在建立N-grams的时候,他说我的参数有问题
import argparse
import pathlib
import nltk
SOS = " ""
EOS = "
UNK = ""
def add_sentence_tokens(sentences, n):
sos = SOS * (n - 1) if n > 1 else SOS
return ['{}{} {}'.format(sos, s, EOS) for s in sentences]
def replace_singletons(tokens):
vocab = nltk.FreqDist(tokens)
return [token if vocab[token] > 1 else UNK for token in tokens]
def preprocess(sentences, n):
sentences = add_sentence_tokens(sentences, n)
tokens = ' '.join(sentences).split(' ')
tokens = replace_singletons(tokens)
return tokens
def load_data(train_path):
with open(train_path, 'r') as f:
train_ = [l.strip() for l in f.readlines()]
return train_
class LanguageModel(object):
def init(self, train_data, n, laplace=1):
self.n = n
self.laplace = laplace
self.tokens = preprocess(train_data, n)
self.vocab = nltk.FreqDist(self.tokens)
if name == 'main':
parser = argparse.ArgumentParser("N-gram Language Model")
parser.add_argument('--data', type=str, required=True,
help='Location of the data directory containing ')
parser.add_argument('--n', type=int, required=True,
help='Order of N-gram model to create (i.e. 1 for unigram, 2 for bigram, etc.)')
parser.add_argument('--laplace', type=float, default=0.01,
help='Lambda parameter for Laplace smoothing (default is 0.01 -- use 1 for add-1 smoothing)')
args = parser.parse_args()
# Load and prepare train/test data
data_path = pathlib.Path(args.data)
train = load_data(data_path)
print("Loading {}-gram model...".format(args.n))
lm = LanguageModel(train, args.n, laplace=args.laplace)
print("Vocabulary size: {}".format(len(lm.vocab)))
usage: N-gram Language Model [-h] --data DATA --n N [--laplace LAPLACE]
N-gram Language Model: error: the following arguments are required: --data, --n