关键在于食物成分有很多带有“-”,比如“frozen-limeade-concentrate”,在GoogleNews-vectors-negative300中是查不到的,需采用合理、准确的方法训练得到这一类成分的词向量,并导出到源文件中。请高人指点,并附上Python源码!谢谢啦!!
用字符串分割-,望采纳,谢谢
望采纳,谢谢
"""
利用GoogleNews-vectors-negative300提取词语的词向量
"""
import numpy as np
# loads 300x1 word vectors from file.
def load_bin_vec(fname, vocab):
word_vecs = {}
with open(fname, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split()) # 3000000 300
binary_len = np.dtype('float32').itemsize * layer1_size # 1200
for line in range(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
if word in vocab:
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.read(binary_len)
return word_vecs
# add random vectors of unknown words which are not in pre-trained vector file.
# if pre-trained vectors are not used, then initialize all words in vocab with random value.
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25, 0.25, k)
vectors_file = './GoogleNews-vectors-negative300.bin'
vocab = ['I', 'can', 'do']
vectors = load_bin_vec(vectors_file, vocab) # pre-trained vectors
add_unknown_words(vectors, vocab)
print(vectors['I'])
print('*'*40)
print(vectors['can'])
print('*'*40)
print(vectors['do'])