import pandas as pd
import numpy as np
import jieba
import jieba.posseg as pseg
import re
#import csv
import string
from gensim.test.utils import common_texts,get_tmpfile
from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.word2vec import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
#from keras.models import Model
#from sklearn import metrics
from keras.models import load_model
from keras.models import Sequential
import matplotlib.pyplot as plt
#读入数据集,lineterminator是行分隔符,默认notebook文件保存在c盘用户下面
#newTrain.csv和newTest.csv是和这个文件Untitled5.ipynb在同一个目录下面的
train_data = pd.read_csv('newTrain.csv', lineterminator='\n')
test_data=pd.read_csv('newTest.csv', lineterminator='\n')
#数据的预处理:
#利用LabelEncoder对数据标签进行规格化处理
def encodeLabel(data):
listLable=[]
#这里我的标签的名字全都打成了lable,我知道标签的英文是label,如果大家实在看不惯想改过来的话记得前面加载的csv文件的开头的lable也改成label
for lable in data['lable']:
listLable.append(lable)
#到这里都是把lable整合到一起,下面是规格化处理
le = LabelEncoder()
resultLable=le.fit_transform(listLable)
return resultLable
trainLable=encodeLabel(train_data)
testLable=encodeLabel(test_data)
#这里输出testLable给大家看看
print(testLable)
#这里出来是所有review的集合:
def getReview(data):
listReview=[]
for review in data['review']:
listReview.append(review)
return listReview
trainReview=getReview(train_data)
testReview=getReview(test_data)
#这里输出testReview给大家看看
print(testReview)
def stopwordslist():#加载停用词表,这个中文停用词表.txt也是要和文件放在同一目录下的喔,因为我这里用的都是相对路径
stopwords = [line.strip() for line in open('中文停用词表.txt',encoding='UTF-8').readlines()]
return stopwords
def deleteStop(sentence): #去停用词
stopwords=stopwordslist()
outstr=""
for i in sentence:
if i not in stopwords and i!="\n":
outstr+=i
return outstr
def wordCut(Review):
Mat=[]
for rec in Review:
seten=[]
rec = re.sub('[%s]' % re.escape(string.punctuation), '',rec)
fenci=jieba.lcut(rec) #精准模式分词
stc=deleteStop(fenci) #去停用词
# sentence = list(map(lambda x: x.strip().lower() if len(x.strip().lower()) > 0 else None, jieba.cut(stc))) # 每句话里的单词拿出来
seg_list=pseg.cut(stc) #标注词性
for word,flag in seg_list:
if flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]: #去掉这些词性(人名、地名等)的词语
seten.append(word)
Mat.append(seten)
return Mat
trainCut=wordCut(trainReview)
testCut=wordCut(testReview)
#看看testCut长什么样子吧,想看的自己去掉注释哈
print(testCut)
wordCut=trainCut+testCut
#下面这几行代码是为了flask部署模型的时候对拿进来的数据进行同样预处理,所以把这些所有的词又存在了wordCut.txt里面(我视频里面也讲清楚了)
fileDic=open('wordCut.txt','w',encoding='UTF-8')
for i in wordCut:
fileDic.write(" ".join(i))
fileDic.write('\n')
fileDic.close()
#我们可以看一看读出来是些啥,但是实在是太多了,所以出不来
words = [line.strip().split(" ") for line in open('wordCut.txt',encoding='UTF-8').readlines()]
#print(words)
maxLen=100
#word2vec的训练:
# 设置词语向量维度
num_featrues = 100
# 保证被考虑词语的最低频度
min_word_count = 3
# 设置并行化训练使用CPU计算核心数量
num_workers =4
# 设置词语上下文窗口大小
context = 4
model = Word2Vec(wordCut, workers=num_workers,vector_size=num_featrues, min_count=min_word_count,window=context)
# 强制单位归一化
model.init_sims(replace=True)
# 输入一个路径,保存训练好的模型,其中./data/model目录事先要存在
model.wv.save_word2vec_format("word2vec.bin",binary=False)
#word2vec的训练:
# 设置词语向量维度
num_featrues = 100
# 保证被考虑词语的最低频度
min_word_count = 3
# 设置并行化训练使用CPU计算核心数量
num_workers =4
# 设置词语上下文窗口大小
context = 4
model = Word2Vec(wordCut, workers=num_workers,vector_size=num_featrues, min_count=min_word_count,window=context)
# 强制单位归一化
model.init_sims(replace=True)
# 输入一个路径,保存训练好的模型,其中./data/model目录事先要存在
model.wv.save_word2vec_format("word2vec.bin",binary=False)
出现了什么问题呢?把错误信息贴出来
你这给是调用模块的问题啊!
这个是我之前用的一个:
import collections
from gensim.models import word2vec
from gensim.models import KeyedVectors
def stat_words(file_path, freq_path):
'''
统计词频保存到文件,了解数据集基本特征
Args:
file_path: 语料库文件路径
freq_path: 词频文件保存路径
Retrun:
word_list = [[word:count],...]
'''
fr = open(file_path, 'r',encoding='utf-8') #从语料库文件中读取数据并统计词频
lines = fr.readlines()
text = [line.strip().split(' ') for line in lines]
fr.close()
word_counts = collections.Counter() #统计词频
for content in text:
word_counts.update(content)
word_freq_list = sorted(word_counts.most_common(), key=lambda x:x[1], reverse=True)
fw = open(freq_path, 'w',encoding='utf-8') #将词频数据保存到文件
for i in range(len(word_freq_list)):
content = ' '.join(str(word_freq_list[i][j]) for j in range(len(word_freq_list[i])))
fw.write(content + '\n')
fw.close()
def get_word_embedding(input_corpus, model_path):
'''
利用gensim库生成语料库word embedding
Args:
input_corpus: 语料库文件路径
model_patht: 预训练word embedding文件保存路径
'''
sentences = word2vec.Text8Corpus(input_corpus) # 加载语料
#常用参数介绍: size词向量维度、window滑动窗口大小上下文最大距离、min_count最小词频数、iter随机梯度下降迭代最小次数
model = word2vec.Word2Vec(sentences, size=100, window=8, min_count=3, iter=8)
#model.save(model_path)
model.wv.save_word2vec_format(model_path, binary=False)
if __name__ == '__main__':
corpus_path = 'data/toutiao_word_corpus.txt' #中文预料文件路径
freq_path = 'data/words_freq_info.txt' #词频文件保存路径
word_list = stat_words(corpus_path, freq_path) #统计保存预料中词频信息并保存
model_path = 'toutiao_word_embedding.bin' #训练词向量文件保存路径
get_word_embedding(corpus_path, model_path) #训练得到预料的词向量
刚刚我也运行了,没有错误
非常感谢您使用有问必答服务,为了后续更快速的帮您解决问题,现诚邀您参与有问必答体验反馈。您的建议将会运用到我们的产品优化中,希望能得到您的支持与协助!
速戳参与调研>>>https://t.csdnimg.cn/Kf0y
import codecs
import numpy
import gensim
import numpy as np
from keyword_extract import *
wordvec_size=192
def get_char_pos(string,char):
chPos=[]
try:
chPos=list(((pos) for pos,val in enumerate(string) if(val == char)))
except:
pass
return chPos
def word2vec(file_name,model):
with codecs.open(file_name, 'r',encoding="utf-8") as f:
word_vec_all = numpy.zeros(wordvec_size)
for data in f:
space_pos = get_char_pos(data, ' ')
first_word=data[0:space_pos[0]]
if model.wv.__contains__(str(first_word)):
word_vec_all= word_vec_all+model[str(first_word)]
for i in range(len(space_pos) - 1):
word = data[space_pos[i]:space_pos[i + 1]]
if model.wv.__contains__(str(first_word)):
word_vec_all = word_vec_all+model[str(first_word)]
return word_vec_all
def simlarityCalu(vector1,vector2):
vector1Mod=np.sqrt(vector1.dot(vector1))
vector2Mod=np.sqrt(vector2.dot(vector2))
if vector2Mod!=0 and vector1Mod!=0:
simlarity=(vector1.dot(vector2))/(vector1Mod*vector2Mod)
else:
simlarity=0
return simlarity
if __name__ == '__main__':
model = gensim.models.Word2Vec.load('zhiwiki_news.word2vec')
p1 = './data/P1.txt'
p2 = './data/P2.txt'
p1_keywords = './data/P1_keywords.txt'
p2_keywords = './data/P2_keywords.txt'
getKeywords(p1, p1_keywords)
getKeywords(p2, p2_keywords)
p1_vec=word2vec(p1_keywords,model)
p2_vec=word2vec(p2_keywords,model)
print(simlarityCalu(p1_vec,p2_vec))
D:\python\Anaconda3\envs\tensorflow\python.exe E:/daima/python2/learning-nlp-master/chapter-7/word2vec训练与相似度计算/word2vec_sim.py
D:\python\Anaconda3\envs\tensorflow\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Administrator\AppData\Local\Temp\jieba.cache
Loading model cost 1.419 seconds.
Prefix dict has been built successfully.
Traceback (most recent call last):
File "E:/daima/python2/learning-nlp-master/chapter-7/word2vec训练与相似度计算/word2vec_sim.py", line 49, in <module>
p1_vec=word2vec(p1_keywords,model)
File "E:/daima/python2/learning-nlp-master/chapter-7/word2vec训练与相似度计算/word2vec_sim.py", line 24, in word2vec
word_vec_all= word_vec_all+model[str(first_word)]
TypeError: 'Word2Vec' object is not subscriptable
Process finished with exit code 1
想用word2vec实现两个文本相似度,这个问题怎么改啊,跪求