gensim4.0版本保存word2vec出现问题

import pandas as pd
import numpy as np
import jieba
import jieba.posseg as pseg
import re
#import csv
import string
from gensim.test.utils import common_texts,get_tmpfile
from keras import models
from keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.word2vec import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
#from keras.models import Model
#from sklearn import metrics
from keras.models import load_model
from keras.models import Sequential
import matplotlib.pyplot as plt
#读入数据集,lineterminator是行分隔符,默认notebook文件保存在c盘用户下面
#newTrain.csv和newTest.csv是和这个文件Untitled5.ipynb在同一个目录下面的
train_data = pd.read_csv('newTrain.csv', lineterminator='\n')
test_data=pd.read_csv('newTest.csv', lineterminator='\n')

#数据的预处理:
#利用LabelEncoder对数据标签进行规格化处理
def encodeLabel(data):
    listLable=[]
    #这里我的标签的名字全都打成了lable,我知道标签的英文是label,如果大家实在看不惯想改过来的话记得前面加载的csv文件的开头的lable也改成label
    for lable in data['lable']:
        listLable.append(lable)
    #到这里都是把lable整合到一起,下面是规格化处理
    le = LabelEncoder()
    resultLable=le.fit_transform(listLable)
    return resultLable

trainLable=encodeLabel(train_data)
testLable=encodeLabel(test_data)
#这里输出testLable给大家看看
print(testLable)
#这里出来是所有review的集合:
def getReview(data):
    listReview=[]
    for review in data['review']:
        listReview.append(review)
    return listReview

trainReview=getReview(train_data)
testReview=getReview(test_data)
#这里输出testReview给大家看看
print(testReview)

def stopwordslist():#加载停用词表,这个中文停用词表.txt也是要和文件放在同一目录下的喔,因为我这里用的都是相对路径
    stopwords = [line.strip() for line in open('中文停用词表.txt',encoding='UTF-8').readlines()]
    return stopwords

def deleteStop(sentence):     #去停用词
    stopwords=stopwordslist()
    outstr=""
    for i in sentence:
        if i not in stopwords and i!="\n":
            outstr+=i
    return outstr
def wordCut(Review):
    Mat=[]
    for rec in Review:
        seten=[]
        rec = re.sub('[%s]' % re.escape(string.punctuation), '',rec)
        fenci=jieba.lcut(rec)    #精准模式分词
        stc=deleteStop(fenci)     #去停用词
#         sentence = list(map(lambda x: x.strip().lower() if len(x.strip().lower()) > 0 else None, jieba.cut(stc)))  # 每句话里的单词拿出来
        seg_list=pseg.cut(stc)    #标注词性
        for word,flag in seg_list:
            if flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]:  #去掉这些词性(人名、地名等)的词语
                seten.append(word)
        Mat.append(seten)
    return Mat
trainCut=wordCut(trainReview)
testCut=wordCut(testReview)
#看看testCut长什么样子吧,想看的自己去掉注释哈
print(testCut)
wordCut=trainCut+testCut
#下面这几行代码是为了flask部署模型的时候对拿进来的数据进行同样预处理,所以把这些所有的词又存在了wordCut.txt里面(我视频里面也讲清楚了)
fileDic=open('wordCut.txt','w',encoding='UTF-8')
for i in wordCut:
    fileDic.write(" ".join(i))
    fileDic.write('\n')
fileDic.close()
#我们可以看一看读出来是些啥,但是实在是太多了,所以出不来
words = [line.strip().split(" ") for line in open('wordCut.txt',encoding='UTF-8').readlines()]
#print(words)
maxLen=100
#word2vec的训练:
# 设置词语向量维度
num_featrues = 100
# 保证被考虑词语的最低频度
min_word_count = 3
# 设置并行化训练使用CPU计算核心数量
num_workers =4
# 设置词语上下文窗口大小
context = 4
model = Word2Vec(wordCut, workers=num_workers,vector_size=num_featrues, min_count=min_word_count,window=context)
# 强制单位归一化
model.init_sims(replace=True)
# 输入一个路径,保存训练好的模型,其中./data/model目录事先要存在
model.wv.save_word2vec_format("word2vec.bin",binary=False)
 
#word2vec的训练:
# 设置词语向量维度
num_featrues = 100
# 保证被考虑词语的最低频度
min_word_count = 3
# 设置并行化训练使用CPU计算核心数量
num_workers =4
# 设置词语上下文窗口大小
context = 4



model = Word2Vec(wordCut, workers=num_workers,vector_size=num_featrues, min_count=min_word_count,window=context)
# 强制单位归一化
model.init_sims(replace=True)
# 输入一个路径,保存训练好的模型,其中./data/model目录事先要存在
model.wv.save_word2vec_format("word2vec.bin",binary=False)

 

出现了什么问题呢?把错误信息贴出来

你这给是调用模块的问题啊!

这个是我之前用的一个:

import collections
from gensim.models import word2vec
from gensim.models import KeyedVectors

def stat_words(file_path, freq_path):
    '''
    统计词频保存到文件,了解数据集基本特征 
    Args:
        file_path: 语料库文件路径
        freq_path: 词频文件保存路径
    Retrun:
        word_list = [[word:count],...]
    '''
    fr = open(file_path, 'r',encoding='utf-8') #从语料库文件中读取数据并统计词频
    lines = fr.readlines()
    text = [line.strip().split(' ') for line in lines]
    fr.close()
    word_counts = collections.Counter()  #统计词频
    for content in text:
        word_counts.update(content)
    word_freq_list = sorted(word_counts.most_common(), key=lambda x:x[1], reverse=True)
    fw = open(freq_path, 'w',encoding='utf-8') #将词频数据保存到文件
    for i in range(len(word_freq_list)):
        content = ' '.join(str(word_freq_list[i][j]) for j in range(len(word_freq_list[i])))
        fw.write(content + '\n')
    fw.close()

def get_word_embedding(input_corpus, model_path):
    '''
    利用gensim库生成语料库word embedding
    Args:
        input_corpus: 语料库文件路径
        model_patht: 预训练word embedding文件保存路径
    '''
    sentences = word2vec.Text8Corpus(input_corpus)  # 加载语料
    #常用参数介绍: size词向量维度、window滑动窗口大小上下文最大距离、min_count最小词频数、iter随机梯度下降迭代最小次数   
    model = word2vec.Word2Vec(sentences, size=100, window=8, min_count=3, iter=8)
    #model.save(model_path)
    model.wv.save_word2vec_format(model_path, binary=False)

if __name__ == '__main__':
    corpus_path = 'data/toutiao_word_corpus.txt' #中文预料文件路径
    freq_path = 'data/words_freq_info.txt' #词频文件保存路径
    word_list = stat_words(corpus_path, freq_path) #统计保存预料中词频信息并保存
    
    model_path = 'toutiao_word_embedding.bin' #训练词向量文件保存路径
    get_word_embedding(corpus_path, model_path) #训练得到预料的词向量

刚刚我也运行了,没有错误

非常感谢您使用有问必答服务,为了后续更快速的帮您解决问题,现诚邀您参与有问必答体验反馈。您的建议将会运用到我们的产品优化中,希望能得到您的支持与协助!

速戳参与调研>>>https://t.csdnimg.cn/Kf0y

import codecs
import numpy
import gensim
import numpy as np
from keyword_extract import *

wordvec_size=192
def get_char_pos(string,char):
    chPos=[]
    try:
        chPos=list(((pos) for pos,val in enumerate(string) if(val == char)))
    except:
        pass
    return chPos

def word2vec(file_name,model):
    with codecs.open(file_name, 'r',encoding="utf-8") as f:
        word_vec_all = numpy.zeros(wordvec_size)
        for data in f:
            space_pos = get_char_pos(data, ' ')
            first_word=data[0:space_pos[0]]
            if model.wv.__contains__(str(first_word)):
                word_vec_all= word_vec_all+model[str(first_word)]

            for i in range(len(space_pos) - 1):
                word = data[space_pos[i]:space_pos[i + 1]]
                if model.wv.__contains__(str(first_word)):
                    word_vec_all = word_vec_all+model[str(first_word)]
        return word_vec_all

def simlarityCalu(vector1,vector2):
    vector1Mod=np.sqrt(vector1.dot(vector1))
    vector2Mod=np.sqrt(vector2.dot(vector2))
    if vector2Mod!=0 and vector1Mod!=0:
        simlarity=(vector1.dot(vector2))/(vector1Mod*vector2Mod)
    else:
        simlarity=0
    return simlarity

if __name__ == '__main__':
    model = gensim.models.Word2Vec.load('zhiwiki_news.word2vec')
    p1 = './data/P1.txt'
    p2 = './data/P2.txt'
    p1_keywords = './data/P1_keywords.txt'
    p2_keywords = './data/P2_keywords.txt'
    getKeywords(p1, p1_keywords)
    getKeywords(p2, p2_keywords)
    p1_vec=word2vec(p1_keywords,model)
    p2_vec=word2vec(p2_keywords,model)

    print(simlarityCalu(p1_vec,p2_vec))





D:\python\Anaconda3\envs\tensorflow\python.exe E:/daima/python2/learning-nlp-master/chapter-7/word2vec训练与相似度计算/word2vec_sim.py
D:\python\Anaconda3\envs\tensorflow\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Administrator\AppData\Local\Temp\jieba.cache
Loading model cost 1.419 seconds.
Prefix dict has been built successfully.
Traceback (most recent call last):
  File "E:/daima/python2/learning-nlp-master/chapter-7/word2vec训练与相似度计算/word2vec_sim.py", line 49, in <module>
    p1_vec=word2vec(p1_keywords,model)
  File "E:/daima/python2/learning-nlp-master/chapter-7/word2vec训练与相似度计算/word2vec_sim.py", line 24, in word2vec
    word_vec_all= word_vec_all+model[str(first_word)]
TypeError: 'Word2Vec' object is not subscriptable

Process finished with exit code 1

想用word2vec实现两个文本相似度,这个问题怎么改啊,跪求