在运行
pyLDAvis.enable_notebook()
pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.save_html(pic, 'lda_pass'+str(n_topics)+'.html')
pyLDAvis.show(pic,local=False)
代码时出错
import re
import time
import jieba
import jieba.posseg as pseg
import numpy as np
import numpy
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
def read_data(fname):
'''
输入的数据是按照:
title content answer tag tag1
存储的,在训练lda时,只需要使用title和answer.
:return: list 每一项是一个问题以及其所有的答案,使用.分隔
'''
dic = {}
with open(fname,'r',encoding='utf8') as f:
for i in f:
lst = i.strip().split('\t')
if lst[0] not in dic:
dic[lst[0]] = re.sub('[。?!,、….?!:]$','',lst[2])
else:
dic[lst[0]] += '.' + re.sub('[。?!,、….?!:]$','',lst[2])
corpus = [re.sub('[。?!,、….?!:]$','',key) + '.' + dic[key] for key in dic]
return corpus
def jieba_cut(corpus,cut_file,stopwords_file):
#结巴分词,并去除停用词(停用词为网上找到的中文停用词库),最后存储在cut_file中
stopwords = []
with open(stopwords_file,'r',encoding='utf8') as f:
for i in f:
stopwords.append(i.strip())
corpus_cut = []
n = 0
for s in corpus:
s_cut = [w for w in jieba.cut(s) if w not in stopwords]
corpus_cut.append(' '.join(s_cut))
n += 1
if n % 10000 == 0:
print(n)
f1 = open(cut_file, 'a', encoding='utf8')
for i in corpus_cut:
f1.write(i + '\n')
f1.close()
return corpus_cut
def vec_model(cut_file):
with open(cut_file, 'r', encoding='utf8') as f:
corpus_cut = [i.strip() for i in f.readlines()]
tf_vectorizer = CountVectorizer(max_df=0.95,min_df=2,stop_words='english')
x = tf_vectorizer.fit_transform(corpus_cut)
joblib.dump(tf_vectorizer,tf_ModelPath )
return x,tf_vectorizer,corpus_cut
def read_vec_model(cut_file,tf_ModelPath):
# 直接加载模型
with open(cut_file, 'r', encoding='utf8') as f:
corpus_cut = [i.strip() for i in f.readlines()]
tf_vectorizer = joblib.load(tf_ModelPath)
x = tf_vectorizer.fit_transform(corpus_cut)
return x,tf_vectorizer,corpus_cut
def train(vec_data,tf_model,n_topics = 14,max_iter = 10,learning_method= 'batch'):
'''
训练lda模型并存储
:param vec_data:
:param n_topics:
:param max_iter:
:param learning_method:
:return: 返回最终的lda模型
'''
lda = LatentDirichletAllocation(n_topics=n_topics,max_iter=max_iter,learning_method=learning_method,max_doc_update_iter=5)
print('train')
a = time.time()
lda.fit(vec_data)
print(time.time() - a)
n_top_words = 20
tf_feature_names = tf_model.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
joblib.dump(lda, lda_ModelPath)
return lda
def print_top_words(model, feature_names, n_top_words):
#打印每个主题下权重较高的term
for topic_idx, topic in enumerate(model.components_):
print( "Topic #%d:" % topic_idx)
print( " ".join([feature_names[i]for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
print(model.components_)
def grid_search(vec_data,tf_vectorizer,parameters):
GridSearchCV
def jieba_cut_transform(file):
#读取结巴分词的数据,然后将其中的英文和字母全部去除
with open(file,'r',encoding='utf8') as f:
ret_list = f.readlines()
f1 = open('./jieba_cut_all_drop.txt','a',encoding='utf8')
lst = []
for i in [[re.sub(r'[^\u4e00-\u9fa5]+','',j) for j in i.strip().split(' ') if re.sub(r'[a-zA-Z0-9]+','',j)] for i in ret_list]:
f1.write(' '.join(i) + '\n')
f1.close()
调用函数训练模型
lda_ModelPath = './lda_model2_all'
tf_ModelPath = './tf_model1_all'
fname = '../train_data'
cut_file = './jieba_cut_all.txt'
stopwords_file = './stopwords'
#读取数据
corpus = read_data(fname)
#分词
cut_data = jieba_cut(corpus, cut_file, stopwords_file)
#训练词向量模型
vec_data,tf_vectorizer,cut_data = vec_model(cut_file)
##这是直接读取数据的函数
#vec_data,tf_vectorizer,cut_data = read_vec_model(cut_file, tf_ModelPath)
lda = train(vec_data,tf_vectorizer,max_iter=50)
# 计算困惑度
# lda = joblib.load(lda_ModelPath)
p = lda.perplexity(vec_data)
print(p)
test_data = vec_data[:10]
ret = lda.transform(vec_data[:10])
print(ret)
print(ret.argmax(1))
print(cut_data[:10])
print(len(cut_data[:10]))
问了下chatGPT,完美解决:
default_term_info = default_term_info.sort_values(
by='saliency', ascending=False).head(R).drop(['saliency'], axis=1)
在_prepare.py里把这行代码改了就行
在运行pyLDAvis.sklearn.prepare()时,可能会遇到“FutureWarning: Conversion of the second argument of issubdtype from 'float' to 'np.floating' is deprecated”的警告信息。
可以通过在代码中添加以下两行来解决这个问题:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
这样,警告信息就会被忽略,不会影响代码的运行。完整代码如下:
import pyLDAvis.sklearn
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
pyLDAvis.enable_notebook()
pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.save_html(pic, 'lda.html')