LDA模型可视化报错

在运行
pyLDAvis.enable_notebook()
pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.save_html(pic, 'lda_pass'+str(n_topics)+'.html')
pyLDAvis.show(pic,local=False)
代码时出错

帮你找了个相似的问题, 你可以看下: https://ask.csdn.net/questions/4646672
你也可以参考下这篇文章：使用LDA分类器对邮件进行分类
除此之外, 这篇博客: LDA 学习笔记中的 2 完整的模型代码: 部分也许能够解决你的问题, 你可以仔细阅读以下内容或跳转源博客中阅读:

import re
import time
import jieba
import jieba.posseg as pseg
import numpy as np
import numpy
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

def read_data(fname):
    '''
    输入的数据是按照:
        title   content answer  tag tag1
    存储的,在训练lda时,只需要使用title和answer.
    :return: list 每一项是一个问题以及其所有的答案,使用.分隔
    '''
    dic = {}
    with open(fname,'r',encoding='utf8') as f:
        for i in f:
            lst = i.strip().split('\t')
            if lst[0] not in dic:
                dic[lst[0]] = re.sub('[。？！，、….?!:]$','',lst[2])
            else:
                dic[lst[0]] += '.' + re.sub('[。？！，、….?!:]$','',lst[2])
    corpus = [re.sub('[。？！，、….?!:]$','',key) + '.' + dic[key] for key in dic]
    return corpus

def jieba_cut(corpus,cut_file,stopwords_file):
    #结巴分词,并去除停用词(停用词为网上找到的中文停用词库),最后存储在cut_file中
    stopwords = []
    with open(stopwords_file,'r',encoding='utf8') as f:
        for i in f:
            stopwords.append(i.strip())
    corpus_cut = []
    n = 0
    for s in corpus:
        s_cut = [w for w in jieba.cut(s) if w not in stopwords]
        corpus_cut.append(' '.join(s_cut))
        n += 1
        if n % 10000 == 0:
            print(n)
    f1 = open(cut_file, 'a', encoding='utf8')
    for i in corpus_cut:
        f1.write(i + '\n')
    f1.close()
    return corpus_cut

    
    
def vec_model(cut_file):
    with open(cut_file, 'r', encoding='utf8') as f:
        corpus_cut = [i.strip() for i in f.readlines()]
    tf_vectorizer = CountVectorizer(max_df=0.95,min_df=2,stop_words='english')
    x = tf_vectorizer.fit_transform(corpus_cut)
    joblib.dump(tf_vectorizer,tf_ModelPath )
    return x,tf_vectorizer,corpus_cut




def read_vec_model(cut_file,tf_ModelPath):
    # 直接加载模型
    with open(cut_file, 'r', encoding='utf8') as f:
        corpus_cut = [i.strip() for i in f.readlines()]
    tf_vectorizer = joblib.load(tf_ModelPath)
    x = tf_vectorizer.fit_transform(corpus_cut)
    return x,tf_vectorizer,corpus_cut



def train(vec_data,tf_model,n_topics = 14,max_iter = 10,learning_method= 'batch'):
    '''
    训练lda模型并存储
    :param vec_data:
    :param n_topics:
    :param max_iter:
    :param learning_method:
    :return: 返回最终的lda模型
    '''
    lda = LatentDirichletAllocation(n_topics=n_topics,max_iter=max_iter,learning_method=learning_method,max_doc_update_iter=5)
    print('train')
    a = time.time()
    lda.fit(vec_data)
    print(time.time() - a)
    n_top_words = 20
    tf_feature_names = tf_model.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)
    joblib.dump(lda, lda_ModelPath)
    return lda



def print_top_words(model, feature_names, n_top_words):
    #打印每个主题下权重较高的term
    for topic_idx, topic in enumerate(model.components_):
        print( "Topic #%d:" % topic_idx)
        print( " ".join([feature_names[i]for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    print(model.components_)

    
    
def grid_search(vec_data,tf_vectorizer,parameters):
    GridSearchCV

    
    
def jieba_cut_transform(file):
    #读取结巴分词的数据,然后将其中的英文和字母全部去除
    with open(file,'r',encoding='utf8') as f:
        ret_list = f.readlines()
        f1 = open('./jieba_cut_all_drop.txt','a',encoding='utf8')
        lst = []
        for i in [[re.sub(r'[^\u4e00-\u9fa5]+','',j) for j in i.strip().split(' ') if re.sub(r'[a-zA-Z0-9]+','',j)] for i in ret_list]:
            f1.write(' '.join(i) + '\n')
        f1.close()

调用函数训练模型

lda_ModelPath = './lda_model2_all'
tf_ModelPath = './tf_model1_all'
fname = '../train_data'
cut_file = './jieba_cut_all.txt'
stopwords_file = './stopwords'

#读取数据
corpus = read_data(fname)
#分词
cut_data = jieba_cut(corpus, cut_file, stopwords_file)
#训练词向量模型
vec_data,tf_vectorizer,cut_data = vec_model(cut_file)
##这是直接读取数据的函数
#vec_data,tf_vectorizer,cut_data = read_vec_model(cut_file, tf_ModelPath)
lda = train(vec_data,tf_vectorizer,max_iter=50)
# 计算困惑度
# lda = joblib.load(lda_ModelPath)
p = lda.perplexity(vec_data)
print(p)
test_data = vec_data[:10]
ret = lda.transform(vec_data[:10])
print(ret)
print(ret.argmax(1))
print(cut_data[:10])
print(len(cut_data[:10]))

您还可以看一下唐宇迪老师的机器学习30天进阶实战课程中的 LDA数学原理推导小节, 巩固相关知识点

问了下chatGPT，完美解决：


default_term_info = default_term_info.sort_values(
    by='saliency', ascending=False).head(R).drop(['saliency'], axis=1)

在_prepare.py里把这行代码改了就行

以下内容部分参考ChatGPT模型：

在运行pyLDAvis.sklearn.prepare()时，可能会遇到“FutureWarning: Conversion of the second argument of issubdtype from 'float' to 'np.floating' is deprecated”的警告信息。

可以通过在代码中添加以下两行来解决这个问题：

import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

这样，警告信息就会被忽略，不会影响代码的运行。完整代码如下：

import pyLDAvis.sklearn
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

pyLDAvis.enable_notebook()
pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.save_html(pic, 'lda.html')

LDA模型可视化报错

以下内容部分参考ChatGPT模型：

如果我的建议对您有帮助、请点击采纳、祝您生活愉快