LDA模型可视化报错

在运行
pyLDAvis.enable_notebook()
pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.save_html(pic, 'lda_pass'+str(n_topics)+'.html')
pyLDAvis.show(pic,local=False)
代码时出错

img

  • 帮你找了个相似的问题, 你可以看下: https://ask.csdn.net/questions/4646672
  • 你也可以参考下这篇文章:使用LDA分类器对邮件进行分类
  • 除此之外, 这篇博客: LDA 学习笔记中的 2 完整的模型代码: 部分也许能够解决你的问题, 你可以仔细阅读以下内容或跳转源博客中阅读:
  • import re
    import time
    import jieba
    import jieba.posseg as pseg
    import numpy as np
    import numpy
    from sklearn.externals import joblib
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    from sklearn.model_selection import GridSearchCV
    
    def read_data(fname):
        '''
        输入的数据是按照:
            title   content answer  tag tag1
        存储的,在训练lda时,只需要使用title和answer.
        :return: list 每一项是一个问题以及其所有的答案,使用.分隔
        '''
        dic = {}
        with open(fname,'r',encoding='utf8') as f:
            for i in f:
                lst = i.strip().split('\t')
                if lst[0] not in dic:
                    dic[lst[0]] = re.sub('[。?!,、….?!:]$','',lst[2])
                else:
                    dic[lst[0]] += '.' + re.sub('[。?!,、….?!:]$','',lst[2])
        corpus = [re.sub('[。?!,、….?!:]$','',key) + '.' + dic[key] for key in dic]
        return corpus
    
    def jieba_cut(corpus,cut_file,stopwords_file):
        #结巴分词,并去除停用词(停用词为网上找到的中文停用词库),最后存储在cut_file中
        stopwords = []
        with open(stopwords_file,'r',encoding='utf8') as f:
            for i in f:
                stopwords.append(i.strip())
        corpus_cut = []
        n = 0
        for s in corpus:
            s_cut = [w for w in jieba.cut(s) if w not in stopwords]
            corpus_cut.append(' '.join(s_cut))
            n += 1
            if n % 10000 == 0:
                print(n)
        f1 = open(cut_file, 'a', encoding='utf8')
        for i in corpus_cut:
            f1.write(i + '\n')
        f1.close()
        return corpus_cut
    
        
        
    def vec_model(cut_file):
        with open(cut_file, 'r', encoding='utf8') as f:
            corpus_cut = [i.strip() for i in f.readlines()]
        tf_vectorizer = CountVectorizer(max_df=0.95,min_df=2,stop_words='english')
        x = tf_vectorizer.fit_transform(corpus_cut)
        joblib.dump(tf_vectorizer,tf_ModelPath )
        return x,tf_vectorizer,corpus_cut
    
    
    
    
    def read_vec_model(cut_file,tf_ModelPath):
        # 直接加载模型
        with open(cut_file, 'r', encoding='utf8') as f:
            corpus_cut = [i.strip() for i in f.readlines()]
        tf_vectorizer = joblib.load(tf_ModelPath)
        x = tf_vectorizer.fit_transform(corpus_cut)
        return x,tf_vectorizer,corpus_cut
    
    
    
    def train(vec_data,tf_model,n_topics = 14,max_iter = 10,learning_method= 'batch'):
        '''
        训练lda模型并存储
        :param vec_data:
        :param n_topics:
        :param max_iter:
        :param learning_method:
        :return: 返回最终的lda模型
        '''
        lda = LatentDirichletAllocation(n_topics=n_topics,max_iter=max_iter,learning_method=learning_method,max_doc_update_iter=5)
        print('train')
        a = time.time()
        lda.fit(vec_data)
        print(time.time() - a)
        n_top_words = 20
        tf_feature_names = tf_model.get_feature_names()
        print_top_words(lda, tf_feature_names, n_top_words)
        joblib.dump(lda, lda_ModelPath)
        return lda
    
    
    
    def print_top_words(model, feature_names, n_top_words):
        #打印每个主题下权重较高的term
        for topic_idx, topic in enumerate(model.components_):
            print( "Topic #%d:" % topic_idx)
            print( " ".join([feature_names[i]for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()
        print(model.components_)
    
        
        
    def grid_search(vec_data,tf_vectorizer,parameters):
        GridSearchCV
    
        
        
    def jieba_cut_transform(file):
        #读取结巴分词的数据,然后将其中的英文和字母全部去除
        with open(file,'r',encoding='utf8') as f:
            ret_list = f.readlines()
            f1 = open('./jieba_cut_all_drop.txt','a',encoding='utf8')
            lst = []
            for i in [[re.sub(r'[^\u4e00-\u9fa5]+','',j) for j in i.strip().split(' ') if re.sub(r'[a-zA-Z0-9]+','',j)] for i in ret_list]:
                f1.write(' '.join(i) + '\n')
            f1.close()
    

    调用函数训练模型

    lda_ModelPath = './lda_model2_all'
    tf_ModelPath = './tf_model1_all'
    fname = '../train_data'
    cut_file = './jieba_cut_all.txt'
    stopwords_file = './stopwords'
    
    #读取数据
    corpus = read_data(fname)
    #分词
    cut_data = jieba_cut(corpus, cut_file, stopwords_file)
    #训练词向量模型
    vec_data,tf_vectorizer,cut_data = vec_model(cut_file)
    ##这是直接读取数据的函数
    #vec_data,tf_vectorizer,cut_data = read_vec_model(cut_file, tf_ModelPath)
    lda = train(vec_data,tf_vectorizer,max_iter=50)
    # 计算困惑度
    # lda = joblib.load(lda_ModelPath)
    p = lda.perplexity(vec_data)
    print(p)
    test_data = vec_data[:10]
    ret = lda.transform(vec_data[:10])
    print(ret)
    print(ret.argmax(1))
    print(cut_data[:10])
    print(len(cut_data[:10]))
    
  • 您还可以看一下 唐宇迪老师的机器学习30天进阶实战课程中的 LDA数学原理推导小节, 巩固相关知识点

问了下chatGPT,完美解决:

img

img


default_term_info = default_term_info.sort_values(
    by='saliency', ascending=False).head(R).drop(['saliency'], axis=1)

在_prepare.py里把这行代码改了就行

以下内容部分参考ChatGPT模型:


在运行pyLDAvis.sklearn.prepare()时,可能会遇到“FutureWarning: Conversion of the second argument of issubdtype from 'float' to 'np.floating' is deprecated”的警告信息。

可以通过在代码中添加以下两行来解决这个问题:

import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

这样,警告信息就会被忽略,不会影响代码的运行。完整代码如下:

import pyLDAvis.sklearn
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

pyLDAvis.enable_notebook()
pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.save_html(pic, 'lda.html')


如果我的建议对您有帮助、请点击采纳、祝您生活愉快