PyCharm上运行CoherenceModel的代码时,会出现代码反复从头开始运行的情况,但是用jupyter运行就很正常,只会循环需要循环运行的部分。
代码如下:
import pandas as pd
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import tqdm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
print('start')
TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
id2word = corpora.Dictionary(data_words)
texts =data_words
corpus = [id2word.doc2bow(text) for text in texts]
def compute_coherence_values(corpus, dictionary, texts, k, a, b):
lda_model = gensim.models.LdaMulticore(corpus=corpus,
id2word=dictionary,
num_topics=k,
random_state=100,
chunksize=100,
passes=10,
alpha=a,
eta=b)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
return coherence_model_lda.get_coherence()
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 4
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 0.02, 0.01))
# Beta parameter
beta = list(np.arange(0.01, 0.1, 0.05))
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [ # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
# gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
'Topics': [],
'Alpha': [],
'Beta': [],
'Coherence': []
}
# Can take a long time to run
if __name__ == '__main__':
if 1 == 1:
pbar = tqdm.tqdm(total=4)
# iterate through validation corpuses
for i in range(len(corpus_sets)):
# iterate through number of topics
for k in topics_range:
# iterate through alpha values
for a in alpha:
# iterare through beta values
for b in beta:
# get the coherence score for the given parameters
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
k=k, a=a, b=b)
# Save the model results
model_results['Validation_Set'].append(corpus_title[i])
model_results['Topics'].append(k)
model_results['Alpha'].append(a)
model_results['Beta'].append(b)
model_results['Coherence'].append(cv)
pbar.update(1)
pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
pbar.close()
print('end')
pycharm的结果:
start
0%| | 0/4 [00:00<?, ?it/s]start
end
start
end
start
end
start
end
start
end
start
end
25%|██▌ | 1/4 [00:19<00:59, 19.72s/it]start
end
start
end
start
end
start
end
start
end
start
end
50%|█████ | 2/4 [00:39<00:39, 19.87s/it]start
end
start
end
start
end
start
end
start
end
start
end
75%|███████▌ | 3/4 [00:58<00:19, 19.27s/it]start
end
start
end
start
end
start
end
start
end
start
end
100%|██████████| 4/4 [01:14<00:00, 18.72s/it]
end
Process finished with exit code 0
jupyter的结果:
0%| | 0/4 [00:00<?, ?it/s]
star
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:19<00:00, 4.95s/it]
end
可以看到pycharm每次都会从第一行代码开始运行,而jupyter就只会循环需要的部分。
不懂为什么pycharm和jupyter运行的结果不一样,我希望让pycharm也能运行像jupyter一样的结果,希望可以帮我解答一下
然后我自己实验了,只要把关于计算Coherence的代码删掉,pycharm就能正常运行,比如这样:
print('start')
TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
id2word = corpora.Dictionary(data_words)
texts =data_words
corpus = [id2word.doc2bow(text) for text in texts]
def compute_coherence_values(corpus, dictionary, texts, k, a, b):
lda_model = gensim.models.LdaMulticore(corpus=corpus,
id2word=dictionary,
num_topics=k,
random_state=100,
chunksize=100,
passes=10,
alpha=a,
eta=b)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
return coherence_model_lda.get_coherence()
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 4
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 0.02, 0.01))
# Beta parameter
beta = list(np.arange(0.01, 0.1, 0.05))
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [ # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
# gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
'Topics': [],
'Alpha': [],
'Beta': [],
#'Coherence': []
}
# Can take a long time to run
if __name__ == '__main__':
if 1 == 1:
pbar = tqdm.tqdm(total=4)
# iterate through validation corpuses
for i in range(len(corpus_sets)):
# iterate through number of topics
for k in topics_range:
# iterate through alpha values
for a in alpha:
# iterare through beta values
for b in beta:
# get the coherence score for the given parameters
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
#cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
# k=k, a=a, b=b)
# Save the model results
model_results['Validation_Set'].append(corpus_title[i])
model_results['Topics'].append(k)
model_results['Alpha'].append(a)
model_results['Beta'].append(b)
#model_results['Coherence'].append(cv)
pbar.update(1)
pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
pbar.close()
print('end')
修改了下代码顺序,你试下这样可以不
另外
你上边是所有代码吗?本地跑了下会报错KeyError: 'ConsumerReviews'
,
方便说下你的python版本和其他库的版本吗
import pandas as pd
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import tqdm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
def sent_to_words(sentences):
for sentence in sentences:
yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
def compute_coherence_values(corpus, dictionary, texts, k, a, b):
lda_model = gensim.models.LdaMulticore(corpus=corpus,
id2word=dictionary,
num_topics=k,
random_state=100,
chunksize=100,
passes=10,
alpha=a,
eta=b)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
return coherence_model_lda.get_coherence()
# Can take a long time to run
if __name__ == '__main__':
print('start')
TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 4
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 0.02, 0.01))
# Beta parameter
beta = list(np.arange(0.01, 0.1, 0.05))
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [ # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
# gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
'Topics': [],
'Alpha': [],
'Beta': [],
'Coherence': []
}
if 1 == 1:
pbar = tqdm.tqdm(total=4)
# iterate through validation corpuses
for i in range(len(corpus_sets)):
# iterate through number of topics
for k in topics_range:
# iterate through alpha values
for a in alpha:
# iterare through beta values
for b in beta:
# get the coherence score for the given parameters
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
k=k, a=a, b=b)
# Save the model results
model_results['Validation_Set'].append(corpus_title[i])
model_results['Topics'].append(k)
model_results['Alpha'].append(a)
model_results['Beta'].append(b)
model_results['Coherence'].append(cv)
pbar.update(1)
pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
pbar.close()
print('end')
TXT文本是这样的
'''
FilePath: D:/0527/reviews.TXT
Text OR
0 Room was comfortable and run down some, but it... 2
1 We took our dog to visit her favorite girl awa... 5
2 This is the cleanest Super 8 I've ever stayed ... 5
3 Stayed here for one night while visiting OU fo... 3
4 We attended a ceremony for our daughter at OU ... 4
.. ... ..
928 So the room looks fine. One of the first rooms... 3
929 We try to come once a month or so to visit my ... 1
930 This was a great place to stop for a quick ove... 5
931 This place didn't turn out to be as bad as we ... 3
932 Air conditioner didn't work right and I had to... 2
[933 rows x 2 columns]
'''
能多次执行star 是奇怪。 你代码分的这几段也挺奇怪的。你把内容都放在if name == 'main':
试试呢
把开始的print('start') 和 结束的print('end'),放到if name == 'main': 里面试试
您好,我是有问必答小助手,您的问题已经有小伙伴帮您解答,感谢您对有问必答的支持与关注!