现在在写分词,对用户评论数据进行分词,统计出评论中词频最高的100词,在创建时我中间的过度不太明白,思路乱的。
代码
import requests
global true
true=''
false=''
for i in range(1,12):
url="https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=chizhou_5402&page="+str(i)+"&onlyGuru=false&rate=all&sort=hot"
headers={"accept": "application/json, text/plain, */*",
"cookie": "QN1=00008500306c4420a0b8b0f8; QN300=s%3Dbaidu; QN99=408; QunarGlobal=10.66.75.142_d32936_181f2d17d57_506b|1657638423568; QN205=s%3Dbaidu; QN277=s%3Dbaidu; QN601=c43d1ade59f094ae25a5102fec04f930; _i=DFiEZn84Y4OwqWHwtM4R6zxXF-Lw; QN48=tc_5e42b7a972d174fa_181f2f30d77_dcf7; QN269=49BA567301F411ED9EB4FA163E03BB9E; fid=4875b795-ec79-4862-a949-c79bf2651426; HN1=v18dfb735ba8db1fca5d7db8aa2880d3d8; HN2=quruklugszszu; ctt_june=1654604625968##iK3waSXwWhPwawPwasj8VKasWs3maS2%2BaSGRVK0haDWGVRX%2BVDaOXstOaKvAiK3siK3saKjAWsjsVRX%3DWs3AaUPwaUvt; qunar-assist={%22version%22:%2220211215173359.925%22%2C%22show%22:false%2C%22audio%22:false%2C%22speed%22:%22middle%22%2C%22zomm%22:1%2C%22cursor%22:false%2C%22pointer%22:false%2C%22bigtext%22:false%2C%22overead%22:false%2C%22readscreen%22:false%2C%22theme%22:%22default%22}; csrfToken=1vxgc5lKfN6GjdxP8uuh82i6Q7VR8fC9; QN163=0; checkInDate=2022-07-19; cityName=%25E6%25B1%25A0%25E5%25B7%259E; cityUrl=chizhou; tabIndex=0; checkOutDate=2022-07-20; _vi=BW2J654JRUGW-HlikPSP4xPKPtfsddFwcOhZWGaKcGysFXv0d4vDTJfHQsDd8oUB9rrbezAw0Ww_3f0NTUqySsYu6ZPOV_FG2CcLWbfgSKssAjSdfUldAmXWP18s9AruzecNndZGfYkX5xkEknSJEl86W7tJe4YStvL6bEl_2KAw; QN271=d2945a34-aef7-468a-9d5e-5ae15d0e51c3; QN267=03930745825af8f1b9; ariaDefaultTheme=undefined; __qt=v1%7CVTJGc2RHVmtYMStTWU4vRG5sTzEvdlBnV0ZLNGhNeWg0Wjd0VDlTQU9zR05OL1JMTU5DV1BySEwvQzBCbFFJcGcyMmg1VVBFOTB2QktMOCtWVHVIdE1ObGo5S1VJMVR1cklnMlg0dTVpRnVYWDQ2SHByeE5FYllIVk56dDU3RzBDdjUxNWRQd0xDRHhpb0s1N3ptaDBJOXN3WDlZNDBQaWtpY0VQNnhBQzdFPQ%3D%3D%7C1657943601609%7CVTJGc2RHVmtYMTh5ZUFjd2tMYkpFRWVNTmlwMWtBWWpMQXR4c2NCR2VreFRDak5qeXlHVGUrbVlZV3hzbDNpOUtQVklYUkY4ZjlXcG5zd2toZlJhZEE9PQ%3D%3D%7CVTJGc2RHVmtYMS9KVWN2RG9SMzluU2JYbnMvSGcrOExYMi9QU0gxbUF2RWNLVlZTTEp5RCtWeWNzSnlzRGU1NndMdHVuNnMxYktRQlE0L0kzaS9YQjhIVk5DOEUrZkYyS1VjNzVZcEtZTk5lZTZvcXFHSEJSdmFwOERTSERsa0xSelU4SUQ4Yi92U2VYQXQ4N3hGVUxCL2IxaTBEaDNEUTA0czYzVi9nalhWbW5BM2h1VXQwNlQyU0ROQzYwdGpycFBhZ2Q2eDJBZXNWQzV1c2k5WG5Pd3g2UVduZVVHSzNMT0k4MittaEV6dWIwMFBNQTM1SkROeUFUV2JZVGpjN2dQaG9OSFRaWTl0OGY3dzBDU01OZzQxMkdvdzhEa0Z3TFlIL0JiNGpOZ2YrYmdCNzlkSnBycm4zNkFRZ0p1YzJ6aE1GZkZiVFo1cTlHamIwekZvRWJJSm1aM1MrY1Jab0txdEtsd0MrOXVXU0hQMjJmQ2d2c0xqSTNyTWRDZEdrazFDQlFZOXhnd3ZIc2wxNVNpN0JsQVdCWEExSitxcERnVUJzejlwSSsvbFA1V08rZFhVYVVjWVdVckQxVGxEbURaclBpOUFEVHV3OENYRGdDYTFuZFJnZW9PN3ROM2l1c05kOWJhTG5uV1lEVWRsTG94d0RKTW5Nb2hvdTBCUDRTeUk0Z3ZYOFBBYVMwMURrZlBQbURRN1duUURHUFFqNkdjRjFKcE9nKzRrWEg0T2FOVkp4Mmt6UGV0RzVmT1VjSjVwSlpxWVV6cWN1UHJneWJzRWZ4L0lrWjJvLzlhV29JZUtCQzJiL0diZ3lQTk5EdFdWd3RyOElHM3pTVnZ5SU5PN2VsRk9ybE5McGdmVDFLVDNwa0Q1UmtiN2FxNkVtUGxwdWVaNUpiM3M9; ctf_june=1654604625968##iK3wWRX8WUPwawPwa%3D3OXKtsWDWDWs3nVRoDVKXNasX%2BXsGDWRvOaDD%2BER0IiK3siK3saKjAWs2%3Dasj8aKj%2BawPwaUvt; cs_june=823fa8d5f718d20bdc3ac14bf6f1a8c5f6089326481932e48da57ed6f9b3045da22d7f720fbec48e82e0fce181cd50e2855e69e24902d919dec81154fbfdabe2b17c80df7eee7c02a9c1a6a5b97c11799ceefc8ad37fbbbf057c7b9d6858b77c5a737ae180251ef5be23400b098dd8ca",
"referer": "https://hotel.qunar.com/cn/chizhou/dt-5402/?fromDate=2022-07-19&toDate=2022-07-20&highQuality=false&bellaId=57iYI5FAfGbp0unR0hGZ_",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"}
a = requests.get(url,headers = headers).json()
for j in range(0,10):
b = a['data']['list'][j]['content']
c = eval(b)
d = c['feedContent']
print(d)
import jieba
def cut_word(file_path):
file = open(file_path, 'r')
txt = file.read()
words = jieba.lcut(txt)
count = {}
for word in words:
if len(word) == 1:
continue
else:
count[word] = count.get(word, 0) + 1
exclude = ["酒店", "民宿", "服务", "九华山", "店家"]
for key in list(count.keys()):
if key in exclude:
del count[key]
lists = list(count.items())
lists.sort(key=lambda x: x[1], reverse=True)
for i in range(20):
word, number = lists[i]
print("关键字:{:-<5}频次:{}".format(word, number))
with open(word_path, 'w', encoding='gbk') as f:
for i in range(20):
word, number = lists[i]
f.write('{}\t{}\n'.format(word, number))
f.close()
return word_path
def draw_wordcloud(text, image_mask, ):
sanguo_mask = np.array(Image.open(image_mask))
wordcloud = WordCloud(background_color='white', mask=sanguo_mask,
max_words=1000, font_path='C:\\Windows\\MSYH(1)'
wordcloud.generate(text)
image_colors = ImageColorGenerator(sanguo_mask)
wordcloud.to_file("wordcloud_jiajia1111.jpg")
plt.figure(figsize=(14, 8))
plt.subplot(1, 2, 1)
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(sanguo_mask, interpolation='bilinear')
plt.axis('off')
plt.show()
text_content = open('text2.txt', encoding='UTF-8')
str1 = split_four_text(words=text_content)
draw_wordcloud(text=str(str1), image_mask='jiajia1111.jpg')
C:\Users\jiajia530\pythonProject5\venv\Scripts\python.exe C:/Users/jiajia530/pythonProject5/main.py
File "C:/Users/jiajia530/pythonProject5/main.py", line 57
wordcloud.generate(text)
^
SyntaxError: invalid syntax
进程已结束,退出代码1
我在不停的改路径和进行从写,有点乱
成功做出前100词的词云
我有一段代码可以满足你的需求,词频统计可以用collections库,前面两个函数是进行机械压缩去词,分词后不需要的词你自己改改,自己可以根据自己的需求改改
这是我的代码,想修改为100词的词云,我的好像只能生成一种图片
import jieba
import requests
import wordcloud
global true,false
true=''
false=''
jieba.setLogLevel(jieba.logging.INFO)
for i in range(1,12):
url="https://hotel.qunar.com/napi/ugcCmtList?hotelSeq=chizhou_5402&page="+str(i)+"&onlyGuru=false&rate=all&sort=hot"
headers={"accept": "application/json, text/plain, */*",
"cookie": "QN1=00008500306c4420a0b8b0f8; QN300=s%3Dbaidu; QN99=408; QunarGlobal=10.66.75.142_d32936_181f2d17d57_506b|1657638423568; QN205=s%3Dbaidu; QN277=s%3Dbaidu; QN601=c43d1ade59f094ae25a5102fec04f930; _i=DFiEZn84Y4OwqWHwtM4R6zxXF-Lw; QN48=tc_5e42b7a972d174fa_181f2f30d77_dcf7; QN269=49BA567301F411ED9EB4FA163E03BB9E; fid=4875b795-ec79-4862-a949-c79bf2651426; HN1=v18dfb735ba8db1fca5d7db8aa2880d3d8; HN2=quruklugszszu; ctt_june=1654604625968##iK3waSXwWhPwawPwasj8VKasWs3maS2%2BaSGRVK0haDWGVRX%2BVDaOXstOaKvAiK3siK3saKjAWsjsVRX%3DWs3AaUPwaUvt; qunar-assist={%22version%22:%2220211215173359.925%22%2C%22show%22:false%2C%22audio%22:false%2C%22speed%22:%22middle%22%2C%22zomm%22:1%2C%22cursor%22:false%2C%22pointer%22:false%2C%22bigtext%22:false%2C%22overead%22:false%2C%22readscreen%22:false%2C%22theme%22:%22default%22}; csrfToken=1vxgc5lKfN6GjdxP8uuh82i6Q7VR8fC9; QN163=0; checkInDate=2022-07-19; cityName=%25E6%25B1%25A0%25E5%25B7%259E; cityUrl=chizhou; tabIndex=0; checkOutDate=2022-07-20; _vi=BW2J654JRUGW-HlikPSP4xPKPtfsddFwcOhZWGaKcGysFXv0d4vDTJfHQsDd8oUB9rrbezAw0Ww_3f0NTUqySsYu6ZPOV_FG2CcLWbfgSKssAjSdfUldAmXWP18s9AruzecNndZGfYkX5xkEknSJEl86W7tJe4YStvL6bEl_2KAw; QN271=d2945a34-aef7-468a-9d5e-5ae15d0e51c3; QN267=03930745825af8f1b9; ariaDefaultTheme=undefined; __qt=v1%7CVTJGc2RHVmtYMStTWU4vRG5sTzEvdlBnV0ZLNGhNeWg0Wjd0VDlTQU9zR05OL1JMTU5DV1BySEwvQzBCbFFJcGcyMmg1VVBFOTB2QktMOCtWVHVIdE1ObGo5S1VJMVR1cklnMlg0dTVpRnVYWDQ2SHByeE5FYllIVk56dDU3RzBDdjUxNWRQd0xDRHhpb0s1N3ptaDBJOXN3WDlZNDBQaWtpY0VQNnhBQzdFPQ%3D%3D%7C1657943601609%7CVTJGc2RHVmtYMTh5ZUFjd2tMYkpFRWVNTmlwMWtBWWpMQXR4c2NCR2VreFRDak5qeXlHVGUrbVlZV3hzbDNpOUtQVklYUkY4ZjlXcG5zd2toZlJhZEE9PQ%3D%3D%7CVTJGc2RHVmtYMS9KVWN2RG9SMzluU2JYbnMvSGcrOExYMi9QU0gxbUF2RWNLVlZTTEp5RCtWeWNzSnlzRGU1NndMdHVuNnMxYktRQlE0L0kzaS9YQjhIVk5DOEUrZkYyS1VjNzVZcEtZTk5lZTZvcXFHSEJSdmFwOERTSERsa0xSelU4SUQ4Yi92U2VYQXQ4N3hGVUxCL2IxaTBEaDNEUTA0czYzVi9nalhWbW5BM2h1VXQwNlQyU0ROQzYwdGpycFBhZ2Q2eDJBZXNWQzV1c2k5WG5Pd3g2UVduZVVHSzNMT0k4MittaEV6dWIwMFBNQTM1SkROeUFUV2JZVGpjN2dQaG9OSFRaWTl0OGY3dzBDU01OZzQxMkdvdzhEa0Z3TFlIL0JiNGpOZ2YrYmdCNzlkSnBycm4zNkFRZ0p1YzJ6aE1GZkZiVFo1cTlHamIwekZvRWJJSm1aM1MrY1Jab0txdEtsd0MrOXVXU0hQMjJmQ2d2c0xqSTNyTWRDZEdrazFDQlFZOXhnd3ZIc2wxNVNpN0JsQVdCWEExSitxcERnVUJzejlwSSsvbFA1V08rZFhVYVVjWVdVckQxVGxEbURaclBpOUFEVHV3OENYRGdDYTFuZFJnZW9PN3ROM2l1c05kOWJhTG5uV1lEVWRsTG94d0RKTW5Nb2hvdTBCUDRTeUk0Z3ZYOFBBYVMwMURrZlBQbURRN1duUURHUFFqNkdjRjFKcE9nKzRrWEg0T2FOVkp4Mmt6UGV0RzVmT1VjSjVwSlpxWVV6cWN1UHJneWJzRWZ4L0lrWjJvLzlhV29JZUtCQzJiL0diZ3lQTk5EdFdWd3RyOElHM3pTVnZ5SU5PN2VsRk9ybE5McGdmVDFLVDNwa0Q1UmtiN2FxNkVtUGxwdWVaNUpiM3M9; ctf_june=1654604625968##iK3wWRX8WUPwawPwa%3D3OXKtsWDWDWs3nVRoDVKXNasX%2BXsGDWRvOaDD%2BER0IiK3siK3saKjAWs2%3Dasj8aKj%2BawPwaUvt; cs_june=823fa8d5f718d20bdc3ac14bf6f1a8c5f6089326481932e48da57ed6f9b3045da22d7f720fbec48e82e0fce181cd50e2855e69e24902d919dec81154fbfdabe2b17c80df7eee7c02a9c1a6a5b97c11799ceefc8ad37fbbbf057c7b9d6858b77c5a737ae180251ef5be23400b098dd8ca",
"referer": "https://hotel.qunar.com/cn/chizhou/dt-5402/?fromDate=2022-07-19&toDate=2022-07-20&highQuality=false&bellaId=57iYI5FAfGbp0unR0hGZ_",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"}
a = requests.get(url,headers = headers).json()
for j in range(0,10):
b = a['data']['list'][j]['content']
c = eval(b)
d = c['feedContent']
print(d)
e = ''
f =jieba.cut(d)
for k in f:
e=e+k+' '
print(g)
g=''
g = g+e
print(z)
wc = wordcloud.WordCloud(background_color="white",
font_path="msyh.ttc",
max_words = 100,
max_font_size=200,
width=800,
height=600,
mask=mask,
contour_width=4,
stopwords={"酒店","用户已","!","非常","了","已","很"})
wc.generate(g)
wc.to_file(r'D:\python work\6.jpg')