汉字的词语没切割出来,不会用jieba来完成汉字词语的切割
import re,openpyxl,pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import jieba
plt.rcParams['font.family']='STXiHei'
f=open(r'D:\政府工作报告.txt', 'r', encoding='utf-8')
txt=f.read()
ls=jieba.cut(txt, cut_all=True)
lst=[word for word in ls]
pat=re.compile(r'[\u4e00-\u9fa5]+')
lst=pat.findall(txt)
dict={}
for word in lst:
if word in dict.keys():
dict[word]+=1
else:
dict[word]=1
df=pd.DataFrame([dict]).T
df.columns=['词频']
df.index.name='词语'
df=df.sort_values(by='词频',ascending=False)[:40]
sns.barplot(y=df.index,x=df['词频'],data=df)
df.to_excel(r'D:\result3.xlsx')
目的:分析政府工作报告出现最高的40个词,作图(横向条形图),并存入Excel文件中
可参考:https://blog.csdn.net/September_nine/article/details/107478291
改成类似这样试试?
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import jieba
# plt.rcParams['font.family'] = 'STXiHei'
# 设置字体为微软雅黑,解决中文显示问题
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
f = open(r'sanguo.txt', 'r', encoding='utf-8')
txt = f.read()
ls = jieba.cut(txt, cut_all=True)
lst = [word for word in ls]
# pat = re.compile(r'[\u4e00-\u9fa5]+')
# lst = pat.findall(txt)
dict = {}
for word in lst:
if word in dict.keys():
dict[word] += 1
else:
dict[word] = 1
df = pd.DataFrame([dict]).T
df.columns = ['词频']
df.index.name = '词语'
df = df.sort_values(by='词频', ascending=False)[:40]
sns.barplot(y=df.index, x=df['词频'], data=df)
plt.show()
df.to_excel(r'result4.xlsx')