利用jieba提取高频词汇,但是提示IndexError: list index out of range,尝试解决未果,寻求大佬帮助
以下是我运行的代码
import glob
import random
import jieba
# 读取文章的函数
def get_content(content_path):
with open(content_path, 'r', encoding="gbk", errors="ignore") as f:
content = ''
for i in f:
i = i.strip()
content += i
return content
# 提取topK个高频词的函数
# TF:计算某个词在文章中出现的总次数
def get_TF(k,words):
tf_dic = {}
for i in words:
tf_dic[i] = tf_dic.get(i, 0)+1
return sorted(tf_dic.items(), key=lambda x: x[1], reverse=True)[:k]
# 去掉停用词(包括标点)
def stop_words(path):
with open(path, encoding='UTF-8') as f:
return [l.strip() for l in f]
# 主函数
if __name__ == "__main__":
files = glob.glob("C:/Users/Dell/Desktop/十七岁.txt")
corpus = [get_content(x) for x in files]
sample_inx = random.randint(0, len(corpus))
split_words = [x for x in jieba.cut(corpus[sample_inx]) if x not in stop_words("D:/PY/work/ml02/baidu_stopwords.utf8")]
print("样本的topk10词为:" + str(get_TF(10, split_words)))
IndexError: list index out of range