import jieba
file_userDict = 'add_word_list.txt'
jieba.load_userdict(file_userDict)
import re
stop_file = "stopwordlist.txt"
try:
stopword_list = open(stop_file,encoding ='utf-8')
except:
stopword_list = []
print("error in stop_file")
stop_list = []
for line in stopword_list:
line = re.sub(u'\n|\\r', '', line)
stop_list.append(line)
f=open("法律(新)\服务贸易\中华人民共和国海南自由贸易港法(FBM-CLI.1.5015177).txt","r",encoding='utf-8')
txt=f.read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) ==1:
continue
else:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
items
我想做个先去除停用词然后对文本进行分词并词频统计的代码,然而这个代码最后得出的词频统计并没有去除停用词,请问该怎么修改呢?
参考gpt和自己的思路,在代码中,你已经成功读取并生成了停用词列表stop_list,可以在统计词频时添加一个判断,如果单词出现在停用词列表中,则不加入词频统计。下面是修改后的代码:
import jieba
import re
file_userDict = 'add_word_list.txt'
jieba.load_userdict(file_userDict)
stop_file = "stopwordlist.txt"
try:
stopword_list = open(stop_file, encoding='utf-8')
except:
stopword_list = []
print("error in stop_file")
stop_list = []
for line in stopword_list:
line = re.sub(u'\n|\r', '', line)
stop_list.append(line)
f = open("法律(新)\服务贸易\中华人民共和国海南自由贸易港法(FBM-CLI.1.5015177).txt", "r", encoding='utf-8')
txt = f.read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1 or word in stop_list: # 判断单词长度是否为1或者是否出现在停用词列表中
continue
else:
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
print(items) # 输出词频统计结果
你在统计词频的时候,判断这个词在停用词中则不统计即可:
for word in words:
if len(word) ==1 or word in stop_list:
continue
else:
counts[word] = counts.get(word,0) + 1
“Devil组”引证GPT后的撰写:
修改如下:
import jieba
import re
# 加载用户自定义词典
file_userDict = 'add_word_list.txt'
jieba.load_userdict(file_userDict)
# 加载停用词表
stop_file = "stopwordlist.txt"
try:
stopword_list = open(stop_file, encoding='utf-8')
stop_list = [line.strip() for line in stopword_list] # 读取停用词表到列表
except:
stop_list = []
print("error in stop_file")
# 打开文本文件并读取内容
f = open("法律(新)\服务贸易\中华人民共和国海南自由贸易港法(FBM-CLI.1.5015177).txt", "r", encoding='utf-8')
txt = f.read()
# 分词并统计词频
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1 or word in stop_list:
continue
else:
counts[word] = counts.get(word, 0) + 1
# 按词频排序并输出结果
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for item in items:
print(item[0], item[1])
用line.strip()函数从停用词列表中读取每一行,然后用continue语句跳过被定义为停用词的单词,从而避免了它们被计入词频统计中。
该回答引用ChatGPT
您可以在处理完文本后,添加一个判断语句来判断词语是否为停用词。如果是停用词,则不对其进行词频统计。以下是修改后的代码:
import jieba
file_userDict = 'add_word_list.txt'
jieba.load_userdict(file_userDict)
import re
stop_file = "stopwordlist.txt"
try:
stopword_list = open(stop_file, encoding ='utf-8')
except:
stopword_list = []
print("error in stop_file")
stop_list = []
for line in stopword_list:
line = re.sub(u'\n|\\r', '', line)
stop_list.append(line)
f = open("法律(新)\服务贸易\中华人民共和国海南自由贸易港法(FBM-CLI.1.5015177).txt", "r", encoding='utf-8')
txt = f.read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
if word in stop_list:
continue
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
items
在上述代码中,添加了一个判断语句if word in stop_list: continue来判断词语是否为停用词。如果是停用词,则直接跳过该词语,不进行词频统计。
该回答引用GPTᴼᴾᴱᴺᴬᴵ
您可以在统计词频之前,将停用词从分词列表中去除。可以使用以下代码实现:
import jieba
import re
# 加载用户自定义词典
file_userDict = 'add_word_list.txt'
jieba.load_userdict(file_userDict)
# 加载停用词列表
stop_file = "stopwordlist.txt"
try:
stopword_list = open(stop_file, encoding='utf-8')
except:
stopword_list = []
print("error in stop_file")
stop_list = []
for line in stopword_list:
line = re.sub(u'\n|\\r', '', line)
stop_list.append(line)
# 打开文本文件并读取内容
with open("法律(新)\服务贸易\中华人民共和国海南自由贸易港法(FBM-CLI.1.5015177).txt", "r", encoding='utf-8') as f:
txt = f.read()
# 使用jieba分词,并去除停用词
words = jieba.lcut(txt)
words = [word for word in words if word not in stop_list and len(word) > 1]
# 统计词频
counts = {}
for word in words:
counts[word] = counts.get(word, 0) + 1
# 对词频统计结果进行排序
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
# 输出结果
print(items)
在这个代码中,我们添加了一个名为 stop_list 的列表,其中包含所有停用词。在分词列表 words 中,我们使用了列表推导式来过滤出不包含停用词和单字词的词语。然后,我们按照之前的方法统计词频,最后输出结果。
请确保您的停用词列表中的每个停用词都是独立一行,否则代码中的正则表达式可能无法正确去除换行符。