python jieba库统计三国演义前20个出场的人物

python嵩天p176 实例代码10.4
如何完善程序,排除无关词汇干扰,总结出场最多的20个人物?

img

如果有人物姓名列表,不考虑全文字符串长度太大,Python 的字符串方法 str.count() “定义出场次数统计字典或列表,字典[name]=全文字符串.count(name);列表.append(f"{name}:{全文字符串.count(name)}") ,但列表要考虑去重,可以用 if 或 set()去重。”,也可以办到😜😜如——

  • Python 代码
#!/sur/bin/nve python
# coding: utf-8
from re import findall
my_path = '/sdcard/Documents/'
names_text = open(f"{my_path}三国演义人物姓名列表.txt").read()
texts = open(f"{my_path}三国演义.txt").read()
names = []
name_sort = {}

names += findall(r'(\w+)、', names_text)
names += findall(r'(\w+)\(', names_text)
names += findall(r'字(\w+)、', names_text)
names += findall(r'字(\w+))', names_text)
names += findall(r'字(\w+)\)', names_text)
names += findall(r'([\u4E00-\u9FA5]+)\n', names_text)

for k,name in enumerate(names):
    print(' '*50, end='\r')
    print(f"{' '*(k%39)}finding ...", end='\r')
    name_sort[name] = texts.count(name)

print(' '*50, end='\r')
names = [(name, times) for name,times in name_sort.items()]
names.sort(key=lambda x: x[1], reverse=True)

for i in range(20):
    print(f"{names[i][0]:>18}{names[i][1]}")

print(len(names), len(name_sort), len(texts))

  • 代码运行效果截屏图片

    img

我的《三国演义》文本是文言文版。



import jieba
 
article = open('三国演义.txt', 'r', encoding='utf-8').read()
words = jieba.lcut(article)
//排除干扰项,可添加
exincludes = ['将军', '却说', '二人', '不可', '荆州', '如此', '不能',
            '商议', '如何', '主公', '军士', '左右', '军马', '引兵',
            '次日', '大喜', '天下', '于是', '东吴', '今日', '不敢',
            '魏兵', '人马', '不知', '汉中', '陛下', '一人', '众将',
            '只见', '蜀兵', '大叫', '上马', '此人', '后人', '城中',
            '背后', '一面', '先主', '太守', '大军', '何不', '然后',
            '忽报', '先生', '夫人', '不如', '先锋', "何故", '江东',
            '原来', '令人', '天子', '赶来', '徐州', '正是', '忽然',
            '下马', '因此', '大败', '未知', '百姓', '成都', '大事',
            '一军', '之后', '起兵', '喊声', '不见', '接应', '引军',
            '进兵', '引军', '军中', '大怒', '大惊', '可以', '谋反',
            '心中', '以为', '军民', '不得', '休走', '帐中', '可得']
nums = {}
//别称
for word in words:
    if len(word) == 1 or word in exincludes:
        continue
    elif word in ['丞相','曹孟德','孟德']:
        nums['曹操'] = nums.get('曹操', 0) + 1
    elif word in ['孔明曰', '诸葛亮','卧龙','伏龙','武乡侯','忠武侯','蜀相']:
        nums['孔明'] = nums.get('孔明', 0) + 1
    elif word in ['玄德曰', '玄德' , '刘豫州','汉中王','汉昭烈帝','平原相','汉室宗亲','中山靖王之后','刘皇叔']:
        nums['刘备'] = nums.get('刘备', 0) + 1
    elif word in ['关公', '云长','寿亭侯','关云长']:
        nums['关羽'] = nums.get('关羽', 0) + 1
    elif word in ['都督','周郎','公瑾']:
        nums['周瑜'] = nums.get('周瑜', 0) + 1
    elif word in ['飞将','吕温侯','奉先','吕奉先']:
        nums['吕布'] = nums.get('吕布', 0) + 1
    elif word in ['常胜将军','子龙','赵子龙']:
        nums['赵云'] = nums.get('赵云',0)+1
    else:
        nums[word] = nums.get(word, 0) + 1
numslist = list(nums.items())
//次数排序
numslist.sort(key=lambda x: x[1], reverse=True)
for i in range(20):
    word, count = numslist[i]
    print("{} {}".format(word, count))

原文链接:https://blog.csdn.net/WYYYYkkk/article/details/102470500?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522168449950016800211526978%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fblog.%2522%257D&request_id=168449950016800211526978&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~blog~first_rank_ecpm_v1~rank_v31_ecpm-7-102470500-null-null.blog_rank_default&utm_term=python%20jieba%E5%BA%93%E7%BB%9F%E8%AE%A1%E4%B8%89%E5%9B%BD%E6%BC%94%E4%B9%89%E5%89%8D20%E4%B8%AA%E5%87%BA%E5%9C%BA%E7%9A%84%E4%BA%BA%E7%89%A9&spm=1018.2226.3001.4450