python如何每次读两个汉字,读取字符,为了实验,无法读取两个字符(自然语言处理)



def getText():
    txt=open('Ci.txt','r',encoding ="utf-8").read()
    for ch in '\n \ / ,。、:!?”“#¥%':
        txt=txt.replace(ch,"")
    return txt

test=getText()

counts={}

for word in test:
    if len(word)==1:
        if word in counts:
            counts[word]=counts[word]+1
        else:
            counts[word]=1

items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)

with open('2.txt','w') as a:
    a.write(str(items))

文件就不上传了,就是无法读取两个字符(汉字),只能读出单个汉字,但是又不想用jieba包,如何处理,求解决方案

你这样每次读两个:

def getText():
    txt = open('1.txt', 'r', encoding="utf-8").read()
    for ch in '\n \ / ,。、:!?”“#¥%':
        txt = txt.replace(ch, "")
    return txt

test = getText()
print(test)
counts = {}
for i in range(len(test)-1):
    t=test[i]+test[i+1]
    if len(t) == 2:
        if t in counts:
            counts[t] = counts[t] + 1
        else:
            counts[t] = 1
items = list(counts.items())
print(items)
items.sort(key=lambda x: x[1], reverse=True)
with open('2.txt', 'w') as a:
    a.write(str(items))

img