import jieba.posseg as pseg
import operator
import warnings
import os
from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
warnings.filterwarnings('ignore')
class Word():
def __init__(self, char, freq=0, deg=0):
self.freq = freq
self.deg = deg
self.char = char
def returnScore(self):
return self.deg / self.freq
def updateOccur(self, phraseLength):
self.freq += 1
self.deg += phraseLength
def getChar(self):
return self.char
def updateFreq(self):
self.freq += 1
def getFreq(self):
return self.freq
def notNumStr(instr):
for item in instr:
if '\u0041' <= item <= '\u005a' or ('\u0061' <= item <= '\u007a') or item.isdigit():
return False
return True
def run(rawText):
swLibList = [line.rstrip('\n') for line in open('./dataset/1893(utf8).txt', 'r', encoding='utf-8')]
conjLibList = [line.rstrip('\n') for line in open('./dataset/spw.txt', 'r', encoding='GB2312')]
rawtextList = pseg.cut(rawText)
textList = []
listofSingleWord = dict()
lastWord = ''
poSPrty = ['m', 'x', 'uj', 'ul', 'mq', 'u', 'v', 'f']
meaningfulCount = 0
checklist = []
for eachWord, flag in rawtextList:
checklist.append([eachWord, flag])
if eachWord in conjLibList or not notNumStr(
eachWord) or eachWord in swLibList or flag in poSPrty or eachWord == '\n':
if lastWord != '|':
textList.append("|")
lastWord = "|"
elif eachWord not in swLibList and eachWord != '\n':
textList.append(eachWord)
meaningfulCount += 1
if eachWord not in listofSingleWord:
listofSingleWord[eachWord] = Word(eachWord)
lastWord = ''
newList = []
tempList = []
for everyWord in textList:
if everyWord != '|':
tempList.append(everyWord)
else:
newList.append(tempList)
tempList = []
tempStr = ''
for everyWord in textList:
if everyWord != '|':
tempStr += everyWord + '|'
else:
if tempStr[:-1] not in listofSingleWord:
listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])
tempStr = ''
for everyPhrase in newList:
res = ''
for everyWord in everyPhrase:
listofSingleWord[everyWord].updateOccur(len(everyPhrase))
res += everyWord + '|'
phraseKey = res[:-1]
if phraseKey not in listofSingleWord:
listofSingleWord[phraseKey] = Word(phraseKey)
else:
listofSingleWord[phraseKey].updateFreq()
outputList = dict()
for everyPhrase in newList:
if len(everyPhrase) > 5:
continue
score = 0
phraseString = ''
outStr = ''
for everyWord in everyPhrase:
score += listofSingleWord[everyWord].returnScore()
phraseString += everyWord + '|'
outStr += everyWord
phraseKey = phraseString[:-1]
freq = listofSingleWord[phraseKey].getFreq()
if meaningfulCount != 0:
if freq / meaningfulCount < 0.01 and freq < 3:
continue
else:
pass
outputList[outStr] = score
sorted_list = sorted(outputList.items(), key=operator.itemgetter(1), reverse=True)
return sorted_list[:20]
def plot(x, y, title):
mpl.rcParams['font.sans-serif'] = ['STZhongsong']
mpl.rcParams['axes.unicode_minus'] = False
plt.plot(x, y)
plt.xticks(rotation=-25)
plt.xlabel('word')
plt.ylabel('score')
plt.title(title)
plt.legend()
plt.show()
def get_tags():
path = './processed_data/'
file_list = os.listdir(path)
for file in tqdm(file_list):
data2p = ''
word = []
score = []
with open(path + file, 'r', encoding='utf-8') as fin:
for lin in fin:
line = lin.strip()
data2p = data2p + line + ' '
res = run(data2p)
for r in res:
word.append(r[0])
score.append(r[1])
# print(word)
# print(score)
plot(word, score, file)
if __name__ == '__main__':
get_tags()
随机抽样呢 , 抽样大小和原大小一样
random.shuffle?