前提描述:
最近在做中文分词方法设计,其中有一个是用hmm的方法。不过不管用什么方法,都需要把x_train(数据集,每一个中文字符)和y_train(标注集,中文字符所对应的标注,分为b、m、e、s)一一对应起来训练。训练出的结果是给定中文字符,能预测出它的标签,b是词首,m是中间,e是词尾,s是单独成词。然后,就可以把b和e之间的字符连接成词,s也单独成词。这就是中文分词要做的内容。
问题描述:
在跑代码时,遇到以下问题:
File "e:/毕业设计/代码/msr/HMM/hmm_model.py", line 156, in <module>
train()
File "e:/毕业设计/代码/msr/HMM/hmm_model.py", line 77, in train
if sentence[i] not in Emit[tags[i]] :
IndexError: list index out of range
于是对sentence长度和tags长度进行输出:
(sentence是x_train的随机一句话,tags是这句话每个字对应的标签,理论上应该相等)
for sentence, tags in zip(x_train, y_train):
print(len(sentence),len(tags))
#输出结果 24,60.也就是sentence长度是小于tags长度
但是对x_train和y_train长度进行输出,结果是
69534 69534,是相等的。
问题:为什么sentence长度会比tags小?二者是从x_train和y_train一一配对来的,应该不存在不相等的情况啊?
并且,此hmm代码在人民日报数据集上跑的时候,并没有任何问题,换上msr数据集和pku数据集,就会出现list index out of range错误。但是用crf+bilstm等其它方式跑msr数据集和pku数据集时,也有for sentence, tags in zip(x_train, y_train)代码段,也没有任何报错。因此可认为不是数据集的问题。
最后附上全部代码:
import pickle
Trans = {} #trans
Emit = {} #emit
Count_dic = {}
Start = {} #start
with open('E:/毕业设计/代码/msr/data/datasave.pkl', 'rb') as inp:
'''
读取数据处理结果
'''
word2id = pickle.load(inp)
id2word = pickle.load(inp)
tag2id = pickle.load(inp)
id2tag = pickle.load(inp)
x_train = pickle.load(inp)
y_train = pickle.load(inp)
x_test = pickle.load(inp)
y_test = pickle.load(inp)
print(len(x_train),len(y_train))
def calculate(x,y,id2word,id2tag,res=[]):
'''
:param x: 输入的句子(转换后的ID序列)
:param y: 标注tag序列
:param id2word: id2word
:param id2tag: id2tag
:param res: 添加输入句子的词组划分 BME S
:return: res
'''
entity=[]
for j in range(len(x)):
if id2tag[y[j]]=='B':
entity=[id2word[x[j]]]
elif id2tag[y[j]]=='M' and len(entity)!=0:
entity.append(id2word[x[j]])
elif id2tag[y[j]]=='E' and len(entity)!=0:
entity.append(id2word[x[j]])
res.append(entity)
entity=[]
elif id2tag[y[j]]=='S':
entity=[id2word[x[j]]]
res.append(entity)
entity=[]
else:
entity=[]
return res
def init():
for tag in tag2id:
Trans[tag2id[tag]] = {}
for tag2 in tag2id:
Trans[tag2id[tag]][tag2id[tag2]] = 0.0
for tag in tag2id:
Start[tag2id[tag]] = 0.0
Emit[tag2id[tag]] = {}
Count_dic[tag2id[tag]] = 0
def train():
'''
根据输入的训练集进行各个数组的填充
:return:
'''
for sentence, tags in zip(x_train, y_train):
print(len(sentence),len(tags))
for i in range(len(tags)):
if i == 0:
Start[tags[0]] += 1
Count_dic[tags[0]] += 1
else:
Trans[tags[i - 1]][tags[i]] += 1
Count_dic[tags[i]] += 1
if sentence[i] not in Emit[tags[i]] :
Emit[tags[i]][sentence[i]] = 0.0
else:
Emit[tags[i]][sentence[i]] += 1
for tag in Start:
Start[tag] = Start[tag] * 1.0 / len(x_train)
for tag in Trans:
for tag1 in Trans[tag]:
Trans[tag][tag1] = Trans[tag][tag1] / Count_dic[tag]
for tag in Emit:
for word in Emit[tag]:
Emit[tag][word] = Emit[tag][word] / Count_dic[tag]
print(Start)
print(Trans)
def viterbi(sentence, tag_list):
'''
:param sentence: 输入的句子
:param tag_list: 所有的tag
:return: prob预测的最大的概率 bestpath 预测的tag序列
'''
V = [{}] #tabular
path = {}
backpointers = []
for y in tag_list: #init
V[0][y] = Start[y] * (Emit[y].get(sentence[0],0.00000001))
path[y]=y
backpointers.append(path)
for t in range(1,len(sentence)):
V.append({})
newpath = {}
path = {}
for y in tag_list:
(prob,state ) = max([(V[t-1][y0] * Trans[y0].get(y,0.00000001) * Emit[y].get(sentence[t],0.00000001) ,y0) for y0 in tag_list])
V[t][y] =prob
path[y]=state
backpointers.append(path)
(prob, state) = max([(V[len(sentence) - 1][y], y) for y in tag_list])
best_path=[]
best_path.append(state)
for pathi in reversed(backpointers):
state = pathi[state]
best_path.append(state)
best_path.pop()
# Pop off the start tag (we dont want to return that to the caller)
best_path.reverse()
return (prob, best_path)
def test():
'''
计算Precision和Recall以及Fscore
'''
taglist=[tag2id[tag] for tag in tag2id]
entityres = []#根据预测结果的分词序列
entityall = []#根据真实结果的分词序列
for sentence, tags in zip(x_test, y_test):
#score, predict=viterbi(sentence,taglist,Start,Trans,Emit)
score, predict = viterbi(sentence, taglist)
entityres = calculate(sentence, predict, id2word, id2tag, entityres)
entityall = calculate(sentence, tags, id2word, id2tag, entityall)
rightpre = [i for i in entityres if i in entityall]#预测成功的分词序列
if len(rightpre) != 0:
precision = float(len(rightpre)) / len(entityres)
recall = float(len(rightpre)) / len(entityall)
print("precision: ", precision)
print("recall: ", recall)
print("fscore: ", (2 * precision * recall) / (precision + recall))
else:
print("precision: ", 0)
print("recall: ", 0)
print("fscore: ", 0)
if __name__ == "__main__":
init()
train()
test()
脱离数据集讨论题主的问题,可能不会有答案。不过,题主认为x_train和y_train长度相等,因此zip(x_train, y_train)中的每个二元组的元素长度就应该相等,这是错误的。比如:
>>> x_train = ['abc','d'] # 长度2
>>> y_train = 'xy' # 长度2
>>> for a, b in zip(x_train, y_train):
print(a, len(a), b, len(b))
abc 3 x 1
d 1 y 1