list index out of range

前提描述：

最近在做中文分词方法设计，其中有一个是用hmm的方法。不过不管用什么方法，都需要把x_train（数据集，每一个中文字符）和y_train（标注集，中文字符所对应的标注，分为b、m、e、s）一一对应起来训练。训练出的结果是给定中文字符，能预测出它的标签，b是词首，m是中间，e是词尾，s是单独成词。然后，就可以把b和e之间的字符连接成词，s也单独成词。这就是中文分词要做的内容。

问题描述：

在跑代码时，遇到以下问题：

File "e:/毕业设计/代码/msr/HMM/hmm_model.py", line 156, in <module>
    train()
  File "e:/毕业设计/代码/msr/HMM/hmm_model.py", line 77, in train
    if sentence[i] not in Emit[tags[i]] :
IndexError: list index out of range

于是对sentence长度和tags长度进行输出：

（sentence是x_train的随机一句话，tags是这句话每个字对应的标签，理论上应该相等）

for sentence, tags in zip(x_train, y_train):
        print(len(sentence),len(tags))

#输出结果 24，60.也就是sentence长度是小于tags长度

但是对x_train和y_train长度进行输出，结果是

69534 69534，是相等的。

问题：为什么sentence长度会比tags小？二者是从x_train和y_train一一配对来的，应该不存在不相等的情况啊？

并且，此hmm代码在人民日报数据集上跑的时候，并没有任何问题，换上msr数据集和pku数据集，就会出现list index out of range错误。但是用crf+bilstm等其它方式跑msr数据集和pku数据集时，也有for sentence, tags in zip(x_train, y_train)代码段，也没有任何报错。因此可认为不是数据集的问题。

最后附上全部代码：

import  pickle
Trans = {}  #trans
Emit = {}  #emit
Count_dic = {}
Start = {}  #start

with open('E:/毕业设计/代码/msr/data/datasave.pkl', 'rb') as inp:
    '''
    读取数据处理结果
    '''
    word2id = pickle.load(inp)
    id2word = pickle.load(inp)
    tag2id = pickle.load(inp)
    id2tag = pickle.load(inp)
    x_train = pickle.load(inp)
    y_train = pickle.load(inp)
    x_test = pickle.load(inp)
    y_test = pickle.load(inp)
    print(len(x_train),len(y_train))


def calculate(x,y,id2word,id2tag,res=[]):
    '''

    :param x: 输入的句子(转换后的ID序列）
    :param y: 标注tag序列
    :param id2word: id2word
    :param id2tag: id2tag
    :param res: 添加输入句子的词组划分 BME S
    :return: res
    '''
    entity=[]
    for j in range(len(x)):
        if id2tag[y[j]]=='B':
            entity=[id2word[x[j]]]
        elif id2tag[y[j]]=='M' and len(entity)!=0:
            entity.append(id2word[x[j]])
        elif id2tag[y[j]]=='E' and len(entity)!=0:
            entity.append(id2word[x[j]])
            res.append(entity)
            entity=[]
        elif id2tag[y[j]]=='S':
            entity=[id2word[x[j]]]
            res.append(entity)
            entity=[]
        else:
            entity=[]
    return res




def init():

    for tag in tag2id:
        Trans[tag2id[tag]] = {}
        for tag2 in tag2id:
            Trans[tag2id[tag]][tag2id[tag2]] = 0.0
    for tag in tag2id:
        Start[tag2id[tag]] = 0.0
        Emit[tag2id[tag]] = {}
        Count_dic[tag2id[tag]] = 0
def train():
    '''
    根据输入的训练集进行各个数组的填充
    :return:
    '''
    for sentence, tags in zip(x_train, y_train):
        print(len(sentence),len(tags))
        for i in range(len(tags)):
            if i == 0:
                Start[tags[0]] += 1
                Count_dic[tags[0]] += 1
            else:
                Trans[tags[i - 1]][tags[i]] += 1
                Count_dic[tags[i]] += 1
                if sentence[i] not in Emit[tags[i]] :
                    Emit[tags[i]][sentence[i]] = 0.0
                else:
                    Emit[tags[i]][sentence[i]] += 1

    for tag in Start:
        Start[tag] = Start[tag] * 1.0 / len(x_train)
    for tag in Trans:
        for tag1 in Trans[tag]:
            Trans[tag][tag1] = Trans[tag][tag1] / Count_dic[tag]

    for tag in Emit:
        for word in Emit[tag]:
            Emit[tag][word] = Emit[tag][word] / Count_dic[tag]
    print(Start)
    print(Trans)

def viterbi(sentence, tag_list):
    '''

    :param sentence:  输入的句子
    :param tag_list:  所有的tag
    :return: prob预测的最大的概率 bestpath 预测的tag序列
    '''
    V = [{}] #tabular
    path = {}
    backpointers = []
    for y in tag_list: #init
        V[0][y] = Start[y] * (Emit[y].get(sentence[0],0.00000001))
        path[y]=y
    backpointers.append(path)
    for t in range(1,len(sentence)):
        V.append({})
        newpath = {}
        path = {}
        for y in tag_list:
            (prob,state ) = max([(V[t-1][y0] * Trans[y0].get(y,0.00000001) * Emit[y].get(sentence[t],0.00000001) ,y0) for y0 in tag_list])
            V[t][y] =prob
            path[y]=state
        backpointers.append(path)
    (prob, state) = max([(V[len(sentence) - 1][y], y) for y in tag_list])
    best_path=[]
    best_path.append(state)
    for pathi in reversed(backpointers):
        state = pathi[state]
        best_path.append(state)
    best_path.pop()
    # Pop off the start tag (we dont want to return that to the caller)
    best_path.reverse()
    return (prob, best_path)

def test():
    '''
    计算Precision和Recall以及Fscore
    '''
    taglist=[tag2id[tag] for tag in tag2id]
    entityres = []#根据预测结果的分词序列
    entityall = []#根据真实结果的分词序列
    for sentence, tags in zip(x_test, y_test):
        #score, predict=viterbi(sentence,taglist,Start,Trans,Emit)
        score, predict = viterbi(sentence, taglist)
        entityres = calculate(sentence, predict, id2word, id2tag, entityres)
        entityall = calculate(sentence, tags, id2word, id2tag, entityall)

    rightpre = [i for i in entityres if i in entityall]#预测成功的分词序列
    if len(rightpre) != 0:
        precision = float(len(rightpre)) / len(entityres)
        recall = float(len(rightpre)) / len(entityall)
        print("precision: ", precision)
        print("recall: ", recall)
        print("fscore: ", (2 * precision * recall) / (precision + recall))
    else:
        print("precision: ", 0)
        print("recall: ", 0)
        print("fscore: ", 0)


if __name__ == "__main__":
    init()
    train()
    test()

脱离数据集讨论题主的问题，可能不会有答案。不过，题主认为x_train和y_train长度相等，因此zip(x_train, y_train)中的每个二元组的元素长度就应该相等，这是错误的。比如：

>>> x_train = ['abc','d'] # 长度2
>>> y_train = 'xy' # 长度2
>>> for a, b in zip(x_train, y_train):
	print(a, len(a), b, len(b))

	
abc 3 x 1
d 1 y 1