我想修改下面程序,只把正向和逆向结果弄出来,该怎么做呢?

#! -*- coding:utf-8 -*-

#from __future__ import division
import sys
from importlib import reload

reload(sys)
# sys.setdefaultencoding('utf8')
import os
import re

StopWordtmp = [' ', u'\u3000', u'\u3001', u'\u300a', u'\u300b', u'\uff1b', u'\uff02', u'\u30fb', u'\u25ce', u'\x30fb',
               u'\u3002', u'\uff0c', u'\uff01', u'\uff1f', u'\uff1a', u'\u201c', u'\u201d', u'\u2018', u'\u2019',
               u'\uff08', u'\uff09', u'\u3010', u'\u3011', u'\uff5b', u'\uff5d', u'-', u'\uff0d', u'\uff5e', u'\uff3b',
               u'\uff3d', u'\u3014', u'\u3015', u'\uff0e', u'\uff20', u'\uffe5', u'\u2022', u'.']

WordDic = {}
StopWord = []
StatisticDic = {}
span = 16


# 将停用词读入列表
def InitStopword():
    for key in StopWordtmp:
        StopWord.append(key)


def InitDic(Dicfile):
    f = open(Dicfile,encoding='utf8')
    for line in f:
        # line = line.strip().decode('utf-8')
        line = line.strip()
        WordDic[line] = 1;
    f.close()
    # print len(WordDic)
    # print "Dictionary has built down!"


def InitStatisticDic(StatisticDicfile):
    StatisticDic['<BEG>'] = {}
    f = open(StatisticDicfile,encoding='utf8')
    for line in f:
        # chunk = line.strip().decode('utf-8').split('  ')
        chunk = line.strip().split('  ')
        if chunk[0] != '':
            if chunk[0] not in StatisticDic['<BEG>']:
                StatisticDic['<BEG>'][chunk[0]] = 1
            else:
                StatisticDic['<BEG>'][chunk[0]] += 1

        for i in range(len(chunk) - 1):
            # if not StatisticDic.has_key(chunk[i]) and chunk[i] != '':
            if chunk[i] not in StatisticDic and chunk[i] != '':
                StatisticDic[chunk[i]] = {}
            if chunk[i] != '':
                if chunk[i + 1] not in StatisticDic[chunk[i]]:
                    StatisticDic[chunk[i]][chunk[i + 1]] = 1
                else:
                    StatisticDic[chunk[i]][chunk[i + 1]] += 1
        if chunk[-1] not in StatisticDic and chunk[-1] != '':
            StatisticDic[chunk[-1]] = {}
        if chunk[-1] != '':
            if '<END>' not in StatisticDic[chunk[-1]]:
                StatisticDic[chunk[-1]]['<END>'] = 1
            else:
                StatisticDic[chunk[-1]]['<END>'] += 1


def WordSeg(Inputfile, Outputfile):
    f = open(Inputfile,encoding='utf8')
    w = open(Outputfile, 'w',encoding='utf8')
    dic_size = 0
    for key in StatisticDic:
        for keys in StatisticDic[key]:
            dic_size += StatisticDic[key][keys]
    for line in f:
        line = line.strip()
        senList = []
        newsenList = []
        tmpword = ''
        for i in range(len(line)):
            if line[i] in StopWord:
                senList.append(tmpword)
                senList.append(line[i])
                tmpword = ''
            else:
                tmpword += line[i]
                if i == len(line) - 1:
                    senList.append(tmpword)
        # N-gram
        for key in senList:
            if key in StopWord:
                newsenList.append(key)
            else:
                Pretmplist = PreSenSeg(key, span)
                Posttmplist = PostSenSeg(key, span)
                tmp_pre = P(Pretmplist, dic_size)
                tmp_post = P(Posttmplist, dic_size)
                tmplist = []
                if tmp_pre > tmp_post:
                    tmplist = Pretmplist
                else:
                    tmplist = Posttmplist
                # print 'tmplist', tmplist
                for keyseg in tmplist:
                    newsenList.append(keyseg)
        writeline = ''
        for key in newsenList:
            writeline = writeline + key + '  '
        writeline = writeline.strip('  ')
        w.write(writeline + '\n')
    # break

    f.close()
    w.close()


# 根据概率的乘法定理及N-gram模型,字串出现的概率
def P(tmplist, dic_size):
    rev = 1
    if len(tmplist) < 1:
        return 0
    '''
    print 'tmplist', tmplist
    print "tmplist[0]", tmplist[0]
    print '-----------'
    '''
    rev *= Pword(tmplist[0], '<BEG>', dic_size)
    rev *= Pword('<END>', tmplist[-1], dic_size)
    for i in range(len(tmplist) - 1):
        a = Pword(tmplist[i + 1], tmplist[i], dic_size)
        rev *= a
    return rev


# 基于N-gram模型,用字在语料库中出现频率来估计
# ???????????????????????????????????
def Pword(word1, word2, dic_size):
    # print 'word1:', word1
    # print 'word2:', word2
    div_up = 0
    div_down = 0
    if word2 in StatisticDic:
        for key in StatisticDic[word2]:
            # print 'key:', key·
            div_down += StatisticDic[word2][key]     #??????
            if key == word1:
                div_up = StatisticDic[word2][key]
    return (div_up + 1) / (div_down + dic_size)  # 平滑技术???


def PreSenSeg(sen, span):
    # sen = u"北京奥运"
    post = span
    if len(sen) < span:
        post = len(sen)
    cur = 0
    revlist = []
    while 1:
        if cur >= len(sen):
            break
        s = re.search(
            u"^[0|1|2|3|4|5|6|7|8|9|\uff11|\uff12|\uff13|\uff14|\uff15|\uff16|\uff17|\uff18|\uff19|\uff10|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u96f6|\u5341|\u767e|\u5343|\u4e07|\u4ebf|\u5146|\uff2f]+",
            sen[cur:])
        if s:
            if s.group() != '':
                revlist.append(s.group())
            cur = cur + len(s.group())
            post = cur + span
            if post > len(sen):
                post = len(sen)
        s = re.search(
            u"^[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|\uff41|\uff42|\uff43|\uff44|\uff45|\uff46|\uff47|\uff48|\uff49|\uff47|\uff4b|\uff4c|\uff4d|\uff4e|\uff4f|\uff50|\uff51|\uff52|\uff53|\uff54|\uff55|\uff56|\uff57|\uff58|\uff59|\uff5a|\uff21|\uff22|\uff23|\uff24|\uff25|\uff26|\uff27|\uff28|\uff29|\uff2a|\uff2b|\uff2c|\uff2d|\uff2e|\uff2f|\uff30|\uff31|\uff32|\uff33|\uff35|\uff36|\uff37|\uff38|\uff39|\uff3a]+",
            sen[cur:])
        if s:
            if s.group() != '':
                revlist.append(s.group())
            cur = cur + len(s.group())
            post = cur + span
            if post > len(sen):
                post = len(sen)
        if (sen[cur:post] in WordDic) or (cur + 1 == post):
            if sen[cur:post] != '':
                revlist.append(sen[cur:post])
            cur = post
            post = post + span
            if post > len(sen):
                post = len(sen)
        else:
            post -= 1

    return revlist

# def freq()

def PostSenSeg(sen, span):
    cur = len(sen)
    pre = cur - span
    if pre < 0:
        pre = 0
    revlist = []
    while 1:
        if cur <= 0:
            break
        s = re.search(
            u"[0|1|2|3|4|5|6|7|8|9|\uff11|\uff12|\uff13|\uff14|\uff15|\uff16|\uff17|\uff18|\uff19|\uff10|\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u96f6|\u5341|\u767e|\u5343|\u4e07|\u4ebf|\u5146|\uff2f]+$",
            sen[pre:cur])
        if s:
            if s.group() != '':
                revlist.append(s.group())
            cur = cur - len(s.group())
            pre = cur - span
            if pre < 0:
                pre = 0
        s = re.search(
            u"^[a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|\uff41|\uff42|\uff43|\uff44|\uff45|\uff46|\uff47|\uff48|\uff49|\uff47|\uff4b|\uff4c|\uff4d|\uff4e|\uff4f|\uff50|\uff51|\uff52|\uff53|\uff54|\uff55|\uff56|\uff57|\uff58|\uff59|\uff5a|\uff21|\uff22|\uff23|\uff24|\uff25|\uff26|\uff27|\uff28|\uff29|\uff2a|\uff2b|\uff2c|\uff2d|\uff2e|\uff2f|\uff30|\uff31|\uff32|\uff33|\uff35|\uff36|\uff37|\uff38|\uff39|\uff3a]+",
            sen[pre:cur])
        if s:
            if s.group() != '':
                revlist.append(s.group())
            cur = cur - len(s.group())
            pre = cur - span
            if pre < 0:
                pre = 0

        if (sen[pre:cur] in WordDic) or (cur - 1 == pre):
            if sen[pre:cur] != '':
                revlist.append(sen[pre:cur])
            cur = pre
            pre = pre - span
            if pre < 0:
                pre = 0
        else:
            pre += 1
    return revlist[::-1]


if __name__ == "__main__":
    # if len(sys.argv) != 5:
    #  print("Usage: python wordseg.py Dicfile Inputfile Outfile")
    Dicfile = r'D:\pythonProject\Ngram\dic.txt'  # sys.argv[1]
    StatisticDicfile = r'D:\pythonProject\Ngram\traindata.txt'  # sys.argv[2]
    Inputfile = r'D:\pythonProject\Ngram\test.txt'  # sys.argv[3]
    Outputfile = r'D:\pythonProject\Ngram\out1.txt'  # sys.argv[4]
    InitDic(Dicfile)
    InitStatisticDic(StatisticDicfile)

    # print "Dic:", StatisticDic
    InitStopword()
    WordSeg(Inputfile, Outputfile)
 

你好,我是有问必答小助手,非常抱歉,本次您提出的有问必答问题,技术专家团超时未为您做出解答

本次提问扣除的有问必答次数,将会以问答VIP体验卡(1次有问必答机会、商城购买实体图书享受95折优惠)的形式为您补发到账户。

​​​​因为有问必答VIP体验卡有效期仅有1天,您在需要使用的时候【私信】联系我,我会为您补发。