本博文为介绍如果采用二元词图以及Viterbi算法分词的系列博文之一,为主体算法模块,欢迎有此方面学习需要的朋友按顺序阅读。
下面讲解算法主体实现部分:
首先给个定义:未登录词
在我的程序设计体系中未登录词分为两种:“单词未登录词”,即某个词没有在“单词”词典里出现过;“双词未登录词”即某两个词从来没有连续出现过。
之所以要考虑未登录词,是因为要对0概率进行平滑,否则计算中会有错误。
在实验过程中,我采用了两种0概率平滑方法,第一种方法:直接让未登录词的概率为1/N。N为“双词”词典或者“单词”词典中总共的词型数(Word Type:即每个词不管重复出现多少次,也只算一词),此方法的概率框架为p(x,y)由双词Trie词典得出,p(x)由 单词Trie词典得出,这种方法看起来似乎很武断。第二种方法是加1平滑法。用到的概率框架为P(x,y)由双词Trie词典得出。P(x)=sum P(x,y) for y=....。也就是说这里没有用到单词词典,某个单词的概率用联合概率算出。
第二种方法比较繁琐,且有理论依据。通过实验观察第二种方法的性能要好一点,大概F-值较第一种方法可以提高0.1%。
下面给出加1平滑法的公式(注:此截图以及后续博文中的图片来自中科院计算所刘群老师的《计算语言学》PPT,特此感谢)
利用第一种O概率平滑方法的主算法模块:
代码
# -*- coding: cp936 -*-
class Viterbi:
######################################################################################
def __init__(self):
import cPickle as mypickle
self.singleWordDict=mypickle.load(file('mySingleWordTrie.dat'))
self.doubleWordDict=mypickle.load(file('myDoubleWordTrie.dat'))
#########################################################################################
def calWordLength(self,string):
'''计算一个词含有多少个汉字
'''
import re
p=re.compile(r' ')
tmp=p.findall(string)
return len(tmp)+1
########################################################################################
def CreateWordGraph(self,sentence):#sentence为一个list,每一个元素为一个字
'''对于一个给定的句子生成词图
'''
NodesCollection={}#每一个节点信息为一个元组(start_p,max_p,hierachy) hierachy initialized 0
length=len(sentence)
for i in range(0,length):
expandlen=self.GetMaxLengthFromTrie(sentence[i])#句子中每个字,粗切分窗的最大长度
if expandlen==0:#说明以sentence[i]为首字的词在字典里不存在
NodesCollection[sentence[i]]=(1,1,None)
continue
else:
for k in range(1,expandlen+1):
sent=self.FormatSent(sentence[i:i+k])
if self.singleWordDict.has_key(sentence[i]):
tmpdict=self.singleWordDict[sentence[i]]
if tmpdict.has_key(sent):
prob=tmpdict[sent]
NodesCollection[sent]=(prob,prob,None)
#i
#print prob
#print sent
else:
if k==1:#以sentence[i]为首字的词在字典中存在,但是字典中不存在此单字
NodesCollection[sent]=(1,1,None)
return NodesCollection
#############################################################################################
def TopologicalSortWordGraph(self,sentence):
'''对原词图生成一个拓扑序,以后缀字为Hierachy'''
import re
NodesCollection=self.CreateWordGraph(sentence)
length=len(sentence)
TopologicalOrderNodes={}
for i in range(0,length):
TopologicalOrderNodes[i]={}
p=re.compile(sentence[i]+'$')
for key ,val in NodesCollection.iteritems():#从点集合中取出以此字为后缀的词,加入该阶梯
if p.search(key):
TopologicalOrderNodes[i][key]=val
return TopologicalOrderNodes
###########################################################################################
def GetMaxLengthFromTrie(self,key):
'''给出首字,查看以此字为首字的词的最大长度
'''
maxlen=0
if self.singleWordDict.has_key(key):
maxlen=max([self.calWordLength(subkey) for subkey in self.singleWordDict[key].iterkeys()])
return maxlen
############################################################################################
def FormatSent(self,sentence):
'''由字拼成短句'''
delimiter=' '
sent=delimiter.join(sentence)
return sent
##################################################################################################
def Segment(self,sentence):
'''如果句子中只有一个字不分词'''
#print 'segment'
if len(sentence)==1:
return sentence
else:
result=self.MyViterbi(sentence)
return result
#############################################################################################
def MyViterbi(self,sentence):
'''在词图上利用动态规划算法,维特比求最优解'''
import math
import re
#N=10001600
N=1000150
smooth=1.0/N#如果是稀缺字则概率值平滑为一个小值
TopologicalOrderNodes=self.TopologicalSortWordGraph(sentence)#获得一个具有拓扑序的图
optimalCandidates={}#存放每一个hierachy内最优的候选节点的键值,元素形式为(key,p,hierachy)
p=smooth#如果双字典中不存在以此字为首字的词,则概率平滑到一个小值
#首先求词网格lattice第一级 hierachy 0的最优解
if self.doubleWordDict.has_key(sentence[0]):
tmpp=self.doubleWordDict[sentence[0]].get(sentence[0]+'|'+'S')
if tmpp>0:
p=tmpp#如果双字典中存在以此单字为句首的情况,则p值取这个概率值
optimalCandidates[0]=(sentence[0],math.log(p,2),0)
for i in range(1,len(sentence)):#对于 hierachy 1 to len(sentence-1)
keysToRemove=[]#待去除的键值,以免它的存在影响最大概率的计算
for key in TopologicalOrderNodes[i].iterkeys():
keylen=self.calWordLength(key)#对于每一个hierachy遍历所有键
flag=i-keylen#flag表示回溯到标号为flag 的hierachy
p=re.compile('^\d+')
if flag<-1:
keysToRemove.append(key)
else:
if flag==-1:#表示词键值可以成为句首词
searchresult=p.findall(key)
if searchresult[0]==sentence[flag+1]:
prob=self.calConditionPFirst(key)
prob=math.log(prob,2)
TopologicalOrderNodes[i][key]=(TopologicalOrderNodes[i][key][0],prob,None)
else:
keysToRemove.append(key)
else:
firstword=optimalCandidates[flag][0]#取出标号为flag的hierachy对应的词
#看看在句子中位置为flag+1的字,是不是等于key的首字,如果相等则利用标号为flag的hierachy的最优结果进行计算,否则将该key加入待删除集合
searchresult=p.findall(key)
if searchresult[0]==sentence[flag+1]:
jointseg=firstword+'|'+key
con_prob=self.calConditionP(jointseg,firstword)
prob=optimalCandidates[flag][1]+math.log(con_prob,2)
TopologicalOrderNodes[i][key]=(TopologicalOrderNodes[i][key][0],prob,flag)
else:
keysToRemove.append(key)
for toremove in keysToRemove:
TopologicalOrderNodes[i].pop(toremove)
maxP=max([value[1] for value in TopologicalOrderNodes[i].itervalues()])
for cankey in TopologicalOrderNodes[i].iterkeys():
if TopologicalOrderNodes[i][cankey][1]==maxP:
candidatekey=cankey
break
hierachyP=TopologicalOrderNodes[i][candidatekey][2]
optimalCandidates[i]=(candidatekey,maxP,hierachyP)
print 'finishcalculating'
finalresult=[]
pieceOfWord=optimalCandidates[len(sentence)-1][0]
finalresult.append(pieceOfWord)
index=TopologicalOrderNodes[len(sentence)-1][pieceOfWord][2]
while index!=None:
pieceOfWord=optimalCandidates[index][0]
finalresult.append(pieceOfWord)
index=TopologicalOrderNodes[index][pieceOfWord][2]
finalresult.reverse()
return finalresult #print items
#############################################################################################
def calConditionPFirst(self,seg):
'''计算第能成为首词的候选节点的条件概率,输入参数seg为句子的一个片段'''
import re
p=re.compile('\d+')
# N=10002000;
N=1000023
smooth=1.0/N
tmp=p.findall(seg)
probability=smooth
if self.doubleWordDict.has_key(tmp[0]):
prob=self.doubleWordDict[tmp[0]].get(seg,0)
if prob>0:
probability=prob
#print'call calConditionPFirst probability%s' %probability
return probability
#######################################################################################
def calConditionP(self,jointseg,prevseg):
'''计算其他节点的条件概率 jointseg两个词合一起的形式,prevseg为前一个词'''
import re
p=re.compile('\d+')
#N=10001600
N=1000023
smooth=1.0/N
conditionP=smooth
tmp=p.findall(jointseg)
if self.doubleWordDict.has_key(tmp[0]) and self.singleWordDict.has_key(tmp[0]):
prob_joint=self.doubleWordDict[tmp[0]].get(jointseg,0)
prob_prev=self.singleWordDict[tmp[0]].get(prevseg)
if prob_joint>0 and prob_prev>0:
conditionP=prob_joint/prob_prev
#print 'call calConditonP probability=%s'%conditionP
return conditionP
###########################################################################################
def getSingleWordDict(self):
return self.singleWordDict
def getDoubleWordDict(self):
return self.doubleWordDict
class Viterbi:
######################################################################################
def __init__(self):
import cPickle as mypickle
self.singleWordDict=mypickle.load(file('mySingleWordTrie.dat'))
self.doubleWordDict=mypickle.load(file('myDoubleWordTrie.dat'))
#########################################################################################
def calWordLength(self,string):
'''计算一个词含有多少个汉字
'''
import re
p=re.compile(r' ')
tmp=p.findall(string)
return len(tmp)+1
########################################################################################
def CreateWordGraph(self,sentence):#sentence为一个list,每一个元素为一个字
'''对于一个给定的句子生成词图
'''
NodesCollection={}#每一个节点信息为一个元组(start_p,max_p,hierachy) hierachy initialized 0
length=len(sentence)
for i in range(0,length):
expandlen=self.GetMaxLengthFromTrie(sentence[i])#句子中每个字,粗切分窗的最大长度
if expandlen==0:#说明以sentence[i]为首字的词在字典里不存在
NodesCollection[sentence[i]]=(1,1,None)
continue
else:
for k in range(1,expandlen+1):
sent=self.FormatSent(sentence[i:i+k])
if self.singleWordDict.has_key(sentence[i]):
tmpdict=self.singleWordDict[sentence[i]]
if tmpdict.has_key(sent):
prob=tmpdict[sent]
NodesCollection[sent]=(prob,prob,None)
#i
#print prob
#print sent
else:
if k==1:#以sentence[i]为首字的词在字典中存在,但是字典中不存在此单字
NodesCollection[sent]=(1,1,None)
return NodesCollection
#############################################################################################
def TopologicalSortWordGraph(self,sentence):
'''对原词图生成一个拓扑序,以后缀字为Hierachy'''
import re
NodesCollection=self.CreateWordGraph(sentence)
length=len(sentence)
TopologicalOrderNodes={}
for i in range(0,length):
TopologicalOrderNodes[i]={}
p=re.compile(sentence[i]+'$')
for key ,val in NodesCollection.iteritems():#从点集合中取出以此字为后缀的词,加入该阶梯
if p.search(key):
TopologicalOrderNodes[i][key]=val
return TopologicalOrderNodes
###########################################################################################
def GetMaxLengthFromTrie(self,key):
'''给出首字,查看以此字为首字的词的最大长度
'''
maxlen=0
if self.singleWordDict.has_key(key):
maxlen=max([self.calWordLength(subkey) for subkey in self.singleWordDict[key].iterkeys()])
return maxlen
############################################################################################
def FormatSent(self,sentence):
'''由字拼成短句'''
delimiter=' '
sent=delimiter.join(sentence)
return sent
##################################################################################################
def Segment(self,sentence):
'''如果句子中只有一个字不分词'''
#print 'segment'
if len(sentence)==1:
return sentence
else:
result=self.MyViterbi(sentence)
return result
#############################################################################################
def MyViterbi(self,sentence):
'''在词图上利用动态规划算法,维特比求最优解'''
import math
import re
#N=10001600
N=1000150
smooth=1.0/N#如果是稀缺字则概率值平滑为一个小值
TopologicalOrderNodes=self.TopologicalSortWordGraph(sentence)#获得一个具有拓扑序的图
optimalCandidates={}#存放每一个hierachy内最优的候选节点的键值,元素形式为(key,p,hierachy)
p=smooth#如果双字典中不存在以此字为首字的词,则概率平滑到一个小值
#首先求词网格lattice第一级 hierachy 0的最优解
if self.doubleWordDict.has_key(sentence[0]):
tmpp=self.doubleWordDict[sentence[0]].get(sentence[0]+'|'+'S')
if tmpp>0:
p=tmpp#如果双字典中存在以此单字为句首的情况,则p值取这个概率值
optimalCandidates[0]=(sentence[0],math.log(p,2),0)
for i in range(1,len(sentence)):#对于 hierachy 1 to len(sentence-1)
keysToRemove=[]#待去除的键值,以免它的存在影响最大概率的计算
for key in TopologicalOrderNodes[i].iterkeys():
keylen=self.calWordLength(key)#对于每一个hierachy遍历所有键
flag=i-keylen#flag表示回溯到标号为flag 的hierachy
p=re.compile('^\d+')
if flag<-1:
keysToRemove.append(key)
else:
if flag==-1:#表示词键值可以成为句首词
searchresult=p.findall(key)
if searchresult[0]==sentence[flag+1]:
prob=self.calConditionPFirst(key)
prob=math.log(prob,2)
TopologicalOrderNodes[i][key]=(TopologicalOrderNodes[i][key][0],prob,None)
else:
keysToRemove.append(key)
else:
firstword=optimalCandidates[flag][0]#取出标号为flag的hierachy对应的词
#看看在句子中位置为flag+1的字,是不是等于key的首字,如果相等则利用标号为flag的hierachy的最优结果进行计算,否则将该key加入待删除集合
searchresult=p.findall(key)
if searchresult[0]==sentence[flag+1]:
jointseg=firstword+'|'+key
con_prob=self.calConditionP(jointseg,firstword)
prob=optimalCandidates[flag][1]+math.log(con_prob,2)
TopologicalOrderNodes[i][key]=(TopologicalOrderNodes[i][key][0],prob,flag)
else:
keysToRemove.append(key)
for toremove in keysToRemove:
TopologicalOrderNodes[i].pop(toremove)
maxP=max([value[1] for value in TopologicalOrderNodes[i].itervalues()])
for cankey in TopologicalOrderNodes[i].iterkeys():
if TopologicalOrderNodes[i][cankey][1]==maxP:
candidatekey=cankey
break
hierachyP=TopologicalOrderNodes[i][candidatekey][2]
optimalCandidates[i]=(candidatekey,maxP,hierachyP)
print 'finishcalculating'
finalresult=[]
pieceOfWord=optimalCandidates[len(sentence)-1][0]
finalresult.append(pieceOfWord)
index=TopologicalOrderNodes[len(sentence)-1][pieceOfWord][2]
while index!=None:
pieceOfWord=optimalCandidates[index][0]
finalresult.append(pieceOfWord)
index=TopologicalOrderNodes[index][pieceOfWord][2]
finalresult.reverse()
return finalresult #print items
#############################################################################################
def calConditionPFirst(self,seg):
'''计算第能成为首词的候选节点的条件概率,输入参数seg为句子的一个片段'''
import re
p=re.compile('\d+')
# N=10002000;
N=1000023
smooth=1.0/N
tmp=p.findall(seg)
probability=smooth
if self.doubleWordDict.has_key(tmp[0]):
prob=self.doubleWordDict[tmp[0]].get(seg,0)
if prob>0:
probability=prob
#print'call calConditionPFirst probability%s' %probability
return probability
#######################################################################################
def calConditionP(self,jointseg,prevseg):
'''计算其他节点的条件概率 jointseg两个词合一起的形式,prevseg为前一个词'''
import re
p=re.compile('\d+')
#N=10001600
N=1000023
smooth=1.0/N
conditionP=smooth
tmp=p.findall(jointseg)
if self.doubleWordDict.has_key(tmp[0]) and self.singleWordDict.has_key(tmp[0]):
prob_joint=self.doubleWordDict[tmp[0]].get(jointseg,0)
prob_prev=self.singleWordDict[tmp[0]].get(prevseg)
if prob_joint>0 and prob_prev>0:
conditionP=prob_joint/prob_prev
#print 'call calConditonP probability=%s'%conditionP
return conditionP
###########################################################################################
def getSingleWordDict(self):
return self.singleWordDict
def getDoubleWordDict(self):
return self.doubleWordDict
采用第二种O概率平滑方法的主体算法部分
class Viterbi:
######################################################################################
def __init__(self):
import cPickle as mypickle
self.singleWordDict=mypickle.load(file('mySingleWordTrie.dat'))
self.doubleWordDict=mypickle.load(file('myDoubleWordTrie2.dat'))
#########################################################################################
def calWordLength(self,string):
'''计算一个词含有多少个汉字
'''
import re
p=re.compile(r' ')
tmp=p.findall(string)
return len(tmp)+1
########################################################################################
def CreateWordGraph(self,sentence):#sentence为一个list,每一个元素为一个字
'''对于一个给定的句子生成词图
'''
NodesCollection={}#每一个节点信息为一个元组(start_p,max_p,hierachy) hierachy initialized 0
length=len(sentence)
for i in range(0,length):
expandlen=self.GetMaxLengthFromTrie(sentence[i])#句子中每个字,粗切分窗的最大长度
if expandlen==0:#说明以sentence[i]为首字的词在字典里不存在
NodesCollection[sentence[i]]=(1,1,None)
continue
else:
for k in range(1,expandlen+1):
sent=self.FormatSent(sentence[i:i+k])
if self.singleWordDict.has_key(sentence[i]):
tmpdict=self.singleWordDict[sentence[i]]
if tmpdict.has_key(sent):
prob=tmpdict[sent]
NodesCollection[sent]=(prob,prob,None)
#i
#print prob
#print sent
else:
if k==1:#以sentence[i]为首字的词在字典中存在,但是字典中不存在此单字
NodesCollection[sent]=(1,1,None)
return NodesCollection
#############################################################################################
def TopologicalSortWordGraph(self,sentence):
'''对原词图生成一个拓扑序,以后缀字为Hierachy'''
import re
NodesCollection=self.CreateWordGraph(sentence)
length=len(sentence)
TopologicalOrderNodes={}
for i in range(0,length):
TopologicalOrderNodes[i]={}
p=re.compile(sentence[i]+'$')
for key ,val in NodesCollection.iteritems():#从点集合中取出以此字为后缀的词,加入该阶梯
if p.search(key):
TopologicalOrderNodes[i][key]=val
return TopologicalOrderNodes
###########################################################################################
def GetMaxLengthFromTrie(self,key):
'''给出首字,查看以此字为首字的词的最大长度
'''
maxlen=0
if self.singleWordDict.has_key(key):
maxlen=max([self.calWordLength(subkey) for subkey in self.singleWordDict[key].iterkeys()])
return maxlen
############################################################################################
def FormatSent(self,sentence):
'''由字拼成短句'''
delimiter=' '
sent=delimiter.join(sentence)
return sent
##################################################################################################
def Segment(self,sentence):
'''如果句子中只有一个字不分词'''
#print 'segment'
if len(sentence)==1:
return sentence
else:
result=self.MyViterbi(sentence)
return result
#############################################################################################
def MyViterbi(self,sentence):
'''在词图上利用动态规划算法,维特比求最优解'''
import math
import re
#N=10001600
N=1000150
smooth=1.0/N#如果是稀缺字则概率值平滑为一个小值
TopologicalOrderNodes=self.TopologicalSortWordGraph(sentence)#获得一个具有拓扑序的图
optimalCandidates={}#存放每一个hierachy内最优的候选节点的键值,元素形式为(key,p,hierachy)
p=smooth#如果双字典中不存在以此字为首字的词,则概率平滑到一个小值
#首先求词网格lattice第一级 hierachy 0的最优解
if self.doubleWordDict.has_key(sentence[0]):
tmpp=self.doubleWordDict[sentence[0]].get(sentence[0]+'|'+'S')
if tmpp>0:
p=tmpp#如果双字典中存在以此单字为句首的情况,则p值取这个概率值
optimalCandidates[0]=(sentence[0],math.log(p,2),0)
for i in range(1,len(sentence)):#对于 hierachy 1 to len(sentence-1)
keysToRemove=[]#待去除的键值,以免它的存在影响最大概率的计算
for key in TopologicalOrderNodes[i].iterkeys():
keylen=self.calWordLength(key)#对于每一个hierachy遍历所有键
flag=i-keylen#flag表示回溯到标号为flag 的hierachy
p=re.compile('^\d+')
if flag<-1:
keysToRemove.append(key)
else:
if flag==-1:#表示词键值可以成为句首词
searchresult=p.findall(key)
if searchresult[0]==sentence[flag+1]:
prob=self.calConditionPFirst(key)
prob=math.log(prob,2)
TopologicalOrderNodes[i][key]=(TopologicalOrderNodes[i][key][0],prob,None)
else:
keysToRemove.append(key)
else:
firstword=optimalCandidates[flag][0]#取出标号为flag的hierachy对应的词
#看看在句子中位置为flag+1的字,是不是等于key的首字,如果相等则利用标号为flag的hierachy的最优结果进行计算,否则将该key加入待删除集合
searchresult=p.findall(key)
if searchresult[0]==sentence[flag+1]:
jointseg=firstword+'|'+key
con_prob=self.calConditionP(jointseg,firstword)
prob=optimalCandidates[flag][1]+math.log(con_prob,2)
TopologicalOrderNodes[i][key]=(TopologicalOrderNodes[i][key][0],prob,flag)
else:
keysToRemove.append(key)
for toremove in keysToRemove:
TopologicalOrderNodes[i].pop(toremove)
maxP=max([value[1] for value in TopologicalOrderNodes[i].itervalues()])
for cankey in TopologicalOrderNodes[i].iterkeys():
if TopologicalOrderNodes[i][cankey][1]==maxP:
candidatekey=cankey
break
hierachyP=TopologicalOrderNodes[i][candidatekey][2]
optimalCandidates[i]=(candidatekey,maxP,hierachyP)
print 'finishcalculating'
finalresult=[]
pieceOfWord=optimalCandidates[len(sentence)-1][0]
finalresult.append(pieceOfWord)
index=TopologicalOrderNodes[len(sentence)-1][pieceOfWord][2]
while index!=None:
pieceOfWord=optimalCandidates[index][0]
finalresult.append(pieceOfWord)
index=TopologicalOrderNodes[index][pieceOfWord][2]
finalresult.reverse()
return finalresult #print items
#############################################################################################
def calConditionPFirst(self,seg):
'''计算第能成为首词的候选节点的条件概率,输入参数seg为句子的一个片段'''
import re
p=re.compile('\d+')
# N=10002000;
N=1000023
smooth=1.0/N
tmp=p.findall(seg)
probability=smooth
if self.doubleWordDict.has_key(tmp[0]):
count=self.doubleWordDict[tmp[0]].get(seg,0)
if count>0:
probability=count/N
#print'call calConditionPFirst probability%s' %probability
return probability
#######################################################################################
def calConditionP(self,jointseg,prevseg):
'''计算其他节点的条件概率 jointseg两个词合一起的形式,prevseg为前一个词'''
import re
p=re.compile('\d+')
#N=10001600
#N=886000
V=52287.0 #词类数number of word type
smooth=1.0/V
conditionP=smooth
tmp=p.findall(jointseg)
assist=jointseg.split('|')[0]#找到第一个词
p_fist=re.compile(assist)
if self.doubleWordDict.has_key(tmp[0]):
count_joint=self.doubleWordDict[tmp[0]].get(jointseg,0)
count_sum_for_edge=sum([val for (key,val) in self.doubleWordDict[tmp[0]].iteritems() if p_fist.match(key)])
conditionP=(1.0+count_joint)/(V+count_sum_for_edge)
#prob_prev=self.singleWordDict[tmp[0]].get(prevseg)
#if prob_joint>0 and prob_prev>0:
#conditionP=prob_joint/prob_prev
#print 'call calConditonP probability=%s'%conditionP
return conditionP
###########################################################################################
def getSingleWordDict(self):
return self.singleWordDict
def getDoubleWordDict(self):
return self.doubleWordDict
######################################################################################
def __init__(self):
import cPickle as mypickle
self.singleWordDict=mypickle.load(file('mySingleWordTrie.dat'))
self.doubleWordDict=mypickle.load(file('myDoubleWordTrie2.dat'))
#########################################################################################
def calWordLength(self,string):
'''计算一个词含有多少个汉字
'''
import re
p=re.compile(r' ')
tmp=p.findall(string)
return len(tmp)+1
########################################################################################
def CreateWordGraph(self,sentence):#sentence为一个list,每一个元素为一个字
'''对于一个给定的句子生成词图
'''
NodesCollection={}#每一个节点信息为一个元组(start_p,max_p,hierachy) hierachy initialized 0
length=len(sentence)
for i in range(0,length):
expandlen=self.GetMaxLengthFromTrie(sentence[i])#句子中每个字,粗切分窗的最大长度
if expandlen==0:#说明以sentence[i]为首字的词在字典里不存在
NodesCollection[sentence[i]]=(1,1,None)
continue
else:
for k in range(1,expandlen+1):
sent=self.FormatSent(sentence[i:i+k])
if self.singleWordDict.has_key(sentence[i]):
tmpdict=self.singleWordDict[sentence[i]]
if tmpdict.has_key(sent):
prob=tmpdict[sent]
NodesCollection[sent]=(prob,prob,None)
#i
#print prob
#print sent
else:
if k==1:#以sentence[i]为首字的词在字典中存在,但是字典中不存在此单字
NodesCollection[sent]=(1,1,None)
return NodesCollection
#############################################################################################
def TopologicalSortWordGraph(self,sentence):
'''对原词图生成一个拓扑序,以后缀字为Hierachy'''
import re
NodesCollection=self.CreateWordGraph(sentence)
length=len(sentence)
TopologicalOrderNodes={}
for i in range(0,length):
TopologicalOrderNodes[i]={}
p=re.compile(sentence[i]+'$')
for key ,val in NodesCollection.iteritems():#从点集合中取出以此字为后缀的词,加入该阶梯
if p.search(key):
TopologicalOrderNodes[i][key]=val
return TopologicalOrderNodes
###########################################################################################
def GetMaxLengthFromTrie(self,key):
'''给出首字,查看以此字为首字的词的最大长度
'''
maxlen=0
if self.singleWordDict.has_key(key):
maxlen=max([self.calWordLength(subkey) for subkey in self.singleWordDict[key].iterkeys()])
return maxlen
############################################################################################
def FormatSent(self,sentence):
'''由字拼成短句'''
delimiter=' '
sent=delimiter.join(sentence)
return sent
##################################################################################################
def Segment(self,sentence):
'''如果句子中只有一个字不分词'''
#print 'segment'
if len(sentence)==1:
return sentence
else:
result=self.MyViterbi(sentence)
return result
#############################################################################################
def MyViterbi(self,sentence):
'''在词图上利用动态规划算法,维特比求最优解'''
import math
import re
#N=10001600
N=1000150
smooth=1.0/N#如果是稀缺字则概率值平滑为一个小值
TopologicalOrderNodes=self.TopologicalSortWordGraph(sentence)#获得一个具有拓扑序的图
optimalCandidates={}#存放每一个hierachy内最优的候选节点的键值,元素形式为(key,p,hierachy)
p=smooth#如果双字典中不存在以此字为首字的词,则概率平滑到一个小值
#首先求词网格lattice第一级 hierachy 0的最优解
if self.doubleWordDict.has_key(sentence[0]):
tmpp=self.doubleWordDict[sentence[0]].get(sentence[0]+'|'+'S')
if tmpp>0:
p=tmpp#如果双字典中存在以此单字为句首的情况,则p值取这个概率值
optimalCandidates[0]=(sentence[0],math.log(p,2),0)
for i in range(1,len(sentence)):#对于 hierachy 1 to len(sentence-1)
keysToRemove=[]#待去除的键值,以免它的存在影响最大概率的计算
for key in TopologicalOrderNodes[i].iterkeys():
keylen=self.calWordLength(key)#对于每一个hierachy遍历所有键
flag=i-keylen#flag表示回溯到标号为flag 的hierachy
p=re.compile('^\d+')
if flag<-1:
keysToRemove.append(key)
else:
if flag==-1:#表示词键值可以成为句首词
searchresult=p.findall(key)
if searchresult[0]==sentence[flag+1]:
prob=self.calConditionPFirst(key)
prob=math.log(prob,2)
TopologicalOrderNodes[i][key]=(TopologicalOrderNodes[i][key][0],prob,None)
else:
keysToRemove.append(key)
else:
firstword=optimalCandidates[flag][0]#取出标号为flag的hierachy对应的词
#看看在句子中位置为flag+1的字,是不是等于key的首字,如果相等则利用标号为flag的hierachy的最优结果进行计算,否则将该key加入待删除集合
searchresult=p.findall(key)
if searchresult[0]==sentence[flag+1]:
jointseg=firstword+'|'+key
con_prob=self.calConditionP(jointseg,firstword)
prob=optimalCandidates[flag][1]+math.log(con_prob,2)
TopologicalOrderNodes[i][key]=(TopologicalOrderNodes[i][key][0],prob,flag)
else:
keysToRemove.append(key)
for toremove in keysToRemove:
TopologicalOrderNodes[i].pop(toremove)
maxP=max([value[1] for value in TopologicalOrderNodes[i].itervalues()])
for cankey in TopologicalOrderNodes[i].iterkeys():
if TopologicalOrderNodes[i][cankey][1]==maxP:
candidatekey=cankey
break
hierachyP=TopologicalOrderNodes[i][candidatekey][2]
optimalCandidates[i]=(candidatekey,maxP,hierachyP)
print 'finishcalculating'
finalresult=[]
pieceOfWord=optimalCandidates[len(sentence)-1][0]
finalresult.append(pieceOfWord)
index=TopologicalOrderNodes[len(sentence)-1][pieceOfWord][2]
while index!=None:
pieceOfWord=optimalCandidates[index][0]
finalresult.append(pieceOfWord)
index=TopologicalOrderNodes[index][pieceOfWord][2]
finalresult.reverse()
return finalresult #print items
#############################################################################################
def calConditionPFirst(self,seg):
'''计算第能成为首词的候选节点的条件概率,输入参数seg为句子的一个片段'''
import re
p=re.compile('\d+')
# N=10002000;
N=1000023
smooth=1.0/N
tmp=p.findall(seg)
probability=smooth
if self.doubleWordDict.has_key(tmp[0]):
count=self.doubleWordDict[tmp[0]].get(seg,0)
if count>0:
probability=count/N
#print'call calConditionPFirst probability%s' %probability
return probability
#######################################################################################
def calConditionP(self,jointseg,prevseg):
'''计算其他节点的条件概率 jointseg两个词合一起的形式,prevseg为前一个词'''
import re
p=re.compile('\d+')
#N=10001600
#N=886000
V=52287.0 #词类数number of word type
smooth=1.0/V
conditionP=smooth
tmp=p.findall(jointseg)
assist=jointseg.split('|')[0]#找到第一个词
p_fist=re.compile(assist)
if self.doubleWordDict.has_key(tmp[0]):
count_joint=self.doubleWordDict[tmp[0]].get(jointseg,0)
count_sum_for_edge=sum([val for (key,val) in self.doubleWordDict[tmp[0]].iteritems() if p_fist.match(key)])
conditionP=(1.0+count_joint)/(V+count_sum_for_edge)
#prob_prev=self.singleWordDict[tmp[0]].get(prevseg)
#if prob_joint>0 and prob_prev>0:
#conditionP=prob_joint/prob_prev
#print 'call calConditonP probability=%s'%conditionP
return conditionP
###########################################################################################
def getSingleWordDict(self):
return self.singleWordDict
def getDoubleWordDict(self):
return self.doubleWordDict