1。首先建立词典。此处词典要理解为:对训练语料库中的词进行词频等信息的统计后形成的数据结构,和“新华字典”中的字典意义不一样。我的实现中建立了两个词典:“单词”词典统计每个词的出现次数,“双词”词典统计每两个词连续出现的次数(因为采用的是二元语法模型)。然后又分别对单词词典,双词词典形成一级Trie树,或者叫做“带首字索引”的字典。
代码如下
从训练语料库建立“单词”词典
# -*- coding: cp936 -*-
import re
import cPickle as mypickle
def datafile(name,sep='|'):
for line in file(name):
yield line.split(sep)
candidates=datafile(r'C:\Python26\Bigramwordsegemtation\data\training.txt')
p1=re.compile('(^\s+|\s+$)')
p2=re.compile('\d')
#p3=re.compile('\s+')
mySingleWordDict={}
#myDoubleWordDict={}
for m in candidates:
#singleline=[]
for e in m:
e=p1.sub('',e)
if p2.match(e):
#e=p3.sub('_',e)
mySingleWordDict[e]=float(mySingleWordDict.get(e,0)+1)
print '词为%s,个数为%s'%(e,mySingleWordDict[e])
N=sum(mySingleWordDict.itervalues())
for key in mySingleWordDict.iterkeys():
mySingleWordDict[key]=mySingleWordDict[key]/N
#for m in mySingleWordDict.iteritems():
#print m
fid=file('SingleWordDictionaryCrossValidation.dat','w')
mypickle.dump(mySingleWordDict,fid)
fid.close()
print 'finish'
print N
import re
import cPickle as mypickle
def datafile(name,sep='|'):
for line in file(name):
yield line.split(sep)
candidates=datafile(r'C:\Python26\Bigramwordsegemtation\data\training.txt')
p1=re.compile('(^\s+|\s+$)')
p2=re.compile('\d')
#p3=re.compile('\s+')
mySingleWordDict={}
#myDoubleWordDict={}
for m in candidates:
#singleline=[]
for e in m:
e=p1.sub('',e)
if p2.match(e):
#e=p3.sub('_',e)
mySingleWordDict[e]=float(mySingleWordDict.get(e,0)+1)
print '词为%s,个数为%s'%(e,mySingleWordDict[e])
N=sum(mySingleWordDict.itervalues())
for key in mySingleWordDict.iterkeys():
mySingleWordDict[key]=mySingleWordDict[key]/N
#for m in mySingleWordDict.iteritems():
#print m
fid=file('SingleWordDictionaryCrossValidation.dat','w')
mypickle.dump(mySingleWordDict,fid)
fid.close()
print 'finish'
print N
从训练语料库建立"双词"词典
# -*- coding: cp936 -*-
import re
import cPickle as mypickle
delimiter='|'
def datafile(name,sep='|'):
'''use generator to create a iterable object '''
for line in file(name):
yield line.split(sep)
candidates=datafile(r'c:\python26\Bigramwordsegemtation\data\training.txt')
p1=re.compile('(^\s+|\s+$)')
p2=re.compile('\d')
myDoubleWordDict={}
for m in candidates:
singleline=[]
for e in m:
e=p1.sub('',e)
if p2.match(e):
singleline.append(e)
if len(singleline)>=2:
initial=singleline[0]+delimiter+'S'
myDoubleWordDict[initial]=float(myDoubleWordDict.get(initial,0)+1)
print '词为%s,个数为%s'%(initial,myDoubleWordDict[initial])
for i in range(0,len(singleline)-1):
c=delimiter.join(singleline[i:i+2])
myDoubleWordDict[c]=float(myDoubleWordDict.get(c,0)+1)
print '词为%s,个数为%s'%(c,myDoubleWordDict[c])
N=sum(myDoubleWordDict.itervalues())
for key in myDoubleWordDict.iterkeys():
myDoubleWordDict[key]=myDoubleWordDict[key]/N
#for m in myDoubleWordDict.iteritems():
# print m
#print N
fid=file('DoubleWordDictionaryCrossValidation2.dat','w')
mypickle.dump(myDoubleWordDict,fid)
fid.close()
print 'finish'
print N
import re
import cPickle as mypickle
delimiter='|'
def datafile(name,sep='|'):
'''use generator to create a iterable object '''
for line in file(name):
yield line.split(sep)
candidates=datafile(r'c:\python26\Bigramwordsegemtation\data\training.txt')
p1=re.compile('(^\s+|\s+$)')
p2=re.compile('\d')
myDoubleWordDict={}
for m in candidates:
singleline=[]
for e in m:
e=p1.sub('',e)
if p2.match(e):
singleline.append(e)
if len(singleline)>=2:
initial=singleline[0]+delimiter+'S'
myDoubleWordDict[initial]=float(myDoubleWordDict.get(initial,0)+1)
print '词为%s,个数为%s'%(initial,myDoubleWordDict[initial])
for i in range(0,len(singleline)-1):
c=delimiter.join(singleline[i:i+2])
myDoubleWordDict[c]=float(myDoubleWordDict.get(c,0)+1)
print '词为%s,个数为%s'%(c,myDoubleWordDict[c])
N=sum(myDoubleWordDict.itervalues())
for key in myDoubleWordDict.iterkeys():
myDoubleWordDict[key]=myDoubleWordDict[key]/N
#for m in myDoubleWordDict.iteritems():
# print m
#print N
fid=file('DoubleWordDictionaryCrossValidation2.dat','w')
mypickle.dump(myDoubleWordDict,fid)
fid.close()
print 'finish'
print N
从词典建立一级Trie树
# -*- coding: cp936 -*-
import re
import cPickle as mypickle
p=re.compile('\d+')
myDict=mypickle.load(file('DoubleWordDictionaryCrossValidation.dat'))
myTrie={}
#二级Trie树,词按首字归类
for key in myDict.iterkeys():
tmp=p.findall(key)
if myTrie.get(tmp[0])==None:
myTrie[tmp[0]]={}
for (key,val) in myDict.iteritems():
tmp=p.findall(key)
myTrie[tmp[0]][key]=val
print '一级键%s二级键%s值%s'%(tmp[0],key,val)
fid=file('myDoubleWordTrieCrossValidation.dat','w')
mypickle.dump(myTrie,fid)
fid.close()
print 'finish'
import re
import cPickle as mypickle
p=re.compile('\d+')
myDict=mypickle.load(file('DoubleWordDictionaryCrossValidation.dat'))
myTrie={}
#二级Trie树,词按首字归类
for key in myDict.iterkeys():
tmp=p.findall(key)
if myTrie.get(tmp[0])==None:
myTrie[tmp[0]]={}
for (key,val) in myDict.iteritems():
tmp=p.findall(key)
myTrie[tmp[0]][key]=val
print '一级键%s二级键%s值%s'%(tmp[0],key,val)
fid=file('myDoubleWordTrieCrossValidation.dat','w')
mypickle.dump(myTrie,fid)
fid.close()
print 'finish'
下一部分是主算法模块,在主算法模块,我们调用的数据结构为“单词”的一级Trie树词典,与“双词”的一级Trie树字典