【文本挖掘】模拟退火英文分词
大概原理是找到文本中重复项最多的文本作为词典,代价函数为词典的累计词长和文本分词数之和,迭代找到代价函数最小值所对应的词典和分词结果。方法简单,跑出来的结果也挺有意思。
1 from random import randint 2 3 4 def segment(text,segs): 5 words=[] 6 last=0 7 for i in range(len(segs)): 8 if segs[i]=='1': 9 words.append(text[last:i+1]) 10 last=i+1 11 words.append(text[last:]) 12 return words 13 14 def evaluate(text,segs): 15 words=segment(text, segs) 16 text_size=len(words) 17 lexicon_size=len(''.join(list(set(words)))) 18 return text_size+lexicon_size 19 20 def filp(segs,pos): 21 return segs[:pos]+str(1-int(segs[pos]))+segs[pos+1:] 22 23 def filp_n(segs,n): 24 for i in range(n): 25 segs=filp(segs,randint(0,len(segs)-1)) 26 return segs 27 28 def anneal(text,segs,iterations,cooling_rate): 29 temperature=float(len(segs)) 30 while temperature>0.5: 31 best_segs,best=segs,evaluate(text, segs) 32 for i in range(iterations): 33 guess=filp_n(segs,int(round(temperature))) 34 score=evaluate(text,guess) 35 if score<best: 36 best,best_segs=score,guess 37 score,segs=best,best_segs 38 temperature=temperature/cooling_rate 39 print temperature,evaluate(text,segs),segment(text,segs) 40 print '----------END--------------------' 41 return segs