【算法】基于树形结构分词
#!/usr/bin/env python
#encoding=gbk
import os
import sys
import Queue
G_ENCODING="gbk"
"""
===============================
中文分词
1. 机械分词
2. 统计分词
3. 理解分词
===============================
基于树形结构分词策略(结合机械分词,统计分词)
例:笔记本电脑
dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9}
-------------------------------
| <s> |
-------------------------------
/ / \ \
[笔] [笔记] [笔记本] [笔记本电脑]
/ / / \
[记] [本] [电] [电脑]
/ / \ /
[本] [电] [电脑] [脑]
/ \ /
[电] [电脑] [脑]
/
[脑]
-------------------------------
path: 笔 记 本 电 脑 -- score: [0.32768]
path: 笔 记 本 电脑 -- score: [0.4608]
path: 笔记 本 电 脑 -- score: [0.4608]
path: 笔记 本 电脑 -- score: [0.648]
path: 笔记本 电 脑 -- score: [0.576]
path: 笔记本 电脑 -- score: [0.81]
path: 笔记本电脑 -- score: [0.9]
best path: 笔记本电脑 -- score: [0.9]
-------------------------------
1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重)
2、最少切分、OOV、最少单字等策略
==获取最佳分词路径
-------------------------------
Q1、如果句子过长,树非常大,遍历费时(需优化)
Q2、字典加载(需优化)
以下给出该思想的简单实现[python]:
"""
class Stack():
def __init__(self, volume = 0):
self.list = [] if volume == 0 else [0 for i in range(0,volume)]
self.top = 0
def push(self, element):
if self.list != None:
self.top += 1
self.list[self.top] = element
def pop(self):
if self.list != None and self.top >= 0:
ele = self.list[self.top]
self.list[self.top] = None
self.top -= 1
return ele
return None
def empty(self):
return self.top == 0
class Node():
def __init__(self, data, next = None, prev = None, depth = 0, wlen = 0, weight = 0.0):
self.data = data
self.next = next if next != None else []
self.prev = prev
self.depth = depth
self.wlen = wlen
self.weight = weight
def isLeaf(self):
return self.next == None or self.next == []
class Tree():
def __init__(self, root = None):
self.root = root
"""append a child node to child"""
def append(self, node, cnode):
if node != None and cnode != None:
node.next.append(cnode)
cnode.prev = node
cnode.depth = node.depth + 1
return 0
return -1
"""depth first search(binary preorder)"""
def depth_first_search(self, node):
list = []
if node != None:
stack = Stack(30)
stack.push(node)
while not stack.empty():
tmp = stack.pop()
list.append(tmp)
for i in range(len(tmp.next) - 1, -1, -1):
stack.push(tmp.next[i])
return list
class Tokenizer():
"""init the tree"""
def load(self, tree, cache, dict):
queue = Queue.Queue()
queue.put(tree.root)
clen = len(cache)
while not queue.empty():
node = queue.get()
i = node.wlen
j = i
while j < clen and j - i < 5:
j += 1
tmp = cache[i:j].encode(G_ENCODING)
if dict.has_key(tmp) or len(tmp) == 1:
tnode = Node(tmp, wlen = j, weight = dict.get(tmp))
tree.append(node, tnode)
queue.put(tnode)
return 0
"""backtrance"""
def backtrance(self, node, list):
if node.prev != None and node.prev.data != "<s>":
list.append(node.prev)
self.backtrance(node.prev, list)
return 0
def bestpath(self, tree):
highestScore = 0
bestpath = ""
for node in tree.depth_first_search(tree.root):
"""find the leaf node and backtrance to find the bese path"""
if node.isLeaf():
list = [node]
self.backtrance(node, list)
list.reverse()
"""
1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重)
2、最少切分、OOV、最少单字等策略
这里只是简单给出路径权重的乘积得分
"""
sc = 1.0
tp = ""
for xn in list:
sc *= xn.weight if xn.weight > 0 else 1
tp += xn.data + " "
if sc > highestScore:
highestScore = sc
bestpath = tp.strip()
print "path: %s -- score: [%s]"%(tp.strip(), sc)
print "\nbest path: %s -- score: [%s]"%(bestpath, highestScore)
return bestpath
def example():
sent = "笔记本电脑"
dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.99,"电脑":0.99,"笔记本电脑":0.97}
cache = unicode(sent, G_ENCODING)
tokenizer = Tokenizer()
tree = Tree(Node("<s>"))
"""init tree"""
tokenizer.load(tree, cache, dict)
"""backtrance and find the best path"""
tokenizer.bestpath(tree)
example()