【算法】基于树形结构分词

#!/usr/bin/env python
#encoding=gbk
import os
import sys
import Queue

G_ENCODING="gbk"
"""
===============================
中文分词
1. 机械分词
2. 统计分词
3. 理解分词
===============================
基于树形结构分词策略(结合机械分词,统计分词)
例:笔记本电脑 
    dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9}
         -------------------------------
        |              <s>              |
         -------------------------------
        /         /         \           \
      [笔]     [笔记]    [笔记本]    [笔记本电脑]
       /          /        /   \
     [记]       [本]     [电] [电脑]
      /         /  \       /
    [本]      [电] [电脑] [脑]
    /  \       /
 [电] [电脑] [脑]
  /
[脑] 
-------------------------------
path: 笔 记 本 电 脑  -- score: [0.32768]
path: 笔 记 本 电脑   -- score: [0.4608]
path: 笔记 本 电 脑   -- score: [0.4608]
path: 笔记 本 电脑    -- score: [0.648]
path: 笔记本 电 脑    -- score: [0.576]
path: 笔记本 电脑     -- score: [0.81]
path: 笔记本电脑      -- score: [0.9]

best path: 笔记本电脑 -- score: [0.9]

-------------------------------
1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重)
2、最少切分、OOV、最少单字等策略
==获取最佳分词路径
-------------------------------
Q1、如果句子过长,树非常大,遍历费时(需优化)
Q2、字典加载(需优化)
以下给出该思想的简单实现[python]:
"""

class Stack():
    def __init__(self, volume = 0):
        self.list = [] if volume == 0 else [0 for i in range(0,volume)]
        self.top = 0

    def push(self, element):
        if self.list != None: 
            self.top += 1
            self.list[self.top] = element

    def pop(self):
        if self.list != None and self.top >= 0:
            ele = self.list[self.top]
            self.list[self.top] = None
            self.top -= 1
            return ele
        return None
    def empty(self):
        return self.top == 0
    
class Node():
    def __init__(self, data, next = None, prev = None, depth = 0, wlen = 0, weight = 0.0):
        self.data = data
        self.next = next if next != None else []
        self.prev = prev
        self.depth = depth
        self.wlen = wlen
        self.weight = weight

    def isLeaf(self):
        return self.next == None or self.next == []

class Tree():
    def __init__(self, root = None):
        self.root = root
    
    """append a child node to child"""
    def append(self, node, cnode):
        if node != None and cnode != None:
            node.next.append(cnode)
            cnode.prev = node
            cnode.depth = node.depth + 1
            return 0
        return -1

    """depth first search(binary preorder)"""  
    def depth_first_search(self, node):
        list = []
        if node != None:
            stack = Stack(30)
            stack.push(node)
            while not stack.empty():
                tmp = stack.pop()
                list.append(tmp)
                for i in range(len(tmp.next) - 1, -1, -1):
                    stack.push(tmp.next[i])
        return list

class Tokenizer():
    """init the tree"""
    def load(self, tree, cache, dict):
        queue = Queue.Queue()
        queue.put(tree.root)
        clen = len(cache)
        while not queue.empty():
            node = queue.get()
            i = node.wlen
            j = i
            while j < clen and j - i < 5:
                j += 1
                tmp = cache[i:j].encode(G_ENCODING)
                if dict.has_key(tmp) or len(tmp) == 1:
                    tnode = Node(tmp, wlen = j, weight = dict.get(tmp))
                    tree.append(node, tnode)
                    queue.put(tnode)
        return 0
    """backtrance"""
    def backtrance(self, node, list):
        if node.prev != None and node.prev.data != "<s>":
            list.append(node.prev)
            self.backtrance(node.prev, list)
        return 0

    def bestpath(self, tree):
        highestScore = 0
        bestpath = ""

        for node in tree.depth_first_search(tree.root):
            """find the leaf node and backtrance to find the bese path"""
            if node.isLeaf():
                list = [node]
                self.backtrance(node, list)
                list.reverse()
                """
                1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重)
                2、最少切分、OOV、最少单字等策略
                这里只是简单给出路径权重的乘积得分
                """
                sc = 1.0
                tp = ""
                for xn in list:
                    sc *= xn.weight if xn.weight > 0 else 1
                    tp += xn.data + " "
                if sc > highestScore: 
                    highestScore = sc
                    bestpath = tp.strip()
                print "path: %s -- score: [%s]"%(tp.strip(), sc)
        print "\nbest path: %s -- score: [%s]"%(bestpath, highestScore)
        return bestpath

def example():
    sent = "笔记本电脑"
    dict = {"":0.8,"":0.8,"":0.8,"":0.8,"":0.8,"笔记":0.9,"笔记本":0.99,"电脑":0.99,"笔记本电脑":0.97}
    cache = unicode(sent, G_ENCODING)
    tokenizer = Tokenizer()
    tree = Tree(Node("<s>"))

    """init tree"""
    tokenizer.load(tree, cache, dict)
    """backtrance and find the best path"""
    tokenizer.bestpath(tree)

example()

 

posted on 2014-10-30 16:42  有个姑娘叫小芳  阅读(257)  评论(0编辑  收藏  举报