伪Trei树 词典存储方式 敏感词检测

伪Trei, 现在机器空间那么大, 用不着弄个字符集吧, 每次找后继结点 累啊累死了, 虽然很水, 但是这个方法还是比较可取的吧。 一种比较经典的词典存储方式。

可以用来当 词典 、 敏感词汇检测、

我下面这个例子就是敏感词汇检测的、 很简单、用C#就是比较方便。   

 

使用方法就是先定义一个Trei类

初始化Trei的时候用list<string> 来存放一堆匹配敏感词,或者你的词典中的字。

然后调用Trei类的方法is_dirty()来验证是不是在里面。

 

另外, 把之改成Trei图的话, 可以容错输入等。

 

class Trei
    {
        private List<string> dirty_words;
        static private List<Node> TreiTree;
        public Trei(List<string> dirty_words)
        {
            this.dirty_words = dirty_words;
            TreiTree = new List<Node>();
            Node tn = new Node(false, new Dictionary<char, int>(), 0, 0);
            TreiTree.Add(tn);
            build_trei_tree();
        }
        public bool is_dirty(string _words)
        {           
            Node t_node = TreiTree[0];
            for (int i = 0; i < _words.Length;)
            {
                if (t_node.next_nodes.ContainsKey(_words[i]))
                {
                    t_node = TreiTree[t_node.next_nodes[_words[i]]];
                    if (t_node.is_dirty)
                    {                        
                        return true;
                    }
                    i++;
                }
                else
                {
                    if (t_node.level == 0) i++;
                    t_node = TreiTree[0];
                }
            }
           
        return false;
        }
        private void build_trei_tree() 、、 插入每个字符, 你可以扩展成单词,汉字。。等
        {
            Console.WriteLine("Building the tree now.");
            {
                foreach (string s in dirty_words )
                {
                    Node t_node = TreiTree[0];
                    for (int i = 0; i < s.Length; i++ )
                    {
                        if (t_node.next_nodes.ContainsKey(s[i]))
                        {
                            t_node = TreiTree[t_node.next_nodes[s[i]]];
                        }
                        else
                        {
                            Node tmp = new Node(false, new Dictionary<char,int>(), TreiTree.Count, 1+i);
                            TreiTree.Add(tmp);
                            t_node.next_nodes.Add(s[i], TreiTree.Count-1);
                            t_node = tmp;
                        }
                        if (i == s.Length - 1) t_node.is_dirty = true;
                    }
                }
            }
            Console.WriteLine("Building step done.");
        }
        public void print_tree()。。 层状展示
        {
            Node t_node = TreiTree[0];
            Queue<Node> myQue = new Queue<Node>();
            myQue.Enqueue(t_node);
            int t_lev = 0;
            while (myQue.Count > 0)
            {
                t_node = myQue.Dequeue();
                if (t_lev != t_node.level)
                {
                    Console.WriteLine();
                    t_lev = t_node.level;
                }
                foreach (KeyValuePair<char, int> kp in t_node.next_nodes)
                {
                    myQue.Enqueue(TreiTree[kp.Value]);
                    Console.Write( kp.Key +" ");
                }
            }
        }
    }

    class Node
    {
        public bool is_dirty; 、、 是不是最后一个字

        public int idx;
        public int level;
        public Dictionary<char, int> next_nodes;
        public Node(bool _end, Dictionary<char, int> _nn, int _idx, int _l)
        {
            is_dirty = _end;
            next_nodes = _nn;
            idx = _idx;
            level = _l;
        }
    }


posted on 2011-02-18 10:37  amojry  阅读(780)  评论(0编辑  收藏  举报