基于.Net Framework 3.5的Lucene.Net 中文词组匹配分词器

可以自己看看是不是很高效。为了加快速度,尽量精简了算法。测试表明,精确度还可以。
由于没有实现完整的一套字典机制,而是普通的文本字典,所以就不提供完整源码下载了,贴出核心的源码。从版本完整度上来说只能算是0.6版。
另外,本分词系统使用的词库是ShootAnalyzer的词库。

使用方法:

参考以下代码

 1         [TestMethod]
 2         public void TestMethod1()
 3         {
 4             //
 5             // TODO: 在此    添加测试逻辑
 6             //
 7 
 8             Participle p = new Participle();
 9             p.Init(@"D:\labs\xxxx");
10             string txt = @"天下真的有神吗?我不是呀";
11             string outstr = string.Empty;
12             Stopwatch st = new Stopwatch();
13             st.Start();
14             outstr = p.TextSpliter(txt);
15             st.Stop();
16 
17             Stopwatch st2 = new Stopwatch();
18             st2.Start();
19             List<string> hs = p.TextArray(txt);
20             st2.Stop();
21             Console.WriteLine(outstr);
22             Console.WriteLine(st.ElapsedMilliseconds.ToString("f2"));
23             Console.WriteLine(st2.ElapsedMilliseconds.ToString("f2"));
24 
25             YurowAnalyzer.YurowAnalyzer y = new YurowAnalyzer.YurowAnalyzer(@"D:\labs\xxxx");
26             TokenStream t = y.TokenStream(nullnew StringReader(txt));
27 
28             Token token = t.Next();
29             while (token != null)
30             {
31                 Console.WriteLine(token.TermText() + "\t" + token.StartOffset() + "\t" + token.EndOffset());
32                 token = t.Next();
33             }
34             t.Close();
35         }

在Lucene.Net 索引或者搜索中直接使用YurowAnalyzer.YurowAnalyzer 分析器。


下载地址:
https://files.cnblogs.com/birdshover/YurowAnalyzer.rar


下面贴上些关键源码:
Participle类(分词类)
  1 
  2         public List<int> StartArr;
  3 
  4         public List<string> TextArray(string text)
  5         {
  6             List<string> hs = new List<string>();
  7             StartArr = new List<int>();
  8             int start = 0;
  9             for (int i = 0; i < text.Length; i++)
 10             {
 11                 char nowchar = text[i];
 12                 char nextchar = (i == text.Length - 1? '\0' : text[i + 1];
 13                 if (DataCatch.EnglishChar.Contains(nowchar))
 14                 {
 15                     if (start < 1)
 16                         start = i;
 17                     if (DataCatch.EnglishChar.Contains(nextchar))
 18                         i++;
 19                     else
 20                     {
 21                         hs.Add(text.Substring(start, i - start));
 22                         StartArr.Add(start);
 23                         start = 0;
 24                     }
 25                     continue;
 26                 }
 27 
 28                 if (DataCatch.Num.Contains(nowchar))
 29                 {
 30                     if (start < 1)
 31                         start = i;
 32                     if (DataCatch.Num.Contains(nextchar))
 33                     {
 34                         i++;
 35                     }
 36                     else
 37                     {
 38                         hs.Add(text.Substring(start, i - start));
 39                         StartArr.Add(start);
 40                         start = 0;
 41                     }
 42                     continue;
 43                 }
 44                 if (nowchar == ' ')
 45                 {
 46                     continue;
 47                 }
 48                 if (nextchar == ' ' || nextchar == '\0')
 49                 {
 50                     hs.Add(nowchar.ToString());
 51                     StartArr.Add(i);
 52                     i++;
 53                     continue;
 54                 }
 55                 if (DataCatch.GetDict().ContainsKey(nowchar) && DataCatch.GetDict()[nowchar].ContainsKey(nextchar))
 56                 {
 57                     HashSet<string> list = DataCatch.GetDict()[nowchar][nextchar];
 58                     if (list.Count == 0)
 59                     {
 60                         hs.Add(nowchar.ToString() + nextchar.ToString());
 61                         StartArr.Add(i);
 62                         i++;
 63                         continue;
 64                     }
 65                     int maxnum = 0;
 66                     string temp = string.Empty;
 67                     string outstr = string.Empty;
 68                     foreach (string item in list)
 69                     {
 70                         if (text.Length - i > item.Length + 1)
 71                         {
 72                             temp = text.Substring(i + 2, item.Length);
 73                             if (list.Contains(temp))
 74                             {
 75                                 if (maxnum > item.Length)
 76                                     continue;
 77                                 else
 78                                 {
 79                                     maxnum = item.Length;
 80                                     outstr = temp;
 81                                 }
 82                             }
 83                         }
 84                     }
 85                     if (!string.IsNullOrEmpty(outstr))
 86                     {
 87                         hs.Add(nowchar.ToString() + nextchar.ToString() + outstr);
 88                         StartArr.Add(i);
 89                         i = i + maxnum + 1;
 90                     }
 91                     else
 92                     {
 93                         hs.Add(nowchar.ToString() + nextchar.ToString());
 94                         StartArr.Add(i);
 95                         i++;
 96                     }
 97                 }
 98                 else
 99                 {
100                     hs.Add(nowchar.ToString());
101                     StartArr.Add(i);
102                 }
103             }
104             return hs;
105         }



DefaultDict类(加载分词具体实现)

private Dictionary<char, Dictionary<char, HashSet<string>>> dictMemory = new Dictionary<char, Dictionary<char, HashSet<string>>>(DataCatch.InitPage);

 1 protected virtual void DoFormat()
 2         {
 3             Stream stream = new FileStream(dictSourcePath, FileMode.Open, FileAccess.Read, FileShare.Read);
 4             StreamReader sr = new StreamReader(stream, Encoding.Default);
 5             while (sr.Peek() > -1)
 6             {
 7                 string line = sr.ReadLine();
 8                 if (line.Length > 1)
 9                 {
10                     char charfirst = line[0];
11                     char charseconde = line[1];
12                     string other = line.Length > 2 ? line.Remove(02) : null;
13                     if (dictMemory.ContainsKey(charfirst))
14                     {
15                         if (dictMemory[charfirst].ContainsKey(charseconde))
16                         {
17                             HashSet<string> list = dictMemory[charfirst][charseconde];
18                             if (!string.IsNullOrEmpty(other) && !list.Contains(other))
19                                 list.Add(other);
20                         }
21                         else
22                         {
23                             HashSet<string> list = new HashSet<string>();
24                             if (!string.IsNullOrEmpty(other))
25                                 list.Add(other);
26                             dictMemory[charfirst].Add(charseconde, list);
27                         }
28                     }
29                     else
30                     {
31                         Dictionary<char, HashSet<string>> d = new Dictionary<char, HashSet<string>>();
32                         HashSet<string> list = new HashSet<string>();
33                         if (!string.IsNullOrEmpty(other))
34                             list.Add(other);
35                         d.Add(charseconde, list);
36                         dictMemory.Add(charfirst, d);
37                     }
38                 }
39             }
40         }

转换到Lucene接口

 1     public class YurowTokenizer : Tokenizer
 2     {
 3         private string text;
 4         private List<string> list;
 5         int current = 0;
 6         private string path;
 7         static Participle p;
 8         bool isfirstrun = true;
 9 
10         public YurowTokenizer(TextReader textreader, string path)
11         {
12             text = textreader.ReadToEnd();
13             this.path = path;
14             if (p == null)
15             {
16                 p = new Participle();
17                 p.Init(path);
18             }
19         }
20 
21         public override Token Next()
22         {
23             if (string.IsNullOrEmpty(text))
24                 return null;
25 
26             if (isfirstrun)
27             {
28                 list = p.TextArray(text);
29                 isfirstrun = false;
30             }
31             if (list.Count < 1 || current >= list.Count)
32                 return null;
33             int start = p.StartArr[current];
34             string currentstr = list[current];
35             Token token = new Token(currentstr, start, start + currentstr.Length);
36             current++;
37             return token;
38         }
39     }


有兴趣的朋友可以自己反编译查看源码。暂时不提供完整源码。

http://www.cnblogs.com/birdshover/ by yurow
posted @ 2008-03-25 01:32  Birdshover  阅读(8990)  评论(10编辑  收藏  举报