我完成的C#关于在lucene下的中文切词

经过一天的研究，终于完成了在lucene.net下可以使用的中文切词方法。感到有些复杂，不过我还是拿下了。颇有点成就感的，发上来跟大家分享一下！
在实现了中文切词的基础方法上，我将其封装在继承lucene的Analyzer类下
chineseAnalzer的方法就不用多说了。

using System;

using System.Collections.Generic;

using System.Text;

using Lucene.Net.Analysis;

using Lucene.Net.Analysis.Standard;

namespace Lucene.Fanswo

{

/// <summary>

///

/// </summary>

public class ChineseAnalyzer:Analyzer

{

//private System.Collections.Hashtable stopSet;

public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们" };

/// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link

/// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.

/// </summary>

public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)

{

TokenStream result = new ChineseTokenizer(reader);

result = new StandardFilter(result);

result = new LowerCaseFilter(result);

result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);

return result;

}

ChineseTokenizer类的实现：
这里通过词典来正向匹配字符，返回lucene下定义的token流

using System;

using System.Collections.Generic;

using System.Text;

using Lucene.Net.Analysis;

using System.Collections;

using System.Text.RegularExpressions;

using System.IO;

namespace Lucene.Fanswo

{

class ChineseTokenizer : Tokenizer

{

private int offset = 0, bufferIndex = 0, dataLen = 0;//偏移量，当前字符的位置，字符长度

private int start;//开始位置

/// <summary>

/// 存在字符内容

/// </summary>

private string text;

/// <summary>

/// 切词所花费的时间

/// </summary>

public double TextSeg_Span = 0;

/// <summary>Constructs a tokenizer for this Reader. </summary>

public ChineseTokenizer(System.IO.TextReader reader)

{

this.input = reader;

text = input.ReadToEnd();

dataLen = text.Length;

}

/// <summary>进行切词，返回数据流中下一个token或者数据流为空时返回null

/// </summary>

///

public override Token Next()

{

Token token = null;

WordTree tree = new WordTree();

//读取词库

tree.LoadDict();

//初始化词库，为树形

Hashtable t_chartable = WordTree.chartable;

string ReWord = "";

string char_s;

start = offset;

bufferIndex = start;

while (true)

{

//开始位置超过字符长度退出循环

if (start >= dataLen)

{

break;

}

//获取一个词

char_s = text.Substring(start, 1);

if (string.IsNullOrEmpty(char_s.Trim()))

{

start++;

continue;

}

//字符不在字典中

if (!t_chartable.Contains(char_s))

{

if (ReWord == "")

{

int j = start + 1;

switch (tree.GetCharType(char_s))

{

case 0://中文单词

ReWord += char_s;

break;

case 1://英文单词

j = start + 1;

while (j < dataLen)

{

if (tree.GetCharType(text.Substring(j, 1)) != 1)

break;

j++;

}

ReWord += text.Substring(start, j - offset);

break;

case 2://数字

j = start + 1;

while (j < dataLen)

{

if (tree.GetCharType(text.Substring(j, 1)) != 2)

break;

j++;

}

ReWord += text.Substring(start, j - offset);

break;

default:

ReWord += char_s;//其他字符单词

break;

}

offset = j;//设置取下一个词的开始位置

}

else

{

offset = start;//设置取下一个词的开始位置

}

//返回token对象

return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);

}

//字符在字典中

ReWord += char_s;

//取得属于当前字符的词典树

t_chartable = (Hashtable)t_chartable[char_s];

//设置下一循环取下一个词的开始位置

start++;

if (start == dataLen)

{

offset = dataLen;

return new Token(ReWord, bufferIndex, bufferIndex + ReWord.Length - 1);

}

return token;

}

测试的代码：

using System;

using System.Collections.Generic;

using System.Text;

using Analyzer = Lucene.Net.Analysis.Analyzer;

using SimpleAnalyzer = Lucene.Net.Analysis.SimpleAnalyzer;

using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;

using Token = Lucene.Net.Analysis.Token;

using TokenStream = Lucene.Net.Analysis.TokenStream;

namespace MyLuceneTest

{

class Program

{

[STAThread]

public static void Main(System.String[] args)

{

try

{

Test("中华人民共和国在1949年建立，从此开始了新中国的伟大篇章。长春市长春节致词", true);

}

catch (System.Exception e)

{

System.Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + e.Message + e.ToString());

}

internal static void Test(System.String text, bool verbose)

{

System.Console.Out.WriteLine(" Tokenizing string: " + text);

Test(new System.IO.StringReader(text), verbose, text.Length);

}

internal static void Test(System.IO.TextReader reader, bool verbose, long bytes)

{

//Analyzer analyzer = new StandardAnalyzer();

Analyzer analyzer = new Lucene.Fanswo.ChineseAnalyzer();

TokenStream stream = analyzer.TokenStream(null, reader);

System.DateTime start = System.DateTime.Now;

int count = 0;

for (Token t = stream.Next(); t != null; t = stream.Next())

{

if (verbose)

{

System.Console.Out.WriteLine("Token=" + t.ToString());

}

count++;

}

System.DateTime end = System.DateTime.Now;

long time = end.Ticks - start.Ticks;

System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens");

System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token");

System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour");

}

测试结果：

完毕！
分词的郊率上还有待在算法上提高。还有中文的标点符号没有处理，我将进一步完善。
本人文采不好，写不出很多文字，只有以代码代替一下我的言语。兄弟姐妹们给点意见哦。谢谢！

原码下载：https://files.cnblogs.com/harryguo/LucuneSearch.rar

posted @ 2007-09-26 17:38 harry.guo 阅读(5751) 评论(20) 收藏举报

不断的学习技术积累

时刻记录下工作,学习中的点点滴滴.

我完成的C#关于在lucene下的中文切词

公告

不断的学习技术 积累

时刻记录下工作,学习中的点点滴滴.

我完成的C#关于在lucene下的中文切词

公告

不断的学习技术积累