我利用了吕震宇根据Free版ICTCLAS改编而成.net平台下的ICTCLAS,把ICTCLAS的分词为lucene所用。以下是我写的程序,比较简单。大家看看评评,有什么要改进的地方,望大家指出
Analyzer类:
Tokenizer类:
Analyzer类:
1using System;
2using System.Collections.Generic;
3using System.Text;
4
5using Lucene.Net.Analysis;
6using Lucene.Net.Analysis.Standard;
7using System.IO;
8
9namespace ICTCLASForLucene
10{
11 public class ICTCLASAnalyzer : Analyzer
12 {
13 //定义要过滤的词
14 public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[368];
15 public string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";
16
17 public ICTCLASAnalyzer()
18 {
19 StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);
20 string noise = reader.ReadLine();
21 int i = 0;
22 while (!string.IsNullOrEmpty(noise))
23 {
24 CHINESE_ENGLISH_STOP_WORDS[i] = noise;
25 noise = reader.ReadLine();
26 i++;
27 }
28 }
29
30 /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
31 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
32 /// </summary>
33 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
34 {
35 TokenStream result = new ICTCLASTokenizer(reader);
36 result = new StandardFilter(result);
37 result = new LowerCaseFilter(result);
38 result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
39 return result;
40 }
41
42
43 }
44}
45
2using System.Collections.Generic;
3using System.Text;
4
5using Lucene.Net.Analysis;
6using Lucene.Net.Analysis.Standard;
7using System.IO;
8
9namespace ICTCLASForLucene
10{
11 public class ICTCLASAnalyzer : Analyzer
12 {
13 //定义要过滤的词
14 public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[368];
15 public string NoisePath = Environment.CurrentDirectory + "\\data\\sNoise.txt";
16
17 public ICTCLASAnalyzer()
18 {
19 StreamReader reader = new StreamReader(NoisePath, System.Text.Encoding.UTF8);
20 string noise = reader.ReadLine();
21 int i = 0;
22 while (!string.IsNullOrEmpty(noise))
23 {
24 CHINESE_ENGLISH_STOP_WORDS[i] = noise;
25 noise = reader.ReadLine();
26 i++;
27 }
28 }
29
30 /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
31 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
32 /// </summary>
33 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
34 {
35 TokenStream result = new ICTCLASTokenizer(reader);
36 result = new StandardFilter(result);
37 result = new LowerCaseFilter(result);
38 result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
39 return result;
40 }
41
42
43 }
44}
45
Tokenizer类:
1using System;
2using System.Collections.Generic;
3using System.Text;
4
5using Lucene.Net.Analysis;
6using SharpICTCLAS;
7using System.IO;
8
9namespace ICTCLASForLucene
10{
11 class ICTCLASTokenizer : Tokenizer
12 {
13 int nKind = 2;
14 List<WordResult[]> result;
15 int startIndex = 0;
16 int endIndex = 0;
17 int i = 1;
18 /// <summary>
19 /// 待分词的句子
20 /// </summary>
21 private string sentence;
22 /// <summary>Constructs a tokenizer for this Reader. </summary>
23 public ICTCLASTokenizer(System.IO.TextReader reader)
24 {
25 this.input = reader;
26 sentence = input.ReadToEnd();
27 sentence = sentence.Replace("\r\n","");
28 string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
29 //Console.WriteLine("正在初始化字典库,请稍候");
30 WordSegment wordSegment = new WordSegment();
31 wordSegment.InitWordSegment(DictPath);
32 result = wordSegment.Segment(sentence, nKind);
33 }
34
35 /// <summary>进行切词,返回数据流中下一个token或者数据流为空时返回null
36 /// </summary>
37 public override Token Next()
38 {
39 Token token = null;
40 while (i < result[0].Length-1)
41 {
42 string word = result[0][i].sWord;
43 endIndex = startIndex + word.Length - 1;
44 token = new Token(word, startIndex, endIndex);
45 startIndex = endIndex + 1;
46
47 i++;
48 return token;
49
50 }
51 return null;
52 }
53 }
54}
55
分词郊果:2using System.Collections.Generic;
3using System.Text;
4
5using Lucene.Net.Analysis;
6using SharpICTCLAS;
7using System.IO;
8
9namespace ICTCLASForLucene
10{
11 class ICTCLASTokenizer : Tokenizer
12 {
13 int nKind = 2;
14 List<WordResult[]> result;
15 int startIndex = 0;
16 int endIndex = 0;
17 int i = 1;
18 /// <summary>
19 /// 待分词的句子
20 /// </summary>
21 private string sentence;
22 /// <summary>Constructs a tokenizer for this Reader. </summary>
23 public ICTCLASTokenizer(System.IO.TextReader reader)
24 {
25 this.input = reader;
26 sentence = input.ReadToEnd();
27 sentence = sentence.Replace("\r\n","");
28 string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
29 //Console.WriteLine("正在初始化字典库,请稍候");
30 WordSegment wordSegment = new WordSegment();
31 wordSegment.InitWordSegment(DictPath);
32 result = wordSegment.Segment(sentence, nKind);
33 }
34
35 /// <summary>进行切词,返回数据流中下一个token或者数据流为空时返回null
36 /// </summary>
37 public override Token Next()
38 {
39 Token token = null;
40 while (i < result[0].Length-1)
41 {
42 string word = result[0][i].sWord;
43 endIndex = startIndex + word.Length - 1;
44 token = new Token(word, startIndex, endIndex);
45 startIndex = endIndex + 1;
46
47 i++;
48 return token;
49
50 }
51 return null;
52 }
53 }
54}
55
需分词句子:***,***,中华人民共和国在1949年建立,从此开始了新中国的伟大篇章.长春市长春节发表致词汉字abc iphone 1265325.98921 fee1212@tom.com http://news.qq.com 100%
分词结果:
(***,0,2)(***,4,6)(中华人民共和国,8,14)(1949年,16,20)(建立,21,22)(从此,24,25)(新,29,29)(中国,30,31)(伟大,33,34)(篇章,35,36)(长春市,38,40)(春节,42,43)(发表,44,45)(致词,46,47)(汉字,48,49)(abc,50,52)(iphone,53,58)(1265325.98921,59,71)(fee1212@tom,72,82)(com,84,86)(http://news,87,97)(qq,99,100)(com,102,104)(100%,105,108)
耗时00:00:00.0937500
分词结果:
(***,0,2)(***,4,6)(中华人民共和国,8,14)(1949年,16,20)(建立,21,22)(从此,24,25)(新,29,29)(中国,30,31)(伟大,33,34)(篇章,35,36)(长春市,38,40)(春节,42,43)(发表,44,45)(致词,46,47)(汉字,48,49)(abc,50,52)(iphone,53,58)(1265325.98921,59,71)(fee1212@tom,72,82)(com,84,86)(http://news,87,97)(qq,99,100)(com,102,104)(100%,105,108)
耗时00:00:00.0937500