使用肖波的KTDictSeg分词器为Lucene.net服务

最近在看Lucene.net 发现Lucene.net的中文分词资料不是很多,很早就在看肖波的KTDictSeg,觉的分词效果不错,但是没有lucene接口,看他的blog也是很长时间没有更新了他在他的blog中提到将在下一个版本中提供对lucene的支持,我这里期待中...同时blog中提到一挥的修改版本,但是一挥的站打不开了,不知道什么原因,我刚刚看这个时间不长,查了些资料写了下面的代码实现了KTDictSeg在Lucene.net中的调用,期待有更好的方法出现
下面附上代码

1using System;
2using System.Collections.Generic;
3using System.Text;
4using System.IO;
5using Lucene.Net;
6using Lucene.Net.Analysis;
7
8namespace Lucene.Net.Analysis.KTDictSeg
9{
10    public class KTDictSegAnalyzer:Analyzer
11    {
12        public KTDictSegAnalyzer()
13        {
14        }
15
16        public override TokenStream TokenStream(string fieldName, TextReader reader)
17        {
18            TokenStream result = new KTDictSegTokenizer(reader);
19            result = new LowerCaseFilter(result);
20            return result;
21        }
22    }
23}

using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Collections;
using Lucene.Net;
using Lucene.Net.Analysis;
using KTDictSeg;

namespace Lucene.Net.Analysis.KTDictSeg
{
    public class KTDictSegTokenizer:Tokenizer
    {
        public static CSimpleDictSeg m_SimpleDictSeg;
        private ArrayList ioBuffer;
        private int offSet = 0 ;    //偏移量.
        private int position = -1 ; //词汇在缓冲中的位置.
        private int length = 0 ;    //词汇的长度.
        private int start = 0 ;     //开始偏移量.

        public KTDictSegTokenizer(System.IO.TextReader input)
            : base(input)
        {
            //这里用了一个第三方的中文分词组件.
            //ioBuffer = Sj110.Com.Chinese.Tokenizer.Tokenize(input.ReadToEnd());
            if (m_SimpleDictSeg == null)
            {
                try
                {
                    m_SimpleDictSeg = new CSimpleDictSeg();
                    m_SimpleDictSeg.DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
                    m_SimpleDictSeg.LoadDict();
                }
                catch (Exception e1)
                {
                    m_SimpleDictSeg = null;
                    throw e1;
                }
            }

            m_SimpleDictSeg.FilterStopWords = true;
            m_SimpleDictSeg.MatchName = true;
            ioBuffer = m_SimpleDictSeg.Segment(input.ReadToEnd());

        }

        //DotLucene的分词器简单来说，就是实现Tokenizer的Next方法，把分解出来的每一个词构造为一个Token，因为Token是DotLucene分词的基本单位。
        public override Token Next()
        {
            position++;
            if (position < ioBuffer.Count)
            {
                length = ioBuffer[position].ToString().Length;
                start = offSet ;
                offSet += length ;
                return new Token(ioBuffer[position].ToString(), start, start + length);
            }

            return null;
        }
    }
}
以上代码借鉴了其他朋友的代码,自己组织了下, 使用这个分词,比使用lucene.net自带的分词StandardAnalyzer 速度上快了6倍
下面是制作索引的函数
private void mackIndex()
        {
           Analyzer analyzer = new KTDictSegAnalyzer();
//lucene.net 默认分词器
            //Analyzer analyzer = new StandardAnalyzer();

            FSDirectory fsDir = FSDirectory.GetDirectory(Index_Store_Path, true);

            IndexWriter fswriter = new IndexWriter(fsDir, analyzer, true);
            ProductDao productDao = new ProductDao();
//得到数据源
            IList<Product> PList = productDao.GetProduct();
            IEnumerator<Product> _p = PList.GetEnumerator();
//根据数据源制定document
            while(_p.MoveNext())
            {
                Document Doc = new Document();
                Field prodname = new Field("prodname", _p.Current.Proname,Field.Store.YES,Field.Index.TOKENIZED);
                if (_p.Current.Proshuoming == null)
                {
                    _p.Current.Proshuoming = "null";
                }
                Field profunction = new Field("profunction", _p.Current.Proshuoming, Field.Store.YES, Field.Index.UN_TOKENIZED);
                Doc.Add(prodname);
                Doc.Add(profunction);
                fswriter.AddDocument(Doc);
            }

            fswriter.Close();


        }

posted @ 2008-06-22 19:19 冰封的心阅读(517) 评论(0) 编辑收藏举报

刷新页面返回顶部

冰封的心

冰封的心

使用肖波的KTDictSeg分词器 为Lucene.net服务

公告

使用肖波的KTDictSeg分词器为Lucene.net服务