Lucene.NET开发——索引文件的建立

索引建立类

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.IO;

using System.Windows.Forms;

 

using Lucene.Net.Documents;

using Lucene.Net.Analysis;

using Lucene.Net.Analysis.Standard;

using Lucene.Net.Index;

using Lucene.Net.Store;

 

namespace NewBeeEngine

{

   /// <summary>

   /// 使用方法:

   /// 1.创建Indexer,参数为加密密钥

   /// 2.调用FileAdd,将文件添加到索引

   /// 3.调用CloseIndexer,写入索引文件,关闭索引

   /// </summary>

  

   public class Indexer

   {

      private Document doc = null;

      private IndexWriter writer = null;

      List<string> fileTypeList = null;

 

      public Indexer(string key)

      {

        Password.password = key;

 

        fileTypeList = new List<string>();

        fileTypeList.Add("txt");

        fileTypeList.Add("xml");

        fileTypeList.Add("htm");

        fileTypeList.Add("html");

        fileTypeList.Add("doc");

        fileTypeList.Add("docx");

        fileTypeList.Add("pdf");

 

        //初始化:AnalyzerIndexWriter

        Analyzer a = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);

        //Analyzer a = new SimpleAnalyzer();

        DirectoryInfo createDirInfo = new DirectoryInfo(Application.StartupPath + "\\" + "index");

        writer = new IndexWriter(FSDirectory.Open(createDirInfo), a,

              true, IndexWriter.MaxFieldLength.LIMITED);

      }

 

      //对单个文件进行索引

      public void FileAdd(FileInfo fileInfo, string tag, Engine engine)

      {

        doc = new Document();

 

        //filename

        string fileName = fileInfo.Name;

        Field field = new Field("filename", fileName, Field.Store.YES, Field.Index.NOT_ANALYZED);

        doc.Add(field);

 

        //filedate

        string fileDate = DateTime.Now.ToString();

        field = new Field("filedate", fileDate, Field.Store.YES, Field.Index.NO);

        doc.Add(field);

 

        //searchdate供检索使用####

        string timePickerDate = DateTime.Now.ToString();

        string[] pickerDateString = timePickerDate.Split(' ');

        string searchDate = pickerDateString[0];

        field = new Field("searchdate", searchDate, Field.Store.YES, Field.Index.NOT_ANALYZED);

        doc.Add(field);

 

        //filesize

        string fileSize = fileInfo.Length.ToString();

        field = new Field("filesize", fileSize, Field.Store.YES, Field.Index.NO);

        doc.Add(field);

 

        //filetag

        string fileTag = tag;

        field = new Field("filetag", fileTag, Field.Store.YES, Field.Index.NO);

        doc.Add(field);

 

        ///content & digest

        string content = null;

        string digest = null;

        try

        {

           if (fileInfo.Extension == ".txt" ||

              fileInfo.Extension == ".html" ||

              fileInfo.Extension == ".htm" ||

              fileInfo.Extension == ".xml")

           {

              byte[] buffer = new byte[fileInfo.Length];

              FileStream fs = fileInfo.OpenRead();

              fs.Read(buffer, 0, buffer.Length);

              //text

              content = Encoding.Default.GetString(buffer);

              content = RegularExpressionsOfHTML(content);

              field = new Field("content", content, Field.Store.NO, Field.Index.ANALYZED);

              doc.Add(field);

              //digest

              digest = Encoding.Default.GetString(buffer);

              digest = RegularExpressionsOfHTML(digest);

              digest = RemoveEnterAndBlank(digest);

              if (digest.Length > 300)

              {

                 digest = digest.Substring(0, 300);

              }

              field = new Field("digest", digest, Field.Store.COMPRESS, Field.Index.NO);

              doc.Add(field);

              //关闭文件流

              fs.Close();

           }

           //如果是word文件

           else if (fileInfo.Extension == ".doc" || fileInfo.Extension == ".docx")

           {

              //text

              content = FileReader.ReadFile(fileInfo);

              field = new Field("content", content, Field.Store.NO, Field.Index.ANALYZED);

              doc.Add(field);

              //digest

              if (content.Length > 300)

              {

                 digest = content.Substring(0, 300);

              }

              digest = RemoveEnterAndBlank(digest);

              field = new Field("digest", digest, Field.Store.COMPRESS, Field.Index.NO);

              doc.Add(field);

           }

           //如果是pdf文件

           else if (fileInfo.Extension == ".pdf")

           {

              //text

              content = FileReader.ReadFile(fileInfo);

              field = new Field("content", content, Field.Store.NO, Field.Index.ANALYZED);

              doc.Add(field);

              //digest

              if (content.Length > 300)

              {

                 digest = content.Substring(0, 300);

              }

              digest = RemoveEnterAndBlank(digest);

              field = new Field("digest", digest, Field.Store.COMPRESS, Field.Index.NO);

              doc.Add(field);

           }

           writer.AddDocument(doc);

        }

        catch (Exception e)

        {

           MessageBox.Show(e.StackTrace);

        }

      }

 

      public void CloseIndexer()

      {

        //关闭索引

        writer.Optimize();     //创建cfs文件并写入信息

        writer.Close();

      }

      //去除html标记

      public static string RegularExpressionsOfHTML(string TempContent)

      {

        TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent, "<[^>]*>", ""); //匹配一个

        return TempContent;

      }

      //去除换行

      public static string RemoveEnterAndBlank(string srcString)

      {

        string removedString = System.Text.RegularExpressions.Regex.Replace(srcString, @"\s+", " ");

        return removedString;

      }

   }

}

分析器Analyzer类型:

分析器类型

分析方法

WhitespaceAnalyzer

空格分割

SimpleAnalyzer

空格及各种符号分割

StopAnalyzer

空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on等无实际意义的词

StandardAnalyzer

混合分割,包括了去掉停止词,支持汉语

参考用法:http://www.blogjava.net/dreamstone/archive/2007/06/20/125372.html

域参数Field.Index, Field.Store使用方法:

Field.Index

Field.Store

说明

TOKENIZED(分词)

YES

被分词索引且存储

TOKENIZED

NO

被分词索引但不存储

NO

YES

这是不能被搜索的,它只是被搜索内容的附属物。如URL

UN_TOKENIZED

YES/NO

不被分词,它作为一个整体被搜索,搜一部分是搜不出来的

NO

NO

没有这种用法

 

posted @ 2011-08-09 10:37  Erebus_NET  阅读(374)  评论(0编辑  收藏  举报