Lucene.NET开发——索引文件的建立
索引建立类
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Windows.Forms;
using Lucene.Net.Documents;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Index;
using Lucene.Net.Store;
namespace NewBeeEngine
{
/// <summary>
/// 使用方法:
/// 1.创建Indexer,参数为加密密钥
/// 2.调用FileAdd,将文件添加到索引
/// 3.调用CloseIndexer,写入索引文件,关闭索引
/// </summary>
public class Indexer
{
private Document doc = null;
private IndexWriter writer = null;
List<string> fileTypeList
= null;
public Indexer(string key)
{
Password.password = key;
fileTypeList = new List<string>();
fileTypeList.Add("txt");
fileTypeList.Add("xml");
fileTypeList.Add("htm");
fileTypeList.Add("html");
fileTypeList.Add("doc");
fileTypeList.Add("docx");
fileTypeList.Add("pdf");
//初始化:Analyzer,IndexWriter
Analyzer a = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
//Analyzer a = new
SimpleAnalyzer();
DirectoryInfo createDirInfo =
new DirectoryInfo(Application.StartupPath + "\\" + "index");
writer = new IndexWriter(FSDirectory.Open(createDirInfo), a,
true, IndexWriter.MaxFieldLength.LIMITED);
}
//对单个文件进行索引
public void FileAdd(FileInfo fileInfo, string tag, Engine
engine)
{
doc = new Document();
//filename
string fileName = fileInfo.Name;
Field field = new Field("filename", fileName, Field.Store.YES,
Field.Index.NOT_ANALYZED);
doc.Add(field);
//filedate
string fileDate = DateTime.Now.ToString();
field = new Field("filedate", fileDate, Field.Store.YES,
Field.Index.NO);
doc.Add(field);
//searchdate供检索使用####
string timePickerDate =
DateTime.Now.ToString();
string[] pickerDateString =
timePickerDate.Split(' ');
string searchDate = pickerDateString[0];
field = new Field("searchdate", searchDate, Field.Store.YES,
Field.Index.NOT_ANALYZED);
doc.Add(field);
//filesize
string fileSize = fileInfo.Length.ToString();
field = new Field("filesize", fileSize, Field.Store.YES,
Field.Index.NO);
doc.Add(field);
//filetag
string fileTag = tag;
field = new Field("filetag", fileTag, Field.Store.YES,
Field.Index.NO);
doc.Add(field);
///content & digest
string content = null;
string digest = null;
try
{
if (fileInfo.Extension ==
".txt" ||
fileInfo.Extension ==
".html" ||
fileInfo.Extension ==
".htm" ||
fileInfo.Extension ==
".xml")
{
byte[] buffer = new byte[fileInfo.Length];
FileStream fs = fileInfo.OpenRead();
fs.Read(buffer,
0, buffer.Length);
//text
content = Encoding.Default.GetString(buffer);
content = RegularExpressionsOfHTML(content);
field = new Field("content", content, Field.Store.NO,
Field.Index.ANALYZED);
doc.Add(field);
//digest
digest = Encoding.Default.GetString(buffer);
digest = RegularExpressionsOfHTML(digest);
digest = RemoveEnterAndBlank(digest);
if (digest.Length >
300)
{
digest = digest.Substring(0,
300);
}
field = new Field("digest", digest, Field.Store.COMPRESS,
Field.Index.NO);
doc.Add(field);
//关闭文件流
fs.Close();
}
//如果是word文件
else if (fileInfo.Extension ==
".doc" || fileInfo.Extension
== ".docx")
{
//text
content = FileReader.ReadFile(fileInfo);
field = new Field("content", content, Field.Store.NO,
Field.Index.ANALYZED);
doc.Add(field);
//digest
if (content.Length >
300)
{
digest = content.Substring(0,
300);
}
digest = RemoveEnterAndBlank(digest);
field = new Field("digest", digest, Field.Store.COMPRESS,
Field.Index.NO);
doc.Add(field);
}
//如果是pdf文件
else if (fileInfo.Extension ==
".pdf")
{
//text
content = FileReader.ReadFile(fileInfo);
field = new Field("content", content, Field.Store.NO,
Field.Index.ANALYZED);
doc.Add(field);
//digest
if (content.Length >
300)
{
digest = content.Substring(0,
300);
}
digest = RemoveEnterAndBlank(digest);
field = new Field("digest", digest, Field.Store.COMPRESS,
Field.Index.NO);
doc.Add(field);
}
writer.AddDocument(doc);
}
catch (Exception e)
{
MessageBox.Show(e.StackTrace);
}
}
public void CloseIndexer()
{
//关闭索引
writer.Optimize(); //创建cfs文件并写入信息
writer.Close();
}
//去除html标记
public static string RegularExpressionsOfHTML(string TempContent)
{
TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent, "<[^>]*>", ""); //匹配一个
return TempContent;
}
//去除换行
public static string RemoveEnterAndBlank(string srcString)
{
string removedString =
System.Text.RegularExpressions.Regex.Replace(srcString,
@"\s+", " ");
return removedString;
}
}
}
分析器Analyzer类型:
分析器类型 |
分析方法 |
WhitespaceAnalyzer |
空格分割 |
SimpleAnalyzer |
空格及各种符号分割 |
StopAnalyzer |
空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on等无实际意义的词 |
StandardAnalyzer |
混合分割,包括了去掉停止词,支持汉语 |
参考用法:http://www.blogjava.net/dreamstone/archive/2007/06/20/125372.html
域参数Field.Index, Field.Store使用方法:
Field.Index |
Field.Store |
说明 |
TOKENIZED(分词) |
YES |
被分词索引且存储 |
TOKENIZED |
NO |
被分词索引但不存储 |
NO |
YES |
这是不能被搜索的,它只是被搜索内容的附属物。如URL等 |
UN_TOKENIZED |
YES/NO |
不被分词,它作为一个整体被搜索,搜一部分是搜不出来的 |
NO |
NO |
没有这种用法 |