LuceneNet的使用

先建立索引，再查询，速度很快。

索引花费的时间比较多，但还可以接受，200M的文本需要1分多钟，40G的文本需要4个小时多。

刚开始我用 2.9版本的，是选择将文本也保存在索引中，占据的空间比原先的文本2倍多。

而且发现如果里面的邮箱地址不带@后面无法查询出来，也许是所用的分词的关系，也不知道怎样才可以自定义分隔符。

后来改用了4.8版本，索引的空间只比原先的文本大一点点，而且不带@的关键字也可以查询出来。

但还是有个问题，查询出来的内容中文是乱码，也无法用中文查询。

无论是 NLuke 还是 Luke ，都没法成功打开索引文件。

IndexWriter 构造函数使用了另一个重载，即第三个参数为 bool，如果为 true 表示不存在就创建、存在就覆盖，为 false 表示不存在就出错、存在就追加。这个不方便，因为我们需要的是：不存在就创建、存在就追加，怎样才能实现这个功能呢？省略掉，就实现这个功能了。

void Main()
{
 
 string idxpath = @"D:\data\DB\txt\index\";
 string dir = @"D:\data\DB\txt\search\tianya\";
 //!!怎样用@分词？
 string keyword = "zhaoshu0997";
 Utils.FullSearch.FileHelper filehelper= new Utils.FullSearch.FileHelper(idxpath);
 //filehelper.BuildIndex(dir);
 Utils.FullSearch.SearchResults results = filehelper.Search(keyword);
 Console.WriteLine($"{keyword.ToString()}");
 results.Dump();
}

namespace Utils.FullSearch
{
	public class SearchResults{
		public int TotalHits{get;set;}
		public List<Hit> SearchContents{get;set;}
	}
	
	public class Hit{
		public float Score{get;set;}
		public string Content{get;set;}
	}
	
	public class FileHelper
    {
		private const LuceneVersion MATCH_LUCENE_VERSION= LuceneVersion.LUCENE_48;
		private const string Field_Name= "content";
		private const int Results_Per_Page = 10;
		//private  IndexWriter writer;
		private  StandardAnalyzer analyzer;
		private  QueryParser queryParser;
		//private  SearcherManager searchManager;
		private string _indexPath;
		
		private StandardAnalyzer SetupAnalyzer() => new StandardAnalyzer(MATCH_LUCENE_VERSION);
		private QueryParser SetupQueryParser(StandardAnalyzer analyzer) => new QueryParser(MATCH_LUCENE_VERSION, Field_Name, analyzer);
		
		public FileHelper(string indexPath)
        {
			analyzer = SetupAnalyzer();
    		queryParser = SetupQueryParser(analyzer);
		    _indexPath = indexPath;
			
		}
		
        public void BuildIndex(string dir)
        {
			var watch = Stopwatch.StartNew();
			List<string> fpaths = FindFile(dir);
			
			IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer));
			foreach(string fpath in fpaths){
				string[] contents = File.ReadAllLines(fpath, Encoding.UTF8);
				foreach(string content in contents){
					Document doc = new Document
				    {                
				        new TextField(Field_Name, content, Field.Store.YES)
				    };
					writer.AddDocument(doc);
				}
				($"index time for {fpath}:{watch.ElapsedMilliseconds/1000.0}second").Dump();
			}
			  
		    writer.Flush(true, true);
		    writer.Commit();
			writer.Dispose();
			watch.Stop();
			($"index time for {dir}:{watch.ElapsedMilliseconds/1000.0}second").Dump();
		}
		
		public static List<string> FindFile(string sSourcePath)
		{
		    List<String> list = new List<string>();
		    DirectoryInfo theFolder = new DirectoryInfo(sSourcePath);
		    FileInfo[] thefileInfo = theFolder.GetFiles("*.*", SearchOption.TopDirectoryOnly);
		    foreach (FileInfo NextFile in thefileInfo)  //遍历文件
		        list.Add(NextFile.FullName);
		   		 DirectoryInfo[] dirInfo = theFolder.GetDirectories();
			    foreach (DirectoryInfo NextFolder in dirInfo)
			    {
			        //list.Add(NextFolder.ToString());
			        FileInfo[] fileInfo = NextFolder.GetFiles("*.*", SearchOption.AllDirectories);
			        foreach (FileInfo NextFile in fileInfo)  //遍历文件
			            list.Add(NextFile.FullName);
			    }           
		    return list;
		}
			
		public  SearchResults Search(string queryString)
		{
			var watch = Stopwatch.StartNew();
		    Query query = queryParser.Parse(queryString);
			IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer));
		    SearcherManager searchManager = new SearcherManager(writer, true, null);
		    searchManager.MaybeRefreshBlocking();
		    IndexSearcher searcher = searchManager.Acquire();
		
		    try
		    {
		        TopDocs topdDocs = searcher.Search(query, Results_Per_Page);         
		        SearchResults searchResults = new SearchResults() { TotalHits = topdDocs.TotalHits, SearchContents = new List<Hit>() };
			    foreach (var result in topdDocs.ScoreDocs)
			    {
				    Document document = searcher.Doc(result.Doc);
				    Hit searchResult = new Hit
			        {
			            Score = result.Score,
			            Content = document.GetField(Field_Name)?.GetStringValue()
			        };
			        searchResults.SearchContents.Add(searchResult);
			    }
				($"search time for {queryString}:{watch.ElapsedMilliseconds/1000.0}second").Dump();
				return searchResults;
		    }
		    finally
		    {
		        searchManager.Release(searcher);
		        searcher = null;
		    }
			
		}
	}
}

posted on 2020-03-29 20:51 白马酒凉阅读(474) 评论(2) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

LuceneNet的使用

导航

公告