lucene.net全文检索(二)lucene.net 的封装
查询
public class LuceneQuery : ILuceneQuery { #region Identity private Logger logger = new Logger(typeof(LuceneQuery)); #endregion Identity #region QueryIndex /// <summary> /// 获取商品信息数据 /// </summary> /// <param name="queryString"></param> /// <returns></returns> public List<Commodity> QueryIndex(string queryString) { IndexSearcher searcher = null; try { List<Commodity> ciList = new List<Commodity>(); Directory dir = FSDirectory.Open(StaticConstant.IndexPath); searcher = new IndexSearcher(dir); Analyzer analyzer = new PanGuAnalyzer(); //--------------------------------------这里配置搜索条件 QueryParser parser = new QueryParser(Version.LUCENE_30, "title", analyzer); Query query = parser.Parse(queryString); Console.WriteLine(query.ToString()); //显示搜索表达式 TopDocs docs = searcher.Search(query, (Filter)null, 10000); foreach (ScoreDoc sd in docs.ScoreDocs) { Document doc = searcher.Doc(sd.Doc); ciList.Add(DocumentToCommodityInfo(doc)); } return ciList; } finally { if (searcher != null) { searcher.Dispose(); } } } /// <summary> /// 分页获取商品信息数据 /// </summary> /// <param name="queryString"></param> /// <param name="pageIndex">第一页为1</param> /// <param name="pageSize"></param> /// <param name="totalCount"></param> /// <returns></returns> public List<Commodity> QueryIndexPage(string queryString, int pageIndex, int pageSize, out int totalCount, string priceFilter, string priceOrderBy) { totalCount = 0; IndexSearcher searcher = null; try { List<Commodity> ciList = new List<Commodity>(); FSDirectory dir = FSDirectory.Open(StaticConstant.IndexPath); searcher = new IndexSearcher(dir); Analyzer analyzer = new PanGuAnalyzer(); //--------------------------------------这里配置搜索条件 QueryParser parser = new QueryParser(Version.LUCENE_30, "title", analyzer); Query query = parser.Parse(queryString); pageIndex = Math.Max(1, pageIndex);//索引从1开始 int startIndex = (pageIndex - 1) * pageSize; int endIndex = pageIndex * pageSize; NumericRangeFilter<float> numPriceFilter = null; if (!string.IsNullOrWhiteSpace(priceFilter)) { bool isContainStart = priceFilter.StartsWith("["); bool isContainEnd = priceFilter.EndsWith("]"); string[] floatArray = priceFilter.Replace("[", "").Replace("]", "").Replace("{", "").Replace("}", "").Split(','); float start = 0; float end = 0; if (!float.TryParse(floatArray[0], out start) || !float.TryParse(floatArray[1], out end)) { throw new Exception("Wrong priceFilter"); } numPriceFilter = NumericRangeFilter.NewFloatRange("price", start, end, isContainStart, isContainEnd); } Sort sort = new Sort(); if (!string.IsNullOrWhiteSpace(priceOrderBy)) { SortField sortField = new SortField("price", SortField.FLOAT, priceOrderBy.EndsWith("asc", StringComparison.CurrentCultureIgnoreCase)); sort.SetSort(sortField); } TopDocs docs = searcher.Search(query, numPriceFilter, 10000, sort); //TopDocs docs = searcher.Search(query, null, 10000); totalCount = docs.TotalHits; //PrintScores(docs, startIndex, endIndex, searcher); for (int i = startIndex; i < endIndex && i < totalCount; i++) { Document doc = searcher.Doc(docs.ScoreDocs[i].Doc); ciList.Add(DocumentToCommodityInfo(doc)); } return ciList; } finally { if (searcher != null) { searcher.Dispose(); } } } private void PrintScores(TopDocs docs, int startIndex, int endIndex, MultiSearcher searcher) { ScoreDoc[] scoreDocs = docs.ScoreDocs; for (int i = startIndex; i < endIndex && i < scoreDocs.Count(); i++) { int docId = scoreDocs[i].Doc; Document doc = searcher.Doc(docId); logger.Info(string.Format("{0}的分值为{1}", doc.Get("productid"), scoreDocs[i].Score)); } } #endregion QueryIndex #region private private Commodity DocumentToCommodityInfo(Document doc) { return new Commodity() { Id = int.Parse(doc.Get("id")), Title = doc.Get("title"), ProductId = long.Parse(doc.Get("productid")), CategoryId = int.Parse(doc.Get("categoryid")), ImageUrl = doc.Get("iamgeurl"), Price = decimal.Parse(doc.Get("price")), Url = doc.Get("url") }; } #endregion private }
批量/单个索引的增删改
/// <summary> /// 多线程的问题 :多文件写,然后合并 /// 延时:异步队列 /// /// </summary> public class LuceneBulid : ILuceneBulid { #region Identity private Logger logger = new Logger(typeof(LuceneBulid)); #endregion Identity #region 批量BuildIndex 索引合并 /// <summary> /// 批量创建索引(要求是统一的sourceflag,即目录是一致的) /// </summary> /// <param name="ciList">sourceflag统一的</param> /// <param name="pathSuffix">索引目录后缀,加在电商的路径后面,为空则为根目录.如sa\1</param> /// <param name="isCreate">默认为false 增量索引 true的时候删除原有索引</param> public void BuildIndex(List<Commodity> ciList, string pathSuffix = "", bool isCreate = false) { IndexWriter writer = null; try { if (ciList == null || ciList.Count == 0) { return; } string rootIndexPath = StaticConstant.IndexPath; string indexPath = string.IsNullOrWhiteSpace(pathSuffix) ? rootIndexPath : string.Format("{0}\\{1}", rootIndexPath, pathSuffix); DirectoryInfo dirInfo = Directory.CreateDirectory(indexPath); LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo); writer = new IndexWriter(directory, new PanGuAnalyzer(), isCreate, IndexWriter.MaxFieldLength.LIMITED); //writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED); writer.SetMaxBufferedDocs(100);//控制写入一个新的segent前内存中保存的doc的数量 默认10 writer.MergeFactor = 100;//控制多个segment合并的频率,默认10 writer.UseCompoundFile = true;//创建复合文件 减少索引文件数量 ciList.ForEach(c => CreateCIIndex(writer, c)); } finally { if (writer != null) { //writer.Optimize(); 创建索引的时候不做合并 merge的时候处理 writer.Close(); } } } /// <summary> /// 将索引合并到上级目录 /// </summary> /// <param name="sourceDir">子文件夹名</param> public void MergeIndex(string[] childDirs) { Console.WriteLine("MergeIndex Start"); IndexWriter writer = null; try { if (childDirs == null || childDirs.Length == 0) return; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); string rootPath = StaticConstant.IndexPath; DirectoryInfo dirInfo = Directory.CreateDirectory(rootPath); LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo); writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//删除原有的 LuceneIO.Directory[] dirNo = childDirs.Select(dir => LuceneIO.FSDirectory.Open(Directory.CreateDirectory(string.Format("{0}\\{1}", rootPath, dir)))).ToArray(); writer.MergeFactor = 100;//控制多个segment合并的频率,默认10 writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量 writer.AddIndexesNoOptimize(dirNo); } finally { if (writer != null) { writer.Optimize(); writer.Close(); } Console.WriteLine("MergeIndex End"); } } //Field.Store.YES:存储字段值(未分词前的字段值) //Field.Store.NO:不存储,存储与索引没有关系 //Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损 //Field.Index.ANALYZED:分词建索引 //Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间 //Field.Index.NOT_ANALYZED:不分词且索引 //Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存 //TermVector表示文档的条目(由一个Document和Field定位)和它们在当前文档中所出现的次数 //Field.TermVector.YES:为每个文档(Document)存储该字段的TermVector //Field.TermVector.NO:不存储TermVector // Field.TermVector.WITH_POSITIONS:存储位置 //Field.TermVector.WITH_OFFSETS:存储偏移量 //Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量 #endregion 批量BuildIndex 索引合并 #region 单个/批量索引增删改 /// <summary> /// 新增一条数据的索引 /// </summary> /// <param name="ci"></param> public void InsertIndex(Commodity ci) { IndexWriter writer = null; try { if (ci == null) return; string rootIndexPath = StaticConstant.IndexPath; DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath); bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo); writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED); writer.MergeFactor = 100;//控制多个segment合并的频率,默认10 writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量 CreateCIIndex(writer, ci); } catch (Exception ex) { logger.Error("InsertIndex异常", ex); throw ex; } finally { if (writer != null) { //if (fileNum > 50) // writer.Optimize(); writer.Close(); } } } /// <summary> /// 批量新增数据的索引 /// </summary> /// <param name="ciList"></param> public void InsertIndexMuti(List<Commodity> ciList) { BuildIndex(ciList, "", false); } /// <summary> /// 批量删除数据的索引 /// </summary> /// <param name="ciList"></param> public void DeleteIndexMuti(List<Commodity> ciList) { IndexReader reader = null; try { if (ciList == null || ciList.Count == 0) return; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); string rootIndexPath = StaticConstant.IndexPath; DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath); LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo); reader = IndexReader.Open(directory, false); foreach (Commodity ci in ciList) { reader.DeleteDocuments(new Term("productid", ci.ProductId.ToString())); } } catch (Exception ex) { logger.Error("DeleteIndex异常", ex); throw ex; } finally { if (reader != null) { reader.Dispose(); } } } /// <summary> /// 删除多条数据的索引 /// </summary> /// <param name="ci"></param> public void DeleteIndex(Commodity ci) { IndexReader reader = null; try { if (ci == null) return; Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); string rootIndexPath = StaticConstant.IndexPath; DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath); LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo); reader = IndexReader.Open(directory, false); reader.DeleteDocuments(new Term("productid", ci.ProductId.ToString())); } catch (Exception ex) { logger.Error("DeleteIndex异常", ex); throw ex; } finally { if (reader != null) { reader.Dispose(); } } } /////// <summary> /////// 更新一条数据的索引 /////// </summary> //public void UpdateIndex(Commodity ci) //{ // DeleteIndex(ci); // InsertIndex(ci); //} /// <summary> /// 更新一条数据的索引 /// </summary> /// <param name="ci"></param> public void UpdateIndex(Commodity ci) { IndexWriter writer = null; try { if (ci == null) return; string rootIndexPath = StaticConstant.IndexPath; DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath); bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo); writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED); writer.MergeFactor = 100;//控制多个segment合并的频率,默认10 writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量 writer.UpdateDocument(new Term("productid", ci.ProductId.ToString()), ParseCItoDoc(ci)); } catch (Exception ex) { logger.Error("InsertIndex异常", ex); throw ex; } finally { if (writer != null) { //if (fileNum > 50) // writer.Optimize(); writer.Close(); } } } /// <summary> /// 批量更新数据的索引 /// </summary> /// <param name="ciList">sourceflag统一的</param> public void UpdateIndexMuti(List<Commodity> ciList) { IndexWriter writer = null; try { if (ciList == null || ciList.Count == 0) return; string rootIndexPath = StaticConstant.IndexPath; DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath); bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo); writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED); writer.MergeFactor = 50;//控制多个segment合并的频率,默认10 writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量 foreach (Commodity ci in ciList) { writer.UpdateDocument(new Term("productid", ci.ProductId.ToString()), ParseCItoDoc(ci)); } } catch (Exception ex) { logger.Error("InsertIndex异常", ex); throw ex; } finally { if (writer != null) { //if (fileNum > 50) // writer.Optimize(); writer.Close(); } } } #endregion 单个索引增删改 #region PrivateMethod /// <summary> /// 创建分析器 /// </summary> /// <returns></returns> private PerFieldAnalyzerWrapper CreateAnalyzerWrapper() { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(analyzer); analyzerWrapper.AddAnalyzer("title", new PanGuAnalyzer()); analyzerWrapper.AddAnalyzer("categoryid", new StandardAnalyzer(Version.LUCENE_30)); return analyzerWrapper; } /// <summary> /// 创建索引 /// </summary> /// <param name="analyzer"></param> /// <param name="title"></param> /// <param name="content"></param> private void CreateCIIndex(IndexWriter writer, Commodity ci) { try { writer.AddDocument(ParseCItoDoc(ci)); } catch (Exception ex) { logger.Error("CreateCIIndex异常", ex); throw ex; } } /// <summary> /// 将Commodity转换成doc /// </summary> /// <param name="ci"></param> /// <returns></returns> private Document ParseCItoDoc(Commodity ci) { Document doc = new Document(); doc.Add(new Field("id", ci.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("title", ci.Title, Field.Store.YES, Field.Index.ANALYZED));//盘古分词 doc.Add(new Field("productid", ci.ProductId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("categoryid", ci.CategoryId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("imageurl", ci.ImageUrl, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("url", ci.Url, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new NumericField("price", Field.Store.YES, true).SetFloatValue((float)ci.Price)); return doc; } #endregion PrivateMethod }
分词器封装
public class LuceneAnalyze : ILuceneAnalyze { private Logger logger = new Logger(typeof(LuceneAnalyze)); // #region AnalyzerKey /// <summary> /// 将搜索的keyword分词 /// 通过or 链接;查询更多的数据(贪婪查询) /// </summary> /// <param name="keyword"></param> /// <returns></returns> public string[] AnalyzerKey(string keyword) { Analyzer analyzer = new PanGuAnalyzer(); QueryParser parser = new QueryParser(Version.LUCENE_30, "title", analyzer); Query query = parser.Parse(this.CleanKeyword(keyword)); if (query is TermQuery) { Term term = ((TermQuery)query).Term; return new string[] { term.Text }; } else if (query is PhraseQuery) { Term[] term = ((PhraseQuery)query).GetTerms(); return term.Select(t => t.Text).ToArray(); } else if (query is BooleanQuery)// and or { BooleanClause[] clauses = ((BooleanQuery)query).GetClauses(); List<string> analyzerWords = new List<string>(); foreach (BooleanClause clause in clauses) { Query childQuery = clause.Query; if (childQuery is TermQuery) { Term term = ((TermQuery)childQuery).Term; analyzerWords.Add(term.Text); } else if (childQuery is PhraseQuery) { Term[] term = ((PhraseQuery)childQuery).GetTerms(); analyzerWords.AddRange(term.Select(t => t.Text)); } } return analyzerWords.ToArray(); } else { logger.Debug(string.Format("AnalyzerKey在解析keyword={0}的结果为new string[] { keyword } ", keyword)); return new string[] { keyword }; } } /// <summary> /// 清理头尾and or 关键字 /// </summary> /// <param name="keyword"></param> /// <returns></returns> private string CleanKeyword(string keyword) { if (string.IsNullOrWhiteSpace(keyword)) { } else { bool isClean = false; while (!isClean) { keyword = keyword.Trim(); if (keyword.EndsWith(" AND")) { keyword = string.Format("{0}and", keyword.Remove(keyword.Length - 3, 3)); } else if (keyword.EndsWith(" OR")) { keyword = string.Format("{0}or", keyword.Remove(keyword.Length - 2, 2)); } else if (keyword.StartsWith("AND ")) { keyword = string.Format("and{0}", keyword.Substring(3)); } else if (keyword.StartsWith("OR ")) { keyword = string.Format("or{0}", keyword.Substring(2)); } else if (keyword.Contains(" OR ")) { keyword = keyword.Replace(" OR ", " or "); } else if (keyword.Contains(" AND ")) { keyword = keyword.Replace(" AND ", " and "); } else isClean = true; } } return QueryParser.Escape(keyword); } #endregion AnalyzerKey
付费内容,请联系本人QQ:1002453261
本文来自博客园,作者:明志德道,转载请注明原文链接:https://www.cnblogs.com/for-easy-fast/p/14319028.html