Lucene.Net 代码演示
Analysis--分词器,负责把字符串拆分成原子,包含了标准分词,直接空格拆分 项目中用的是盘古中文分词, Document--数据结构,定义存储数据的格式 Index--索引的读写类 QueryParser--查询解析器,负责解析查询语句 Search---负责各种查询类,命令解析后得到就是查询类 Store---索引存储类,负责文件夹等等 Util---常见工具类库
lucene是全文搜索必备的,是大型系统必备的
Search: TermQuery--单元查询 new Term("title","张三") title:张三 BoolenQuery---new Term("title","张三") and new Term("title","李四") title:张三 + title:李四 new Term("title","张三") or new Term("title","李四") title:张三 title:李四 WildcardQuery---通配符 new Term("title","张?") title:张? new Term("title","张") title:张 PrefixQuery---前缀查询 以xx开头 title:张* PhraseQuery---间隔距离 包含没有 包含提莫 而且二者距离不能超过5 title: "没有 提莫"~5 没有蘑菇的提莫 没有蘑菇的蘑菇的蘑菇的提莫 FuzzyQuery---近似查询,ibhone----iphone title:ibhone~ RangeQuery---范围查询 [1,100] {1,100}
Lucene.Net一进一出,建立索引需要获取数据源,分词-保存到硬盘 索引查找, 自然会有些延迟,以前淘宝上架宝贝,第二天才能搜索的 索引更新策略:1 数据跟新---丢一个队列---一个processor通过队列完成更新 2 每一周全部索引一遍 lucene索引存的是原子--docid1,docid2,docid3 不store可以大量节约空间;查找时原子匹配多个id;
1 索引增删改查和分词处理 2 京东数据多线程建立索引 3 索引查询接口封装
Lucene--封装的lucene相关操作封装
LuceneAnalyze--负责完成查询关键字解析,尽可能拆分成原子数组 如果只有一个词,prefix查询 苹果* 如果是多个词,换成或者关系, 都是为了更多的命中结果(贪婪搜索) 做个关键词清理
LuceneBulid--- BuildIndex--MergeIndex 多线程写不同子路径,完成后合并 增加/删除索引 更新索引-只能先删除再更新 LuceneQuery---QueryIndexPage 支持关键字,支持范围过滤 支持排序 Processor---Lucene多线程建立索引 IndexBuilder 入口,启动多线程创建+完成后的Merge IndexBuilderPerThread 每个线程是如何完成索引建立的
DataService--CommodityLucene对外提供的搜索封装 CommodityRepository-SqlHelper,完成数据库数据查询
Utility--通用帮助类 CfgFiles--配置文件 Model--实体类
初始化索引
/// <summary> /// 初始化索引 /// </summary> public static void InitIndex() { List<Commodity> commodityList = GetList(); FSDirectory directory = FSDirectory.Open(StaticConstant.TestIndexPath);//文件夹 using (IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED))//索引写入器 { foreach (Commodity commdity in commodityList) { for (int k = 0; k < 10; k++) { Document doc = new Document();//一条数据 doc.Add(new Field("id", commdity.Id.ToString(), Field.Store.NO, Field.Index.NOT_ANALYZED));//一个字段 列名 值 是否保存值 是否分词 doc.Add(new Field("title", commdity.Title, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("url", commdity.Url, Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.Add(new Field("imageurl", commdity.ImageUrl, Field.Store.NO, Field.Index.NOT_ANALYZED)); doc.Add(new Field("content", "this is lucene working,powerful tool " + k, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new NumericField("price", Field.Store.YES, true).SetDoubleValue((double)(commdity.Price + k))); //doc.Add(new NumericField("time", Field.Store.YES, true).SetLongValue(DateTime.Now.ToFileTimeUtc())); doc.Add(new NumericField("time", Field.Store.YES, true).SetIntValue(int.Parse(DateTime.Now.ToString("yyyyMMdd")) + k)); writer.AddDocument(doc);//写进去 } } writer.Optimize();//优化 就是合并 } }
查找器IndexSearcher
FSDirectory dir = FSDirectory.Open(StaticConstant.TestIndexPath); IndexSearcher searcher = new IndexSearcher(dir);//查找器 { TermQuery query = new TermQuery(new Term("title", "图书馆"));//包含 TopDocs docs = searcher.Search(query, null, 10000);//找到的数据 foreach (ScoreDoc sd in docs.ScoreDocs) { Document doc = searcher.Doc(sd.Doc); Console.WriteLine("***************************************"); Console.WriteLine(string.Format("id={0}", doc.Get("id"))); Console.WriteLine(string.Format("title={0}", doc.Get("title"))); Console.WriteLine(string.Format("time={0}", doc.Get("time"))); Console.WriteLine(string.Format("price={0}", doc.Get("price"))); Console.WriteLine(string.Format("content={0}", doc.Get("content"))); } Console.WriteLine("1一共命中了{0}个", docs.TotalHits); }
解析器QueryParser
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", new PanGuAnalyzer());//解析器 { //string keyword = "高中政治人教新课标选修生活中的法律常识"; string keyword = "高中政治 人 教 新课 标 选修 生活 中的 法律常识"; { Query query = parser.Parse(keyword); TopDocs docs = searcher.Search(query, null, 10000);//找到的数据 int i = 0; foreach (ScoreDoc sd in docs.ScoreDocs) { if (i++ < 1000) { Document doc = searcher.Doc(sd.Doc); Console.WriteLine("***************************************"); Console.WriteLine(string.Format("id={0}", doc.Get("id"))); Console.WriteLine(string.Format("title={0}", doc.Get("title"))); Console.WriteLine(string.Format("time={0}", doc.Get("time"))); Console.WriteLine(string.Format("price={0}", doc.Get("price"))); } } Console.WriteLine($"一共命中{docs.TotalHits}"); } { Query query = parser.Parse(keyword); NumericRangeFilter<int> timeFilter = NumericRangeFilter.NewIntRange("time", 20190101, 20191231, true, true);//过滤 SortField sortPrice = new SortField("price", SortField.DOUBLE, false);//降序 SortField sortTime = new SortField("time", SortField.INT, true);//升序 Sort sort = new Sort(sortTime, sortPrice);//排序 哪个前哪个后 TopDocs docs = searcher.Search(query, timeFilter, 10000, sort);//找到的数据 int i = 0; foreach (ScoreDoc sd in docs.ScoreDocs) { if (i++ < 1000) { Document doc = searcher.Doc(sd.Doc); Console.WriteLine("***************************************"); Console.WriteLine(string.Format("id={0}", doc.Get("id"))); Console.WriteLine(string.Format("title={0}", doc.Get("title"))); Console.WriteLine(string.Format("time={0}", doc.Get("time"))); Console.WriteLine(string.Format("price={0}", doc.Get("price"))); } } Console.WriteLine("3一共命中了{0}个", docs.TotalHits); } }
/// <summary> /// 索引建立 /// </summary> public class IndexBuilder { private static Logger logger = new Logger(typeof(IndexBuilder)); private static List<string> PathSuffixList = new List<string>(); private static CancellationTokenSource CTS = null; public static void Build() { try { logger.Debug(string.Format("{0} BuildIndex开始",DateTime.Now)); List<Task> taskList = new List<Task>(); TaskFactory taskFactory = new TaskFactory(); CTS = new CancellationTokenSource(); //30个表 30个线程 不用折腾,一线程一表 平均分配 //30个表 18个线程 1到12号2个表 13到18是一个表? 错的!前12个线程活儿多,后面的活少 //自己去想想,怎么样可以做,随便配置线程数量,但是可以均匀分配任务? for (int i = 1; i < 31; i++) { IndexBuilderPerThread thread = new IndexBuilderPerThread(i, i.ToString("000"), CTS); PathSuffixList.Add(i.ToString("000")); taskList.Add(taskFactory.StartNew(thread.Process));//开启一个线程 里面创建索引 } taskList.Add(taskFactory.ContinueWhenAll(taskList.ToArray(), MergeIndex)); Task.WaitAll(taskList.ToArray()); logger.Debug(string.Format("BuildIndex{0}", CTS.IsCancellationRequested ? "失败" : "成功")); } catch (Exception ex) { logger.Error("BuildIndex出现异常", ex); } finally { logger.Debug(string.Format("{0} BuildIndex结束", DateTime.Now)); } } private static void MergeIndex(Task[] tasks) { try { if (CTS.IsCancellationRequested) return; ILuceneBulid builder = new LuceneBulid(); builder.MergeIndex(PathSuffixList.ToArray()); } catch (Exception ex) { CTS.Cancel(); logger.Error("MergeIndex出现异常", ex); } } }
IndexBuilderPerThread类
public class IndexBuilderPerThread { private Logger logger = new Logger(typeof(IndexBuilderPerThread)); private int CurrentThreadNum = 0; private string PathSuffix = ""; private CancellationTokenSource CTS = null; /// <summary> /// 为了避免多线程临时变量问题,将参数在构造时就传进来 /// </summary> /// <param name="threadNum"></param> /// <param name="pathSuffix"></param> /// <param name="cts"></param> public IndexBuilderPerThread(int threadNum, string pathSuffix, CancellationTokenSource cts) { this.CurrentThreadNum = threadNum; this.PathSuffix = pathSuffix; this.CTS = cts; } public void Process() { try { logger.Debug(string.Format("ThreadNum={0}开始创建", CurrentThreadNum)); CommodityRepository commodityRepository = new CommodityRepository(); ILuceneBulid builder = new LuceneBulid(); bool isFirst = true; int pageIndex = 1; while (!CTS.IsCancellationRequested) { List<Commodity> commodityList = commodityRepository.QueryList(CurrentThreadNum, pageIndex, 1000); if (commodityList == null || commodityList.Count == 0) { break; } //else if (pageIndex == 11) //{ // break;//为了测试 只做10000条数据 //} else { builder.BuildIndex(commodityList, PathSuffix, isFirst); logger.Debug(string.Format("ThreadNum={0}完成{1}条的创建", CurrentThreadNum, 1000 * pageIndex++)); isFirst = false; } } } catch (Exception ex) { CTS.Cancel(); logger.Error(string.Format("ThreadNum={0}出现异常", CurrentThreadNum), ex); } finally { logger.Debug(string.Format("ThreadNum={0}完成创建", CurrentThreadNum)); } } }
/// <summary> /// 将索引合并到上级目录 /// </summary> /// <param name="sourceDir">子文件夹名</param> public void MergeIndex(string[] childDirs) { Console.WriteLine("MergeIndex Start"); IndexWriter writer = null; try { if (childDirs == null || childDirs.Length == 0) return; Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); string rootPath = StaticConstant.IndexPath; DirectoryInfo dirInfo = Directory.CreateDirectory(rootPath); LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo); writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//删除原有的 LuceneIO.Directory[] dirNo = childDirs.Select(dir => LuceneIO.FSDirectory.Open(Directory.CreateDirectory(string.Format("{0}\\{1}", rootPath, dir)))).ToArray(); writer.MergeFactor = 100;//控制多个segment合并的频率,默认10 writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量 writer.AddIndexesNoOptimize(dirNo); } finally { if (writer != null) { writer.Optimize(); writer.Close(); } Console.WriteLine("MergeIndex End"); } }
用lucene进行商品查询
public class CommodityLucene { private static Logger logger = new Logger(typeof(CommodityLucene)); #region QueryCommodity /// <summary> /// 用lucene进行商品查询 /// </summary> /// <param name="pageIndex">从1开始</param> /// <param name="pageSize">单页数量</param> /// <param name="totalCount">总数</param> /// <param name="keyword">搜索的关键字</param> /// <param name="categoryIdList">类别集合</param> /// <param name="priceFilter">[13,50] 13,50且包含13到50 {13,50} 13,50且不包含13到50</param> /// <param name="priceOrderBy">price desc price asc</param> /// <returns></returns> public static List<Commodity> QueryCommodity(int pageIndex, int pageSize, out int totalCount, string keyword, List<int> categoryIdList, string priceFilter, string priceOrderBy) { totalCount = 0; try { if (string.IsNullOrWhiteSpace(keyword) && (categoryIdList == null || categoryIdList.Count == 0)) return null; ILuceneQuery luceneQuery = new LuceneQuery(); string queryString = string.Format(" {0} {1}", string.IsNullOrWhiteSpace(keyword) ? "" : string.Format(" +{0}", AnalyzerKeyword(keyword)), categoryIdList == null || categoryIdList.Count == 0 ? "" : string.Format(" +{0}", AnalyzerCategory(categoryIdList))); return luceneQuery.QueryIndexPage(queryString, pageIndex, pageSize, out totalCount, priceFilter, priceOrderBy); } catch (Exception ex) { logger.Error(string.Format("QueryCommodity参数为{0}出现异常", keyword), ex); return null; } } #endregion QueryCommodity /// <summary> /// 为keyword做盘古分词 /// </summary> /// <param name="keyword"></param> /// <param name="luceneQuery"></param> /// <returns></returns> private static string AnalyzerKeyword(string keyword) { StringBuilder queryStringBuilder = new StringBuilder(); ILuceneAnalyze analyzer = new LuceneAnalyze(); string[] words = analyzer.AnalyzerKey(keyword); if (words.Length == 1) { queryStringBuilder.AppendFormat("{0}:{1}* ", "title", words[0]); } else { StringBuilder fieldQueryStringBuilder = new StringBuilder(); foreach (string word in words) { queryStringBuilder.AppendFormat("{0}:{1} ", "title", word); } } string result = queryStringBuilder.ToString().TrimEnd(); logger.Info(string.Format("AnalyzerKeyword 将 keyword={0}转换为{1}", keyword, result)); return result; } /// <summary> /// 为类别做custom分词 /// </summary> /// <param name="categoryIdList"></param> /// <returns></returns> private static string AnalyzerCategory(List<int> categoryIdList) { return string.Join(" ", categoryIdList.Select(c => string.Format("{0}:{1}", "categoryid", c))); } }
付费内容,请联系本人QQ:1002453261
本文来自博客园,作者:明志德道,转载请注明原文链接:https://www.cnblogs.com/for-easy-fast/articles/12520029.html
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· [.NET]调用本地 Deepseek 模型
· 一个费力不讨好的项目,让我损失了近一半的绩效!
· 在鹅厂做java开发是什么体验
· 百万级群聊的设计实践
· WPF到Web的无缝过渡:英雄联盟客户端的OpenSilver迁移实战
· 永远不要相信用户的输入:从 SQL 注入攻防看输入验证的重要性
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析