Lucene.Net 代码演示

lucene.net:全文检索的工具包,不是应用,只是个类库,完成了全文检索的功能 就是把数据拆分---存起来---查询时---拆分---匹配---结果

Analysis--分词器,负责把字符串拆分成原子,包含了标准分词,直接空格拆分 项目中用的是盘古中文分词, Document--数据结构,定义存储数据的格式 Index--索引的读写类 QueryParser--查询解析器,负责解析查询语句 Search---负责各种查询类,命令解析后得到就是查询类 Store---索引存储类,负责文件夹等等 Util---常见工具类库

lucene是全文搜索必备的,是大型系统必备的

Search: TermQuery--单元查询 new Term("title","张三") title:张三 BoolenQuery---new Term("title","张三") and new Term("title","李四") title:张三 + title:李四 new Term("title","张三") or new Term("title","李四") title:张三 title:李四 WildcardQuery---通配符 new Term("title","张?") title:张? new Term("title","张") title:张 PrefixQuery---前缀查询 以xx开头 title:张* PhraseQuery---间隔距离 包含没有 包含提莫 而且二者距离不能超过5 title: "没有 提莫"~5 没有蘑菇的提莫 没有蘑菇的蘑菇的蘑菇的提莫 FuzzyQuery---近似查询,ibhone----iphone title:ibhone~ RangeQuery---范围查询 [1,100] {1,100}

Lucene.Net一进一出,建立索引需要获取数据源,分词-保存到硬盘 索引查找, 自然会有些延迟,以前淘宝上架宝贝,第二天才能搜索的 索引更新策略:1 数据跟新---丢一个队列---一个processor通过队列完成更新 2 每一周全部索引一遍 lucene索引存的是原子--docid1,docid2,docid3 不store可以大量节约空间;查找时原子匹配多个id;

1 索引增删改查和分词处理 2 京东数据多线程建立索引 3 索引查询接口封装

Lucene--封装的lucene相关操作封装

LuceneAnalyze--负责完成查询关键字解析,尽可能拆分成原子数组 如果只有一个词,prefix查询 苹果* 如果是多个词,换成或者关系, 都是为了更多的命中结果(贪婪搜索) 做个关键词清理

LuceneBulid--- BuildIndex--MergeIndex 多线程写不同子路径,完成后合并 增加/删除索引 更新索引-只能先删除再更新 LuceneQuery---QueryIndexPage 支持关键字,支持范围过滤 支持排序 Processor---Lucene多线程建立索引 IndexBuilder 入口,启动多线程创建+完成后的Merge IndexBuilderPerThread 每个线程是如何完成索引建立的

DataService--CommodityLucene对外提供的搜索封装 CommodityRepository-SqlHelper,完成数据库数据查询

Utility--通用帮助类 CfgFiles--配置文件 Model--实体类

初始化索引

复制代码
  /// <summary>
        /// 初始化索引
        /// </summary>
        public static void InitIndex()
        {
            List<Commodity> commodityList = GetList();
​
            FSDirectory directory = FSDirectory.Open(StaticConstant.TestIndexPath);//文件夹
            using (IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED))//索引写入器
            {
                foreach (Commodity commdity in commodityList)
                {
                    for (int k = 0; k < 10; k++)
                    {
                        Document doc = new Document();//一条数据
                        doc.Add(new Field("id", commdity.Id.ToString(), Field.Store.NO, Field.Index.NOT_ANALYZED));//一个字段  列名  值   是否保存值  是否分词
                        doc.Add(new Field("title", commdity.Title, Field.Store.YES, Field.Index.ANALYZED));
                        doc.Add(new Field("url", commdity.Url, Field.Store.NO, Field.Index.NOT_ANALYZED));
                        doc.Add(new Field("imageurl", commdity.ImageUrl, Field.Store.NO, Field.Index.NOT_ANALYZED));
                        doc.Add(new Field("content", "this is lucene working,powerful tool " + k, Field.Store.YES, Field.Index.ANALYZED));
                        doc.Add(new NumericField("price", Field.Store.YES, true).SetDoubleValue((double)(commdity.Price + k)));
                        //doc.Add(new NumericField("time", Field.Store.YES, true).SetLongValue(DateTime.Now.ToFileTimeUtc()));
                        doc.Add(new NumericField("time", Field.Store.YES, true).SetIntValue(int.Parse(DateTime.Now.ToString("yyyyMMdd")) + k));
                        writer.AddDocument(doc);//写进去
                    }
                }
                writer.Optimize();//优化  就是合并
            }
        }
复制代码

查找器IndexSearcher

复制代码
FSDirectory dir = FSDirectory.Open(StaticConstant.TestIndexPath);
            IndexSearcher searcher = new IndexSearcher(dir);//查找器
            {
                TermQuery query = new TermQuery(new Term("title", "图书馆"));//包含
                TopDocs docs = searcher.Search(query, null, 10000);//找到的数据
                foreach (ScoreDoc sd in docs.ScoreDocs)
                {
                    Document doc = searcher.Doc(sd.Doc);
                    Console.WriteLine("***************************************");
                    Console.WriteLine(string.Format("id={0}", doc.Get("id")));
                    Console.WriteLine(string.Format("title={0}", doc.Get("title")));
                    Console.WriteLine(string.Format("time={0}", doc.Get("time")));
                    Console.WriteLine(string.Format("price={0}", doc.Get("price")));
                    Console.WriteLine(string.Format("content={0}", doc.Get("content")));
                }
                Console.WriteLine("1一共命中了{0}个", docs.TotalHits);
            }
复制代码

解析器QueryParser

复制代码
 QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", new PanGuAnalyzer());//解析器
            {
                //string keyword = "高中政治人教新课标选修生活中的法律常识";
                string keyword = "高中政治 人 教 新课 标 选修 生活 中的 法律常识";
                {
                    Query query = parser.Parse(keyword);
                    TopDocs docs = searcher.Search(query, null, 10000);//找到的数据
int i = 0;
                    foreach (ScoreDoc sd in docs.ScoreDocs)
                    {
                        if (i++ < 1000)
                        {
                            Document doc = searcher.Doc(sd.Doc);
                            Console.WriteLine("***************************************");
                            Console.WriteLine(string.Format("id={0}", doc.Get("id")));
                            Console.WriteLine(string.Format("title={0}", doc.Get("title")));
                            Console.WriteLine(string.Format("time={0}", doc.Get("time")));
                            Console.WriteLine(string.Format("price={0}", doc.Get("price")));
                        }
                    }
                    Console.WriteLine($"一共命中{docs.TotalHits}");
                }
                {
                    Query query = parser.Parse(keyword);
                    NumericRangeFilter<int> timeFilter = NumericRangeFilter.NewIntRange("time", 20190101, 20191231, true, true);//过滤
                    SortField sortPrice = new SortField("price", SortField.DOUBLE, false);//降序
                    SortField sortTime = new SortField("time", SortField.INT, true);//升序
                    Sort sort = new Sort(sortTime, sortPrice);//排序 哪个前哪个后
​
                    TopDocs docs = searcher.Search(query, timeFilter, 10000, sort);//找到的数据
                    int i = 0;
                    foreach (ScoreDoc sd in docs.ScoreDocs)
                    {
                        if (i++ < 1000)
                        {
                            Document doc = searcher.Doc(sd.Doc);
                            Console.WriteLine("***************************************");
                            Console.WriteLine(string.Format("id={0}", doc.Get("id")));
                            Console.WriteLine(string.Format("title={0}", doc.Get("title")));
                            Console.WriteLine(string.Format("time={0}", doc.Get("time")));
                            Console.WriteLine(string.Format("price={0}", doc.Get("price")));
                        }
                    }
                    Console.WriteLine("3一共命中了{0}个", docs.TotalHits);
                }
            }
复制代码

建立索引

复制代码
    /// <summary>
    /// 索引建立
    /// </summary>
    public class IndexBuilder
    {
        private static Logger logger = new Logger(typeof(IndexBuilder));
        private static List<string> PathSuffixList = new List<string>();
        private static CancellationTokenSource CTS = null;
​
        public static void Build()
        {
            try
            {
                logger.Debug(string.Format("{0} BuildIndex开始",DateTime.Now));
​
                List<Task> taskList = new List<Task>();
                TaskFactory taskFactory = new TaskFactory();
                CTS = new CancellationTokenSource();
                //30个表  30个线程  不用折腾,一线程一表  平均分配
                //30个表  18个线程  1到12号2个表  13到18是一个表?  错的!前12个线程活儿多,后面的活少
                //自己去想想,怎么样可以做,随便配置线程数量,但是可以均匀分配任务?
                for (int i = 1; i < 31; i++)
                {
                    IndexBuilderPerThread thread = new IndexBuilderPerThread(i, i.ToString("000"), CTS);
                    PathSuffixList.Add(i.ToString("000"));
                    taskList.Add(taskFactory.StartNew(thread.Process));//开启一个线程   里面创建索引
                }
                taskList.Add(taskFactory.ContinueWhenAll(taskList.ToArray(), MergeIndex));
                Task.WaitAll(taskList.ToArray());
                logger.Debug(string.Format("BuildIndex{0}", CTS.IsCancellationRequested ? "失败" : "成功"));
            }
            catch (Exception ex)
            {
                logger.Error("BuildIndex出现异常", ex);
            }
            finally
            {
                logger.Debug(string.Format("{0} BuildIndex结束", DateTime.Now));
            }
        }
​
        private static void MergeIndex(Task[] tasks)
        {
            try
            {
                if (CTS.IsCancellationRequested) return;
                ILuceneBulid builder = new LuceneBulid();
                builder.MergeIndex(PathSuffixList.ToArray());
            }
            catch (Exception ex)
            {
                CTS.Cancel();
                logger.Error("MergeIndex出现异常", ex);
            }
        }
    }
复制代码

 

IndexBuilderPerThread类

复制代码
  public class IndexBuilderPerThread
    {
        private Logger logger = new Logger(typeof(IndexBuilderPerThread));
        private int CurrentThreadNum = 0;
        private string PathSuffix = "";
        private CancellationTokenSource CTS = null;
        /// <summary>
        /// 为了避免多线程临时变量问题,将参数在构造时就传进来
        /// </summary>
        /// <param name="threadNum"></param>
        /// <param name="pathSuffix"></param>
        /// <param name="cts"></param>
        public IndexBuilderPerThread(int threadNum, string pathSuffix, CancellationTokenSource cts)
        {
            this.CurrentThreadNum = threadNum;
            this.PathSuffix = pathSuffix;
            this.CTS = cts;
        }
​
        public void Process()
        {
            try
            {
                logger.Debug(string.Format("ThreadNum={0}开始创建", CurrentThreadNum));
                CommodityRepository commodityRepository = new CommodityRepository();
                ILuceneBulid builder = new LuceneBulid();
                bool isFirst = true;
                int pageIndex = 1;
                while (!CTS.IsCancellationRequested)
                {
                    List<Commodity> commodityList = commodityRepository.QueryList(CurrentThreadNum, pageIndex, 1000);
                    if (commodityList == null || commodityList.Count == 0)
                    {
                        break;
                    }
                    //else if (pageIndex == 11)
                    //{
                    //    break;//为了测试  只做10000条数据
                    //}
                    else
                    {
                        builder.BuildIndex(commodityList, PathSuffix, isFirst);
                        logger.Debug(string.Format("ThreadNum={0}完成{1}条的创建", CurrentThreadNum, 1000 * pageIndex++));
                        isFirst = false;
                    }
                }
            }
            catch (Exception ex)
            {
                CTS.Cancel();
                logger.Error(string.Format("ThreadNum={0}出现异常", CurrentThreadNum), ex);
            }
            finally
            {
                logger.Debug(string.Format("ThreadNum={0}完成创建", CurrentThreadNum));
            }
        }
    }
复制代码
复制代码
       /// <summary>
        /// 将索引合并到上级目录
        /// </summary>
        /// <param name="sourceDir">子文件夹名</param>
        public void MergeIndex(string[] childDirs)
        {
            Console.WriteLine("MergeIndex Start");
            IndexWriter writer = null;
            try
            {
                if (childDirs == null || childDirs.Length == 0) return;
                Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
                string rootPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootPath);
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//删除原有的
                LuceneIO.Directory[] dirNo = childDirs.Select(dir => LuceneIO.FSDirectory.Open(Directory.CreateDirectory(string.Format("{0}\\{1}", rootPath, dir)))).ToArray();
                writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                writer.AddIndexesNoOptimize(dirNo);
            }
            finally
            {
                if (writer != null)
                {
                    writer.Optimize();
                    writer.Close();
                }
                Console.WriteLine("MergeIndex End");
            }
        }
复制代码

用lucene进行商品查询

复制代码
    public class CommodityLucene
    {
        private static Logger logger = new Logger(typeof(CommodityLucene));
​
        #region QueryCommodity
        /// <summary>
        /// 用lucene进行商品查询
        /// </summary>
        /// <param name="pageIndex">从1开始</param>
        /// <param name="pageSize">单页数量</param>
        /// <param name="totalCount">总数</param>
        /// <param name="keyword">搜索的关键字</param>
        /// <param name="categoryIdList">类别集合</param>
        /// <param name="priceFilter">[13,50]  13,50且包含13到50   {13,50}  13,50且不包含13到50</param>
        /// <param name="priceOrderBy">price desc   price asc</param>
        /// <returns></returns>
        public static List<Commodity> QueryCommodity(int pageIndex, int pageSize, out int totalCount, string keyword, List<int> categoryIdList, string priceFilter, string priceOrderBy)
        {
            totalCount = 0;
            try
            {
                if (string.IsNullOrWhiteSpace(keyword) && (categoryIdList == null || categoryIdList.Count == 0)) return null;
                ILuceneQuery luceneQuery = new LuceneQuery();
                string queryString = string.Format(" {0} {1}",
                                                    string.IsNullOrWhiteSpace(keyword) ? "" : string.Format(" +{0}", AnalyzerKeyword(keyword)),
                                                    categoryIdList == null || categoryIdList.Count == 0 ? "" : string.Format(" +{0}", AnalyzerCategory(categoryIdList)));
​
                return luceneQuery.QueryIndexPage(queryString, pageIndex, pageSize, out totalCount, priceFilter, priceOrderBy);
            }
            catch (Exception ex)
            {
                logger.Error(string.Format("QueryCommodity参数为{0}出现异常", keyword), ex);
                return null;
            }
        }
        #endregion QueryCommodity/// <summary>
        /// 为keyword做盘古分词
        /// </summary>
        /// <param name="keyword"></param>
        /// <param name="luceneQuery"></param>
        /// <returns></returns>
        private static string AnalyzerKeyword(string keyword)
        {
            StringBuilder queryStringBuilder = new StringBuilder();
            ILuceneAnalyze analyzer = new LuceneAnalyze();
            string[] words = analyzer.AnalyzerKey(keyword);
            if (words.Length == 1)
            {
                queryStringBuilder.AppendFormat("{0}:{1}* ", "title", words[0]);
            }
            else
            {
                StringBuilder fieldQueryStringBuilder = new StringBuilder();
                foreach (string word in words)
                {
                    queryStringBuilder.AppendFormat("{0}:{1} ", "title", word);
                }
            }
            string result = queryStringBuilder.ToString().TrimEnd();
            logger.Info(string.Format("AnalyzerKeyword 将 keyword={0}转换为{1}", keyword, result));
            return result;
        }
​
        /// <summary>
        /// 为类别做custom分词
        /// </summary>
        /// <param name="categoryIdList"></param>
        /// <returns></returns>
        private static string AnalyzerCategory(List<int> categoryIdList)
        {
            return string.Join(" ", categoryIdList.Select(c => string.Format("{0}:{1}", "categoryid", c)));
        }
    }
复制代码

 


 

 

posted @   明志德道  阅读(127)  评论(0编辑  收藏  举报
编辑推荐:
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· [.NET]调用本地 Deepseek 模型
· 一个费力不讨好的项目,让我损失了近一半的绩效!
阅读排行:
· 在鹅厂做java开发是什么体验
· 百万级群聊的设计实践
· WPF到Web的无缝过渡:英雄联盟客户端的OpenSilver迁移实战
· 永远不要相信用户的输入:从 SQL 注入攻防看输入验证的重要性
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
点击右上角即可分享
微信分享提示