(精华)2020年8月18日 C#基础知识点 搜索引擎Lucene的使用

(精华)2020年8月18日 C#基础知识点 搜索引擎Lucene的使用

lucene的基本说明

/// lucene.net:全文检索的工具包,不是应用,只是个类库,完成了全文检索的功能
/// 就是把数据拆分—存起来—查询时—拆分—匹配—结果
///
/// Analysis–分词器,负责把字符串拆分成原子,包含了标准分词,直接空格拆分
/// 项目中用的是盘古中文分词,
/// Document–数据结构,定义存储数据的格式
/// Index–索引的读写类
/// QueryParser–查询解析器,负责解析查询语句
/// Search—负责各种查询类,命令解析后得到就是查询类
/// Store—索引存储类,负责文件夹等等
/// Util—常见工具类库
///
/// lucene是全文搜索必备的,是大型系统必备的
///
/// Search:
/// TermQuery–单元查询 new Term(“title”,“张三”) title:张三
/// BoolenQuery—new Term(“title”,“张三”) and new Term(“title”,“李四”) title:张三 + title:李四
/// new Term(“title”,“张三”) or new Term(“title”,“李四”) title:张三 title:李四
/// WildcardQuery—通配符 new Term(“title”,“张?”) title:张?
/// new Term(“title”,“张*”) title:张*
/// PrefixQuery—前缀查询 以xx开头 title:张*
/// PhraseQuery—间隔距离 包含没有 包含提莫 而且二者距离不能超过5
/// title: “没有 提莫”~5
/// 没有蘑菇的提莫 没有蘑菇的蘑菇的蘑菇的提莫
/// FuzzyQuery—近似查询,ibhone----iphone title:ibhone~
/// RangeQuery—范围查询 [1,100] {1,100}
///
/// Lucene.Net一进一出,建立索引需要获取数据源,分词-保存到硬盘
/// 索引查找,
/// 自然会有些延迟,以前淘宝上架宝贝,第二天才能搜索的
/// 索引更新策略:1 数据跟新—丢一个队列—一个processor通过队列完成更新
/// 2 每一周全部索引一遍
///
/// lucene索引存的是原子–docid1,docid2,docid3
/// 不store可以大量节约空间;查找时原子匹配多个id;

第一步:lucene初始化索引

/// <summary>
/// 初始化索引
/// </summary>
public static void InitIndex()
{<!-- -->
    List<Commodity> commodityList = GetList();//数据源

    FSDirectory directory = FSDirectory.Open(StaticConstant.TestIndexPath);//文件夹
    using (IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED))//索引写入器
    {<!-- -->
        foreach (Commodity commdity in commodityList)
        {<!-- -->
            for (int k = 0; k < 10; k++)
            {<!-- -->
                Document doc = new Document();//一条数据
                doc.Add(new Field("id", commdity.Id.ToString(), Field.Store.NO, Field.Index.NOT_ANALYZED));//一个字段  列名  值   是否保存值  是否分词
                doc.Add(new Field("title", commdity.Title, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new Field("url", commdity.Url, Field.Store.NO, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("imageurl", commdity.ImageUrl, Field.Store.NO, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("content", "this is lucene working,powerful tool " + k, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new NumericField("price", Field.Store.YES, true).SetDoubleValue((double)(commdity.Price + k)));
                //doc.Add(new NumericField("time", Field.Store.YES, true).SetLongValue(DateTime.Now.ToFileTimeUtc()));
                doc.Add(new NumericField("time", Field.Store.YES, true).SetIntValue(int.Parse(DateTime.Now.ToString("yyyyMMdd")) + k));
                writer.AddDocument(doc);//写进去
            }
        }
        writer.Optimize();//优化  就是合并
    }
}

基础的查询

FSDirectory dir = FSDirectory.Open(StaticConstant.TestIndexPath);
IndexSearcher searcher = new IndexSearcher(dir);//查找器
TermQuery query = new TermQuery(new Term("title", "图书馆"));//包含
TopDocs docs = searcher.Search(query, null, 10000);//找到的数据
foreach (ScoreDoc sd in docs.ScoreDocs)
{<!-- -->
    Document doc = searcher.Doc(sd.Doc);
    Console.WriteLine("***************************************");
    Console.WriteLine(string.Format("id={0}", doc.Get("id")));
    Console.WriteLine(string.Format("title={0}", doc.Get("title")));
    Console.WriteLine(string.Format("time={0}", doc.Get("time")));
    Console.WriteLine(string.Format("price={0}", doc.Get("price")));
    Console.WriteLine(string.Format("content={0}", doc.Get("content")));
}
Console.WriteLine("1一共命中了{0}个", docs.TotalHits);

关键字查询

FSDirectory dir = FSDirectory.Open(StaticConstant.TestIndexPath);
IndexSearcher searcher = new IndexSearcher(dir);//查找器
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", new PanGuAnalyzer());//解析器
string keyword = "高中政治 人 教 新课 标 选修 生活 中的 法律常识";
Query query = parser.Parse(keyword);
TopDocs docs = searcher.Search(query, null, 10000);//找到的数据
int i = 0;
foreach (ScoreDoc sd in docs.ScoreDocs)
{<!-- -->
    if (i++ < 1000)
    {<!-- -->
        Document doc = searcher.Doc(sd.Doc);
        Console.WriteLine("***************************************");
        Console.WriteLine(string.Format("id={0}", doc.Get("id")));
        Console.WriteLine(string.Format("title={0}", doc.Get("title")));
        Console.WriteLine(string.Format("time={0}", doc.Get("time")));
        Console.WriteLine(string.Format("price={0}", doc.Get("price")));
    }
}
Console.WriteLine($"一共命中{docs.TotalHits}");

多条件查询,除了关键字,时间,排序

FSDirectory dir = FSDirectory.Open(StaticConstant.TestIndexPath);
IndexSearcher searcher = new IndexSearcher(dir);//查找器
QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", new PanGuAnalyzer());//解析器
string keyword = "高中政治 人 教 新课 标 选修 生活 中的 法律常识";
Query query = parser.Parse(keyword);
NumericRangeFilter<int> timeFilter = NumericRangeFilter.NewIntRange("time", 20190101, 20191231, true, true);//过滤
SortField sortPrice = new SortField("price", SortField.DOUBLE, false);//降序
SortField sortTime = new SortField("time", SortField.INT, true);//升序
Sort sort = new Sort(sortTime, sortPrice);//排序 哪个前哪个后

TopDocs docs = searcher.Search(query, timeFilter, 10000, sort);//找到的数据
int i = 0;
foreach (ScoreDoc sd in docs.ScoreDocs)
{<!-- -->
    if (i++ < 1000)
    {<!-- -->
        Document doc = searcher.Doc(sd.Doc);
        Console.WriteLine("***************************************");
        Console.WriteLine(string.Format("id={0}", doc.Get("id")));
        Console.WriteLine(string.Format("title={0}", doc.Get("title")));
        Console.WriteLine(string.Format("time={0}", doc.Get("time")));
        Console.WriteLine(string.Format("price={0}", doc.Get("price")));
    }
}
Console.WriteLine("3一共命中了{0}个", docs.TotalHits);
/// 1 索引增删改查和分词处理
/// 2 京东数据多线程建立索引
/// 3 索引查询接口封装
///
/// Lucene–封装的lucene相关操作封装
///
/// LuceneAnalyze–负责完成查询关键字解析,尽可能拆分成原子数组
/// 如果只有一个词,prefix查询 苹果*
/// 如果是多个词,换成或者关系,
/// 都是为了更多的命中结果(贪婪搜索)
/// 做个关键词清理
///
/// LuceneBulid— BuildIndex–MergeIndex 多线程写不同子路径,完成后合并
/// 增加/删除索引 更新索引-只能先删除再更新
///
/// LuceneQuery—QueryIndexPage 支持关键字,支持范围过滤 支持排序
///
/// Processor—Lucene多线程建立索引
/// IndexBuilder 入口,启动多线程创建+完成后的Merge
/// IndexBuilderPerThread 每个线程是如何完成索引建立的
///
/// DataService–CommodityLucene对外提供的搜索封装
/// CommodityRepository-SqlHelper,完成数据库数据查询

批量索引建立

IndexBuilder.Build();
int total = 0;
string pricefilter = "[50,2000]";
string priceorderby = "price desc";
List<Commodity> commoditylist = CommodityLucene.QueryCommodity(1, 30, out total, "书", null, pricefilter, priceorderby);

foreach (Commodity commodity in commoditylist)
{<!-- -->
    Console.WriteLine("title={0},price={1}", commodity.Title, commodity.Price);
}
/// <summary>
    /// 索引建立
    /// </summary>
    public class IndexBuilder
    {<!-- -->
        private static Logger logger = new Logger(typeof(IndexBuilder));
        private static List<string> PathSuffixList = new List<string>();
        private static CancellationTokenSource CTS = null;

        public static void Build()
        {<!-- -->
            try
            {<!-- -->
                logger.Debug(string.Format("{0} BuildIndex开始",DateTime.Now));

                List<Task> taskList = new List<Task>();
                TaskFactory taskFactory = new TaskFactory();
                CTS = new CancellationTokenSource(); //线程取消
                //30个表  30个线程  不用折腾,一线程一表  平均分配
                //如果我不开启30 个线程,10 个线程?
                //自己去想想,怎么样可以做,随便配置线程数量,但是可以均匀分配任务?
                for (int i = 1; i < 31; i++)
                {<!-- -->
                    IndexBuilderPerThread thread = new IndexBuilderPerThread(i, i.ToString("000"), CTS);
                    PathSuffixList.Add(i.ToString("000"));
                    taskList.Add(taskFactory.StartNew(thread.Process));//开启一个线程   里面创建索引
                }
                taskList.Add(taskFactory.ContinueWhenAll(taskList.ToArray(), MergeIndex));
                Task.WaitAll(taskList.ToArray());  //为了展示出多线程的异常
                logger.Debug(string.Format("BuildIndex{0}", CTS.IsCancellationRequested ? "失败" : "成功"));
            }
            catch (Exception ex)
            {<!-- -->
                logger.Error("BuildIndex出现异常", ex);
            }
            finally
            {<!-- -->
                logger.Debug(string.Format("{0} BuildIndex结束", DateTime.Now));
            }
        }

        private static void MergeIndex(Task[] tasks)
        {<!-- -->
            try
            {<!-- -->
                if (CTS.IsCancellationRequested) return;
                ILuceneBulid builder = new LuceneBulid();
                builder.MergeIndex(PathSuffixList.ToArray());
            }
            catch (Exception ex)
            {<!-- -->
                CTS.Cancel();
                logger.Error("MergeIndex出现异常", ex);
            }
        }
    }
public class IndexBuilderPerThread
    {<!-- -->
        private Logger logger = new Logger(typeof(IndexBuilderPerThread));
        private int CurrentThreadNum = 0;
        private string PathSuffix = "";
        private CancellationTokenSource CTS = null;
        public IndexBuilderPerThread(int threadNum, string pathSuffix, CancellationTokenSource cts)
        {<!-- -->
            this.CurrentThreadNum = threadNum;
            this.PathSuffix = pathSuffix;
            this.CTS = cts;
        }

        public void Process()
        {<!-- -->
            try
            {<!-- -->
                logger.Debug(string.Format("ThreadNum={0}开始创建", CurrentThreadNum));
                CommodityRepository commodityRepository = new CommodityRepository();
                ILuceneBulid builder = new LuceneBulid();
                bool isFirst = true;
                int pageIndex = 1;
                while (!CTS.IsCancellationRequested)
                {<!-- -->
                    List<CourseEntity> commodityList = commodityRepository.QueryList(CurrentThreadNum, pageIndex, 1000);
                    if (commodityList == null || commodityList.Count == 0)
                    {<!-- -->
                        break;
                    }
                    //else if (pageIndex == 11)
                    //{<!-- -->
                    //    break;//为了测试  只做10000条数据
                    //}
                    else
                    {<!-- -->
                        builder.BuildIndex(commodityList, PathSuffix, isFirst);
                        logger.Debug(string.Format("ThreadNum={0}完成{1}条的创建", CurrentThreadNum, 1000 * pageIndex++));
                        isFirst = false;
                    }
                }
            }
            catch (Exception ex)
            {<!-- -->
                CTS.Cancel();
                logger.Error(string.Format("ThreadNum={0}出现异常", CurrentThreadNum), ex);
            }
            finally
            {<!-- -->
                logger.Debug(string.Format("ThreadNum={0}完成创建", CurrentThreadNum));
            }
        }
    }

备注:相关类

/// <summary>
/// 空格分词器
/// </summary>
public class BlankAnalyzer : Analyzer
{<!-- -->
    public override TokenStream TokenStream(string fieldName, TextReader reader)
    {<!-- -->
        return new BlankTokenizer(reader);
    }
    public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
    {<!-- -->
        Tokenizer tokenizer = (Tokenizer)this.PreviousTokenStream;
        if (tokenizer == null)
        {<!-- -->
            tokenizer = new BlankTokenizer(reader);
            this.PreviousTokenStream = tokenizer;
        }
        else
        {<!-- -->
            tokenizer.Reset(reader);
        }
        return tokenizer;
    }
}
public class BlankTokenizer : CharTokenizer
    {<!-- -->
        public BlankTokenizer(TextReader in_Renamed)
            : base(in_Renamed)
        {<!-- -->
        }
        public BlankTokenizer(AttributeSource source, TextReader in_Renamed)
            : base(source, in_Renamed)
        {<!-- -->
        }
        public BlankTokenizer(AttributeSource.AttributeFactory factory, TextReader in_Renamed)
            : base(factory, in_Renamed)
        {<!-- -->
        }
        protected override bool IsTokenChar(char c)
        {<!-- -->
            return c != ' ';
        }
    }
    /// <summary>
    /// 逗号分词器
    /// </summary>
    public class CommaAnalyzer : Analyzer
    {<!-- -->
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {<!-- -->
            return new CommaTokenizer(reader);
        }
        public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
        {<!-- -->
            Tokenizer tokenizer = (Tokenizer)this.PreviousTokenStream;
            if (tokenizer == null)
            {<!-- -->
                tokenizer = new CommaTokenizer(reader);
                this.PreviousTokenStream = tokenizer;
            }
            else
            {<!-- -->
                tokenizer.Reset(reader);
            }
            return tokenizer;
        }
    }
    public class CommaTokenizer : CharTokenizer
    {<!-- -->
        public CommaTokenizer(TextReader in_Renamed)
            : base(in_Renamed)
        {<!-- -->
        }
        public CommaTokenizer(AttributeSource source, TextReader in_Renamed)
            : base(source, in_Renamed)
        {<!-- -->
        }
        public CommaTokenizer(AttributeSource.AttributeFactory factory, TextReader in_Renamed)
            : base(factory, in_Renamed)
        {<!-- -->
        }
        protected override bool IsTokenChar(char c)
        {<!-- -->
            return c != ',';
        }
    }
public interface ILuceneAnalyze
    {<!-- -->
        /// <summary>
        /// 根据查询的field将keyword分词
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        string[] AnalyzerKey(string keyword);
    }
    public interface ILuceneBulid
    {<!-- -->
        /// <summary>
        /// 批量创建索引
        /// </summary>
        /// <param name="ciList"></param>
        /// <param name="pathSuffix">索引目录后缀,加在电商的路径后面,为空则为根目录.如sa\1</param>
        /// <param name="isCreate">默认为false 增量索引  true的时候删除原有索引</param>
        void BuildIndex(List<CourseEntity> ciList, string pathSuffix = "", bool isCreate = false);

        /// <summary>
        /// 将索引合并到上级目录
        /// </summary>
        /// <param name="sourceDir">子文件夹名</param>
        void MergeIndex(string[] sourceDirs);

        /// <summary>
        /// 新增一条数据的索引
        /// </summary>
        /// <param name="ci"></param>
        void InsertIndex(CourseEntity ci);

        /// <summary>
        /// 批量新增数据的索引
        /// </summary>
        /// <param name="ciList">sourceflag统一的</param>
        void InsertIndexMuti(List<CourseEntity> ciList);

        /// <summary>
        /// 删除一条数据的索引
        /// </summary>
        /// <param name="ci"></param>
        void DeleteIndex(CourseEntity ci);

        /// <summary>
        /// 批量删除数据的索引
        /// </summary>
        /// <param name="ciList">sourceflag统一的</param>
        void DeleteIndexMuti(List<CourseEntity> ciList);

        /// <summary>
        /// 更新一条数据的索引
        /// </summary>
        /// <param name="ci"></param>
        void UpdateIndex(CourseEntity ci);

        /// <summary>
        /// 批量更新数据的索引
        /// </summary>
        /// <param name="ciList">sourceflag统一的</param>
        void UpdateIndexMuti(List<CourseEntity> ciList);
    }
    public interface ILuceneQuery
    {<!-- -->
        /// <summary>
        /// 获取课程信息数据
        /// </summary>
        /// <param name="queryString"></param>
        /// <returns></returns>
        List<CourseEntity> QueryIndex(string queryString);

        /// <summary>
        /// 分页获取商品信息数据
        /// </summary>
        /// <param name="queryString"></param>
        /// <param name="pageIndex">第一页为1</param>
        /// <param name="pageSize"></param>
        /// <param name="totalCount"></param>
        /// <returns></returns>
        List<CourseEntity> QueryIndexPage(string queryString, int pageIndex, int pageSize, out int totalCount, string priceFilter, string priceOrderBy);
    }
public class LuceneAnalyze : ILuceneAnalyze
    {<!-- -->
        private Logger logger = new Logger(typeof(LuceneAnalyze));

        #region AnalyzerKey
        /// <summary>
        /// 将搜索的keyword分词
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public string[] AnalyzerKey(string keyword)
        {<!-- -->
            Analyzer analyzer = new PanGuAnalyzer();
            QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", analyzer);
            Query query = parser.Parse(this.CleanKeyword(keyword));
            if (query is TermQuery)
            {<!-- -->
                Term term = ((TermQuery)query).Term;
                return new string[] {<!-- --> term.Text };
            }
            else if (query is PhraseQuery)
            {<!-- -->
                Term[] term = ((PhraseQuery)query).GetTerms();
                return term.Select(t => t.Text).ToArray();
            }
            else if (query is BooleanQuery)
            {<!-- -->
                BooleanClause[] clauses = ((BooleanQuery)query).GetClauses();
                List<string> analyzerWords = new List<string>();
                foreach (BooleanClause clause in clauses)
                {<!-- -->
                    Query childQuery = clause.Query;
                    if (childQuery is TermQuery)
                    {<!-- -->
                        Term term = ((TermQuery)childQuery).Term;
                        analyzerWords.Add(term.Text);
                    }
                    else if (childQuery is PhraseQuery)
                    {<!-- -->
                        Term[] term = ((PhraseQuery)childQuery).GetTerms();
                        analyzerWords.AddRange(term.Select(t => t.Text));
                    }
                }
                return analyzerWords.ToArray();
            }
            else
            {<!-- -->
                logger.Debug(string.Format("AnalyzerKey在解析keyword={0}的结果为new string[] { keyword } ", keyword));
                return new string[] {<!-- --> keyword };
            }
        }

        /// <summary>
        /// 清理头尾and or 关键字
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        private string CleanKeyword(string keyword)
        {<!-- -->
            if (string.IsNullOrWhiteSpace(keyword))
            {<!-- --> }
            else
            {<!-- -->
                bool isClean = false;
                while (!isClean)
                {<!-- -->
                    keyword = keyword.Trim();
                    if (keyword.EndsWith(" AND"))
                    {<!-- -->
                        keyword = string.Format("{0}and", keyword.Remove(keyword.Length - 3, 3));
                    }
                    else if (keyword.EndsWith(" OR"))
                    {<!-- -->
                        keyword = string.Format("{0}or", keyword.Remove(keyword.Length - 2, 2));
                    }
                    else if (keyword.StartsWith("AND "))
                    {<!-- -->
                        keyword = string.Format("and{0}", keyword.Substring(3));
                    }
                    else if (keyword.StartsWith("OR "))
                    {<!-- -->
                        keyword = string.Format("or{0}", keyword.Substring(2));
                    }
                    else if (keyword.Contains(" OR "))
                    {<!-- -->
                        keyword = keyword.Replace(" OR ", " or ");
                    }
                    else if (keyword.Contains(" AND "))
                    {<!-- -->
                        keyword = keyword.Replace(" AND ", " and ");
                    }
                    else
                        isClean = true;
                }

            }
            return QueryParser.Escape(keyword);
        }
        #endregion AnalyzerKey
    }
/// <summary>
    /// 多线程的问题 :多文件写,然后合并
    /// 延时:异步队列
    /// 
    /// </summary>
    public class LuceneBulid : ILuceneBulid
    {<!-- -->
        #region Identity
        private Logger logger = new Logger(typeof(LuceneBulid));
        #endregion Identity

        #region 批量BuildIndex 索引合并
        /// <summary>
        /// 批量创建索引(要求是统一的sourceflag,即目录是一致的)
        /// </summary>
        /// <param name="ciList">sourceflag统一的</param>
        /// <param name="pathSuffix">索引目录后缀,加在电商的路径后面,为空则为根目录.如sa\1</param>
        /// <param name="isCreate">默认为false 增量索引  true的时候删除原有索引</param>
        public void BuildIndex(List<CourseEntity> ciList, string pathSuffix = "", bool isCreate = false)
        {<!-- -->
            IndexWriter writer = null;
            try
            {<!-- -->
                if (ciList == null || ciList.Count == 0)
                {<!-- -->
                    return;
                }

                string rootIndexPath = StaticConstant.IndexPath;
                string indexPath = string.IsNullOrWhiteSpace(pathSuffix) ? rootIndexPath : string.Format("{0}\\{1}", rootIndexPath, pathSuffix);

                DirectoryInfo dirInfo = Directory.CreateDirectory(indexPath);
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, new PanGuAnalyzer(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                //writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                writer.SetMaxBufferedDocs(100);//控制写入一个新的segent前内存中保存的doc的数量 默认10  
                writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建复合文件 减少索引文件数量

                ciList.ForEach(c => CreateCIIndex(writer, c));
            }
            finally
            {<!-- -->
                if (writer != null)
                {<!-- -->
                    //writer.Optimize(); 创建索引的时候不做合并  merge的时候处理
                    writer.Close();
                }
            }
        }

        /// <summary>
        /// 将索引合并到上级目录
        /// </summary>
        /// <param name="sourceDir">子文件夹名</param>
        public void MergeIndex(string[] childDirs)
        {<!-- -->
            Console.WriteLine("MergeIndex Start");
            IndexWriter writer = null;
            try
            {<!-- -->
                if (childDirs == null || childDirs.Length == 0) return;
                Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
                string rootPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootPath);
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//删除原有的
                LuceneIO.Directory[] dirNo = childDirs.Select(dir => LuceneIO.FSDirectory.Open(Directory.CreateDirectory(string.Format("{0}\\{1}", rootPath, dir)))).ToArray();
                writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                writer.AddIndexesNoOptimize(dirNo);
            }
            finally
            {<!-- -->
                if (writer != null)
                {<!-- -->
                    writer.Optimize();
                    writer.Close();
                }
                Console.WriteLine("MergeIndex End");
            }
        }


        //总结来说:之前开发系统的时候,做的是数据的增删改,这里改成对索引的增删改

        //Field.Store.YES:存储字段值(未分词前的字段值)        
        //Field.Store.NO:不存储,存储与索引没有关系         
        //Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损         
        //Field.Index.ANALYZED:分词建索引         
        //Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间         
        //Field.Index.NOT_ANALYZED:不分词且索引         
        //Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存         
        //TermVector表示文档的条目(由一个Document和Field定位)和它们在当前文档中所出现的次数         
        //Field.TermVector.YES:为每个文档(Document)存储该字段的TermVector         
        //Field.TermVector.NO:不存储TermVector         
        // Field.TermVector.WITH_POSITIONS:存储位置        
        //Field.TermVector.WITH_OFFSETS:存储偏移量         
        //Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量
        #endregion 批量BuildIndex 索引合并

        #region 单个/批量索引增删改
        /// <summary>
        /// 新增一条数据的索引
        /// </summary>
        /// <param name="ci"></param>
        public void InsertIndex(CourseEntity ci)
        {<!-- -->
            IndexWriter writer = null;
            try
            {<!-- -->
                if (ci == null) return;
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);

                bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                CreateCIIndex(writer, ci);
            }
            catch (Exception ex)
            {<!-- -->
                logger.Error("InsertIndex异常", ex);
                throw ex;
            }
            finally
            {<!-- -->
                if (writer != null)
                {<!-- -->
                    //if (fileNum > 50)
                    //    writer.Optimize();
                    writer.Close();
                }
            }
        }

        /// <summary>
        /// 批量新增数据的索引
        /// </summary>
        /// <param name="ciList"></param>
        public void InsertIndexMuti(List<CourseEntity> ciList)
        {<!-- -->
            BuildIndex(ciList, "", false);
        }

        /// <summary>
        /// 批量删除数据的索引
        /// </summary>
        /// <param name="ciList"></param>
        public void DeleteIndexMuti(List<CourseEntity> ciList)
        {<!-- -->
            IndexReader reader = null;
            try
            {<!-- -->
                if (ciList == null || ciList.Count == 0) return;
                Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                reader = IndexReader.Open(directory, false);
                foreach (CourseEntity ci in ciList)
                {<!-- -->
                    reader.DeleteDocuments(new Term("productid", ci.CourseId.ToString()));
                }
            }
            catch (Exception ex)
            {<!-- -->
                logger.Error("DeleteIndex异常", ex);
                throw ex;
            }
            finally
            {<!-- -->
                if (reader != null)
                {<!-- -->
                    reader.Dispose();
                }
            }
        }

        /// <summary>
        /// 删除多条数据的索引
        /// </summary>
        /// <param name="ci"></param>
        public void DeleteIndex(CourseEntity ci)
        {<!-- -->
            IndexReader reader = null;
            try
            {<!-- -->
                if (ci == null) return;
                Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                reader = IndexReader.Open(directory, false);
                reader.DeleteDocuments(new Term("productid", ci.CourseId.ToString()));
            }
            catch (Exception ex)
            {<!-- -->

                logger.Error("DeleteIndex异常", ex);
                throw ex;
            }
            finally
            {<!-- -->
                if (reader != null)
                {<!-- -->
                    reader.Dispose();
                }
            }
        }

        / <summary>
        / 更新一条数据的索引
        / </summary>
        //public void UpdateIndex(Commodity ci)
        //{<!-- -->
        //    DeleteIndex(ci);
        //    InsertIndex(ci);
        //}

        /// <summary>
        /// 更新一条数据的索引
        /// </summary>
        /// <param name="ci"></param>
        public void UpdateIndex(CourseEntity ci)
        {<!-- -->
            IndexWriter writer = null;
            try
            {<!-- -->
                if (ci == null) return;
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);

                bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                writer.UpdateDocument(new Term("productid", ci.CourseId.ToString()), ParseCItoDoc(ci));
            }
            catch (Exception ex)
            {<!-- -->
                logger.Error("InsertIndex异常", ex);
                throw ex;
            }
            finally
            {<!-- -->
                if (writer != null)
                {<!-- -->
                    //if (fileNum > 50)
                    //    writer.Optimize();
                    writer.Close();
                }
            }
        }

        /// <summary>
        /// 批量更新数据的索引
        /// </summary>
        /// <param name="ciList">sourceflag统一的</param>
        public void UpdateIndexMuti(List<CourseEntity> ciList)
        {<!-- -->
            IndexWriter writer = null;
            try
            {<!-- -->
                if (ciList == null || ciList.Count == 0) return;
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);

                bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                writer.MergeFactor = 50;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                foreach (CourseEntity ci in ciList)
                {<!-- -->
                    writer.UpdateDocument(new Term("productid", ci.CourseId.ToString()), ParseCItoDoc(ci));
                }
            }
            catch (Exception ex)
            {<!-- -->
                logger.Error("InsertIndex异常", ex);
                throw ex;
            }
            finally
            {<!-- -->
                if (writer != null)
                {<!-- -->
                    //if (fileNum > 50)
                    //    writer.Optimize();
                    writer.Close();
                }
            }
        }
        #endregion 单个索引增删改

        #region PrivateMethod
        /// <summary>
        /// 创建分析器
        /// </summary>
        /// <returns></returns>
        private PerFieldAnalyzerWrapper CreateAnalyzerWrapper()
        {<!-- -->
            Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);

            PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(analyzer);
            analyzerWrapper.AddAnalyzer("title", new PanGuAnalyzer());
            analyzerWrapper.AddAnalyzer("categoryid", new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30));
            return analyzerWrapper;
        }

        /// <summary>
        /// 创建索引
        /// </summary>
        /// <param name="analyzer"></param>
        /// <param name="title"></param>
        /// <param name="content"></param>
        private void CreateCIIndex(IndexWriter writer, CourseEntity ci)
        {<!-- -->
            try
            {<!-- -->
                writer.AddDocument(ParseCItoDoc(ci));
            }
            catch (Exception ex)
            {<!-- -->
                logger.Error("CreateCIIndex异常", ex);
                throw ex;
            }
        }

        /// <summary>
        /// 将Commodity转换成doc
        /// </summary>
        /// <param name="ci"></param>
        /// <returns></returns>
        private Document ParseCItoDoc(CourseEntity ci)
        {<!-- -->
            Document doc = new Document();

            doc.Add(new Field("id", ci.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("title", ci.Title, Field.Store.YES, Field.Index.ANALYZED));//盘古分词
            doc.Add(new Field("courseId", ci.CourseId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("categoryid", ci.CategoryId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("imageurl", ci.ImageUrl, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("url", ci.Url, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new NumericField("price", Field.Store.YES, true).SetFloatValue((float)ci.Price));
            return doc;
        }

        #endregion PrivateMethod
    }
public class LuceneQuery : ILuceneQuery
    {<!-- -->
        #region Identity
        private Logger logger = new Logger(typeof(LuceneQuery));
        #endregion Identity

        #region QueryIndex
        /// <summary>
        /// 获取课程信息数据
        /// </summary>
        /// <param name="queryString"></param>
        /// <returns></returns>
        public List<CourseEntity> QueryIndex(string queryString)
        {<!-- -->
            IndexSearcher searcher = null;
            try
            {<!-- -->
                List<CourseEntity> ciList = new List<CourseEntity>();
                Directory dir = FSDirectory.Open(StaticConstant.IndexPath);
                searcher = new IndexSearcher(dir);
                Analyzer analyzer = new PanGuAnalyzer();

                //--------------------------------------这里配置搜索条件
                QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", analyzer);
                Query query = parser.Parse(queryString);
                Console.WriteLine(query.ToString()); //显示搜索表达式
                TopDocs docs = searcher.Search(query, (Filter)null, 10000);

                foreach (ScoreDoc sd in docs.ScoreDocs)
                {<!-- -->
                    Document doc = searcher.Doc(sd.Doc);
                    ciList.Add(DocumentToCommodityInfo(doc));
                }

                return ciList;
            }
            finally
            {<!-- -->
                if (searcher != null)
                {<!-- -->
                    searcher.Dispose();
                }
            }
        }



        /// <summary>
        /// 分页获取课程信息数据
        /// </summary>
        /// <param name="queryString"></param>
        /// <param name="pageIndex">第一页为1</param>
        /// <param name="pageSize"></param>
        /// <param name="totalCount"></param>
        /// <returns></returns>
        public List<CourseEntity> QueryIndexPage(string queryString, int pageIndex, int pageSize, out int totalCount, string priceFilter, string priceOrderBy)
        {<!-- -->
            totalCount = 0;
            IndexSearcher searcher = null;
            try
            {<!-- -->
                List<CourseEntity> ciList = new List<CourseEntity>();
                FSDirectory dir = FSDirectory.Open(StaticConstant.IndexPath);
                searcher = new IndexSearcher(dir);
                Analyzer analyzer = new PanGuAnalyzer();

                //--------------------------------------这里配置搜索条件
                QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "title", analyzer);
                Query query = parser.Parse(queryString);

                pageIndex = Math.Max(1, pageIndex);//索引从1开始
                int startIndex = (pageIndex - 1) * pageSize;
                int endIndex = pageIndex * pageSize;

                NumericRangeFilter<float> numPriceFilter = null;
                if (!string.IsNullOrWhiteSpace(priceFilter))
                {<!-- -->
                    bool isContainStart = priceFilter.StartsWith("[");
                    bool isContainEnd = priceFilter.EndsWith("]");
                    string[] floatArray = priceFilter.Replace("[", "").Replace("]", "").Replace("{", "").Replace("}", "").Split(',');
                    float start = 0;
                    float end = 0;
                    if (!float.TryParse(floatArray[0], out start) || !float.TryParse(floatArray[1], out end))
                    {<!-- -->
                        throw new Exception("Wrong priceFilter");
                    }
                    numPriceFilter = NumericRangeFilter.NewFloatRange("price", start, end, isContainStart, isContainEnd);
                }

                Sort sort = new Sort();
                if (!string.IsNullOrWhiteSpace(priceOrderBy))
                {<!-- -->
                    SortField sortField = new SortField("price", SortField.FLOAT, priceOrderBy.EndsWith("asc", StringComparison.CurrentCultureIgnoreCase));
                    sort.SetSort(sortField);
                }

                TopDocs docs = searcher.Search(query, numPriceFilter, 500, sort);
                //TopDocs docs = searcher.Search(query, null, 10000);

                totalCount = docs.TotalHits;
                //PrintScores(docs, startIndex, endIndex, searcher);
                for (int i = startIndex; i < endIndex && i < totalCount; i++)
                {<!-- -->
                    Document doc = searcher.Doc(docs.ScoreDocs[i].Doc);
                    ciList.Add(DocumentToCommodityInfo(doc));
                }

                return ciList;
            }
            finally
            {<!-- -->
                if (searcher != null)
                {<!-- -->
                    searcher.Dispose();
                }
            }
        }

        private void PrintScores(TopDocs docs, int startIndex, int endIndex, MultiSearcher searcher)
        {<!-- -->
            ScoreDoc[] scoreDocs = docs.ScoreDocs;
            for (int i = startIndex; i < endIndex && i < scoreDocs.Count(); i++)
            {<!-- -->
                int docId = scoreDocs[i].Doc;
                Document doc = searcher.Doc(docId);
                logger.Info(string.Format("{0}的分值为{1}", doc.Get("productid"), scoreDocs[i].Score));
            }
        }

        #endregion QueryIndex

        #region private
        private CourseEntity DocumentToCommodityInfo(Document doc)
        {<!-- -->
            return new CourseEntity()
                       {<!-- -->
                           Id = int.Parse(doc.Get("id")),
                           Title = doc.Get("title"),
                           CourseId = long.Parse(doc.Get("productid")),
                           CategoryId = int.Parse(doc.Get("categoryid")),
                           ImageUrl = doc.Get("iamgeurl"),
                           Price = decimal.Parse(doc.Get("price")),
                           Url = doc.Get("url")
                       };
        }

        #endregion private
    }
public class CommodityLucene
    {<!-- -->
        private static Logger logger = new Logger(typeof(CommodityLucene));
        #region QueryCommodity
        /// <summary>
        /// 用lucene进行商品查询
        /// </summary>
        /// <param name="pageIndex"></param>
        /// <param name="pageSize"></param>
        /// <param name="totalCount"></param>
        /// <param name="keyword"></param>
        /// <param name="categoryIdList"></param>
        /// <param name="priceFilter">[13,50]  13,50且包含13到50   {13,50}  13,50且不包含13到50</param>
        /// <param name="priceOrderBy">price desc   price asc</param>
        /// <returns></returns>
        public static List<CourseEntity> QueryCommodity(int pageIndex, int pageSize, out int totalCount, string keyword, List<int> categoryIdList, string priceFilter, string priceOrderBy)
        {<!-- -->
            totalCount = 0;
            try
            {<!-- -->
                if (string.IsNullOrWhiteSpace(keyword) && (categoryIdList == null || categoryIdList.Count == 0)) return null;
                ILuceneQuery luceneQuery = new LuceneQuery();
                string queryString = string.Format(" {0} {1}",
                                                    string.IsNullOrWhiteSpace(keyword) ? "" : string.Format(" +{0}", AnalyzerKeyword(keyword)),
                                                    categoryIdList == null || categoryIdList.Count == 0 ? "" : string.Format(" +{0}", AnalyzerCategory(categoryIdList)));

                return luceneQuery.QueryIndexPage(queryString, pageIndex, pageSize, out totalCount, priceFilter, priceOrderBy);
            }
            catch (Exception ex)
            {<!-- -->
                logger.Error(string.Format("QueryCommodity参数为{0}出现异常", keyword), ex);
                return null;
            }
        }
        #endregion QueryCommodity

        /// <summary>
        /// 为keyword做盘古分词
        /// </summary>
        /// <param name="keyword"></param>
        /// <param name="luceneQuery"></param>
        /// <returns></returns>
        private static string AnalyzerKeyword(string keyword)
        {<!-- -->
            StringBuilder queryStringBuilder = new StringBuilder();
            ILuceneAnalyze analyzer = new LuceneAnalyze();
            string[] words = analyzer.AnalyzerKey(keyword);
            if (words.Length == 1)
            {<!-- -->
                queryStringBuilder.AppendFormat("{0}:{1}* ", "title", words[0]);
            }
            else
            {<!-- -->
                StringBuilder fieldQueryStringBuilder = new StringBuilder();
                foreach (string word in words)
                {<!-- -->
                    queryStringBuilder.AppendFormat("{0}:{1} ", "title", word);
                }
            }
            string result = queryStringBuilder.ToString().TrimEnd();
            logger.Info(string.Format("AnalyzerKeyword 将 keyword={0}转换为{1}", keyword, result));
            return result;
        }

        /// <summary>
        /// 为类别做custom分词
        /// </summary>
        /// <param name="categoryIdList"></param>
        /// <returns></returns>
        private static string AnalyzerCategory(List<int> categoryIdList)
        {<!-- -->
            return string.Join(" ", categoryIdList.Select(c => string.Format("{0}:{1}", "categoryid", c)));
        }
    }
}
/// <summary>
    /// 数据库查询
    /// </summary>
    public class CommodityRepository //: IRepository<Commodity>
    {<!-- -->
        private Logger logger = new Logger(typeof(CommodityRepository));

        public void SaveList(List<CourseEntity> commodityList)
        {<!-- -->
            if (commodityList == null || commodityList.Count == 0) return;
            IEnumerable<IGrouping<string, CourseEntity>> group = commodityList.GroupBy<CourseEntity, string>(c => GetTableName(c));

            foreach (var data in group)
            {<!-- -->
                SqlHelper.InsertList<CourseEntity>(data.ToList(), data.Key);
            }
        }

        private string GetTableName(CourseEntity commodity)
        {<!-- -->
            return string.Format("Tencent_Subject_{0}", (commodity.CourseId % 30 + 1).ToString("000"));
        }

        /// <summary>
        /// 分页获取商品数据
        /// </summary>
        /// <param name="tableNum"></param>
        /// <param name="pageIndex">从1开始</param>
        /// <param name="pageSize"></param>
        /// <returns></returns>
        public List<CourseEntity> QueryList(int tableNum,int pageIndex, int pageSize)
        {<!-- -->
            string sql = string.Format("SELECT top {2} * FROM Tencent_Subject_{0} WHERE id>{1};", tableNum.ToString("000"), pageSize * Math.Max(0, pageIndex - 1), pageSize);
            return SqlHelper.QueryList<CourseEntity>(sql);
        }
    }
 public class SqlHelper
    {<!-- -->
        private static Logger logger = new Logger(typeof(SqlHelper));
        private static string ConnStr = ConfigurationManager.ConnectionStrings["TencentConn"].ConnectionString;

        public static void ExecuteNonQuery(string sql)
        {<!-- -->
            //try
            //{<!-- -->
            using (SqlConnection sqlConn = new SqlConnection(ConnStr))
            {<!-- -->
                sqlConn.Open();
                SqlCommand cmd = new SqlCommand(sql, sqlConn);
                cmd.ExecuteNonQuery();//.ExecuteNonQueryAsync();//
            }
            //}
            //catch (Exception ex)
            //{<!-- -->
            //}
            //finally
            //{<!-- -->
            //}
        }

        public static List<T> QueryList<T>(string sql) where T : new()
        {<!-- -->
            using (SqlConnection sqlConn = new SqlConnection(ConnStr))
            {<!-- -->
                sqlConn.Open();
                SqlCommand cmd = new SqlCommand(sql, sqlConn);
                cmd.CommandTimeout = 120;
                return TransList<T>(cmd.ExecuteReader());
            }
        }

        public static void Insert<T>(T model, string tableName) where T : new()
        {<!-- -->
            string sql = GetInsertSql<T>(model, tableName);
            ExecuteNonQuery(sql);
        }

        public static void InsertList<T>(List<T> list, string tableName) where T : new()
        {<!-- -->
            string sql = string.Join(" ", list.Select(t => GetInsertSql<T>(t, tableName)));
            ExecuteNonQuery(sql);
        }

        #region Private
        private static string GetInsertSql<T>(T model, string tableName)
        {<!-- -->
            StringBuilder sbSql = new StringBuilder();

            StringBuilder sbFields = new StringBuilder();
            StringBuilder sbValues = new StringBuilder();

            Type type = model.GetType();
            var properties = type.GetProperties();
            foreach (PropertyInfo p in properties)
            {<!-- -->
                string name = p.Name;
                if (!name.Equals("id", StringComparison.OrdinalIgnoreCase))
                {<!-- -->
                    sbFields.AppendFormat("[{0}],", name);
                    sbValues.AppendFormat("'{0}',", p.GetValue(model));
                }
            }
            sbSql.AppendFormat("INSERT INTO {0} ({1}) VALUES ({2});",tableName, sbFields.ToString().TrimEnd(','), sbValues.ToString().TrimEnd(','));
            return sbSql.ToString();
        }

        private static List<T> TransList<T>(SqlDataReader reader) where T : new()
        {<!-- -->
            List<T> tList = new List<T>();
            Type type = typeof(T);
            var properties = type.GetProperties();
            if (reader.Read())
            {<!-- -->
                do
                {<!-- -->
                    T t = new T();
                    foreach (PropertyInfo p in properties)
                    {<!-- -->
                        p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType));
                    }
                    tList.Add(t);
                }
                while (reader.Read());
            }
            return tList;
        }

        private static T TransModel<T>(SqlDataReader reader) where T : new()
        {<!-- -->
            T t = new T();
            if (reader.Read())
            {<!-- -->
                do
                {<!-- -->
                    Type type = typeof(T);
                    var properties = type.GetProperties();
                    foreach (PropertyInfo p in properties)
                    {<!-- -->
                        p.SetValue(t, Convert.ChangeType(reader[p.Name], p.PropertyType));
                    }
                }
                while (reader.Read());
            }
            return t;
        }
        #endregion Private
    }
posted @ 2020-12-30 11:49  不要摸我的腰  阅读(363)  评论(0编辑  收藏  举报