- 1.总述 IndexSearch全过程源码分析--->生成weight树
- IndexSearch ---> search(createWeight(query), filter, n, sort)
- |--createWeight(query) |实际为生成Weight树
- |--return query.weight(this);
-
- 2.创建weight树总过程
- query.weight(this)
- |--Query query = searcher.rewrite(this); |重新解析Query,将Query生成为单个可供直接查询的Query
- |--Weight weight = query.createWeight(searcher);
- |--float sum = weight.sumOfSquaredWeights(); |计算sum分值
- |--float norm = getSimilarity(searcher).queryNorm(sum); |获取标准因子
- |--weight.normalize(norm); |标准化
- |--return weight; |返回weight权值树
-
- 3.重写Query对象,生成Query树
- IndexSearcher.rewrite(Query original) |重写Query对象,主要实现拆分
- |--for (Query rewrittenQuery = query.rewrite(reader); rewrittenQuery != query; rewrittenQuery = query.rewrite(reader)) |重写直至不能再拆分
- |--query = rewrittenQuery;
- |--eg1:BooleanQuery.rewrite(reader) 实现
- |--for (int i = 0 ; i < clauses.size(); i++)
- |--Query query = c.getQuery().rewrite(reader); |重写query对象,重复写的过程,最后都生成BooleanQuery对象
- |--clone.clauses.set(i, new BooleanClause(query, c.getOccur())); |合成新的BooleanQuery对象
- |--eg2:MultiTermQuery.rewrite(reader) 实现
- |--rewriteMethod.rewrite(reader, this);
- |--ConstantScoreFilterRewrite.rewrite(reader) |将所有Term当成一个Term处理
- |--Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query));
- |--result.setBoost(query.getBoost());
- |--ScoringBooleanQueryRewrite.rewrite(reader) |将Term分离出来,风险在于350Term的限制值
- |--ConstantScoreBooleanQueryRewrite
- |--result.add(tq, BooleanClause.Occur.SHOULD); |逐一分离Term,生成Boolean查询
-
- |--ConstantScoreAutoRewrite.rewrite(reader) |结合上述二者,自动选择,以term < 350 为界,进行选择
- |--FilteredTermEnum enumerator = query.getEnum(reader); |根据需要改变的query生成枚举器
- |--Term t = enumerator.term(); |此时含IO操作?
- |--Iterator it = pendingTerms.iterator();
- |--BooleanQuery bq = new BooleanQuery(true);
- |--while(it.hasNext()) |逐个循环,生成term
- |--TermQuery tq = new TermQuery((Term) it.next());
- |--bq.add(tq, BooleanClause.Occur.SHOULD);
- |--Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq));
- |--result.setBoost(query.getBoost()); |设置分值
- |--query.incTotalNumberOfTerms(pendingTerms.size()); |增加Term总数
- |--return query; |返回最终生成的Query树
-
- 4.不同的Query查询,重写后生成新的Query
- |--eg2.1:PrefixQuery.getEnum(reader)
- |--return new PrefixTermEnum(reader, prefix); |返回FilterTermEnum的子类
- |--setEnum(reader.terms(new Term(prefix.field(), prefix.text())));
- |--if (term != null && termCompare(term)) |比较前缀
- |--currentTerm = term;
- |--else next()
- |--if (actualEnum.next()) |取下一个term,判断是否为当前term的pre
- |-- Term term = actualEnum.term();
- |-- if (termCompare(term)) {
- |--currentTerm = term;
- |--eg2.2:FuzzyQuery.getEnum(reader)
- |--return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
- |--this.text = searchTerm.text().substring(realPrefixLength); |获取前缀及text文本内容
- |--this.prefix = searchTerm.text().substring(0, realPrefixLength);
- |--initializeMaxDistances(); |计算初始最大距离
- |--setEnum(reader.terms(new Term(searchTerm.field(), prefix))); |计算差距值
- |--termCompare(Term term)
- |--final String target = term.text().substring(prefix.length());
- |--this.similarity = similarity(target);
- |--return (similarity > minimumSimilarity);
-
-
- 5.weight.sumOfSquaredWeights() |--计算合值
- |--BooleanWeight |计算后出现二种情况,Boolean及单个weight树,以BooleanWeight为准进行分析
- |--for (int i = 0 ; i < weights.size(); i++) |逐一单个的Weight进行计算
- |--float s = w.sumOfSquaredWeights()
- |--sum += s;
- |--sum *= getBoost() * getBoost();
-
- |--TermWeight |以TermWeight为例
- |--queryWeight = idf * getBoost();
- |--return queryWeight * queryWeight; |开平方
-
-
- 6.float norm = getSimilarity(searcher).queryNorm(sum); |计算标准因子,默认为DefaultSimilarity
- |-- return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
-
-
- 7.weight.normalize(norm); |标准化norm因子,以BooleanWeight为例
- |--norm *= getBoost();
- |-- for (Iterator iter = weights.iterator(); iter.hasNext();)
- |--w.normalize(norm); |逐个标准化
-
- |--TermWeight.normalize(norm) |以TermWeight为例
- |--queryWeight *= queryNorm;
- |--value = queryWeight * idf
- |--实际值value = (idf * getBoost())*(idf * getBoost())*queryNorm*idf;
posted @
2012-03-15 09:18
桃花雪
阅读(
594)
评论()
编辑
收藏
举报
点击右上角即可分享
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律