lucene3.6.0的文档评估机制
lucene的评分机制:所有hits的分数<=1.0
每个document(d)的分数:
∑tf(t in d)*idf(t)*boost(t.field in d)*lengthNorm(t.field in d)
t In q
查询的得分:
score(q,d)=coord(q,d)·queryNorm(q)·∑tf(t in d)*idf(t)*boost(t.field in d)*lengthNorm(t.field in d)
t In q
tf(t in d):文档中d出现搜索项t的频率
idf(t):搜索项t在倒排文档中出现的频率
boost(t.field in d):域的加权因子,在插入文档中设置
lengthNorm(t.field in d):域的标准化值,即在某一域中所有项的个数。通常在索引时计算该值并存储到索引文件中。
coord(q,d):协调因子(normalization value),该因子的值基于文档中包含查询项的个数
queryNorm(q):每个查询的标准化值,指每个查询项的权重的平方和
query对象的加权因子,查询时如果是多个子句,则可以通过加权某一个查询子句来加权某一个query对象。
DefaultSimilarity.java默认处理计分规则/** Expert: Default scoring implementation. */
public class DefaultSimilarity extends Similarity { /** Implemented as * <code>state.getBoost()*lengthNorm(numTerms)</code>, where * <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link * #setDiscountOverlaps} is false, else it's {@link * FieldInvertState#getLength()} - {@link * FieldInvertState#getNumOverlap()}. * * @lucene.experimental */ @Override public float computeNorm(String field, FieldInvertState state) { final int numTerms; if (discountOverlaps) numTerms = state.getLength() - state.getNumOverlap(); else numTerms = state.getLength(); return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))); } /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */ @Override public float queryNorm(float sumOfSquaredWeights) { return (float)(1.0 / Math.sqrt(sumOfSquaredWeights)); } /** Implemented as <code>sqrt(freq)</code>. */ @Override public float tf(float freq) { return (float)Math.sqrt(freq); } /** Implemented as <code>1 / (distance + 1)</code>. */ @Override public float sloppyFreq(int distance) { return 1.0f / (distance + 1); } /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */ @Override public float idf(int docFreq, int numDocs) { return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); } /** Implemented as <code>overlap / maxOverlap</code>. */ @Override public float coord(int overlap, int maxOverlap) { return overlap / (float)maxOverlap; } // Default true protected boolean discountOverlaps = true; /** Determines whether overlap tokens (Tokens with * 0 position increment) are ignored when computing * norm. By default this is true, meaning overlap * tokens do not count when computing norms. * * @lucene.experimental * * @see #computeNorm */ public void setDiscountOverlaps(boolean v) { discountOverlaps = v; } /** @see #setDiscountOverlaps */ public boolean getDiscountOverlaps() { return discountOverlaps; } }
IndexSearcher.java的explain方法返回的Explanation对象包含了所有评分因子中各个因子的详细信息。
测试程序和数据参考http://zhwj184.iteye.com/admin/blogs/1522709
import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class DocSearch { private static IndexSearcher isearcher = null; public static void search(String key) throws IOException, ParseException{ Directory directory = FSDirectory.open(new File("E:\\output\\lucence\\index")); // Now search the index: IndexReader ireader = IndexReader.open(directory); // read-only=true isearcher = new IndexSearcher(ireader); // Parse a simple query that searches for "text": Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,"context", analyzer); Query query = parser.parse(key); ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); System.out.println(hitDoc.getValues("id")[0] + "\t" + hitDoc.getValues("context")[0] + "\t" + hits[i].score); Explanation explan = isearcher.explain(query, hits[i].doc); System.out.println(explan); } } public static void main(String[] args) throws IOException, ParseException { search("旧水泥袋"); isearcher.close(); } }
输出结果:
只截取第一篇文档的评分信息
4801857 采购旧编织袋、旧水泥袋 4.0172114 4.0172114 = (MATCH) sum of: 1.4140004 = (MATCH) weight(context:旧 in 1682), product of: 0.54585564 = queryWeight(context:旧), product of: 5.861472 = idf(docFreq=13, maxDocs=1809) 0.09312603 = queryNorm 2.5904293 = (MATCH) fieldWeight(context:旧 in 1682), product of: 1.4142135 = tf(termFreq(context:旧)=2) 5.861472 = idf(docFreq=13, maxDocs=1809) 0.3125 = fieldNorm(field=context, doc=1682) 0.60229266 = (MATCH) weight(context:水 in 1682), product of: 0.42365694 = queryWeight(context:水), product of: 4.549286 = idf(docFreq=51, maxDocs=1809) 0.09312603 = queryNorm 1.4216518 = (MATCH) fieldWeight(context:水 in 1682), product of: 1.0 = tf(termFreq(context:水)=1) 4.549286 = idf(docFreq=51, maxDocs=1809) 0.3125 = fieldNorm(field=context, doc=1682) 1.1562659 = (MATCH) weight(context:泥 in 1682), product of: 0.58700174 = queryWeight(context:泥), product of: 6.3033047 = idf(docFreq=8, maxDocs=1809) 0.09312603 = queryNorm 1.9697827 = (MATCH) fieldWeight(context:泥 in 1682), product of: 1.0 = tf(termFreq(context:泥)=1) 6.3033047 = idf(docFreq=8, maxDocs=1809) 0.3125 = fieldNorm(field=context, doc=1682) 0.84465253 = (MATCH) weight(context:袋 in 1682), product of: 0.42188305 = queryWeight(context:袋), product of: 4.5302377 = idf(docFreq=52, maxDocs=1809) 0.09312603 = queryNorm 2.0021012 = (MATCH) fieldWeight(context:袋 in 1682), product of: 1.4142135 = tf(termFreq(context:袋)=2) 4.5302377 = idf(docFreq=52, maxDocs=1809) 0.3125 = fieldNorm(field=context, doc=1682)