Lucene.Net 按类别统计搜索结果数
今天群里有个朋友问"如何按类别统计搜索结果数?是不是要循环一个个类别去查询出总数啊?"
以Lucene.Net现在的API,只能这样做。当然这样做一般会带来性能问题,所以更好的解决方案就是改动库文件了。
注意:本文内容仅适用于Lucene.Net,以2.1版为例,其它版本可能会有出入,Java版本差别更大一些。
改动库先要有个思路。Lucene.Net的查询结果是一个Hits,而它有一个方法length可以得到总的结果。这个结果是一个精确值。这个值实际上是在TopDocCollector类的Collect方法计算出来的。要改精算为估算也就是在这里添加算法就可以了。
public override void Collect(int doc, float score)
{
if (score > 0.0f)
{
totalHits++;
if (hq.Size() < numHits || score >= minScore)
{
hq.Insert(new ScoreDoc(doc, score));
minScore = ((ScoreDoc) hq.Top()).score; // maintain minScore
}
}
}
这个方法中已经有了Document的id号,只要有办法拿到Document就能得到类别了。能拿到Document的类,IndexSearcher和IndexReader都可以。这里用IndexReader比较合算,因为IndexSearcher本身就包含IndexReader的。
{
if (score > 0.0f)
{
totalHits++;
if (hq.Size() < numHits || score >= minScore)
{
hq.Insert(new ScoreDoc(doc, score));
minScore = ((ScoreDoc) hq.Top()).score; // maintain minScore
}
}
}
Collect方法会在几个地方被用到。都是Scorer一系的类中。比如TermScorer,BooleanScorer2等。所以按分类统计如果给Collect增加参数的话改动量可能会比较大。所以修改TopDocCollector的构造函数。
private IndexReader reader;
public TopDocCollector(int numHits, IndexReader reader)
: this(numHits, new HitQueue(numHits), reader)
{
}
internal TopDocCollector(int numHits, PriorityQueue hq, IndexReader reader)
{
this.numHits = numHits;
this.hq = hq;
this.reader = reader;
}
同时有两个调用构造函数的地方需要被修改。
public TopDocCollector(int numHits, IndexReader reader)
: this(numHits, new HitQueue(numHits), reader)
{
}
internal TopDocCollector(int numHits, PriorityQueue hq, IndexReader reader)
{
this.numHits = numHits;
this.hq = hq;
this.reader = reader;
}
TopFieldDocCollector的构造函数:
public TopFieldDocCollector(IndexReader reader, Sort sort, int numHits)
: base(numHits, new FieldSortedHitQueue(reader, sort.fields, numHits), reader) {
}
IndexSearcher的构造函数:
: base(numHits, new FieldSortedHitQueue(reader, sort.fields, numHits), reader) {
}
public override TopDocs Search(Weight weight, Filter filter, int nDocs)
{
if (nDocs <= 0)
// null might be returned from hq.top() below.
throw new System.ArgumentException("nDocs must be > 0");
TopDocCollector collector = new TopDocCollector(nDocs, this.reader);
Search(weight, filter, collector);
return collector.TopDocs();
}
现在TopDocCollector类就可以拿到分类了。
{
if (nDocs <= 0)
// null might be returned from hq.top() below.
throw new System.ArgumentException("nDocs must be > 0");
TopDocCollector collector = new TopDocCollector(nDocs, this.reader);
Search(weight, filter, collector);
return collector.TopDocs();
}
public override void Collect(int doc, float score)
{
if (score > 0.0f)
{
Document d = reader.Document(doc);
int category = int.Parse(d.Get("category"));
totalHits++;
if (hq.Size() < numHits || score >= minScore)
{
hq.Insert(new ScoreDoc(doc, score));
minScore = ((ScoreDoc) hq.Top()).score; // maintain minScore
}
}
}
最终这个统计的结构需要反映到Hits类去。返回结构和TopDocCollector的public virtual TopDocs TopDocs()方法有关。给TopDocs 增加一个字段:
{
if (score > 0.0f)
{
Document d = reader.Document(doc);
int category = int.Parse(d.Get("category"));
totalHits++;
if (hq.Size() < numHits || score >= minScore)
{
hq.Insert(new ScoreDoc(doc, score));
minScore = ((ScoreDoc) hq.Top()).score; // maintain minScore
}
}
}
public System.Collections.Generic.Dictionary<int, int> category_count;
Collect方法改成:
private System.Collections.Generic.Dictionary<int, int> category_count = new System.Collections.Generic.Dictionary<int,int>();
public override void Collect(int doc, float score)
{
if (score > 0.0f)
{
Document d = reader.Document(doc);
int category = int.Parse(d.Get("category"));
if (category_count.ContainsKey(category))
category_count[category]++;
else
category_count.Add(category, 1);
totalHits++;
if (hq.Size() < numHits || score >= minScore)
{
hq.Insert(new ScoreDoc(doc, score));
minScore = ((ScoreDoc) hq.Top()).score; // maintain minScore
}
}
}
TopDocs方法改成
public override void Collect(int doc, float score)
{
if (score > 0.0f)
{
Document d = reader.Document(doc);
int category = int.Parse(d.Get("category"));
if (category_count.ContainsKey(category))
category_count[category]++;
else
category_count.Add(category, 1);
totalHits++;
if (hq.Size() < numHits || score >= minScore)
{
hq.Insert(new ScoreDoc(doc, score));
minScore = ((ScoreDoc) hq.Top()).score; // maintain minScore
}
}
}
public virtual TopDocs TopDocs()
{
ScoreDoc[] scoreDocs = new ScoreDoc[hq.Size()];
for (int i = hq.Size() - 1; i >= 0; i--)
// put docs in array
scoreDocs[i] = (ScoreDoc) hq.Pop();
float maxScore = (totalHits == 0) ? System.Single.NegativeInfinity : scoreDocs[0].score;
TopDocs docs = new TopDocs(totalHits, scoreDocs, maxScore);
docs.category_count = category_count;
return docs;
}
Hits类增加:
{
ScoreDoc[] scoreDocs = new ScoreDoc[hq.Size()];
for (int i = hq.Size() - 1; i >= 0; i--)
// put docs in array
scoreDocs[i] = (ScoreDoc) hq.Pop();
float maxScore = (totalHits == 0) ? System.Single.NegativeInfinity : scoreDocs[0].score;
TopDocs docs = new TopDocs(totalHits, scoreDocs, maxScore);
docs.category_count = category_count;
return docs;
}
private Dictionary<int, int> category_count;
public Dictionary<int, int> Category_Count {
get {
return category_count;
}
}
同时修改:
public Dictionary<int, int> Category_Count {
get {
return category_count;
}
}
private void GetMoreDocs(int min)
{
if (hitDocs.Count > min)
{
min = hitDocs.Count;
}
int n = min * 2; // double # retrieved
TopDocs topDocs = (sort == null) ? searcher.Search(weight, filter, n) : searcher.Search(weight, filter, n, sort);
category_count = topDocs.category_count;
length = topDocs.totalHits;
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
float scoreNorm = 1.0f;
if (length > 0 && topDocs.GetMaxScore() > 1.0f)
{
scoreNorm = 1.0f / topDocs.GetMaxScore();
}
int end = scoreDocs.Length < length?scoreDocs.Length:length;
for (int i = hitDocs.Count; i < end; i++)
{
hitDocs.Add(new HitDoc(scoreDocs[i].score * scoreNorm, scoreDocs[i].doc));
}
}
{
if (hitDocs.Count > min)
{
min = hitDocs.Count;
}
int n = min * 2; // double # retrieved
TopDocs topDocs = (sort == null) ? searcher.Search(weight, filter, n) : searcher.Search(weight, filter, n, sort);
category_count = topDocs.category_count;
length = topDocs.totalHits;
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
float scoreNorm = 1.0f;
if (length > 0 && topDocs.GetMaxScore() > 1.0f)
{
scoreNorm = 1.0f / topDocs.GetMaxScore();
}
int end = scoreDocs.Length < length?scoreDocs.Length:length;
for (int i = hitDocs.Count; i < end; i++)
{
hitDocs.Add(new HitDoc(scoreDocs[i].score * scoreNorm, scoreDocs[i].doc));
}
}
至此就OK了。从结果中取的时候,比如ID为1的分类,则
hits.Category_Count[1]就出来了。