Hack in Lucene.Net之为什么无法在搜索时统计分类下相关结果数或者实现Group By效果
这几天一直在思考如果Hack in Lucene.Net就能实现在搜索时统计分类下相关结果数或者实现Group By效果,答案是,如果依靠IndexSearcer类注入,那么是不可能实现这个效果的。从大的方面——索引结构——上面说更加容易让人理解,就先从这里说起。
Lucene的索引结构是分块的,这个在很多地方有这样的资料。为了更加方便理解,这里建立一个简单的索引文件。
static void Main(string[] args) {
IndexWriter writer = new IndexWriter("d:\\index", new StandardAnalyzer(), true);
writer.AddDocument(new IndexDocumentExtension.DocmentField[] {
new IndexDocumentExtension.DocmentField("ID","1"),
new IndexDocumentExtension.DocmentField("Title","test record first",Field.Store.NO,Field.Index.TOKENIZED,0.0f),
new IndexDocumentExtension.DocmentField("Category","1")
});
writer.AddDocument(new IndexDocumentExtension.DocmentField[] {
new IndexDocumentExtension.DocmentField("ID","2"),
new IndexDocumentExtension.DocmentField("Title","test record second"),
new IndexDocumentExtension.DocmentField("Category","1")
});
writer.AddDocument(new IndexDocumentExtension.DocmentField[] {
new IndexDocumentExtension.DocmentField("ID","3"),
new IndexDocumentExtension.DocmentField("Title","test record third"),
new IndexDocumentExtension.DocmentField("Category","2")
});
writer.Close();
}
IndexWriter writer = new IndexWriter("d:\\index", new StandardAnalyzer(), true);
writer.AddDocument(new IndexDocumentExtension.DocmentField[] {
new IndexDocumentExtension.DocmentField("ID","1"),
new IndexDocumentExtension.DocmentField("Title","test record first",Field.Store.NO,Field.Index.TOKENIZED,0.0f),
new IndexDocumentExtension.DocmentField("Category","1")
});
writer.AddDocument(new IndexDocumentExtension.DocmentField[] {
new IndexDocumentExtension.DocmentField("ID","2"),
new IndexDocumentExtension.DocmentField("Title","test record second"),
new IndexDocumentExtension.DocmentField("Category","1")
});
writer.AddDocument(new IndexDocumentExtension.DocmentField[] {
new IndexDocumentExtension.DocmentField("ID","3"),
new IndexDocumentExtension.DocmentField("Title","test record third"),
new IndexDocumentExtension.DocmentField("Category","2")
});
writer.Close();
}
public static class IndexDocumentExtension {
public static void AddDocument(this IndexWriter writer, DocmentField[] fields) {
Document doc = new Document();
foreach (DocmentField field in fields) {
doc.Add(new Field(field.Key, field.Value, field.Store, field.Index));
if (field.Boost != 1.0f)
doc.GetField(field.Key).SetBoost(field.Boost);
}
writer.AddDocument(doc);
}
public class DocmentField {
public DocmentField(string key, string value)
: this(key, value, Field.Store.YES, Field.Index.TOKENIZED, 1.0f) {
}
public DocmentField(string key, string value, Field.Store store, Field.Index index, float boost) {
this.Key = key;
this.Value = value;
this.Store = store;
this.Index = index;
this.Boost = boost;
}
public string Key { get; set; }
public string Value { get; set; }
public Field.Store Store { get; set; }
public Field.Index Index { get; set; }
public float Boost { get; set; }
}
}
public static void AddDocument(this IndexWriter writer, DocmentField[] fields) {
Document doc = new Document();
foreach (DocmentField field in fields) {
doc.Add(new Field(field.Key, field.Value, field.Store, field.Index));
if (field.Boost != 1.0f)
doc.GetField(field.Key).SetBoost(field.Boost);
}
writer.AddDocument(doc);
}
public class DocmentField {
public DocmentField(string key, string value)
: this(key, value, Field.Store.YES, Field.Index.TOKENIZED, 1.0f) {
}
public DocmentField(string key, string value, Field.Store store, Field.Index index, float boost) {
this.Key = key;
this.Value = value;
this.Store = store;
this.Index = index;
this.Boost = boost;
}
public string Key { get; set; }
public string Value { get; set; }
public Field.Store Store { get; set; }
public Field.Index Index { get; set; }
public float Boost { get; set; }
}
}
由以上代码在d盘的index目录创建了一份索引,包含了3个文档,每个文档有3个字段,除了第一份文档不存储数据外,其它两份都存储,并且进行了分词的操作。用记事本打开cfs文件可以看到图1.1的效果的乱码文件,上面作了简单的说明。PS:不会用PS,专门装了下FW,汗~~~
图1.1
图1.1只标了Title字段索引后的效果。但从这里就可以看出,在搜索过程中,用Analyzer对搜索文本进行分词得到Term后,与这里的项索引进行对比的时候是拿不到文档的。但是,要实现按分类统计结果数或者实现Group By的效果,需要知道文档其它字段的信息。因此,入侵内部循环完成相应的目的似乎不太可能了。使用下面的代码可以从逻辑结构上更加清晰地看到结果。调用的时候,使用printSegment("_u"); 注意,我的cfs文件的文件名是_u.cfs,你的可能会不一样。省略后缀,采用前面的部分。
public static void printSegment(String segment) {
Lucene.Net.Store.Directory directory = FSDirectory.GetDirectory("d:\\index", false);
SegmentReader segmentReader = SegmentReader.Get(new SegmentInfo(segment, 3, directory,true,true)); //new SegmentInfo(segment, 1, directory)
//display document.
for (int i = 0; i < segmentReader.NumDocs(); i++)
Console.WriteLine(segmentReader.Document(i).ToString());
TermEnum termEnum = segmentReader.Terms();//此处实际为SegmentTermEnum
//display term and term positions,termDocs
while (termEnum.Next()) {
Console.WriteLine(termEnum.Term().ToString());
Console.WriteLine(" document.requency=" + termEnum.DocFreq());
TermPositions termPositions = segmentReader.TermPositions(termEnum.Term());
int i = 0;
while (termPositions.Next()) {
Console.WriteLine((i++) + "->" + TermPositionsToString(termPositions));
}
TermDocs termDocs = segmentReader.TermDocs(termEnum.Term());//实际为segmentDocs
while (termDocs.Next()) {
Console.WriteLine((i++) + "->" + TermDocsToString(termDocs));
}
}
//display field info
FieldInfos fieldInfos = segmentReader.FieldInfos();
FieldInfo pathFieldInfo = fieldInfos.FieldInfo("ID");
FieldInfo modifiedFieldInfo = fieldInfos.FieldInfo("Category");
FieldInfo contentsFieldInfo = fieldInfos.FieldInfo("Title");
Console.WriteLine(pathFieldInfo);
Console.WriteLine(modifiedFieldInfo);
Console.WriteLine(contentsFieldInfo);
//display TermFreqVector
for (int i = 0; i < segmentReader.NumDocs(); i++) {
//对contents的token之后的term存于了TermFreqVector
TermFreqVector termFreqVector = segmentReader.GetTermFreqVector(i, "Title");
Console.WriteLine(termFreqVector);
}
}
public static string TermPositionsToString(TermPositions termPositions) {
string pos = string.Empty;
for (int i = 0; i < termPositions.Freq(); i++) {
pos += termPositions.NextPosition() + ",";
}
pos = pos.Remove(pos.Length - 1, 1);
return string.Format("<doc,TermFrequency,Pos>:< doc={0}, TermFrequency={1} Pos={2}>", termPositions.Doc(), termPositions.Freq(), pos);
}
public static string TermDocsToString(TermDocs doc) {
return string.Format("<docNumber,freq>=<{0},{1}>", doc.Doc(), doc.Freq());
}
Lucene.Net.Store.Directory directory = FSDirectory.GetDirectory("d:\\index", false);
SegmentReader segmentReader = SegmentReader.Get(new SegmentInfo(segment, 3, directory,true,true)); //new SegmentInfo(segment, 1, directory)
//display document.
for (int i = 0; i < segmentReader.NumDocs(); i++)
Console.WriteLine(segmentReader.Document(i).ToString());
TermEnum termEnum = segmentReader.Terms();//此处实际为SegmentTermEnum
//display term and term positions,termDocs
while (termEnum.Next()) {
Console.WriteLine(termEnum.Term().ToString());
Console.WriteLine(" document.requency=" + termEnum.DocFreq());
TermPositions termPositions = segmentReader.TermPositions(termEnum.Term());
int i = 0;
while (termPositions.Next()) {
Console.WriteLine((i++) + "->" + TermPositionsToString(termPositions));
}
TermDocs termDocs = segmentReader.TermDocs(termEnum.Term());//实际为segmentDocs
while (termDocs.Next()) {
Console.WriteLine((i++) + "->" + TermDocsToString(termDocs));
}
}
//display field info
FieldInfos fieldInfos = segmentReader.FieldInfos();
FieldInfo pathFieldInfo = fieldInfos.FieldInfo("ID");
FieldInfo modifiedFieldInfo = fieldInfos.FieldInfo("Category");
FieldInfo contentsFieldInfo = fieldInfos.FieldInfo("Title");
Console.WriteLine(pathFieldInfo);
Console.WriteLine(modifiedFieldInfo);
Console.WriteLine(contentsFieldInfo);
//display TermFreqVector
for (int i = 0; i < segmentReader.NumDocs(); i++) {
//对contents的token之后的term存于了TermFreqVector
TermFreqVector termFreqVector = segmentReader.GetTermFreqVector(i, "Title");
Console.WriteLine(termFreqVector);
}
}
public static string TermPositionsToString(TermPositions termPositions) {
string pos = string.Empty;
for (int i = 0; i < termPositions.Freq(); i++) {
pos += termPositions.NextPosition() + ",";
}
pos = pos.Remove(pos.Length - 1, 1);
return string.Format("<doc,TermFrequency,Pos>:< doc={0}, TermFrequency={1} Pos={2}>", termPositions.Doc(), termPositions.Freq(), pos);
}
public static string TermDocsToString(TermDocs doc) {
return string.Format("<docNumber,freq>=<{0},{1}>", doc.Doc(), doc.Freq());
}
这里截取了一个输出图。
图1.2
另外从代码上看,也不太可能实现。完成索引的对比最终会在Term类的CompareTo方法完成。不详细介绍,给出调用关系的堆栈信息,可以看出大概。
Lucene.Net.Index.Term.CompareTo(Lucene.Net.Index.Term) 行 122
Lucene.Net.Index.TermInfosReader.GetIndexOffset(Lucene.Net.Index.Term) 行 202
Lucene.Net.Index.TermInfosReader.Get(Lucene.Net.Index.Term) 行 236
Lucene.Net.Index.SegmentReader.DocFreq(Lucene.Net.Index.Term) 行 792
Lucene.Net.Search.IndexSearcher.DocFreq(Lucene.Net.Index.Term) 行 126
Lucene.Net.Search.Similarity.Idf(Lucene.Net.Index.Term, Lucene.Net.Search.Searcher) 行 484
Lucene.Net.Search.TermQuery.TermWeight.TermWeight(Lucene.Net.Search.TermQuery, Lucene.Net.Search.Searcher) 行 62
Lucene.Net.Search.TermQuery.CreateWeight(Lucene.Net.Search.Searcher) 行 173
Lucene.Net.Search.Query.Weight(Lucene.Net.Search.Searcher) 行 102
Lucene.Net.Search.Hits.Hits(Lucene.Net.Search.Searcher, Lucene.Net.Search.Query, Lucene.Net.Search.Filter) 行 63
Lucene.Net.Search.Searcher.Search(Lucene.Net.Search.Query, Lucene.Net.Search.Filter) 行 57
Lucene.Net.Search.Searcher.Search(Lucene.Net.Search.Query) 行 48
TestLucene.Program.Main(string[]) 行 40
[外部代码]
Lucene.Net.Index.TermInfosReader.GetIndexOffset(Lucene.Net.Index.Term) 行 202
Lucene.Net.Index.TermInfosReader.Get(Lucene.Net.Index.Term) 行 236
Lucene.Net.Index.SegmentReader.DocFreq(Lucene.Net.Index.Term) 行 792
Lucene.Net.Search.IndexSearcher.DocFreq(Lucene.Net.Index.Term) 行 126
Lucene.Net.Search.Similarity.Idf(Lucene.Net.Index.Term, Lucene.Net.Search.Searcher) 行 484
Lucene.Net.Search.TermQuery.TermWeight.TermWeight(Lucene.Net.Search.TermQuery, Lucene.Net.Search.Searcher) 行 62
Lucene.Net.Search.TermQuery.CreateWeight(Lucene.Net.Search.Searcher) 行 173
Lucene.Net.Search.Query.Weight(Lucene.Net.Search.Searcher) 行 102
Lucene.Net.Search.Hits.Hits(Lucene.Net.Search.Searcher, Lucene.Net.Search.Query, Lucene.Net.Search.Filter) 行 63
Lucene.Net.Search.Searcher.Search(Lucene.Net.Search.Query, Lucene.Net.Search.Filter) 行 57
Lucene.Net.Search.Searcher.Search(Lucene.Net.Search.Query) 行 48
TestLucene.Program.Main(string[]) 行 40
[外部代码]
在写这篇文章的时候,有了另外一个思路,但是比较复杂,还没考虑好,要是可以成功就可以比较好地解决这个问题了。目前能实现这两种效果的办法还是要在TopDocCollector类的Collect处注入代码。