dotLucene中文分词的highlight显示

1、准备的原料

lucene.net的1.4.3版本比java版的Lucene 1.4.3功能要少，所以需要lucene.net-1.9的版本。highlighter.net也用当前最新的版本1.4.0，但是这个版本的功能也比java当前版的功能要少，缺少一个实现快速显示highlight的类TokenSources。

2、TokenSources.cs的代码

using System;

using IComparer = System.Collections.IComparer;

using ArrayList = System.Collections.ArrayList;

using Analyzer = Lucene.Net.Analysis.Analyzer;

using Token = Lucene.Net.Analysis.Token;

using TokenStream = Lucene.Net.Analysis.TokenStream;

using IndexReader = Lucene.Net.Index.IndexReader;

using TermFreqVector = Lucene.Net.Index.TermFreqVector;

using TermPositionVector = Lucene.Net.Index.TermPositionVector;

using TermVectorOffsetInfo = Lucene.Net.Index.TermVectorOffsetInfo;

using Document = Lucene.Net.Documents.Document;

namespace Lucene.Net.Search.Highlight

{

/// <summary>

/// TokenSources used for fast highlight,it's a must for chinese word segment.

/// </summary>

public class TokenSources

{

/**

* A convenience method that tries a number of approaches to getting a token stream.

* The cost of finding there are no termVectors in the index is minimal (1000 invocations still

* registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable

* @param reader

* @param docId

* @param field

* @param analyzer

* @return null if field not stored correctly

* @throws IOException

public static TokenStream GetAnyTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer)

{

TokenStream ts=null;

TermFreqVector tfv=(TermFreqVector) reader.GetTermFreqVector(docId,field);

if(tfv!=null)

{

if(tfv is TermPositionVector)

{

ts=GetTokenStream((TermPositionVector) tfv);

}

//No token info stored so fall back to analyzing raw content

if(ts==null)

{

ts=GetTokenStream(reader,docId,field,analyzer);

}

return ts;

}

/**

* */

public static TokenStream GetTokenStream(TermPositionVector tpv)

{

//assumes the worst and makes no assumptions about token position sequences.

return GetTokenStream(tpv,false);

}

/**

* an object used to iterate across an array of tokens

* */

public class StoredTokenStream : TokenStream

{

Token[] tokens;

int currentToken=0;

/**

* */

public StoredTokenStream(Token[] tokens)

{

this.tokens=tokens;

}

/**

* */

public override Token Next()

{

if(currentToken>=tokens.Length)

{

return null;

}

return tokens[currentToken++];

}

class CompareClass : IComparer

{

public Int32 Compare(Object o1, Object o2)

{

Token t1=(Token) o1;

Token t2=(Token) o2;

if(t1.StartOffset()>t2.StartOffset())

return 1;

if(t1.StartOffset()<t2.StartOffset())

return -1;

return 0;

}

/**

* Low level api.

* Returns a token stream or null if no offset info available in index.

* This can be used to feed the highlighter with a pre-parsed token stream

* In my tests the speeds to recreate 1000 token streams using this method are:

* - with TermVector offset only data stored - 420 milliseconds

* - with TermVector offset AND position data stored - 271 milliseconds

* (nb timings for TermVector with position data are based on a tokenizer with contiguous

* positions - no overlaps or gaps)

* The cost of not using TermPositionVector to store

* pre-parsed content and using an analyzer to re-parse the original content:

* - reanalyzing the original content - 980 milliseconds

* The re-analyze timings will typically vary depending on -

* 1) The complexity of the analyzer code (timings above were using a

* stemmer/lowercaser/stopword combo)

* 2) The number of other fields (Lucene reads ALL fields off the disk

* when accessing just one document field - can cost dear!)

* 3) Use of compression on field storage - could be faster cos of compression (less disk IO)

* or slower (more CPU burn) depending on the content.

* @param tpv

* @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking

* to eek out the last drops of performance, set to true. If in doubt, set to false.

public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)

{

//System.out.println("fastfastfast");

//code to reconstruct the original sequence of Tokens

String[] terms=tpv.GetTerms();

int[] freq=tpv.GetTermFrequencies();

int totalTokens=0;

for (int t = 0; t < freq.Length; t++)

{

totalTokens+=freq[t];

}

Token[] tokensInOriginalOrder=new Token[totalTokens];

ArrayList unsortedTokens = null;

for (int t = 0; t < freq.Length; t++)

{

TermVectorOffsetInfo[] offsets=tpv.GetOffsets(t);

if(offsets==null)

{

return null;

}

int[] pos=null;

if(tokenPositionsGuaranteedContiguous)

{

//try get the token position info to speed up assembly of tokens into sorted sequence

pos=tpv.GetTermPositions(t);

}

if(pos==null)

{

//tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later

if(unsortedTokens==null)

{

unsortedTokens=new ArrayList();

}

for (int tp = 0; tp < offsets.Length; tp++)

{

unsortedTokens.Add(new Token(terms[t],

offsets[tp].GetStartOffset(),

offsets[tp].GetEndOffset()));

}

else

{

//We have positions stored and a guarantee that the token position information is contiguous

// This may be fast BUT wont work if Tokenizers used which create >1 token in same position or

// creates jumps in position numbers - this code would fail under those circumstances

//tokens stored with positions - can use this to index straight into sorted array

for (int tp = 0; tp < pos.Length; tp++)

{

tokensInOriginalOrder[pos[tp]]=new Token(terms[t],

offsets[tp].GetStartOffset(),

offsets[tp].GetEndOffset());

}

//If the field has been stored without position data we must perform a sort

if(unsortedTokens!=null)

{

tokensInOriginalOrder=(Token[]) unsortedTokens.ToArray(typeof( Token) );

System.Array.Sort(tokensInOriginalOrder, new CompareClass() );

}

return new StoredTokenStream(tokensInOriginalOrder);

}

/**

* */

public static TokenStream GetTokenStream(IndexReader reader,int docId, String field)

{

TermFreqVector tfv=(TermFreqVector) reader.GetTermFreqVector(docId,field);

if(tfv==null)

{

throw new Exception(field+" in doc #"+docId

+"does not have any term position data stored");

}

if(tfv is TermPositionVector)

{

TermPositionVector tpv=(TermPositionVector) reader.GetTermFreqVector(docId,field);

return GetTokenStream(tpv);

}

throw new Exception(field+" in doc #"+docId

+"does not have any term position data stored");

}

//convenience method

public static TokenStream GetTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer)

{

Document doc=reader.Document(docId);

String contents=doc.Get(field);

if(contents==null)

{

throw new Exception("Field "+field +" in document #"+docId+ " is not stored and cannot be analyzed");

}

return analyzer.TokenStream(field,new System.IO.StringReader(contents));

}

3、附加工作

去掉highlight包中的单词界限判断：

tokenGroup.isDistinct(token)

修改测试程序的方法参考。
from http://www.lietu.com/doc/dotHighlighter.htm

posted @ 2006-06-23 23:31 kwklover 阅读(1461) 评论(1) 收藏举报

刷新页面返回顶部

dotLucene中文分词的highlight显示

dotLucene中文分词的highlight显示

公告