dotLucene中文分词的highlight显示

dotLucene中文分词的highlight显示

1、准备的原料

    lucene.net的1.4.3版本比java版的Lucene 1.4.3功能要少,所以需要lucene.net-1.9的版本。highlighter.net也用当前最新的版本1.4.0,但是这个版本的功能也比java当前版的功能要少,缺少一个实现快速显示highlight的类TokenSources。

 

2、TokenSources.cs的代码

 

using System;

 

using IComparer = System.Collections.IComparer;

using ArrayList = System.Collections.ArrayList;

 

using Analyzer = Lucene.Net.Analysis.Analyzer;

using Token = Lucene.Net.Analysis.Token;

using TokenStream = Lucene.Net.Analysis.TokenStream;

using IndexReader = Lucene.Net.Index.IndexReader;

using TermFreqVector = Lucene.Net.Index.TermFreqVector;

using TermPositionVector = Lucene.Net.Index.TermPositionVector;

using TermVectorOffsetInfo = Lucene.Net.Index.TermVectorOffsetInfo;

using Document = Lucene.Net.Documents.Document;

 

namespace Lucene.Net.Search.Highlight

{

       /// <summary>

       /// TokenSources used for fast highlight,it's a must for chinese word segment.

       /// </summary>

       public class TokenSources

       {

              /**

              * A convenience method that tries a number of approaches to getting a token stream.

              * The cost of finding there are no termVectors in the index is minimal (1000 invocations still

              * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable

              * @param reader

              * @param docId

              * @param field

              * @param analyzer

              * @return null if field not stored correctly

              * @throws IOException

              */

              public static TokenStream GetAnyTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer)

              {

                     TokenStream ts=null;

 

                     TermFreqVector tfv=(TermFreqVector) reader.GetTermFreqVector(docId,field);

                     if(tfv!=null)

                     {

                            if(tfv is TermPositionVector)

                            {

                                   ts=GetTokenStream((TermPositionVector) tfv);

                            }

                     }

                     //No token info stored so fall back to analyzing raw content

                     if(ts==null)

                     {

                            ts=GetTokenStream(reader,docId,field,analyzer);

                     }

                     return ts;

              }

             

              /**

               *

               * */

              public static TokenStream GetTokenStream(TermPositionVector tpv)

              {

                     //assumes the worst and makes no assumptions about token position sequences.

                     return GetTokenStream(tpv,false);  

              }

 

             

              /**

               * an object used to iterate across an array of tokens

               * */

              public class StoredTokenStream : TokenStream

              {

                     Token[] tokens;

                     int currentToken=0;

 

                     /**

                      * */

                     public StoredTokenStream(Token[] tokens)

                     {

                            this.tokens=tokens;

                     }

 

                     /**

                      * */

                     public override Token Next()

                     {

                            if(currentToken>=tokens.Length)

                            {

                                   return null;

                            }

                            return tokens[currentToken++];

                     }           

              }

             

              class CompareClass : IComparer

              {

                     public Int32 Compare(Object o1, Object o2)

                     {                                         

                            Token t1=(Token) o1;

                            Token t2=(Token) o2;

                            if(t1.StartOffset()>t2.StartOffset())

                                   return 1;

                            if(t1.StartOffset()<t2.StartOffset())

                                   return -1;

                            return 0;

                     }

              }

              /**

              * Low level api.

              * Returns a token stream or null if no offset info available in index.

              * This can be used to feed the highlighter with a pre-parsed token stream

              *

              * In my tests the speeds to recreate 1000 token streams using this method are:

              * - with TermVector offset only data stored - 420  milliseconds

              * - with TermVector offset AND position data stored - 271 milliseconds

              *  (nb timings for TermVector with position data are based on a tokenizer with contiguous

              *  positions - no overlaps or gaps)

              * The cost of not using TermPositionVector to store

              * pre-parsed content and using an analyzer to re-parse the original content:

              * - reanalyzing the original content - 980 milliseconds

              *

              * The re-analyze timings will typically vary depending on -

              *     1) The complexity of the analyzer code (timings above were using a

              *        stemmer/lowercaser/stopword combo)

              *  2) The  number of other fields (Lucene reads ALL fields off the disk

              *     when accessing just one document field - can cost dear!)

              *  3) Use of compression on field storage - could be faster cos of compression (less disk IO)

              *     or slower (more CPU burn) depending on the content.

              *

              * @param tpv

              * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking

              * to eek out the last drops of performance, set to true. If in doubt, set to false.

              */

              public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous)

              {

                     //System.out.println("fastfastfast");

                     //code to reconstruct the original sequence of Tokens

                     String[] terms=tpv.GetTerms();         

                     int[] freq=tpv.GetTermFrequencies();

                     int totalTokens=0;

                    for (int t = 0; t < freq.Length; t++)

                     {

                            totalTokens+=freq[t];

                     }

                     Token[] tokensInOriginalOrder=new Token[totalTokens];

                     ArrayList unsortedTokens = null;

                    for (int t = 0; t < freq.Length; t++)

                     {

                            TermVectorOffsetInfo[] offsets=tpv.GetOffsets(t);

                            if(offsets==null)

                            {

                                   return null;

                            }

                     

                            int[] pos=null;

                            if(tokenPositionsGuaranteedContiguous)

                            {

                                   //try get the token position info to speed up assembly of tokens into sorted sequence

                                   pos=tpv.GetTermPositions(t);

                            }

                            if(pos==null)

                            {    

                                   //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later

                                   if(unsortedTokens==null)

                                   {

                                          unsortedTokens=new ArrayList();

                                   }

                                   for (int tp = 0; tp < offsets.Length; tp++)

                                   {

                                          unsortedTokens.Add(new Token(terms[t],

                                                 offsets[tp].GetStartOffset(),

                                                 offsets[tp].GetEndOffset()));

                                   }

                            }

                            else

                            {

                                   //We have positions stored and a guarantee that the token position information is contiguous

                         

                                   // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or

                                   // creates jumps in position numbers - this code would fail under those circumstances

                         

                                   //tokens stored with positions - can use this to index straight into sorted array

                                   for (int tp = 0; tp < pos.Length; tp++)

                                   {

                                          tokensInOriginalOrder[pos[tp]]=new Token(terms[t],

                                                        offsets[tp].GetStartOffset(),

                                                        offsets[tp].GetEndOffset());

                                   }               

                            }

                     }

                     //If the field has been stored without position data we must perform a sort       

                     if(unsortedTokens!=null)

                     {

                            tokensInOriginalOrder=(Token[]) unsortedTokens.ToArray(typeof( Token) );

                           

                            System.Array.Sort(tokensInOriginalOrder, new CompareClass() );

                     }

                     return new StoredTokenStream(tokensInOriginalOrder);

              }

 

              /**

               * */

              public static TokenStream GetTokenStream(IndexReader reader,int docId, String field)

              {

                     TermFreqVector tfv=(TermFreqVector) reader.GetTermFreqVector(docId,field);

                     if(tfv==null)

                     {

                            throw new Exception(field+" in doc #"+docId

                                        +"does not have any term position data stored");

                     }

                     if(tfv is TermPositionVector)

                     {

                            TermPositionVector tpv=(TermPositionVector) reader.GetTermFreqVector(docId,field);

                            return GetTokenStream(tpv);             

                     }

                     throw new Exception(field+" in doc #"+docId

                      +"does not have any term position data stored");

              }

 

              //convenience method

              public static TokenStream GetTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer)

              {

                     Document doc=reader.Document(docId);

                     String contents=doc.Get(field);

                     if(contents==null)

                     {

                            throw new Exception("Field "+field +" in document #"+docId+ " is not stored and cannot be analyzed");

                     }

                     return analyzer.TokenStream(field,new System.IO.StringReader(contents));

              }

       }

}

 

3  附加工作

       去掉highlight包中的单词界限判断:

tokenGroup.isDistinct(token)

    修改测试程序的方法参考。
from http://www.lietu.com/doc/dotHighlighter.htm

posted @   kwklover  阅读(1444)  评论(1编辑  收藏  举报
编辑推荐:
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
阅读排行:
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 分享 3 个 .NET 开源的文件压缩处理库,助力快速实现文件压缩解压功能!
· Ollama——大语言模型本地部署的极速利器
· DeepSeek如何颠覆传统软件测试?测试工程师会被淘汰吗?
点击右上角即可分享
微信分享提示