阅读下面文章之前,建议先阅读随风的DotLucene源码浅读笔记(1) : Lucene.Net.Analysis 了解Lucene的Analyzer
由于lucene中自带的几个Analyzer不能满足业务需求,要自定义Analyzer所以参考lucene中自带的几个Analyzer的实现。
在参考的过程中,发现KeywordAnalyzer可以简化。
1.修改keywordAnalyzer
Analysis\KeywordTokenizer.cs中分词实现
在分词的过程中,判断了词的大小,当要使用KeywordAnalyzer作为词的分析器时,只能含有256个字符。而一般的使用过程中很少会把大于256的词用KeywordAnalyzer去分词。因此可以把判断去掉(注意:要分的词不能大于256个字符)下面是简化后的代码:
2.自定义Analyzer
在项目实践中,需要对IT商品名进行搜索。我们知道IT商品名比较复杂,商品中含很多型号。
如笔记本:BenQ Joybook R23E (103),HP Pavilion dv1617
用户搜索时基本上不会把型号输完整,很有可能输入r23,dv等关键字进行搜索(限定搜索此必须1个字符以上,否则搜索关键字基本没含义)。Lucene自带的analyzer显然不能满足需求,必需自定义analyzer。
自定义analyzer一般都会
1.定义分(切)词规则(实现Tokenizer)
2.定义词的过滤规则(实现TokenFilter)
分词规则确定:
首先分析taobao等大型电子商务网站对商品搜索结果。taobao基本上无论输入什么词都能搜索出结果,其规则是只要商品名含有输入的字符就显示出来。比如输入“件硬”搜索出的商品只是含有此关键字。基本可以判断出是单字分词,没有进行语义分析。
对于商品名的复杂性和用户输入的不确定性,这样的规则是非常符合商品搜索的。
因此在实现商品搜索的过程中,我也采用了这样的规则。
规则:进行单字分词,字母和数字都作为单个字符处理,其它字符则被过滤。
如BenQ Joybook R23E (103)分词后效果:b e n q j o y b o o k r 2 3 e 1 0 3
实现代码:
实际使用下来,搜索效果理想
由于lucene中自带的几个Analyzer不能满足业务需求,要自定义Analyzer所以参考lucene中自带的几个Analyzer的实现。
在参考的过程中,发现KeywordAnalyzer可以简化。
1.修改keywordAnalyzer
Analysis\KeywordTokenizer.cs中分词实现
public override Token Next()
{
if (!done)
{
done = true;
System.Text.StringBuilder buffer = new System.Text.StringBuilder();
int length;
while (true)
{
length = input.Read((System.Char[]) this.buffer, 0, this.buffer.Length);
if (length <= 0)
break;
buffer.Append(this.buffer, 0, length);
}
System.String text = buffer.ToString();
return new Token(text, 0, text.Length);
}
return null;
}
private const int DEFAULT_BUFFER_SIZE = 256;{
if (!done)
{
done = true;
System.Text.StringBuilder buffer = new System.Text.StringBuilder();
int length;
while (true)
{
length = input.Read((System.Char[]) this.buffer, 0, this.buffer.Length);
if (length <= 0)
break;
buffer.Append(this.buffer, 0, length);
}
System.String text = buffer.ToString();
return new Token(text, 0, text.Length);
}
return null;
}
在分词的过程中,判断了词的大小,当要使用KeywordAnalyzer作为词的分析器时,只能含有256个字符。而一般的使用过程中很少会把大于256的词用KeywordAnalyzer去分词。因此可以把判断去掉(注意:要分的词不能大于256个字符)下面是简化后的代码:
/// <summary>
/// KeywordAnalyzer的简单实现(确定keywords在 255字符以内)
/// <remark>自定义KeywordAnalyzer</remark>
/// </summary>
public class SimpleKeywordAnalyzer : Analyzer
{
public override TokenStream TokenStream(String fieldName, System.IO.TextReader reader)
{
return new CustomCharTokenizer(reader);
}
public class CustomCharTokenizer : CharTokenizer // CharTokenizer系统自带用来对基于字符的进行简单分词(tokenizer)
{
public CustomCharTokenizer(System.IO.TextReader reader)
: base(reader)
{
}
protected internal override bool IsTokenChar(char c)
{
return true;
}
}
}
/// KeywordAnalyzer的简单实现(确定keywords在 255字符以内)
/// <remark>自定义KeywordAnalyzer</remark>
/// </summary>
public class SimpleKeywordAnalyzer : Analyzer
{
public override TokenStream TokenStream(String fieldName, System.IO.TextReader reader)
{
return new CustomCharTokenizer(reader);
}
public class CustomCharTokenizer : CharTokenizer // CharTokenizer系统自带用来对基于字符的进行简单分词(tokenizer)
{
public CustomCharTokenizer(System.IO.TextReader reader)
: base(reader)
{
}
protected internal override bool IsTokenChar(char c)
{
return true;
}
}
}
2.自定义Analyzer
在项目实践中,需要对IT商品名进行搜索。我们知道IT商品名比较复杂,商品中含很多型号。
如笔记本:BenQ Joybook R23E (103),HP Pavilion dv1617
用户搜索时基本上不会把型号输完整,很有可能输入r23,dv等关键字进行搜索(限定搜索此必须1个字符以上,否则搜索关键字基本没含义)。Lucene自带的analyzer显然不能满足需求,必需自定义analyzer。
自定义analyzer一般都会
1.定义分(切)词规则(实现Tokenizer)
2.定义词的过滤规则(实现TokenFilter)
分词规则确定:
首先分析taobao等大型电子商务网站对商品搜索结果。taobao基本上无论输入什么词都能搜索出结果,其规则是只要商品名含有输入的字符就显示出来。比如输入“件硬”搜索出的商品只是含有此关键字。基本可以判断出是单字分词,没有进行语义分析。
对于商品名的复杂性和用户输入的不确定性,这样的规则是非常符合商品搜索的。
因此在实现商品搜索的过程中,我也采用了这样的规则。
规则:进行单字分词,字母和数字都作为单个字符处理,其它字符则被过滤。
如BenQ Joybook R23E (103)分词后效果:b e n q j o y b o o k r 2 3 e 1 0 3
实现代码:
/// <summary>
/// Title: ProductAnalyzer
/// Description:
/// Subclass of org.apache.lucene.analysis.Analyzer
/// build from a ProductTokenizer, filtered with ProductFilter.
/// Copyright: Copyright (c) 2006.07.19
/// @author try
/// </summary>
public class ProductAnalyzer : Analyzer
{
public ProductAnalyzer()
{
}
/// <summary>
/// Creates a TokenStream which tokenizes all the text in the provided Reader.
/// </summary>
/// <returns>A TokenStream build from a ProductTokenizer filtered with ProductFilter.</returns>
public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
{
TokenStream result = null;
result = new ProductTokenizer(reader);
return new ProductFilter(result);
}
}
/// Title: ProductAnalyzer
/// Description:
/// Subclass of org.apache.lucene.analysis.Analyzer
/// build from a ProductTokenizer, filtered with ProductFilter.
/// Copyright: Copyright (c) 2006.07.19
/// @author try
/// </summary>
public class ProductAnalyzer : Analyzer
{
public ProductAnalyzer()
{
}
/// <summary>
/// Creates a TokenStream which tokenizes all the text in the provided Reader.
/// </summary>
/// <returns>A TokenStream build from a ProductTokenizer filtered with ProductFilter.</returns>
public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
{
TokenStream result = null;
result = new ProductTokenizer(reader);
return new ProductFilter(result);
}
}
public sealed class ProductTokenizer : Tokenizer
{
public ProductTokenizer(TextReader _in)
{
input = _in;
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private static int MAX_WORD_LEN = 255;
private static int IO_BUFFER_SIZE = 1024;
private char[] buffer = new char[MAX_WORD_LEN];
private char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int length;
private int start;
private void Push(char c)
{
if (length == 0) start = offset-1; // start of token
buffer[length++] = Char.ToLower(c); // buffer it
}
private Token Flush()
{
if (length > 0)
{
return new Token(new String(buffer, 0, length), start, start+length);
}
else
return null;
}
public override Token Next()
{
length = 0;
start = offset;
while (true)
{
char c;
offset++;
if (bufferIndex >= dataLen)
{
dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
bufferIndex = 0;
};
if (dataLen == 0)
return Flush();
else
{
c = ioBuffer[bufferIndex++];
}
switch(Char.GetUnicodeCategory(c))
{
case UnicodeCategory.DecimalDigitNumber:
Push(c);
return Flush();
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.UppercaseLetter:
Push(c);
return Flush();
case UnicodeCategory.OtherLetter:
if (length > 0)
{
bufferIndex--;
offset--;
return Flush();
}
Push(c);
return Flush();
default:
if (length > 0) return Flush();
break;
}
}
}
}
{
public ProductTokenizer(TextReader _in)
{
input = _in;
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private static int MAX_WORD_LEN = 255;
private static int IO_BUFFER_SIZE = 1024;
private char[] buffer = new char[MAX_WORD_LEN];
private char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int length;
private int start;
private void Push(char c)
{
if (length == 0) start = offset-1; // start of token
buffer[length++] = Char.ToLower(c); // buffer it
}
private Token Flush()
{
if (length > 0)
{
return new Token(new String(buffer, 0, length), start, start+length);
}
else
return null;
}
public override Token Next()
{
length = 0;
start = offset;
while (true)
{
char c;
offset++;
if (bufferIndex >= dataLen)
{
dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
bufferIndex = 0;
};
if (dataLen == 0)
return Flush();
else
{
c = ioBuffer[bufferIndex++];
}
switch(Char.GetUnicodeCategory(c))
{
case UnicodeCategory.DecimalDigitNumber:
Push(c);
return Flush();
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.UppercaseLetter:
Push(c);
return Flush();
case UnicodeCategory.OtherLetter:
if (length > 0)
{
bufferIndex--;
offset--;
return Flush();
}
Push(c);
return Flush();
default:
if (length > 0) return Flush();
break;
}
}
}
}
public sealed class ProductFilter : TokenFilter
{
// Only English now, Chinese to be added later.
public static String[] STOP_WORDS =
{
"and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private Hashtable stopTable;
public ProductFilter(TokenStream _in)
: base(_in)
{
stopTable = new Hashtable(STOP_WORDS.Length);
for (int i = 0; i < STOP_WORDS.Length; i++)
stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
}
public override Token Next()
{
for (Token token = input.Next(); token != null; token = input.Next())
{
String text = token.TermText();
if (stopTable[text] == null)
{
switch (Char.GetUnicodeCategory(text[0]))
{
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.UppercaseLetter:
return token;
case UnicodeCategory.OtherLetter:
return token;
case UnicodeCategory.DecimalDigitNumber:
return token;
}
}
}
return null;
}
}
{
// Only English now, Chinese to be added later.
public static String[] STOP_WORDS =
{
"and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private Hashtable stopTable;
public ProductFilter(TokenStream _in)
: base(_in)
{
stopTable = new Hashtable(STOP_WORDS.Length);
for (int i = 0; i < STOP_WORDS.Length; i++)
stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
}
public override Token Next()
{
for (Token token = input.Next(); token != null; token = input.Next())
{
String text = token.TermText();
if (stopTable[text] == null)
{
switch (Char.GetUnicodeCategory(text[0]))
{
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.UppercaseLetter:
return token;
case UnicodeCategory.OtherLetter:
return token;
case UnicodeCategory.DecimalDigitNumber:
return token;
}
}
}
return null;
}
}
实际使用下来,搜索效果理想