基于Lucene 4.x的ik-analyzer
需要修改IKAnalyzer.java、IKTokenizer.java、IKTokenizerFactory.java。
1 import java.io.Reader;
2 import org.apache.lucene.analysis.Analyzer;
3 import org.apache.lucene.analysis.Tokenizer;
4
5 /**
6 * 实现Lucene Analyzer 基于IKTokenizer的中文分词器
7 *
8 * @author 林良益
9 *
10 */
11 public final class IKAnalyzer extends Analyzer {
12
13 private boolean isMaxWordLength = false;
14
15 /**
16 * IK分词器Lucene Analyzer接口实现类 默认最细粒度切分算法
17 */
18 public IKAnalyzer() {
19 this(false);
20 }
21
22 /**
23 * IK分词器Lucene Analyzer接口实现类
24 *
25 * @param isMaxWordLength
26 * 当为true时,分词器进行最大词长切分
27 */
28 public IKAnalyzer(boolean isMaxWordLength) {
29 super();
30 this.setMaxWordLength(isMaxWordLength);
31 }
32
33 @Override
34 public TokenStreamComponents createComponents(String fieldName,
35 Reader reader) {
36 Tokenizer tokenizer = new IKTokenizer(reader, isMaxWordLength());
37 return new TokenStreamComponents(tokenizer, null);
38 }
39
40 public void setMaxWordLength(boolean isMaxWordLength) {
41 this.isMaxWordLength = isMaxWordLength;
42 }
43
44 public boolean isMaxWordLength() {
45 return isMaxWordLength;
46 }
47
48 }
2 import org.apache.lucene.analysis.Analyzer;
3 import org.apache.lucene.analysis.Tokenizer;
4
5 /**
6 * 实现Lucene Analyzer 基于IKTokenizer的中文分词器
7 *
8 * @author 林良益
9 *
10 */
11 public final class IKAnalyzer extends Analyzer {
12
13 private boolean isMaxWordLength = false;
14
15 /**
16 * IK分词器Lucene Analyzer接口实现类 默认最细粒度切分算法
17 */
18 public IKAnalyzer() {
19 this(false);
20 }
21
22 /**
23 * IK分词器Lucene Analyzer接口实现类
24 *
25 * @param isMaxWordLength
26 * 当为true时,分词器进行最大词长切分
27 */
28 public IKAnalyzer(boolean isMaxWordLength) {
29 super();
30 this.setMaxWordLength(isMaxWordLength);
31 }
32
33 @Override
34 public TokenStreamComponents createComponents(String fieldName,
35 Reader reader) {
36 Tokenizer tokenizer = new IKTokenizer(reader, isMaxWordLength());
37 return new TokenStreamComponents(tokenizer, null);
38 }
39
40 public void setMaxWordLength(boolean isMaxWordLength) {
41 this.isMaxWordLength = isMaxWordLength;
42 }
43
44 public boolean isMaxWordLength() {
45 return isMaxWordLength;
46 }
47
48 }
1 import java.io.IOException;
2 import java.io.Reader;
3
4 import org.apache.lucene.analysis.Tokenizer;
5 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
6 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
7 import org.wltea.analyzer.IKSegmentation;
8 import org.wltea.analyzer.Lexeme;
9
10 /**
11 * IK Analyzer v3.2 Lucene4.x Tokenizer适配器类 它封装了IKSegmentation实现
12 *
13 * @author 林良益
14 *
15 */
16 public final class IKTokenizer extends Tokenizer {
17 // IK分词器实现
18 private IKSegmentation _IKImplement;
19 // 词元文本属性
20 private CharTermAttribute termAtt;
21 // 词元位移属性
22 private OffsetAttribute offsetAtt;
23 // 记录最后一个词元的结束位置
24 private int finalOffset;
25
26 /**
27 * Lucene Tokenizer适配器类构造函数
28 *
29 * @param in
30 * @param isMaxWordLength
31 * 当为true时,分词器进行最大词长切分;当为false是,采用最细粒度切分
32 */
33 public IKTokenizer(Reader in, boolean isMaxWordLength) {
34 super(in);
35 offsetAtt = addAttribute(OffsetAttribute.class);
36 termAtt = addAttribute(CharTermAttribute.class);
37 _IKImplement = new IKSegmentation(in, isMaxWordLength);
38 }
39
40 @Override
41 public final boolean incrementToken() throws IOException {
42 // 清除所有的词元属性
43 clearAttributes();
44 Lexeme nextLexeme = _IKImplement.next();
45 if (nextLexeme != null) {
46 // 将Lexeme转成Attributes
47 // 设置词元文本
48 termAtt.setEmpty().append(nextLexeme.getLexemeText());
49 // 设置词元位移
50 offsetAtt.setOffset(nextLexeme.getBeginPosition(),
51 nextLexeme.getEndPosition());
52 offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
53 finalOffset = nextLexeme.getEndPosition();
54 // 返会true告知还有下个词元
55 return true;
56 }
57 // 返会false告知词元输出完毕
58 return false;
59 }
60
61 /*
62 * (non-Javadoc)
63 *
64 * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
65 */
66 public void reset() throws IOException {
67 super.reset();
68 _IKImplement.reset(input);
69 }
70
71 @Override
72 public final void end() {
73 offsetAtt.setOffset(finalOffset, finalOffset);
74 }
2 import java.io.Reader;
3
4 import org.apache.lucene.analysis.Tokenizer;
5 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
6 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
7 import org.wltea.analyzer.IKSegmentation;
8 import org.wltea.analyzer.Lexeme;
9
10 /**
11 * IK Analyzer v3.2 Lucene4.x Tokenizer适配器类 它封装了IKSegmentation实现
12 *
13 * @author 林良益
14 *
15 */
16 public final class IKTokenizer extends Tokenizer {
17 // IK分词器实现
18 private IKSegmentation _IKImplement;
19 // 词元文本属性
20 private CharTermAttribute termAtt;
21 // 词元位移属性
22 private OffsetAttribute offsetAtt;
23 // 记录最后一个词元的结束位置
24 private int finalOffset;
25
26 /**
27 * Lucene Tokenizer适配器类构造函数
28 *
29 * @param in
30 * @param isMaxWordLength
31 * 当为true时,分词器进行最大词长切分;当为false是,采用最细粒度切分
32 */
33 public IKTokenizer(Reader in, boolean isMaxWordLength) {
34 super(in);
35 offsetAtt = addAttribute(OffsetAttribute.class);
36 termAtt = addAttribute(CharTermAttribute.class);
37 _IKImplement = new IKSegmentation(in, isMaxWordLength);
38 }
39
40 @Override
41 public final boolean incrementToken() throws IOException {
42 // 清除所有的词元属性
43 clearAttributes();
44 Lexeme nextLexeme = _IKImplement.next();
45 if (nextLexeme != null) {
46 // 将Lexeme转成Attributes
47 // 设置词元文本
48 termAtt.setEmpty().append(nextLexeme.getLexemeText());
49 // 设置词元位移
50 offsetAtt.setOffset(nextLexeme.getBeginPosition(),
51 nextLexeme.getEndPosition());
52 offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
53 finalOffset = nextLexeme.getEndPosition();
54 // 返会true告知还有下个词元
55 return true;
56 }
57 // 返会false告知词元输出完毕
58 return false;
59 }
60
61 /*
62 * (non-Javadoc)
63 *
64 * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
65 */
66 public void reset() throws IOException {
67 super.reset();
68 _IKImplement.reset(input);
69 }
70
71 @Override
72 public final void end() {
73 offsetAtt.setOffset(finalOffset, finalOffset);
74 }
75 }
1 import java.io.Reader;
2 import java.util.Map;
3
4 import org.apache.lucene.analysis.Tokenizer;
5 import org.apache.lucene.analysis.util.TokenizerFactory;
6 import org.wltea.analyzer.lucene.IKTokenizer;
7
8 /**
9 * 实现Solr4.x分词器接口
10 * 基于IKTokenizer的实现
11 *
12 * @author 林良益、李良杰
13 *
14 */
15 public final class IKTokenizerFactory extends TokenizerFactory{
16
17 private boolean isMaxWordLength = false;
18
19 /**
20 * IK分词器Solr TokenizerFactory接口实现类
21 * 默认最细粒度切分算法
22 */
23 public IKTokenizerFactory(){
24 }
25
26 /*
27 * (non-Javadoc)
28 * @see org.apache.solr.analysis.BaseTokenizerFactory#init(java.util.Map)
29 */
30 public void init(Map<String,String> args){
31 String _arg = args.get("isMaxWordLength");
32 isMaxWordLength = Boolean.parseBoolean(_arg);
33 }
34
35 /*
36 * (non-Javadoc)
37 * @see org.apache.solr.analysis.TokenizerFactory#create(java.io.Reader)
38 */
39 public Tokenizer create(Reader reader) {
40 return new IKTokenizer(reader , isMaxWordLength());
41 }
42
43 public void setMaxWordLength(boolean isMaxWordLength) {
44 this.isMaxWordLength = isMaxWordLength;
45 }
46
47 public boolean isMaxWordLength() {
48 return isMaxWordLength;
49 }
50 }
2 import java.util.Map;
3
4 import org.apache.lucene.analysis.Tokenizer;
5 import org.apache.lucene.analysis.util.TokenizerFactory;
6 import org.wltea.analyzer.lucene.IKTokenizer;
7
8 /**
9 * 实现Solr4.x分词器接口
10 * 基于IKTokenizer的实现
11 *
12 * @author 林良益、李良杰
13 *
14 */
15 public final class IKTokenizerFactory extends TokenizerFactory{
16
17 private boolean isMaxWordLength = false;
18
19 /**
20 * IK分词器Solr TokenizerFactory接口实现类
21 * 默认最细粒度切分算法
22 */
23 public IKTokenizerFactory(){
24 }
25
26 /*
27 * (non-Javadoc)
28 * @see org.apache.solr.analysis.BaseTokenizerFactory#init(java.util.Map)
29 */
30 public void init(Map<String,String> args){
31 String _arg = args.get("isMaxWordLength");
32 isMaxWordLength = Boolean.parseBoolean(_arg);
33 }
34
35 /*
36 * (non-Javadoc)
37 * @see org.apache.solr.analysis.TokenizerFactory#create(java.io.Reader)
38 */
39 public Tokenizer create(Reader reader) {
40 return new IKTokenizer(reader , isMaxWordLength());
41 }
42
43 public void setMaxWordLength(boolean isMaxWordLength) {
44 this.isMaxWordLength = isMaxWordLength;
45 }
46
47 public boolean isMaxWordLength() {
48 return isMaxWordLength;
49 }
50 }