Lucene 自定义分词器
1 package com.lucene.util; 2 3 import java.io.Reader; 4 import java.util.Set; 5 6 import org.apache.lucene.analysis.Analyzer; 7 import org.apache.lucene.analysis.LetterTokenizer; 8 import org.apache.lucene.analysis.LowerCaseFilter; 9 import org.apache.lucene.analysis.StopAnalyzer; 10 import org.apache.lucene.analysis.StopFilter; 11 import org.apache.lucene.analysis.TokenStream; 12 import org.apache.lucene.util.Version; 13 14 //定义禁用词分词器 15 public class UserDefinedAnalyzer extends Analyzer { 16 17 //定义禁用词集合 18 private Set stops; 19 20 //无参构造器使用默认的禁用词分词器 21 public UserDefinedAnalyzer(){ 22 stops=StopAnalyzer.ENGLISH_STOP_WORDS_SET; 23 } 24 25 /** 26 * 传一个禁用词数组 27 * @param sws 28 */ 29 public UserDefinedAnalyzer(String[] sws){ 30 //使用stopFilter创建禁用词集合 31 stops=StopFilter.makeStopSet(Version.LUCENE_35,sws,true); 32 //将默认的禁用词添加进集合 33 stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); 34 } 35 36 /** 37 * 自定义分词器 38 */ 39 @Override 40 public TokenStream tokenStream(String str, Reader reader) { 41 42 return new StopFilter 43 (Version.LUCENE_35, 44 new LowerCaseFilter 45 (Version.LUCENE_35, 46 new LetterTokenizer( 47 Version.LUCENE_35, reader)), stops); 48 } 49 50 }
@Test public void test04(){ Analyzer a1=new UserDefinedAnalyzer(new String[]{"my","name"}); //Analyzer a1=new UserDefinedAnalyzer(); String str="my name is paul"; AnalyzerUtil.displayToken(str, a1); }