lucene 自定义分词器小程序
2012-05-01 22:43 Lves Li 阅读(185) 评论(0) 编辑 收藏 举报测试类package LuceneUtil; import java.io.Reader; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LetterTokenizer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.Version; //自定义过滤分词器 public class MyStopAnalyzer extends Analyzer { private Set stops; public MyStopAnalyzer(String [] sws)//形参为 字符串数组 { //会自动将字符串数组转换为Set stops=StopFilter.makeStopSet(Version.LUCENE_35, sws,true); //将原有的停用词加入到现在的停用词中 stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); } public MyStopAnalyzer() { stops=StopAnalyzer.ENGLISH_STOP_WORDS_SET; } public TokenStream tokenStream(String FileName,Reader reader) { return new StopFilter(Version.LUCENE_35, new LowerCaseFilter(Version.LUCENE_35, new LetterTokenizer(Version.LUCENE_35, reader)), stops); } }
package LuceneTest; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import com.chenlb.mmseg4j.analysis.MMSegAnalyzer; import LuceneUtil.AnalyzerUtils; import LuceneUtil.MyStopAnalyzer; public class TestAnalyzer { static ArrayList<String> list=null; public static void main (String [] args) throws IOException{ //addNewWord( "烟台大学 "); //test(); test01(); } public static void test() { Analyzer a1=new MMSegAnalyzer(); String txt="我是一名大学生,我来自菏*,我现在烟台大学。"; AnalyzerUtils.displayToken(txt,a1); } public static void test01() { //使用自定义的过滤分词器 //这个语句 可以吧 “you“,”meet”,和“***” 给和谐掉 Analyzer a2=new MyStopAnalyzer(new String [] {"you","meet","***"}); //系统自带的StopAnalyzer Analyzer a3=new StopAnalyzer(Version.LUCENE_35); String txt=" i say :how are You,nice to meet you. ***"; AnalyzerUtils.displayToken(txt,a2); AnalyzerUtils.displayToken(txt,a3); }package LuceneUtil; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.omg.CORBA.portable.Streamable; //测试类 public class AnalyzerUtils { public static void displayToken(String str,Analyzer a) { try { TokenStream ts=a.tokenStream("cotents", new StringReader(str)); //创建一个属性,这个属性添加到流中,随着TokenStream增加 CharTermAttribute cta=ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.print("["+cta+"]"); } System.out.println(); } catch (IOException e) { e.printStackTrace(); } } }
/*public static void addNewWord(String newWord) throws IOException{BufferedWriter bw=new BufferedWriter(new FileWriter("G:\\mmseg\\data\\words-my.dic"));ArrayList<String> list=new ArrayList<String>();list.add(newWord);Iterator<String> iterator=list.iterator();while (iterator.hasNext()){bw.write(iterator.next());bw.flush();bw.newLine();}bw.close();System.out.println("添加成功");}*/}测试结果如下:
可见 我想和谐掉的 那几个字已被和谐
第一行为执行和谐后的结果
第二行为未被和谐的
[i][say][how][nice]
[i][say][how][you][nice][meet][you][***]不足:还不能对单个汉语词语和谐 ,汉语只能屏蔽一句话。而英语却可以