Lucene自定义同义词分词器
1 package com.lucene.util; 2 3 import java.io.Reader; 4 5 import org.apache.lucene.analysis.Analyzer; 6 import org.apache.lucene.analysis.TokenStream; 7 8 import com.chenlb.mmseg4j.Dictionary; 9 import com.chenlb.mmseg4j.MaxWordSeg; 10 import com.chenlb.mmseg4j.analysis.MMSegTokenizer; 11 12 public class MySameworkAnalyzer extends Analyzer { 13 14 @Override 15 public TokenStream tokenStream(String str, Reader reader) { 16 //获取中文分词器的字段,我这里使用的是MMSeg4j的中文分词器 17 Dictionary dic=Dictionary.getInstance("F:\\官方包\\lucene-3.5.0\\mmseg4j-1.8.5\\data"); 18 return new MySameworkFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader)); 19 } 20 21 }
1 @Test 2 public void test05(){ 3 try { 4 Analyzer a1=new MySameworkAnalyzer(); 5 String str="我来自中国,我的名字叫什么"; 6 AnalyzerUtil.displayToken(str, a1); 7 Directory directory=new RAMDirectory(); 8 IndexWriter indexWriter=new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, a1)); 9 Document document=new Document(); 10 document.add(new Field("content", str,Field.Store.YES,Field.Index.ANALYZED)); 11 indexWriter.addDocument(document); 12 indexWriter.close(); 13 IndexReader indexReader=IndexReader.open(directory); 14 IndexSearcher searcher=new IndexSearcher(indexReader); 15 TopDocs tds=searcher.search(new TermQuery(new Term("content", "大陆")), 10); 16 ScoreDoc[] docs=tds.scoreDocs; 17 Document doc=searcher.doc(docs[0].doc); 18 System.out.println(doc.get("content")); 19 searcher.close(); 20 indexReader.close(); 21 } catch (CorruptIndexException e) { 22 e.printStackTrace(); 23 } catch (LockObtainFailedException e) { 24 e.printStackTrace(); 25 } catch (IOException e) { 26 e.printStackTrace(); 27 } 28 }
1 package com.lucene.util; 2 3 import java.io.IOException; 4 import java.util.HashMap; 5 import java.util.Map; 6 import java.util.Stack; 7 8 import org.apache.lucene.analysis.TokenFilter; 9 import org.apache.lucene.analysis.TokenStream; 10 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 11 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 12 import org.apache.lucene.util.AttributeSource; 13 14 public class MySameworkFilter extends TokenFilter { 15 16 //保存相应的词汇 17 private CharTermAttribute cta=null; 18 //保存词与词之间的位置增量 19 private PositionIncrementAttribute pia=null; 20 //定义一个状态 21 private AttributeSource.State current=null; 22 //用栈保存同义词集合 23 private Stack<String> sames=null; 24 protected MySameworkFilter(TokenStream input) { 25 super(input); 26 cta=this.addAttribute(CharTermAttribute.class); 27 pia=this.addAttribute(PositionIncrementAttribute.class); 28 sames=new Stack<String>(); 29 } 30 31 32 @Override 33 public boolean incrementToken() throws IOException { 34 if(sames.size()>0){ 35 //将元素出栈,并获取同义词 36 String str=sames.pop(); 37 //还原状态 38 restoreState(current); 39 //先清空,再添加 40 cta.setEmpty(); 41 cta.append(str); 42 //设置位置为0,表示同义词 43 pia.setPositionIncrement(0); 44 return true; 45 } 46 47 if(!this.input.incrementToken()) 48 return false; 49 50 //如果改词中有同义词,捕获当前状态 51 if(this.getSamewords(cta.toString())){ 52 current=captureState(); 53 } 54 55 return true; 56 } 57 58 //定义同义词字典,并判断如果有同义词就返回true 59 private boolean getSamewords(String key){ 60 Map<String, String[]> maps=new HashMap<String, String[]>(); 61 maps.put("我", new String[]{"咱","俺"}); 62 maps.put("中国", new String[]{"大陆","天朝"}); 63 64 if(maps.get(key)!=null){ 65 for(String s:maps.get(key)){ 66 sames.push(s); 67 } 68 } 69 70 if(sames.size()>0){ 71 return true; 72 } 73 return false; 74 } 75 76 }