Lucene自定义同义词分词器

 1 package com.lucene.util;
 2 
 3 import java.io.Reader;
 4 
 5 import org.apache.lucene.analysis.Analyzer;
 6 import org.apache.lucene.analysis.TokenStream;
 7 
 8 import com.chenlb.mmseg4j.Dictionary;
 9 import com.chenlb.mmseg4j.MaxWordSeg;
10 import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
11 
12 public class MySameworkAnalyzer extends Analyzer {
13 
14     @Override
15     public TokenStream tokenStream(String str, Reader reader) {
16         //获取中文分词器的字段,我这里使用的是MMSeg4j的中文分词器
17         Dictionary dic=Dictionary.getInstance("F:\\官方包\\lucene-3.5.0\\mmseg4j-1.8.5\\data");
18         return new MySameworkFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));
19     }
20 
21 }

 1     @Test
 2     public void test05(){
 3         try {
 4             Analyzer a1=new MySameworkAnalyzer();
 5             String str="我来自中国,我的名字叫什么";
 6             AnalyzerUtil.displayToken(str, a1);
 7             Directory directory=new RAMDirectory();
 8             IndexWriter indexWriter=new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, a1));
 9             Document document=new Document();
10             document.add(new Field("content", str,Field.Store.YES,Field.Index.ANALYZED));
11             indexWriter.addDocument(document);
12             indexWriter.close();
13             IndexReader indexReader=IndexReader.open(directory);
14             IndexSearcher searcher=new IndexSearcher(indexReader);
15             TopDocs tds=searcher.search(new TermQuery(new Term("content", "大陆")), 10);
16             ScoreDoc[] docs=tds.scoreDocs;
17             Document doc=searcher.doc(docs[0].doc);
18             System.out.println(doc.get("content"));
19             searcher.close();
20             indexReader.close();
21         } catch (CorruptIndexException e) {
22             e.printStackTrace();
23         } catch (LockObtainFailedException e) {
24             e.printStackTrace();
25         } catch (IOException e) {
26             e.printStackTrace();
27         }
28     }

 1 package com.lucene.util;
 2 
 3 import java.io.IOException;
 4 import java.util.HashMap;
 5 import java.util.Map;
 6 import java.util.Stack;
 7 
 8 import org.apache.lucene.analysis.TokenFilter;
 9 import org.apache.lucene.analysis.TokenStream;
10 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
11 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
12 import org.apache.lucene.util.AttributeSource;
13 
14 public class MySameworkFilter extends TokenFilter  {
15 
16     //保存相应的词汇
17     private CharTermAttribute cta=null;
18     //保存词与词之间的位置增量
19     private PositionIncrementAttribute pia=null;
20     //定义一个状态
21     private AttributeSource.State current=null;
22     //用栈保存同义词集合
23     private Stack<String> sames=null;
24     protected MySameworkFilter(TokenStream input) {
25         super(input);
26         cta=this.addAttribute(CharTermAttribute.class);
27         pia=this.addAttribute(PositionIncrementAttribute.class);
28         sames=new Stack<String>();
29     }
30 
31 
32     @Override
33     public boolean incrementToken() throws IOException {
34         if(sames.size()>0){
35             //将元素出栈,并获取同义词
36             String str=sames.pop();
37             //还原状态
38             restoreState(current);
39             //先清空,再添加
40             cta.setEmpty();
41             cta.append(str);
42             //设置位置为0,表示同义词
43             pia.setPositionIncrement(0);
44             return true;
45         }
46         
47         if(!this.input.incrementToken())
48         return false;
49         
50         //如果改词中有同义词,捕获当前状态
51         if(this.getSamewords(cta.toString())){
52             current=captureState();
53         }
54         
55         return true;
56     }
57 
58     //定义同义词字典,并判断如果有同义词就返回true
59     private boolean getSamewords(String key){
60         Map<String, String[]> maps=new HashMap<String, String[]>();
61         maps.put("我", new String[]{"咱","俺"});
62         maps.put("中国", new String[]{"大陆","天朝"});
63         
64         if(maps.get(key)!=null){
65             for(String s:maps.get(key)){
66                 sames.push(s);
67             }
68         }
69         
70         if(sames.size()>0){
71             return true;
72         }
73         return false;
74     }
75 
76 }

posted @ 2012-04-22 11:58 Paul.Lau 阅读(4787) 评论(2) 编辑收藏举报

刷新页面返回顶部

Lucene自定义同义词分词器

公告