Lucene:初体验

Lucene系列介绍

Lucene:初体验

 使用Lucene3.6版本,到官网下载lucene-3.6.0.zip,解压。

需要用到的jar:

 \lucene-3.6.0\lucene-core-3.6.0.jar                     (Lucene的核心包)

 \lucene-3.6.0\contrib\analyzers\common\lucene-analyzers-3.6.0.jar (分词器)

 \lucene-3.6.0\contrib\highlighter\lucene-highlighter-3.6.0.jar  (高亮关键词使用)

 \lucene-3.6.0\contrib\memory\lucene-memory-3.6.0.jar         (高亮关键词使用)

 

1、新建一个Java Project,并引入上述jar。 

2、初步练习

 1 public class HelloWord {  
 2     public static void createIndexFile() {  
 3         IndexWriter indexWriter=null;  
 4         try {  
 5             // 需要的分词器  
 6             Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);  
 7             // 创建的是哪个版本的IndexWriterConfig  
 8             IndexWriterConfig indexWriterConfig = new IndexWriterConfig(  
 9                     Version.LUCENE_36, analyzer);  
10             // 创建系统文件----- ./ 当前路径下的  
11             Directory directory = new SimpleFSDirectory(new File("./indexDir/"));  
12             indexWriter = new IndexWriter(directory,indexWriterConfig);   
13             //获取实体对象  
14             Article article=new Article(11,"最XX的城市","XX");    
15             //indexWriter添加索引  
16             Document doc=new Document();  
17             //文本中添加内容            标题      内容  
18             /*doc.add(new Field("title","中国的首都在哪里",Store.YES,Index.ANALYZED));  
19             doc.add(new Field("content","中国的首都在北京",Store.YES,Index.ANALYZED));*/      
20             doc.add(new Field("id",article.getId().toString(),Store.YES,Index.ANALYZED));  
21             doc.add(new Field("title",article.getTitle().toString(),Store.YES,Index.ANALYZED));  
22             doc.add(new Field("content",article.getContent().toString(),Store.YES,Index.ANALYZED));   
23             //添加到索引中去  
24             indexWriter.addDocument(doc);     
25         } catch (IOException e) {  
26             // TODO Auto-generated catch block  
27             e.printStackTrace();  
28         }finally{  
29             if(indexWriter!=null){  
30                 try {  
31                     indexWriter.close();  
32                 }  catch (IOException e) {  
33                     // TODO Auto-generated catch block  
34                     e.printStackTrace();  
35                 }  
36             }  
37         }  
38     }  
39     //如果查询是需要用到解析器,那解析器必须和创建时的解析器相同  
40     public static void searchIndexFileResult() throws IOException {   
41         List<Article> articles=new ArrayList<Article>();      
42         //得到索引的目录  
43         Directory directory = new SimpleFSDirectory(new File("./indexDir/"));  
44         //根据目录打开一个indexReader  
45         IndexReader indexReader=IndexReader.open(directory);  
46         //System.out.println(indexReader.maxDoc());   
47         //获取最小值的document对象  
48         //Document doc=indexReader.document(0);  
49         //获取最大值的document对象  
50         //Document doc=indexReader.document(indexReader.maxDoc()-1);  
51         //document对象的get(字段名称)方法获取字段的值  
52         /*System.out.println(doc.get("id"));  
53         System.out.println(doc.get("title"));  
54         System.out.println(doc.get("content"));*/     
55         int n=indexReader.maxDoc();  
56         for(int i=0;i<n;i++){  
57             Document doc=indexReader.document(i);  
58             Article article=new Article();  
59             if(doc.get("id")==null){  
60                 System.out.println("id为空");  
61             }else{  
62                 article.setId(Integer.parseInt(doc.get("id")));  
63                 article.setTitle(doc.get("title"));  
64                 article.setContent(doc.get("content"));  
65                 articles.add(article);  
66             }  
67         }  
68         for(Article article:articles){  
69             System.out.println(article.toString());  
70         }     
71     }  
72     public static void main(String[] args) throws IOException {  
73         // 建立要索引的文件  
74     //  createIndexFile();  
75         // 从索引文件中查询数据  
76         searchIndexFileResult();  
77         // 获得结果,然后交由相关应用程序处理  
78     }  
79 } 

 

2、模拟图书搜索写一个简单的例子。

编写一个图书实体对象。

 1 package com.exam.lucene.entity;
 2 /** 
 3 * 图书 
 4 * @author zhou 
 5 */
 6 public class Book {        
 7     private Integer id;        
 8     /**     
 9     * 书名     
10    */    
11     private String title;        
12     /**     
13     * 内容     
14    */    
15     private String content;        
16     /**     
17     * 作者     
18    */    
19     private String author;        
20     
21     //省略getter/setter方法    
22     //......    
23 }

3、简单的测试。

 

  1 package com.cndatacom.lucene.test;
  2  
  3  import java.io.File;
  4  import java.io.IOException;
  5  import java.util.ArrayList;
  6  import java.util.List;
  7  
  8  import org.apache.lucene.analysis.Analyzer;
  9  import org.apache.lucene.analysis.standard.StandardAnalyzer;
 10  import org.apache.lucene.document.Document;
 11  import org.apache.lucene.document.Field;
 12  import org.apache.lucene.document.Field.Index;
 13  import org.apache.lucene.document.Field.Store;
 14  import org.apache.lucene.index.CorruptIndexException;
 15  import org.apache.lucene.index.IndexReader;
 16  import org.apache.lucene.index.IndexWriter;
 17  import org.apache.lucene.index.IndexWriterConfig;
 18  import org.apache.lucene.queryParser.MultiFieldQueryParser;
 19  import org.apache.lucene.queryParser.ParseException;
 20  import org.apache.lucene.queryParser.QueryParser;
 21  import org.apache.lucene.search.IndexSearcher;
 22  import org.apache.lucene.search.Query;
 23  import org.apache.lucene.search.ScoreDoc;
 24  import org.apache.lucene.search.TopDocs;
 25  import org.apache.lucene.search.highlight.Formatter;
 26  import org.apache.lucene.search.highlight.Fragmenter;
 27  import org.apache.lucene.search.highlight.Highlighter;
 28  import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
 29  import org.apache.lucene.search.highlight.QueryScorer;
 30  import org.apache.lucene.search.highlight.Scorer;
 31  import org.apache.lucene.search.highlight.SimpleFragmenter;
 32  import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
 33  import org.apache.lucene.store.Directory;
 34  import org.apache.lucene.store.FSDirectory;
 35  import org.apache.lucene.util.Version;
 36  
 37  import org.junit.Before;
 38  import org.junit.Test;
 39  
 40  import com.cndatacom.lucene.entity.Book;
 41  
 42  public class LuceneTest {        
 43      //分词器    
 44      private Analyzer analyzer;        
 45      //索引存放目录    
 46      private Directory directory;        
 47      
 48      /**     
 49      * 初始化Analyzer和Directory     
 50      * @throws IOException      
 51      */    
 52      @Before    
 53      public void before() throws IOException {                
 54          //建立一个标准分词器        
 55          //Version.LUCENE_36 表示匹配Lucene3.6版本        
 56          analyzer = new StandardAnalyzer(Version.LUCENE_36);                
 57          //在当前路径下建立一个目录叫indexDir        
 58          File indexDir = new File("./indexDir");                
 59          //创建索引目录        
 60          directory = FSDirectory.open(indexDir);    
 61      }    
 62              
 63      /**     
 64      * 建立索引文件     
 65      * @throws IOException      
 66      */    
 67      @Test    
 68      public void testCreateIndex() throws IOException {                
 69          //建立一个IndexWriter配置,指定匹配的版本,以及分词器        
 70          IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36,analyzer);
 71          //创建IndexWriter,它负责索引的创建和维护        
 72          IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);                
 73          //获取图书信息        
 74          Book book1 = new Book();        
 75          book1.setId(1);        
 76          book1.setTitle("Java编程思想");        
 77          book1.setAuthor("Bruce Eckel");        
 78          book1.setContent("Thinking in Java should be read cover to cover by every Java programmer, then kept close at hand for frequent reference.");                
 79          
 80          Book book2 = new Book();        
 81          book2.setId(2);        
 82          book2.setTitle("建筑的永恒之道");        
 83          book2.setAuthor("亚历山大");        
 84          book2.setContent("《建筑的永恒之道》提出了一个关于建筑设计、建筑和规划的新的理论、思想,该理论的核心是社会成员按照他们自己的存在状态设定他们生活的世界秩序,这一古老方式从根本上构成了新的后工业时代建筑的基础,这些建筑由人们创造。");                
 85          
 86          //建立Document        
 87          Document doc1 = new Document();                
 88          //Store指定Field是否需要存储,Index指定Field是否需要分词索引        
 89          doc1.add(new Field("id",book1.getId().toString(),Store.YES,Index.NOT_ANALYZED));        
 90          doc1.add(new Field("title",book1.getTitle(),Store.YES,Index.ANALYZED));        
 91          doc1.add(new Field("author",book1.getAuthor(),Store.YES,Index.ANALYZED));        
 92          doc1.add(new Field("content",book1.getContent(),Store.YES,Index.ANALYZED));                
 93          
 94          //建立Document        
 95          Document doc2 = new Document();                
 96          //Store指定Field是否需要存储,Index指定Field是否需要索引        
 97          doc2.add(new Field("id",book2.getId().toString(),Store.YES,Index.NOT_ANALYZED));        
 98          doc2.add(new Field("title",book2.getTitle(),Store.YES,Index.ANALYZED));        
 99          doc2.add(new Field("author",book2.getAuthor(),Store.YES,Index.ANALYZED));        
100          doc2.add(new Field("content",book2.getContent(),Store.YES,Index.ANALYZED));                
101          
102          //把Document加入到索引中        
103          indexWriter.addDocument(doc1);        
104          indexWriter.addDocument(doc2);                
105          //提交改变到索引,然后关闭        
106          indexWriter.close();            
107      }
108      
109      /**     
110      * 搜索图书     
111      * @throws ParseException      
112      * @throws IOException      
113      * @throws CorruptIndexException      
114      * @throws InvalidTokenOffsetsException      
115      */    
116      @Test    
117      public void testSearchBook() throws ParseException, CorruptIndexException, IOException, InvalidTokenOffsetsException {        
118          //搜索的关键词        
119          String queryKeyWord = "思想";                
120          //创建查询分析器,把查询关键词转化为查询对象Query(单个Field, 如作者author域中搜索)        
121          //QueryParser queryParser = new QueryParser(Version.LUCENE_36,"author",analyzer);
122          //在多个域中搜索,如域title和content                        
123          String[] fields = {"title","content"};        
124          QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36,fields,analyzer);        
125          Query query = queryParser.parse(queryKeyWord);                
126      
127          //获取访问索引的接口,进行搜索        
128          IndexReader indexReader  = IndexReader.open(directory);        
129          IndexSearcher indexSearcher = new IndexSearcher(indexReader);                
130      
131          //TopDocs 搜索返回的结果        
132          TopDocs topDocs = indexSearcher.search(query, 100);//只返回前100条记录                
133          int totalCount = topDocs.totalHits; // 搜索结果总数量        
134          System.out.println("搜索到的结果总数量为:" + totalCount);                
135          ScoreDoc[] scoreDocs = topDocs.scoreDocs; // 搜索的结果列表                
136      
137          //创建高亮器,使搜索的关键词突出显示        
138          Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");        
139          Scorer fragmentScore = new QueryScorer(query);        
140          Highlighter highlighter = new Highlighter(formatter,fragmentScore);        
141          Fragmenter fragmenter = new SimpleFragmenter(100);        
142          highlighter.setTextFragmenter(fragmenter);                
143          List<Book> books = new ArrayList<Book>();        
144      
145          //把搜索结果取出放入到集合中        
146          for(ScoreDoc scoreDoc : scoreDocs) {            
147              int docID = scoreDoc.doc;//当前结果的文档编号            
148              float score = scoreDoc.score;//当前结果的相关度得分            
149              System.out.println("score is : "+score);                        
150              Document document = indexSearcher.doc(docID);            
151              Book book = new Book();            
152              book.setId(Integer.parseInt(document.get("id")));                        
153          
154              //高亮显示title            
155              String title =  document.get("title");            
156              String highlighterTitle = highlighter.getBestFragment(analyzer, "title", title);                        
157              //如果title中没有找到关键词            
158              if(highlighterTitle == null) {                
159                  highlighterTitle = title;            
160              }            
161              book.setTitle(highlighterTitle);                        
162              book.setAuthor(document.get("author"));                        
163          
164              //高亮显示content            
165              String content =  document.get("content");            
166              String highlighterContent = highlighter.getBestFragment(analyzer, "content", content);                        
167              //如果content中没有找到关键词            
168              if(highlighterContent == null) {                
169                  highlighterContent = content;            
170              }            
171              book.setContent(highlighterContent);                        
172              books.add(book);        
173          }        
174          //关闭        
175          indexReader.close();        
176          indexSearcher.close();        
177      
178          for(Book book : books) {            
179              System.out.println("book'id is : "+book.getId());            
180              System.out.println("book'title is : "+book.getTitle());            
181              System.out.println("book'author is : "+book.getAuthor());            
182              System.out.println("book'content is : "+book.getContent());        
183          }
184      }
185  }

 


 

posted @ 2012-09-25 22:40  bluepoint2009  阅读(1045)  评论(0编辑  收藏  举报