Lucene:初体验
Lucene:初体验
使用Lucene3.6版本,到官网下载lucene-3.6.0.zip,解压。
需要用到的jar:
\lucene-3.6.0\lucene-core-3.6.0.jar (Lucene的核心包)
\lucene-3.6.0\contrib\analyzers\common\lucene-analyzers-3.6.0.jar (分词器)
\lucene-3.6.0\contrib\highlighter\lucene-highlighter-3.6.0.jar (高亮关键词使用)
\lucene-3.6.0\contrib\memory\lucene-memory-3.6.0.jar (高亮关键词使用)
1、新建一个Java Project,并引入上述jar。
2、初步练习
1 public class HelloWord { 2 public static void createIndexFile() { 3 IndexWriter indexWriter=null; 4 try { 5 // 需要的分词器 6 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); 7 // 创建的是哪个版本的IndexWriterConfig 8 IndexWriterConfig indexWriterConfig = new IndexWriterConfig( 9 Version.LUCENE_36, analyzer); 10 // 创建系统文件----- ./ 当前路径下的 11 Directory directory = new SimpleFSDirectory(new File("./indexDir/")); 12 indexWriter = new IndexWriter(directory,indexWriterConfig); 13 //获取实体对象 14 Article article=new Article(11,"最XX的城市","XX"); 15 //indexWriter添加索引 16 Document doc=new Document(); 17 //文本中添加内容 标题 内容 18 /*doc.add(new Field("title","中国的首都在哪里",Store.YES,Index.ANALYZED)); 19 doc.add(new Field("content","中国的首都在北京",Store.YES,Index.ANALYZED));*/ 20 doc.add(new Field("id",article.getId().toString(),Store.YES,Index.ANALYZED)); 21 doc.add(new Field("title",article.getTitle().toString(),Store.YES,Index.ANALYZED)); 22 doc.add(new Field("content",article.getContent().toString(),Store.YES,Index.ANALYZED)); 23 //添加到索引中去 24 indexWriter.addDocument(doc); 25 } catch (IOException e) { 26 // TODO Auto-generated catch block 27 e.printStackTrace(); 28 }finally{ 29 if(indexWriter!=null){ 30 try { 31 indexWriter.close(); 32 } catch (IOException e) { 33 // TODO Auto-generated catch block 34 e.printStackTrace(); 35 } 36 } 37 } 38 } 39 //如果查询是需要用到解析器,那解析器必须和创建时的解析器相同 40 public static void searchIndexFileResult() throws IOException { 41 List<Article> articles=new ArrayList<Article>(); 42 //得到索引的目录 43 Directory directory = new SimpleFSDirectory(new File("./indexDir/")); 44 //根据目录打开一个indexReader 45 IndexReader indexReader=IndexReader.open(directory); 46 //System.out.println(indexReader.maxDoc()); 47 //获取最小值的document对象 48 //Document doc=indexReader.document(0); 49 //获取最大值的document对象 50 //Document doc=indexReader.document(indexReader.maxDoc()-1); 51 //document对象的get(字段名称)方法获取字段的值 52 /*System.out.println(doc.get("id")); 53 System.out.println(doc.get("title")); 54 System.out.println(doc.get("content"));*/ 55 int n=indexReader.maxDoc(); 56 for(int i=0;i<n;i++){ 57 Document doc=indexReader.document(i); 58 Article article=new Article(); 59 if(doc.get("id")==null){ 60 System.out.println("id为空"); 61 }else{ 62 article.setId(Integer.parseInt(doc.get("id"))); 63 article.setTitle(doc.get("title")); 64 article.setContent(doc.get("content")); 65 articles.add(article); 66 } 67 } 68 for(Article article:articles){ 69 System.out.println(article.toString()); 70 } 71 } 72 public static void main(String[] args) throws IOException { 73 // 建立要索引的文件 74 // createIndexFile(); 75 // 从索引文件中查询数据 76 searchIndexFileResult(); 77 // 获得结果,然后交由相关应用程序处理 78 } 79 }
2、模拟图书搜索写一个简单的例子。
编写一个图书实体对象。
1 package com.exam.lucene.entity; 2 /** 3 * 图书 4 * @author zhou 5 */ 6 public class Book { 7 private Integer id; 8 /** 9 * 书名 10 */ 11 private String title; 12 /** 13 * 内容 14 */ 15 private String content; 16 /** 17 * 作者 18 */ 19 private String author; 20 21 //省略getter/setter方法 22 //...... 23 }
3、简单的测试。
1 package com.cndatacom.lucene.test; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.util.ArrayList; 6 import java.util.List; 7 8 import org.apache.lucene.analysis.Analyzer; 9 import org.apache.lucene.analysis.standard.StandardAnalyzer; 10 import org.apache.lucene.document.Document; 11 import org.apache.lucene.document.Field; 12 import org.apache.lucene.document.Field.Index; 13 import org.apache.lucene.document.Field.Store; 14 import org.apache.lucene.index.CorruptIndexException; 15 import org.apache.lucene.index.IndexReader; 16 import org.apache.lucene.index.IndexWriter; 17 import org.apache.lucene.index.IndexWriterConfig; 18 import org.apache.lucene.queryParser.MultiFieldQueryParser; 19 import org.apache.lucene.queryParser.ParseException; 20 import org.apache.lucene.queryParser.QueryParser; 21 import org.apache.lucene.search.IndexSearcher; 22 import org.apache.lucene.search.Query; 23 import org.apache.lucene.search.ScoreDoc; 24 import org.apache.lucene.search.TopDocs; 25 import org.apache.lucene.search.highlight.Formatter; 26 import org.apache.lucene.search.highlight.Fragmenter; 27 import org.apache.lucene.search.highlight.Highlighter; 28 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; 29 import org.apache.lucene.search.highlight.QueryScorer; 30 import org.apache.lucene.search.highlight.Scorer; 31 import org.apache.lucene.search.highlight.SimpleFragmenter; 32 import org.apache.lucene.search.highlight.SimpleHTMLFormatter; 33 import org.apache.lucene.store.Directory; 34 import org.apache.lucene.store.FSDirectory; 35 import org.apache.lucene.util.Version; 36 37 import org.junit.Before; 38 import org.junit.Test; 39 40 import com.cndatacom.lucene.entity.Book; 41 42 public class LuceneTest { 43 //分词器 44 private Analyzer analyzer; 45 //索引存放目录 46 private Directory directory; 47 48 /** 49 * 初始化Analyzer和Directory 50 * @throws IOException 51 */ 52 @Before 53 public void before() throws IOException { 54 //建立一个标准分词器 55 //Version.LUCENE_36 表示匹配Lucene3.6版本 56 analyzer = new StandardAnalyzer(Version.LUCENE_36); 57 //在当前路径下建立一个目录叫indexDir 58 File indexDir = new File("./indexDir"); 59 //创建索引目录 60 directory = FSDirectory.open(indexDir); 61 } 62 63 /** 64 * 建立索引文件 65 * @throws IOException 66 */ 67 @Test 68 public void testCreateIndex() throws IOException { 69 //建立一个IndexWriter配置,指定匹配的版本,以及分词器 70 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_36,analyzer); 71 //创建IndexWriter,它负责索引的创建和维护 72 IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig); 73 //获取图书信息 74 Book book1 = new Book(); 75 book1.setId(1); 76 book1.setTitle("Java编程思想"); 77 book1.setAuthor("Bruce Eckel"); 78 book1.setContent("Thinking in Java should be read cover to cover by every Java programmer, then kept close at hand for frequent reference."); 79 80 Book book2 = new Book(); 81 book2.setId(2); 82 book2.setTitle("建筑的永恒之道"); 83 book2.setAuthor("亚历山大"); 84 book2.setContent("《建筑的永恒之道》提出了一个关于建筑设计、建筑和规划的新的理论、思想,该理论的核心是社会成员按照他们自己的存在状态设定他们生活的世界秩序,这一古老方式从根本上构成了新的后工业时代建筑的基础,这些建筑由人们创造。"); 85 86 //建立Document 87 Document doc1 = new Document(); 88 //Store指定Field是否需要存储,Index指定Field是否需要分词索引 89 doc1.add(new Field("id",book1.getId().toString(),Store.YES,Index.NOT_ANALYZED)); 90 doc1.add(new Field("title",book1.getTitle(),Store.YES,Index.ANALYZED)); 91 doc1.add(new Field("author",book1.getAuthor(),Store.YES,Index.ANALYZED)); 92 doc1.add(new Field("content",book1.getContent(),Store.YES,Index.ANALYZED)); 93 94 //建立Document 95 Document doc2 = new Document(); 96 //Store指定Field是否需要存储,Index指定Field是否需要索引 97 doc2.add(new Field("id",book2.getId().toString(),Store.YES,Index.NOT_ANALYZED)); 98 doc2.add(new Field("title",book2.getTitle(),Store.YES,Index.ANALYZED)); 99 doc2.add(new Field("author",book2.getAuthor(),Store.YES,Index.ANALYZED)); 100 doc2.add(new Field("content",book2.getContent(),Store.YES,Index.ANALYZED)); 101 102 //把Document加入到索引中 103 indexWriter.addDocument(doc1); 104 indexWriter.addDocument(doc2); 105 //提交改变到索引,然后关闭 106 indexWriter.close(); 107 } 108 109 /** 110 * 搜索图书 111 * @throws ParseException 112 * @throws IOException 113 * @throws CorruptIndexException 114 * @throws InvalidTokenOffsetsException 115 */ 116 @Test 117 public void testSearchBook() throws ParseException, CorruptIndexException, IOException, InvalidTokenOffsetsException { 118 //搜索的关键词 119 String queryKeyWord = "思想"; 120 //创建查询分析器,把查询关键词转化为查询对象Query(单个Field, 如作者author域中搜索) 121 //QueryParser queryParser = new QueryParser(Version.LUCENE_36,"author",analyzer); 122 //在多个域中搜索,如域title和content 123 String[] fields = {"title","content"}; 124 QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36,fields,analyzer); 125 Query query = queryParser.parse(queryKeyWord); 126 127 //获取访问索引的接口,进行搜索 128 IndexReader indexReader = IndexReader.open(directory); 129 IndexSearcher indexSearcher = new IndexSearcher(indexReader); 130 131 //TopDocs 搜索返回的结果 132 TopDocs topDocs = indexSearcher.search(query, 100);//只返回前100条记录 133 int totalCount = topDocs.totalHits; // 搜索结果总数量 134 System.out.println("搜索到的结果总数量为:" + totalCount); 135 ScoreDoc[] scoreDocs = topDocs.scoreDocs; // 搜索的结果列表 136 137 //创建高亮器,使搜索的关键词突出显示 138 Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>"); 139 Scorer fragmentScore = new QueryScorer(query); 140 Highlighter highlighter = new Highlighter(formatter,fragmentScore); 141 Fragmenter fragmenter = new SimpleFragmenter(100); 142 highlighter.setTextFragmenter(fragmenter); 143 List<Book> books = new ArrayList<Book>(); 144 145 //把搜索结果取出放入到集合中 146 for(ScoreDoc scoreDoc : scoreDocs) { 147 int docID = scoreDoc.doc;//当前结果的文档编号 148 float score = scoreDoc.score;//当前结果的相关度得分 149 System.out.println("score is : "+score); 150 Document document = indexSearcher.doc(docID); 151 Book book = new Book(); 152 book.setId(Integer.parseInt(document.get("id"))); 153 154 //高亮显示title 155 String title = document.get("title"); 156 String highlighterTitle = highlighter.getBestFragment(analyzer, "title", title); 157 //如果title中没有找到关键词 158 if(highlighterTitle == null) { 159 highlighterTitle = title; 160 } 161 book.setTitle(highlighterTitle); 162 book.setAuthor(document.get("author")); 163 164 //高亮显示content 165 String content = document.get("content"); 166 String highlighterContent = highlighter.getBestFragment(analyzer, "content", content); 167 //如果content中没有找到关键词 168 if(highlighterContent == null) { 169 highlighterContent = content; 170 } 171 book.setContent(highlighterContent); 172 books.add(book); 173 } 174 //关闭 175 indexReader.close(); 176 indexSearcher.close(); 177 178 for(Book book : books) { 179 System.out.println("book'id is : "+book.getId()); 180 System.out.println("book'title is : "+book.getTitle()); 181 System.out.println("book'author is : "+book.getAuthor()); 182 System.out.println("book'content is : "+book.getContent()); 183 } 184 } 185 }