lucene 在项目中的使用
开源全文搜索工具包Lucene3.0.1的使用。
项目环境Struts2 (2.18)+Hibernate(3.0)+Spring(2.5) JDK:1.6 IDE:myEclipse 8.5
项目需求:把站内发布的新闻进行全文解索
新闻实体News
public class News { private int id; /**标题*/ private String title; /**内容*/ private String contents; setters(); getters(); }新闻实体的luceneDao
package com.hkrt.dao; import com.hkrt.domain.LuceneSearchResult; import com.hkrt.domain.News; public interface NewsLuceneDao { public static final String FIELD_ID="id"; public static final String FIELD_TITLE = "title"; public static final String FIELD_CONTENTS = "contents"; // 索引存放目录 public static final String INDEX_DIR = Thread.currentThread().getContextClassLoader().getResource("").getPath()+"index_dir"; /** * 对所有文件进行重新索引 */ public void rebuildAllIndex(); /** * 对指定上传文件对象进行索引并追加到已有的索引文件中 * @param news */ public void doIndexSingle(News news); /** * 根据关键字搜索,返回符合条件的分页数据 * @param keyword 关键字 * @param pageNo 起始页 * @param pageSize 每页要显示的记录数 * @return LuceneSearchResult对象 */ public LuceneSearchResult<News> doSeacher(String keyword, int pageNo,int pageSize); /** * 更新文件的索引 * @param news */ public void updateIndex(News news); /** * 根据文件id删除索引 * @param id */ public void deleteIndex(Integer id); }新闻lucene dao的实现
package com.hkrt.dao.impl; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import com.hkrt.dao.NewsDao; import com.hkrt.dao.NewsLuceneDao; import com.hkrt.domain.LuceneSearchResult; import com.hkrt.domain.News; public class NewsLuceneDaoImpl implements NewsLuceneDao { private NewsDao newsDao; /** 获取语法解析器 */ public Analyzer getAnalyzer() { return new StandardAnalyzer(Version.LUCENE_30); } /** 打开索引的存放目录 */ public Directory openDirectory() { try { System.out.println(new File(INDEX_DIR) + "-------打开索引--------------"); return FSDirectory.open(new File(INDEX_DIR)); } catch (IOException e) { e.printStackTrace(); } return null; } /** 对文件的指定属性映射成域,返回文件文档对象 */ public Document createForumuploadDocument(News news) { Document doc = new Document(); // 创建一个文档对象 //id 域 Field field = new Field(FIELD_ID,String.valueOf(news.getId()),Field.Store.YES, Field.Index.NOT_ANALYZED); doc.add(field); // title域 Field field1 = new Field(FIELD_TITLE, String.valueOf(news.getTitle()),Field.Store.YES, Field.Index.ANALYZED); doc.add(field1); // content域 Field field2 = new Field(FIELD_CONTENTS, String.valueOf(news.getContents()), Field.Store.YES, Field.Index.ANALYZED); doc.add(field2); return doc; } public void deleteIndex(Integer id) { IndexReader ir = null; try { ir = IndexReader.open(this.openDirectory(), false); //打开指定目录下索引文件的索引读取器 ir.deleteDocuments(new Term(FIELD_ID,String.valueOf(id))); //删除符合条件的Document } catch (IOException e) { e.printStackTrace(); }finally{ if(ir != null){ try { ir.close(); } catch (IOException e) { e.printStackTrace(); } } } } @Override public void doIndexSingle(News news) { //创建索引写入器 IndexWriter indexWriter = null; try { indexWriter = new IndexWriter(openDirectory(), getAnalyzer(),false, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = this.createForumuploadDocument(news); indexWriter.addDocument(doc); indexWriter.optimize(); // 对索引进行优化 } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (indexWriter != null) { indexWriter.close(); // 关闭IndexWriter,把内存中的数据写到文件 } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } @Override public LuceneSearchResult<News> doSeacher(String keyword, int pageNo,int pageSize) { LuceneSearchResult<News> lsr = new LuceneSearchResult<News>(); lsr.setPageNo(pageNo); lsr.setPageSize(pageSize); lsr.setKeyword(keyword); IndexSearcher searcher = null; try { // 创建一个索引搜索器 searcher = new IndexSearcher(this.openDirectory(), true); // 用多域查询解析器来创建一个查询器, Query query = MultiFieldQueryParser.parse(Version.LUCENE_30,keyword, new String[] { FIELD_TITLE, FIELD_CONTENTS }, new BooleanClause.Occur[] {BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD }, this.getAnalyzer()); long begin = System.currentTimeMillis(); // 查询结集信息类 TopDocs ts = searcher.search(query, null, 100000); // 获取命中的数量 lsr.setRecordCount(ts.totalHits); // 用这个进行高亮显示,默认是<b>..</b> SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style=color:red>", "</span>"); // 构造高亮:指定高亮的格式,指定查询评分 Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(query));highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); // 获取匹配到的结果集 ScoreDoc[] hits = ts.scoreDocs; List<News> ais = new ArrayList<News>(); int pageCount = (lsr.getRecordCount() + pageSize - 1) / pageSize; // 总页数 int start = 0; // 要开始返回的文档编号 int end = 0; // 要结束返回的文档编号 if (pageCount > 0) { start = (pageNo - 1) * pageSize; end = start + pageSize; if (pageNo == pageCount) { // 处理最后一页的结束文档的编号 end = start + (lsr.getRecordCount() % pageSize); } } if (start < end) { lsr.setStratNo(start + 1); lsr.setEndNo(end); } for (int i = start; i < end; i++) { // 循环获取分页数据 // 通过内部编号从搜索器中得到对应的文档 Document doc = searcher.doc(hits[i].doc); News news = new News(); news.setTitle(doc.getField(FIELD_TITLE).stringValue()); news.setContents(doc.getField(FIELD_CONTENTS).stringValue()); // 处理文件名称的高亮显示问题 String title = doc.getField(FIELD_TITLE).stringValue(); String title2 = highlighter.getBestFragment(this.getAnalyzer(),FIELD_TITLE, title); if (title2 == null) { news.setTitle(title); } else { news.setTitle(title2); } // 文件描述高亮显示 String contents1 = doc.getField(FIELD_CONTENTS).stringValue(); String contents2 = highlighter.getBestFragment(this.getAnalyzer(), FIELD_CONTENTS, contents1); if (contents2 == null) { news.setContents(contents1); } else { if (contents2.length() > 512) { news.setContents(contents2.substring(0, 512) + "..."); } else { news.setContents(contents2); } } ais.add(news); // 把符合条件的数据添加到List } lsr.setTime((System.currentTimeMillis() - begin) / 1000.0); // 计算搜索耗时秒数 lsr.setDatas(ais); // 把查询到的数据添加到LuceneSearchResult } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { e.printStackTrace(); } finally { if (searcher != null) { try { searcher.close(); // 关闭搜索器 } catch (Exception e) { e.printStackTrace(); } } } return lsr; } @Override public void rebuildAllIndex() { File file = new File(INDEX_DIR); if (file.exists()) { for (File subFile : file.listFiles()) { subFile.delete(); } } else { file.mkdirs(); } List<News> data = this.newsDao.findAll(); IndexWriter indexWriter = null; try { indexWriter = new IndexWriter(this.openDirectory(), getAnalyzer(),true, IndexWriter.MaxFieldLength.UNLIMITED); // 设置打开使用复合文件 // indexWriter.setUseCompoundFile(true); int size = data == null ? 0 : data.size(); for (int i = 0; i < size; i++) { News news = data.get(i); Document doc = createForumuploadDocument(news); indexWriter.addDocument(doc); if (i % 20 == 0) { indexWriter.commit(); } } indexWriter.optimize(); // 对索引进行优化 } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (indexWriter != null) { indexWriter.close();// 关闭IndexWriter,把内存中的数据写到文件 } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } @Override public void updateIndex(News news) { this.deleteIndex(news.getId()); this.doIndexSingle(news); } public NewsDao getNewsDao() { return newsDao; } public void setNewsDao(NewsDao newsDao) { this.newsDao = newsDao; } }
对查询结果进行分页处理
package com.hkrt.domain; import java.util.List; public class LuceneSearchResult<T> { private int pageNo = 1; //当前页 private int pageSize = 5; //每页显示记录数 private int recordCount; //总记录数 private double time; //耗时 private List<T> datas; //当前页的数据 private int stratNo; //开始记录数 private int endNo; //结束记录数 private String keyword; //关键字 public int getPageNo() { return pageNo; } public void setPageNo(int pageNo) { this.pageNo = pageNo; } public int getPageSize() { return pageSize; } public void setPageSize(int pageSize) { this.pageSize = pageSize; } public int getRecordCount() { return recordCount; } public void setRecordCount(int recordCount) { this.recordCount = recordCount; } public List<T> getDatas() { return datas; } public void setDatas(List<T> datas) { this.datas = datas; } public double getTime() { return time; } public void setTime(double time) { this.time = time; } public String getKeyword() { return keyword; } public void setKeyword(String keyword) { this.keyword = keyword; } public int getStratNo() { return stratNo; } public void setStratNo(int stratNo) { this.stratNo = stratNo; } public int getEndNo() { return endNo; } public void setEndNo(int endNo) { this.endNo = endNo; } }代码已经实现对news 进行建立索引和对关键字进行索引
lucene3.0.1 中需要的jar 包
建立索引:
搜索页面数据展示
<table width="100%" height="92" border="0" cellpadding="0" cellspacing="1"> <div class="title">搜索结果:搜索关键字【${lsr.keyword}】,共搜索到【${lsr.recordCount }】个文件,耗时:${lsr.time}秒,当前显示${lsr.stratNo}—${lsr.endNo}记录</div> <c:forEach items="${request.lsr.datas}" var="news"> <tr> <td height="30" colspan="6" align="left" bgcolor="#f2f2f2" class="left_txt"> ${news.id} </td> <td height="30" colspan="6" align="left" bgcolor="#f2f2f2" class="left_txt"> ${news.title} </td> <td height="30" colspan="6" align="left" bgcolor="#f2f2f2" class="left_txt">${news.contents}</td> </tr> </c:forEach> </table>最终搜索结果: