基于Lucene的文件检索Demo
通过Lucene实现了简单的文件检索功能的Demo。这个Demo支持基于文件内容的检索,支持中文分词和高亮显示。
下面简单的介绍下核心的类
1)索引相关的类
1.FileIndexBuilder ---建立索引
package uap.pub.bap.fs.search.indexer; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import uap.pub.bap.fs.search.util.FileSearchUtils; /** * 文件索引生成器 * * @author chenfeic * */ public class FileIndexBuilder { /** * 需要索引的文件列表 */ private List<File> fileList = new ArrayList<File>(); private IndexWriter writer; /** * * @param fileDir * 文件位置 * @param indexDir * 索引位置 */ public void generateIndexer(String fileDir, String indexDir) { if (StringUtils.isEmpty(indexDir) || StringUtils.isEmpty(fileDir)) { System.out.println("文件和索引路径都不能为空"); throw new RuntimeException("文件和索引路径都不能为空"); } Directory d = null; try { // 初始化IndexWriter d = FSDirectory.open(new File(indexDir)); initWriter(indexDir, d); // 创建索引文档 initIndex(fileDir); System.out.println("索引创建成功!"); } catch (Exception e) { System.out.println("创建索引失败"); System.out.println(e); } finally { FileSearchUtils.closeIndexWriter(writer); FileSearchUtils.closeDirectory(d); } } /** * 初始化 Lucene Index Writer 步骤1: Directory创建索引存放的位置 步骤2:创建分析器Analyzer * 步骤3:创建IndexWriterConfig,使用分析器Analyzer 步骤4:创建IndexWriter * * @param indexDir * @param directory * @throws IOException */ private void initWriter(String indexDir, Directory directory) throws IOException { Analyzer analyzer = new IKAnalyzer(); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46, analyzer); conf.setOpenMode(OpenMode.CREATE_OR_APPEND); writer = new IndexWriter(directory, conf); } /** * 初始化索引文档 * * @param fileDir * 文档目录 * @return */ private int initIndex(String fileDir) { getAllSubFile(new File(fileDir)); TextFileFilter fileter = new TextFileFilter(); for (File file : fileList) { if (fileter.accept(file)) { try { DocumentBuilder db = new DocumentBuilder(file); Document doc = db.createDocument(); writer.addDocument(doc); } catch (FileNotFoundException e) { System.out.println("创建索引失败,文件不存在:" + e.getMessage()); } catch (IOException e) { e.printStackTrace(); } } } return writer.numDocs(); } private void getAllSubFile(File file) { File[] listFiles = file.listFiles(); if (ArrayUtils.isEmpty(listFiles)) { return; } for (File subfile : listFiles) { if (subfile.isDirectory()) { getAllSubFile(subfile); } else { fileList.add(subfile); } } } public static void main(String[] args) { String fileDir = "E:\\lucene\\data"; String indexDir = "E:\\lucene\\index"; FileIndexBuilder indexer = new FileIndexBuilder(); indexer.generateIndexer(fileDir, indexDir); } }
2. DocumentBuilder --索引内容生成器
package uap.pub.bap.fs.search.indexer; import java.io.File; import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import uap.pub.bap.fs.search.IFileSearchConst; import uap.pub.bap.fs.search.util.FileSearchUtils; /** * Document生成器 * * @author chenfeic * */ public class DocumentBuilder { private File file = null; private IContextConverter contextConverter; public DocumentBuilder(File file) { this.file = file; initConverter(); } /** * 初始化内容转换器 */ private void initConverter() { String fileType = FileSearchUtils.getFileType(file.getName()); // 1.word if ("docx".equalsIgnoreCase(fileType) || "doc".equalsIgnoreCase(fileType)) { contextConverter = new WordContextConverter(); } // 2. excel else if ("xlsx".equalsIgnoreCase(fileType) || "xls".equalsIgnoreCase(fileType)) { contextConverter = new ExcelContextConverter(); } // 3.pdf else if ("pdf".equalsIgnoreCase(fileType)) { contextConverter = new PdfContextConverter(); } // 4.txt(log) else { contextConverter = new TextContextConverter(); } } public Document createDocument() { if (file == null || !file.exists()) { return null; } Document doc = new Document(); try { doc.add(new TextField(IFileSearchConst.CONTENT_TYPE, contextConverter.context2String(file), Field.Store.YES)); doc.add(new StringField(IFileSearchConst.FILENAM_TYPE, file .getName(), Field.Store.YES)); doc.add(new StringField(IFileSearchConst.PATH_TYPE, file .getCanonicalPath(), Field.Store.YES)); } catch (IOException e) { e.printStackTrace(); } return doc; } }
3: IContextConverter--文件内容转换器接口,将文件内容装换为字符串
package uap.pub.bap.fs.search.indexer; import java.io.File; /** * 内容转换器,将文本内容转换成字符串 * * @author chenfeic * */ public interface IContextConverter { /** * 文件内容转换成字符串 * * @param file 文件 * @return */ public String context2String(File file); }
4:AbstractContextConverter--这个类主要利用第三方开源包cpdetector获取文件编码格式
package uap.pub.bap.fs.search.indexer; import info.monitorenter.cpdetector.io.ASCIIDetector; import info.monitorenter.cpdetector.io.CodepageDetectorProxy; import info.monitorenter.cpdetector.io.JChardetFacade; import info.monitorenter.cpdetector.io.ParsingDetector; import info.monitorenter.cpdetector.io.UnicodeDetector; import java.io.File; public abstract class AbstractContextConverter implements IContextConverter { /** * 利用第三方开源包cpdetector获取文件编码格式 * * @param path * 要判断文件编码格式的源文件的路径 * @author huanglei * @version 2012-7-12 14:05 */ protected String getFileEncode(String path) { /* * detector是探测器,它把探测任务交给具体的探测实现类的实例完成。 * cpDetector内置了一些常用的探测实现类,这些探测实现类的实例可以通过add方法 加进来,如ParsingDetector、 * JChardetFacade、ASCIIDetector、UnicodeDetector。 * detector按照“谁最先返回非空的探测结果,就以该结果为准”的原则返回探测到的 * 字符集编码。使用需要用到三个第三方JAR包:antlr.jar、chardet.jar和cpdetector.jar * cpDetector是基于统计学原理的,不保证完全正确。 */ CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); /* * ParsingDetector可用于检查HTML、XML等文件或字符流的编码,构造方法中的参数用于 * 指示是否显示探测过程的详细信息,为false不显示。 */ detector.add(new ParsingDetector(false)); /* * JChardetFacade封装了由Mozilla组织提供的JChardet,它可以完成大多数文件的编码 * 测定。所以,一般有了这个探测器就可满足大多数项目的要求,如果你还不放心,可以 * 再多加几个探测器,比如下面的ASCIIDetector、UnicodeDetector等。 */ detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar // ASCIIDetector用于ASCII编码测定 detector.add(ASCIIDetector.getInstance()); // UnicodeDetector用于Unicode家族编码的测定 detector.add(UnicodeDetector.getInstance()); java.nio.charset.Charset charset = null; File f = new File(path); try { charset = detector.detectCodepage(f.toURI().toURL()); } catch (Exception ex) { ex.printStackTrace(); } if (charset != null) return charset.name(); else return null; } }
4.TextContextConverter
package uap.pub.bap.fs.search.indexer; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.InputStreamReader; import uap.pub.bap.fs.search.util.FileSearchUtils; /** * txt、配置文件、log等文本的Document生成器 * * @author chenfeic * */ public class TextContextConverter extends AbstractContextConverter { @Override public String context2String(File file) { StringBuilder sb = new StringBuilder(); BufferedReader reader = null; InputStream in = null; try { String encoding = getFileEncode(file.getCanonicalPath()); in = new FileInputStream(file); if (encoding != null && !"".equals(encoding.trim())) { reader = new BufferedReader(new InputStreamReader(in, encoding)); } else { reader = new BufferedReader(new InputStreamReader(in)); } // 将输入流写入输出流 String line = ""; while ((line = reader.readLine()) != null) { sb.append(line + "\n"); } } catch (Exception e) { e.printStackTrace(); } finally { FileSearchUtils.closeInputStream(in); FileSearchUtils.closeReader(reader); } return sb.toString(); } }
下面的两个类主要是读取excel、word等office 办公软件的内容,用到的第三方插件为poi
5.WordContextConverter
package uap.pub.bap.fs.search.indexer; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.poi.POIXMLDocument; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.xmlbeans.XmlException; import uap.pub.bap.fs.search.util.FileSearchUtils; /** * word文档内容转换器 * * @author chenfeic * */ public class WordContextConverter extends AbstractContextConverter { @Override public String context2String(File file) { if (isWord2003(file)) { return readWord2003(file); } else { return readWord2007(file); } } /** * 判断是否是Word 97(-2003)版本 * * @param file * @return */ private boolean isWord2003(File file) { String fileType = FileSearchUtils.getFileType(file.getName()); return "doc".equalsIgnoreCase(fileType); } /** * 读取Word 97(-2003)文件内容 * * @param file * @return */ private String readWord2003(File file) { InputStream inputStream = null; String context = null; try { inputStream = new FileInputStream(file); WordExtractor extractor = new WordExtractor(inputStream); context = extractor.getText(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { FileSearchUtils.closeInputStream(inputStream); } return context; } private String readWord2007(File file) { String text = null; OPCPackage openPackage = null; try { // 得到.docx文件提取器 openPackage = POIXMLDocument.openPackage(file.getCanonicalPath()); XWPFWordExtractor docx = new XWPFWordExtractor(openPackage); // 提取.docx正文文本 text = docx.getText(); } catch (IOException e) { e.printStackTrace(); } catch (XmlException e) { e.printStackTrace(); } catch (OpenXML4JException e) { e.printStackTrace(); } finally { if (openPackage != null) { try { openPackage.close(); } catch (IOException e) { e.printStackTrace(); } } } return text; } }
6.ExcelContextConverter
package uap.pub.bap.fs.search.indexer; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import uap.pub.bap.fs.search.util.FileSearchUtils; /** * excel内容转换器 * * @author chenfeic * */ public class ExcelContextConverter extends AbstractContextConverter { @Override public String context2String(File file) { if (isExcel2003(file)) { return readExcel2003(file); } else { return readExcel2007(file); } } /** * 判断是否是Excel 97(-2003)版本 * * @param file * @return */ private boolean isExcel2003(File file) { String fileType = FileSearchUtils.getFileType(file.getName()); return "xls".equalsIgnoreCase(fileType); } public String readExcel2003(File file) { InputStream inputStream = null; String content = null; try { inputStream = new FileInputStream(file.getCanonicalPath()); HSSFWorkbook wb = new HSSFWorkbook(inputStream); ExcelExtractor extractor = new ExcelExtractor(wb); extractor.setFormulasNotResults(true); extractor.setIncludeSheetNames(false); content = extractor.getText(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { FileSearchUtils.closeInputStream(inputStream); } return content; } public String readExcel2007(File file) { StringBuffer content = new StringBuffer(); InputStream inputStream = null; try { inputStream = new FileInputStream(file.getCanonicalPath()); XSSFWorkbook xwb = new XSSFWorkbook(inputStream); // 循环工作表Sheet for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) { XSSFSheet xSheet = xwb.getSheetAt(numSheet); if (xSheet == null) { continue; } // 循环行Row for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) { XSSFRow xRow = xSheet.getRow(rowNum); if (xRow == null) { continue; } // 循环列Cell for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) { XSSFCell xCell = xRow.getCell(cellNum); if (xCell == null) { continue; } if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) { content.append(xCell.getBooleanCellValue()); } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) { content.append(xCell.getNumericCellValue()); } else { content.append(xCell.getStringCellValue()); } } } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { FileSearchUtils.closeInputStream(inputStream); } return content.toString(); } }
2)检索相关的类
FileSearchServiceImpl --查询关键字
package uap.pub.bap.fs.search.service; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexNotFoundException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import uap.pub.bap.fs.search.IFileSearchConst; import uap.pub.bap.fs.search.SearchResult; import uap.pub.bap.fs.search.util.FileSearchUtils; public class FileSearchServiceImpl implements IFileSerachService { private int count = 0;
@Override
public List<SearchResult> search(String type, String key) {
//type:查询的类型---标题,文件内容等 //key:查询关键字 List<SearchResult> results = new ArrayList<SearchResult>(); if (StringUtils.isEmpty(key)) { return results; } // TODO chenfeic String indexDir = "E:\\lucene\\index"; IndexReader reader = null; Directory directory = null; try { directory = FSDirectory.open(new File(indexDir)); reader = DirectoryReader.open(directory); IndexSearcher search = new IndexSearcher(reader); // 使用QueryParser查询分析器构造Query对象 Analyzer analyzer = new IKAnalyzer(); QueryParser qp = new QueryParser(Version.LUCENE_46, type, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(key); // 一个简单的指针容器,指向前N个排名的搜索结果 TopDocs hits = search.search(query, null, 100); count = hits.totalHits; for (ScoreDoc soreDoc : hits.scoreDocs) { Document doc = search.doc(soreDoc.doc); String summary = toHighlighter(query, doc, IFileSearchConst.CONTENT_TYPE, analyzer); String title = doc.get(IFileSearchConst.FILENAM_TYPE); String path = doc.get(IFileSearchConst.PATH_TYPE); SearchResult result = new SearchResult(); result.setPath(path); result.setTitle(title); if (!StringUtils.isEmpty(summary)) { result.setSummary(summary); } results.add(result); } } catch (IndexNotFoundException e1) { System.out.println("无查询结果,没有此词条的索引"); }catch (IOException e) { System.out.println("无查询结果!"); e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { FileSearchUtils.closeIndexReader(reader); FileSearchUtils.closeDirectory(directory); } return results; } /** * 高亮显示 * * @param query * @param doc * @param field * @return */ private String toHighlighter(Query query, Document doc, String field, Analyzer analyzer) { try { SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( "<font color=\"red\">", "</font>"); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); // highlighter.setTextFragmenter(new // SimpleFragmenter(20));//显示20个字符,默认是100个 TokenStream tokenStream = analyzer.tokenStream(field, new StringReader(doc.get(field))); String highlighterStr = highlighter.getBestFragment(tokenStream, doc.get(field)); return highlighterStr == null ? doc.get(field) : highlighterStr; } catch (IOException e) { e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { e.printStackTrace(); } return null; } @Override public int getCount() { return this.count; } }
上述基本上就是此Demo的核心类,其他的一些工具类和jsp、servlet处理类就没多写了
基本上用的都是第三方的开源工具,比如支持中文分词的IK_Analyzer(版本IK Analyzer 2012FF_hf1)(注:开始的时候想要的是paoding作为中文的分词器,用户之后发现paoding不支持Lucene4.0版本,估计3.X就已经不支持了,原因是因为坑跌的Lucene总是变化的他的实现和结构,使得有些方法变为final这样paoding中重写了此方法,导致编译出错),为了支持基于文件内容的检索,所以需要对文件内容进行索引并保存,所以用到了poi用于对ms office的处理,对于txt等读取时通过cpdetector检测文件的编码格式。代码都可以参照上面。代码有些是自己写的,有些是参照网上其他同仁的,再次一并谢过
相关jar包列表为