lucene 实现word,pdf全文检索源码
创建索引: import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.poi.hslf.HSLFSlideShow; import org.apache.poi.hslf.model.Slide; import org.apache.poi.hslf.model.TextRun; import org.apache.poi.hslf.usermodel.RichTextRun; import org.apache.poi.hslf.usermodel.SlideShow; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFDateUtil; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.LittleEndian; /** * 创建索引 Lucene 3.0+ * @author Administrator * */ public class indexer { /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { //保存索引文件的地方 String indexDir = "data\\test\\indexDir"; //将要搜索TXT文件的地方 String dateDir = "data\\test\\dateDir"; IndexWriter indexWriter = null; //创建Directory对象 Directory dir = new SimpleFSDirectory(new File(indexDir)); //创建IndexWriter对象, //第一个参数是Directory,第二个是分词器, //第三个表示是否是创建,如果为false为在此基础上面修改, //第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分, //一般用IndexWriter.MaxFieldLength.LIMITED indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true, IndexWriter.MaxFieldLength.UNLIMITED); File[] files = new File(dateDir).listFiles(); for (int i = 0; i < files.length; i++) { Document doc = null; if(files[i].getName().endsWith(".txt")){ doc = new Document(); //创建Field对象,并放入doc对象中 doc.add(new Field("contents", new FileReader(files[i]))); doc.add(new Field("filename", files[i].getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY), Field.Store.YES,Field.Index.NOT_ANALYZED)); }else if(files[i].getName().endsWith(".doc")){ doc = getDocument(files[i]); }else if(files[i].getName().endsWith(".ppt")){ doc = getPPT(files[i]); }else if(files[i].getName().endsWith(".xls")){ doc = getExcel(files[i]); }else if(files[i].getName().endsWith(".pdf")){ doc = getPdf(files[i]); }else{ doc = new Document(); //创建Field对象,并放入doc对象中 doc.add(new Field("contents", new FileReader(files[i]))); doc.add(new Field("filename", files[i].getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY), Field.Store.YES,Field.Index.NOT_ANALYZED)); } //写入IndexWriter if(doc!= null) indexWriter.addDocument(doc); } //查看IndexWriter里面有多少个索引 System.out.println("numDocs:"+indexWriter.numDocs()); indexWriter.close(); } public static Document getDocument(File file) throws Exception { String docPath = file.getAbsolutePath(); String title = file.getName(); // 创建Document Document document = new Document(); /*InputStream inputStream = null; Reader contents = null; try { inputStream = new FileInputStream(file); } catch (FileNotFoundException e) { e.printStackTrace(); } WordExtractor extractor = new WordExtractor(); //try{ // POIFSFileSystem fsys = new POIFSFileSystem(inputStream); // DocumentEntry headerProps = // (DocumentEntry)fsys.getRoot().getEntry("WordDocument"); // DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); // byte[] header = new byte[headerProps.getSize()]; // din.read(header); // din.close(); // int info = LittleEndian.getShort(header, 0xa); // if ((info & 0x4) != 0) // { // throw new FastSavedException("Fast-saved files are unsupported at this time"); // } // if ((info & 0x100) != 0) // { // throw new PasswordProtectedException("This document is password protected"); // } //}finally{ //} try { contents = new StringReader(extractor.extractText(inputStream)); } catch (Exception e) { e.printStackTrace(); }*/ StringBuffer contents = new StringBuffer("");// 文档内容 try { FileInputStream fs = new FileInputStream(docPath); HWPFDocument doc = new HWPFDocument(fs); Range range = doc.getRange(); int paragraphCount = range.numParagraphs();// 段落 for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据 Paragraph pp = range.getParagraph(i); contents.append(pp.text()); } } catch (Exception e) { } String cont = contents.toString().trim(); document.add(new Field("filename", title, Field.Store.YES, Field.Index.ANALYZED));//TOKENIZED //document.add(new Field("contents", contents)); document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED)); //document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED)); document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY), Field.Store.YES,Field.Index.NOT_ANALYZED)); return document; } public static Document getPPT(File pptFile) throws IOException{ String docPath = pptFile.getAbsolutePath(); String title = pptFile.getName(); StringBuffer contents = new StringBuffer("");// 文档内容 InputStream is = new FileInputStream(pptFile); SlideShow ppt = new SlideShow(new HSLFSlideShow(is)); Slide[] slides = ppt.getSlides(); //提取文本信息 /*for (Slide each : slides) { //System.out.println("title:" + each.getTitle()) ; //System.out.println("content:") ; TextRun[] textRuns = each.getTextRuns(); for (int i=0 ;i< textRuns.length; i++ ) { //System.out.println(textRuns[i].getText()); RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns(); for (int j = 0; j < richTextRuns.length; j++) { //System.out.println(richTextRuns[j].getText()); contents.append(richTextRuns[j].getText()); } } contents.append(each.getTitle()); }*/ for(int i=0;i <slides.length;i++){ TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun for(int j=0;j <t.length;j++){ contents.append(t[j].getText());//这里会将文字内容加到content中去 } //contents.append(slides[i].getTitle()); } Document document = new Document(); String cont = contents.toString().trim(); document.add(new Field("filename", title, Field.Store.YES, Field.Index.ANALYZED));//TOKENIZED //document.add(new Field("contents", contents)); document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED)); //document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED)); document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY), Field.Store.YES,Field.Index.NOT_ANALYZED)); return document; } public static Document getPdf(File pdf) { String pdfpath = pdf.getAbsolutePath(); // 创建输入流读取pdf文件 String title = pdf.getName(); String result = ""; FileInputStream is = null; PDDocument doc = null; try { is = new FileInputStream(pdf); PDFParser parser = new PDFParser(is); parser.parse(); doc = parser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(doc); } catch (Exception e) { e.printStackTrace(); } finally { if (is != null) { try { is.close(); } catch (Exception e) { e.printStackTrace(); } } if (doc != null) { try { doc.close(); } catch (Exception e) { e.printStackTrace(); } } } Document document = new Document(); document.add(new Field("filename", title, Field.Store.YES, Field.Index.ANALYZED));//TOKENIZED document.add(new Field("contents", result, Field.Store.YES, Field.Index.ANALYZED)); //document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED)); return document; } public static Document getExcel(File fileExcel) throws Exception { InputStream is = new FileInputStream(fileExcel); StringBuffer content = new StringBuffer(); HSSFWorkbook workbook = new HSSFWorkbook(is); for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) { HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet content.append("\n"); if (null == aSheet) { continue; } for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) { content.append("\n"); HSSFRow aRow = aSheet.getRow(rowNum); if (null == aRow) { continue; } for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) { HSSFCell aCell = aRow.getCell(cellNum); if (null == aCell) { continue; } if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) { content.append(aCell.getRichStringCellValue().getString()); } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) { boolean b = HSSFDateUtil.isCellDateFormatted(aCell); if (b) { Date date = aCell.getDateCellValue(); SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd"); content.append(df.format(date)); } } } } } String cont = content.toString(); Document document = new Document(); document.add(new Field("filename",fileExcel.getName(), Field.Store.YES, Field.Index.ANALYZED));//TOKENIZED document.add(new Field("contents", cont, Field.Store.YES, Field.Index.ANALYZED)); //document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED)); return document; } public static String readHtml(String urlString) { StringBuffer content = new StringBuffer(""); File file = new File(urlString); FileInputStream fis = null; try { fis = new FileInputStream(file); // 读取页面 BufferedReader reader = new BufferedReader(new InputStreamReader( fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码 String line = null; while ((line = reader.readLine()) != null) { content.append(line + "\n"); } reader.close(); } catch (Exception e) { e.printStackTrace(); } String contentString = content.toString(); return contentString; } }
搜索索引
import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; /** * 搜索索引 Lucene 3.0+ * @author Administrator * */ public class searcher { public static void main(String[] args) throws IOException, ParseException { //保存索引文件的地方 String indexDir = "data\\test\\indexDir"; Directory dir = new SimpleFSDirectory(new File(indexDir)); //创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了 IndexSearcher indexSearch = new IndexSearcher(dir); //创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器 QueryParser queryParser = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30)); //生成Query对象 Query query = queryParser.parse("arcgis"); //搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值 TopDocs hits = indexSearch.search(query,10); //hits.totalHits表示一共搜到多少个 System.out.println("找到了"+hits.totalHits+"个"); //循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值 for (int i = 0; i < hits.scoreDocs.length; i++) { ScoreDoc sdoc = hits.scoreDocs[i]; Document doc = indexSearch.doc(sdoc.doc); System.out.println(doc.get("filename")); } indexSearch.close(); } }