lucene 实现word,pdf全文检索源码

创建索引:
  
import java.io.BufferedReader;
import java.io.File;   
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;   
import java.io.IOException;   
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.text.SimpleDateFormat;
import java.util.Date;   
  
import org.apache.lucene.analysis.standard.StandardAnalyzer;   
import org.apache.lucene.document.DateTools;   
import org.apache.lucene.document.Document;   
import org.apache.lucene.document.Field;   
import org.apache.lucene.index.IndexWriter;   
import org.apache.lucene.store.Directory;   
import org.apache.lucene.store.SimpleFSDirectory;   
import org.apache.lucene.util.Version;   
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.RichTextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**  
 * 创建索引 Lucene 3.0+  
 * @author Administrator  
 *  
 */  
public class indexer {   
	
    /**  
     * @param args  
     * @throws Exception 
     */  
    public static void main(String[] args) throws Exception {   
        //保存索引文件的地方   
        String indexDir = "data\\test\\indexDir";   
        //将要搜索TXT文件的地方   
        String dateDir = "data\\test\\dateDir";   
        IndexWriter indexWriter = null;   
        //创建Directory对象   
        Directory dir = new SimpleFSDirectory(new File(indexDir));   
        //创建IndexWriter对象,
        //第一个参数是Directory,第二个是分词器,
        //第三个表示是否是创建,如果为false为在此基础上面修改,
        //第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,
        //一般用IndexWriter.MaxFieldLength.LIMITED    
        indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true,
        		IndexWriter.MaxFieldLength.UNLIMITED);   
        File[] files = new File(dateDir).listFiles();   
        for (int i = 0; i < files.length; i++) { 
        	Document doc = null;
        	if(files[i].getName().endsWith(".txt")){
	            doc = new Document();   
	            //创建Field对象,并放入doc对象中   
	            doc.add(new Field("contents", new FileReader(files[i])));    
	            doc.add(new Field("filename", files[i].getName(),    
	                                Field.Store.YES, Field.Index.NOT_ANALYZED));   
	            doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
	            		Field.Store.YES,Field.Index.NOT_ANALYZED)); 
        	}else if(files[i].getName().endsWith(".doc")){
        			doc = getDocument(files[i]);
        	}else if(files[i].getName().endsWith(".ppt")){
        		doc = getPPT(files[i]);
        	}else if(files[i].getName().endsWith(".xls")){
        		doc = getExcel(files[i]);
        	}else if(files[i].getName().endsWith(".pdf")){	
        		doc = getPdf(files[i]); 
        	}else{
        		doc = new Document();   
	            //创建Field对象,并放入doc对象中   
	            doc.add(new Field("contents", new FileReader(files[i])));    
	            doc.add(new Field("filename", files[i].getName(),    
	                                Field.Store.YES, Field.Index.NOT_ANALYZED));   
	            doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
	            		Field.Store.YES,Field.Index.NOT_ANALYZED));   
	              
        	}
        	//写入IndexWriter
        	if(doc!= null) indexWriter.addDocument(doc);
        }   
        //查看IndexWriter里面有多少个索引   
        System.out.println("numDocs:"+indexWriter.numDocs());
        indexWriter.close();
        
    } 
    
    public static Document getDocument(File file) throws Exception {
		String docPath = file.getAbsolutePath();
		String title = file.getName();
		
		// 创建Document
		Document document = new Document();
		
		/*InputStream inputStream = null;
		Reader contents = null;
		try {
			inputStream = new FileInputStream(file);
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		
		WordExtractor extractor = new WordExtractor();
		//try{
		//	POIFSFileSystem fsys = new POIFSFileSystem(inputStream);
		//	DocumentEntry headerProps =
		//	         (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
		//	DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
		//	byte[] header = new byte[headerProps.getSize()];
			 
			 
		//	din.read(header);
		//	din.close();
			 
		//	int info = LittleEndian.getShort(header, 0xa);
		//	if ((info & 0x4) != 0)
		//	{
		//		throw new FastSavedException("Fast-saved files are unsupported at this time");
		//	}
		//	if ((info & 0x100) != 0)
		//	{
		//		throw new PasswordProtectedException("This document is password protected");
		//	}
		//}finally{
			
		//}
		
		try {
			contents = new StringReader(extractor.extractText(inputStream));
		} catch (Exception e) {
			e.printStackTrace();
		}*/

		StringBuffer contents = new StringBuffer("");// 文档内容
        try {
        	FileInputStream fs = new FileInputStream(docPath);
            HWPFDocument doc = new HWPFDocument(fs);
            Range range = doc.getRange();
            int paragraphCount = range.numParagraphs();// 段落
            for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
                Paragraph pp = range.getParagraph(i);
                contents.append(pp.text());
            } 

        } catch (Exception e) {

        }
        String cont = contents.toString().trim();

		
		document.add(new Field("filename", title, Field.Store.YES,
				Field.Index.ANALYZED));//TOKENIZED
		//document.add(new Field("contents", contents));
		document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
		//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
		document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
        		Field.Store.YES,Field.Index.NOT_ANALYZED));
		return document;
	}
    
    public static Document getPPT(File pptFile) throws IOException{
    	String docPath = pptFile.getAbsolutePath();
		String title = pptFile.getName();
    	
    	
    	StringBuffer contents = new StringBuffer("");// 文档内容
    	InputStream is = new FileInputStream(pptFile);
    	SlideShow ppt = new SlideShow(new HSLFSlideShow(is));
    	Slide[] slides = ppt.getSlides();
    	//提取文本信息   
    	/*for (Slide each : slides) {
    		//System.out.println("title:" + each.getTitle()) ;
    		//System.out.println("content:") ;
    		TextRun[] textRuns = each.getTextRuns();
    		for (int i=0 ;i< textRuns.length; i++ ) {
    			//System.out.println(textRuns[i].getText());
    			RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns();
    			for (int j = 0; j < richTextRuns.length; j++) {
    				//System.out.println(richTextRuns[j].getText());
    				contents.append(richTextRuns[j].getText());
    			}
    		}
    		contents.append(each.getTitle());
    	}*/
    	for(int i=0;i <slides.length;i++){
            TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun 
            for(int   j=0;j <t.length;j++){ 
            	contents.append(t[j].getText());//这里会将文字内容加到content中去 
            } 
            //contents.append(slides[i].getTitle()); 
        }
    	
    	Document document = new Document();
    	String cont = contents.toString().trim();

		
		document.add(new Field("filename", title, Field.Store.YES,
				Field.Index.ANALYZED));//TOKENIZED
		//document.add(new Field("contents", contents));
		document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
		//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
		document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
        		Field.Store.YES,Field.Index.NOT_ANALYZED));
    	return document;
    }
  
    public static Document getPdf(File pdf) {
		String pdfpath = pdf.getAbsolutePath();
		// 创建输入流读取pdf文件
		String title = pdf.getName();
		String result = "";
		FileInputStream is = null;
		PDDocument doc = null;
		try {
			is = new FileInputStream(pdf);
			PDFParser parser = new PDFParser(is);
			parser.parse();
			doc = parser.getPDDocument();
			PDFTextStripper stripper = new PDFTextStripper();
			result = stripper.getText(doc);

		} catch (Exception e) {

			e.printStackTrace();
		} finally {
			if (is != null) {
				try {
					is.close();
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
			if (doc != null) {
				try {
					doc.close();
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		}
		Document document = new Document();
		document.add(new Field("filename", title, Field.Store.YES,
				Field.Index.ANALYZED));//TOKENIZED
		document.add(new Field("contents", result, Field.Store.YES,
				Field.Index.ANALYZED));
		//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
		return document;
	}
    
    public static Document getExcel(File fileExcel) throws Exception {

    	InputStream is = new FileInputStream(fileExcel);
        StringBuffer content = new StringBuffer();

        HSSFWorkbook workbook = new HSSFWorkbook(is);

        for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
            HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
            content.append("\n");
            if (null == aSheet) {
               continue;
            }
            for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
               content.append("\n");
               HSSFRow aRow = aSheet.getRow(rowNum);
               if (null == aRow) {
                   continue;
               }

               for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
                   HSSFCell aCell = aRow.getCell(cellNum);
                   if (null == aCell) {
                      continue;
                   }

                   if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
                      content.append(aCell.getRichStringCellValue().getString());
                   } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
                      boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
                      if (b) {
                          Date date = aCell.getDateCellValue();
                          SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
                          content.append(df.format(date));
                      }
                   }
               }
            }
        }
        
        String cont = content.toString();
        Document document = new Document();
		document.add(new Field("filename",fileExcel.getName(), Field.Store.YES,
				Field.Index.ANALYZED));//TOKENIZED
		document.add(new Field("contents", cont, Field.Store.YES,
				Field.Index.ANALYZED));
		//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
		return document;
     }
    
    public static String readHtml(String urlString) {

        StringBuffer content = new StringBuffer("");
        File file = new File(urlString);
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(file);
            // 读取页面
            BufferedReader reader = new BufferedReader(new InputStreamReader(
                    fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
            
            String line = null;

            while ((line = reader.readLine()) != null) {
                content.append(line + "\n");
            }
            reader.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        String contentString = content.toString();
        return contentString;
    }
} 

  搜索索引 

   
  
import java.io.File;   
import java.io.IOException;   
  
import org.apache.lucene.analysis.standard.StandardAnalyzer;   
import org.apache.lucene.document.Document;   
import org.apache.lucene.queryParser.ParseException;   
import org.apache.lucene.queryParser.QueryParser;   
import org.apache.lucene.search.IndexSearcher;   
import org.apache.lucene.search.Query;   
import org.apache.lucene.search.ScoreDoc;   
import org.apache.lucene.search.TopDocs;   
import org.apache.lucene.store.Directory;   
import org.apache.lucene.store.SimpleFSDirectory;   
import org.apache.lucene.util.Version;   
/**  
 * 搜索索引 Lucene 3.0+  
 * @author Administrator  
 *  
 */  
public class searcher {   
  
    public static void main(String[] args) throws IOException, ParseException {   
        //保存索引文件的地方      
        String indexDir = "data\\test\\indexDir"; 
        Directory dir = new SimpleFSDirectory(new File(indexDir));   
        //创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了   
        IndexSearcher indexSearch = new IndexSearcher(dir);   
        //创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器   
        QueryParser queryParser = new QueryParser(Version.LUCENE_30,   
                "contents", new StandardAnalyzer(Version.LUCENE_30));   
        //生成Query对象   
        Query query = queryParser.parse("arcgis");   
        //搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值   
        TopDocs hits = indexSearch.search(query,10);   
        //hits.totalHits表示一共搜到多少个   
        System.out.println("找到了"+hits.totalHits+"个");   
        //循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值   
        for (int i = 0; i < hits.scoreDocs.length; i++) {   
            ScoreDoc sdoc = hits.scoreDocs[i];   
            Document doc = indexSearch.doc(sdoc.doc);
            System.out.println(doc.get("filename"));
        }
        indexSearch.close();   
    }   
} 

  

posted @ 2015-08-25 15:15  zzlp  阅读(1365)  评论(0编辑  收藏  举报