基于Lucene的文件检索Demo

通过Lucene实现了简单的文件检索功能的Demo。这个Demo支持基于文件内容的检索,支持中文分词和高亮显示。

下面简单的介绍下核心的类

1)索引相关的类

         1.FileIndexBuilder ---建立索引

package uap.pub.bap.fs.search.indexer;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
 * 文件索引生成器
 * 
 * @author chenfeic
 * 
 */
public class FileIndexBuilder {

    /**
     * 需要索引的文件列表
     */
    private List<File> fileList = new ArrayList<File>();

    private IndexWriter writer;

    /**
     * 
     * @param fileDir
     *            文件位置
     * @param indexDir
     *            索引位置
     */
    public void generateIndexer(String fileDir, String indexDir) {
        if (StringUtils.isEmpty(indexDir) || StringUtils.isEmpty(fileDir)) {
            System.out.println("文件和索引路径都不能为空");
            throw new RuntimeException("文件和索引路径都不能为空");
        }
        Directory d = null;
        try {
            // 初始化IndexWriter
            d = FSDirectory.open(new File(indexDir));
            initWriter(indexDir, d);
            // 创建索引文档
            initIndex(fileDir);
            System.out.println("索引创建成功!");
        } catch (Exception e) {
            System.out.println("创建索引失败");
            System.out.println(e);
        } finally {
            FileSearchUtils.closeIndexWriter(writer);
            FileSearchUtils.closeDirectory(d);
        }
    }

    /**
     * 初始化 Lucene Index Writer 步骤1: Directory创建索引存放的位置 步骤2:创建分析器Analyzer
     * 步骤3:创建IndexWriterConfig,使用分析器Analyzer 步骤4:创建IndexWriter
     * 
     * @param indexDir
     * @param directory
     * @throws IOException
     */
    private void initWriter(String indexDir, Directory directory)
            throws IOException {
        Analyzer analyzer = new IKAnalyzer();
        IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46,
                analyzer);
        conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
        writer = new IndexWriter(directory, conf);
    }

    /**
     * 初始化索引文档
     * 
     * @param fileDir
     *            文档目录
     * @return
     */
    private int initIndex(String fileDir) {
        getAllSubFile(new File(fileDir));
        TextFileFilter fileter = new TextFileFilter();
        for (File file : fileList) {
            if (fileter.accept(file)) {
                try {
                    DocumentBuilder db = new DocumentBuilder(file);
                    Document doc = db.createDocument();
                    writer.addDocument(doc);
                } catch (FileNotFoundException e) {
                    System.out.println("创建索引失败,文件不存在:" + e.getMessage());
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return writer.numDocs();

    }

    private void getAllSubFile(File file) {
        File[] listFiles = file.listFiles();
        if (ArrayUtils.isEmpty(listFiles)) {
            return;
        }
        for (File subfile : listFiles) {
            if (subfile.isDirectory()) {
                getAllSubFile(subfile);
            } else {
                fileList.add(subfile);
            }
        }
    }

    public static void main(String[] args) {
        String fileDir = "E:\\lucene\\data";
        String indexDir = "E:\\lucene\\index";
        FileIndexBuilder indexer = new FileIndexBuilder();
        indexer.generateIndexer(fileDir, indexDir);
    }

}

2. DocumentBuilder  --索引内容生成器

package uap.pub.bap.fs.search.indexer;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;

import uap.pub.bap.fs.search.IFileSearchConst;
import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
 * Document生成器
 * 
 * @author chenfeic
 * 
 */
public class DocumentBuilder {

    private File file = null;

    private IContextConverter contextConverter;

    public DocumentBuilder(File file) {
        this.file = file;
        initConverter();
    }

    /**
     * 初始化内容转换器
     */
    private void initConverter() {
        String fileType = FileSearchUtils.getFileType(file.getName());
        // 1.word
        if ("docx".equalsIgnoreCase(fileType)
                || "doc".equalsIgnoreCase(fileType)) {
            contextConverter = new WordContextConverter();
        }
        // 2. excel
        else if ("xlsx".equalsIgnoreCase(fileType)
                || "xls".equalsIgnoreCase(fileType)) {
            contextConverter = new ExcelContextConverter();
        }
        // 3.pdf
        else if ("pdf".equalsIgnoreCase(fileType)) {
            contextConverter = new PdfContextConverter();
        }
        // 4.txt(log)
        else {
            contextConverter = new TextContextConverter();
        }
    }

    public Document createDocument() {
        if (file == null || !file.exists()) {
            return null;
        }
        Document doc = new Document();
        try {
            doc.add(new TextField(IFileSearchConst.CONTENT_TYPE,
                    contextConverter.context2String(file), Field.Store.YES));
            doc.add(new StringField(IFileSearchConst.FILENAM_TYPE, file
                    .getName(), Field.Store.YES));
            doc.add(new StringField(IFileSearchConst.PATH_TYPE, file
                    .getCanonicalPath(), Field.Store.YES));
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }
}

3: IContextConverter--文件内容转换器接口,将文件内容装换为字符串

 

package uap.pub.bap.fs.search.indexer;

import java.io.File;

/**
 * 内容转换器,将文本内容转换成字符串
 * 
 * @author chenfeic
 * 
 */
public interface IContextConverter {

    /**
     * 文件内容转换成字符串
     * 
     * @param file 文件
     * @return
     */
    public String context2String(File file);

}

4:AbstractContextConverter--这个类主要利用第三方开源包cpdetector获取文件编码格式

package uap.pub.bap.fs.search.indexer;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;

import java.io.File;

public abstract class AbstractContextConverter implements IContextConverter {


    /**
     * 利用第三方开源包cpdetector获取文件编码格式
     * 
     * @param path
     *            要判断文件编码格式的源文件的路径
     * @author huanglei
     * @version 2012-7-12 14:05
     */
    protected String getFileEncode(String path) {
        /*
         * detector是探测器,它把探测任务交给具体的探测实现类的实例完成。
         * cpDetector内置了一些常用的探测实现类,这些探测实现类的实例可以通过add方法 加进来,如ParsingDetector、
         * JChardetFacade、ASCIIDetector、UnicodeDetector。
         * detector按照“谁最先返回非空的探测结果,就以该结果为准”的原则返回探测到的
         * 字符集编码。使用需要用到三个第三方JAR包:antlr.jar、chardet.jar和cpdetector.jar
         * cpDetector是基于统计学原理的,不保证完全正确。
         */
        CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
        /*
         * ParsingDetector可用于检查HTML、XML等文件或字符流的编码,构造方法中的参数用于
         * 指示是否显示探测过程的详细信息,为false不显示。
         */
        detector.add(new ParsingDetector(false));
        /*
         * JChardetFacade封装了由Mozilla组织提供的JChardet,它可以完成大多数文件的编码
         * 测定。所以,一般有了这个探测器就可满足大多数项目的要求,如果你还不放心,可以
         * 再多加几个探测器,比如下面的ASCIIDetector、UnicodeDetector等。
         */
        detector.add(JChardetFacade.getInstance());// 用到antlr.jar、chardet.jar
        // ASCIIDetector用于ASCII编码测定
        detector.add(ASCIIDetector.getInstance());
        // UnicodeDetector用于Unicode家族编码的测定
        detector.add(UnicodeDetector.getInstance());
        java.nio.charset.Charset charset = null;
        File f = new File(path);
        try {
            charset = detector.detectCodepage(f.toURI().toURL());
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        if (charset != null)
            return charset.name();
        else
            return null;
    }
    
}

4.TextContextConverter 

package uap.pub.bap.fs.search.indexer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;

import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
 * txt、配置文件、log等文本的Document生成器
 * 
 * @author chenfeic
 * 
 */
public class TextContextConverter extends AbstractContextConverter {

    @Override
    public String context2String(File file) {
        StringBuilder sb = new StringBuilder();
        BufferedReader reader = null;
        InputStream in = null;
        try {
            String encoding = getFileEncode(file.getCanonicalPath());
            in = new FileInputStream(file);
            if (encoding != null && !"".equals(encoding.trim())) {
                reader = new BufferedReader(new InputStreamReader(in, encoding));
            } else {
                reader = new BufferedReader(new InputStreamReader(in));
            }
            // 将输入流写入输出流
            String line = "";
            while ((line = reader.readLine()) != null) {
                sb.append(line + "\n");
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            FileSearchUtils.closeInputStream(in);
            FileSearchUtils.closeReader(reader);
        }
        return sb.toString();
    }

}

下面的两个类主要是读取excel、word等office 办公软件的内容,用到的第三方插件为poi

5.WordContextConverter  

package uap.pub.bap.fs.search.indexer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;

import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
 * word文档内容转换器
 * 
 * @author chenfeic
 * 
 */
public class WordContextConverter extends AbstractContextConverter {

    @Override
    public String context2String(File file) {
        if (isWord2003(file)) {
            return readWord2003(file);
        } else {
            return readWord2007(file);
        }
    }

    /**
     * 判断是否是Word 97(-2003)版本
     * 
     * @param file
     * @return
     */
    private boolean isWord2003(File file) {
        String fileType = FileSearchUtils.getFileType(file.getName());
        return "doc".equalsIgnoreCase(fileType);
    }

    /**
     * 读取Word 97(-2003)文件内容
     * 
     * @param file
     * @return
     */
    private String readWord2003(File file) {
        InputStream inputStream = null;
        String context = null;
        try {
            inputStream = new FileInputStream(file);
            WordExtractor extractor = new WordExtractor(inputStream);
            context = extractor.getText();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            FileSearchUtils.closeInputStream(inputStream);
        }
        return context;
    }

    private String readWord2007(File file) {
        String text = null;
        OPCPackage openPackage = null;
        try {
            // 得到.docx文件提取器
            openPackage = POIXMLDocument.openPackage(file.getCanonicalPath());
            XWPFWordExtractor docx = new XWPFWordExtractor(openPackage);
            // 提取.docx正文文本
            text = docx.getText();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (XmlException e) {
            e.printStackTrace();
        } catch (OpenXML4JException e) {
            e.printStackTrace();
        } finally {
            if (openPackage != null) {
                try {
                    openPackage.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return text;
    }

}

6.ExcelContextConverter 

package uap.pub.bap.fs.search.indexer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import uap.pub.bap.fs.search.util.FileSearchUtils;

/**
 * excel内容转换器
 * 
 * @author chenfeic
 * 
 */
public class ExcelContextConverter extends AbstractContextConverter {

    @Override
    public String context2String(File file) {
        if (isExcel2003(file)) {
            return readExcel2003(file);
        } else {
            return readExcel2007(file);
        }
    }

    /**
     * 判断是否是Excel 97(-2003)版本
     * 
     * @param file
     * @return
     */
    private boolean isExcel2003(File file) {
        String fileType = FileSearchUtils.getFileType(file.getName());
        return "xls".equalsIgnoreCase(fileType);
    }
    
    public String readExcel2003(File file) {
        InputStream inputStream = null;
        String content = null;
        try {
            inputStream = new FileInputStream(file.getCanonicalPath());
            HSSFWorkbook wb = new HSSFWorkbook(inputStream);
            ExcelExtractor extractor = new ExcelExtractor(wb);
            extractor.setFormulasNotResults(true);
            extractor.setIncludeSheetNames(false);
            content = extractor.getText();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            FileSearchUtils.closeInputStream(inputStream);
        }
        return content;
    }

    public String readExcel2007(File file) {
        StringBuffer content = new StringBuffer();
        InputStream inputStream = null;
        try {
            inputStream = new FileInputStream(file.getCanonicalPath());
            XSSFWorkbook xwb = new XSSFWorkbook(inputStream);
            // 循环工作表Sheet
            for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
                XSSFSheet xSheet = xwb.getSheetAt(numSheet);
                if (xSheet == null) {
                    continue;
                }
                // 循环行Row
                for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
                    XSSFRow xRow = xSheet.getRow(rowNum);
                    if (xRow == null) {
                        continue;
                    }
                    // 循环列Cell
                    for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
                        XSSFCell xCell = xRow.getCell(cellNum);
                        if (xCell == null) {
                            continue;
                        }
                        if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
                            content.append(xCell.getBooleanCellValue());
                        } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
                            content.append(xCell.getNumericCellValue());
                        } else {
                            content.append(xCell.getStringCellValue());
                        }
                    }
                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            FileSearchUtils.closeInputStream(inputStream);
        }

        return content.toString();
    }
}

2)检索相关的类

FileSearchServiceImpl --查询关键字

package uap.pub.bap.fs.search.service;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexNotFoundException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import uap.pub.bap.fs.search.IFileSearchConst;
import uap.pub.bap.fs.search.SearchResult;
import uap.pub.bap.fs.search.util.FileSearchUtils;

public class FileSearchServiceImpl implements IFileSerachService {

    private int count = 0;

   @Override

public List<SearchResult> search(String type, String key) {
//type:查询的类型---标题,文件内容等 //key:查询关键字 List
<SearchResult> results = new ArrayList<SearchResult>(); if (StringUtils.isEmpty(key)) { return results; } // TODO chenfeic String indexDir = "E:\\lucene\\index"; IndexReader reader = null; Directory directory = null; try { directory = FSDirectory.open(new File(indexDir)); reader = DirectoryReader.open(directory); IndexSearcher search = new IndexSearcher(reader); // 使用QueryParser查询分析器构造Query对象 Analyzer analyzer = new IKAnalyzer(); QueryParser qp = new QueryParser(Version.LUCENE_46, type, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(key); // 一个简单的指针容器,指向前N个排名的搜索结果 TopDocs hits = search.search(query, null, 100); count = hits.totalHits; for (ScoreDoc soreDoc : hits.scoreDocs) { Document doc = search.doc(soreDoc.doc); String summary = toHighlighter(query, doc, IFileSearchConst.CONTENT_TYPE, analyzer); String title = doc.get(IFileSearchConst.FILENAM_TYPE); String path = doc.get(IFileSearchConst.PATH_TYPE); SearchResult result = new SearchResult(); result.setPath(path); result.setTitle(title); if (!StringUtils.isEmpty(summary)) { result.setSummary(summary); } results.add(result); } } catch (IndexNotFoundException e1) { System.out.println("无查询结果,没有此词条的索引"); }catch (IOException e) { System.out.println("无查询结果!"); e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { FileSearchUtils.closeIndexReader(reader); FileSearchUtils.closeDirectory(directory); } return results; } /** * 高亮显示 * * @param query * @param doc * @param field * @return */ private String toHighlighter(Query query, Document doc, String field, Analyzer analyzer) { try { SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( "<font color=\"red\">", "</font>"); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); // highlighter.setTextFragmenter(new // SimpleFragmenter(20));//显示20个字符,默认是100个 TokenStream tokenStream = analyzer.tokenStream(field, new StringReader(doc.get(field))); String highlighterStr = highlighter.getBestFragment(tokenStream, doc.get(field)); return highlighterStr == null ? doc.get(field) : highlighterStr; } catch (IOException e) { e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { e.printStackTrace(); } return null; } @Override public int getCount() { return this.count; } }

 

 上述基本上就是此Demo的核心类,其他的一些工具类和jsp、servlet处理类就没多写了

  基本上用的都是第三方的开源工具,比如支持中文分词的IK_Analyzer(版本IK Analyzer 2012FF_hf1)(注:开始的时候想要的是paoding作为中文的分词器,用户之后发现paoding不支持Lucene4.0版本,估计3.X就已经不支持了,原因是因为坑跌的Lucene总是变化的他的实现和结构,使得有些方法变为final这样paoding中重写了此方法,导致编译出错),为了支持基于文件内容的检索,所以需要对文件内容进行索引并保存,所以用到了poi用于对ms office的处理,对于txt等读取时通过cpdetector检测文件的编码格式。代码都可以参照上面。代码有些是自己写的,有些是参照网上其他同仁的,再次一并谢过

相关jar包列表为 

 

posted @ 2013-12-23 22:38  chenfei0801  阅读(893)  评论(2编辑  收藏  举报