lucene索引word/pdf/html/txt文件及检索(搜索引擎)

 

2009-07-02 15:31

因为lucene索引的时候是将String型的信息建立索引的,所以这里必须是将word/pdf/html等文件的内容转化问字符型。
lucene的jar包自己去下载。
首先是建立索引的代码:

public class TextFileIndexer {   
public static void main(String[] args) throws Exception {   
/* 指明要索引文件夹的位置,这里是d盘的s文件夹下 */
         File fileDir = new File("d:\\s");   
/* 这里放索引文件的位置 */
         File indexDir = new File("d:\\index");   
         Analyzer luceneAnalyzer = new StandardAnalyzer();   
         IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,   
true);   
         File[] textFiles = fileDir.listFiles();   
long startTime = new Date().getTime();   
//增加document到索引去
                 System.out.println("File正在被索引clip_image001.");  
/*
                  * 注意要变的就是这里,路径和读取文件的方法
                  * */
                 String path ="d:\\s\\2.doc";
                 String temp = ReadFile.readWord(path);
//                 String path ="d:\\s\\index.htm";
//                 String temp = ReadFile.readHtml(path);
                 Document document = new Document();   
                 Field FieldPath = new Field("path",path,
                         Field.Store.YES, Field.Index.NO);   
                 Field FieldBody = new Field("body", temp, Field.Store.YES,   
                         Field.Index.TOKENIZED,   
                         Field.TermVector.WITH_POSITIONS_OFFSETS);   
                 document.add(FieldPath);   
                 document.add(FieldBody);   
                 indexWriter.addDocument(document);   
//optimize()方法是对索引进行优化
         indexWriter.optimize();   
         indexWriter.close();   
//测试一下索引的时间
long endTime = new Date().getTime();   
         System.out   
                 .println("这花费了"  
                        + (endTime - startTime)   
                        + " 毫秒来把文档增加到索引里面去!"  
                        + fileDir.getPath());   
     }  
}

上面已经注释了要换的地方,我们要做的就是换文件的路径和读取文件的方法。
下面来具体看下读取文件的方法
1.首先来看WORD文档:
我这里用的是poi,相关jar包自己去下载,然后加到工程中(以下所要用的jar包也是,不再重复说)。
来看相关代码:

public static String readWord(String path) {
         StringBuffer content = new StringBuffer("");// 文档内容
try {
             HWPFDocument doc = new HWPFDocument(new FileInputStream(path));
             Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
                 Paragraph pp = range.getParagraph(i);
                 content.append(pp.text());
             }
         } catch (Exception e) {
         }
return content.toString().trim();
     }

2.PDF文件用的是PDFbox:

public static String readPdf(String path) throws Exception {
         StringBuffer content = new StringBuffer("");// 文档内容
         FileInputStream fis = new FileInputStream(path);
         PDFParser p = new PDFParser(fis);
         p.parse();
         PDFTextStripper ts = new PDFTextStripper();
         content.append(ts.getText(p.getPDDocument()));
         fis.close();
return content.toString().trim();
     }

3.html文件:

public static String readHtml(String urlString) {
         StringBuffer content = new StringBuffer("");
         File file = new File(urlString);
         FileInputStream fis = null;
try {
             fis = new FileInputStream(file);
// 读取页面
             BufferedReader reader = new BufferedReader(new InputStreamReader(
                     fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
             String line = null;
while ((line = reader.readLine()) != null) {
                 content.append(line + "\n");
             }
             reader.close();
         } catch (Exception e) {
             e.printStackTrace();
         }
         String contentString = content.toString();
return contentString;
     }

4.txt文件:

public static String readTxt(String path) {
         StringBuffer content = new StringBuffer("");// 文档内容
try {
             FileReader reader = new FileReader(path);
             BufferedReader br = new BufferedReader(reader);
             String s1 = null;
while ((s1 = br.readLine()) != null) {
                 content.append(s1 + "\r");
             }
             br.close();
             reader.close();
         } catch (IOException e) {
             e.printStackTrace();
         }
return content.toString().trim();
     }

接下来数搜索代码:

public class TestQuery {   
public static void main(String[] args) throws IOException, ParseException {   
         Hits hits = null;   
//搜索内容自己换
         String queryString = "根据国务院的决定";   
         Query query = null;  
         IndexSearcher searcher = new IndexSearcher("d:\\index"); //这里注意索引存放的路径
         Analyzer analyzer = new StandardAnalyzer();   
try {   
             QueryParser qp = new QueryParser("body", analyzer);   
/**
              * 建索引的时候我们指定了body建立为内容,我们搜索的时候也是针对body的,所以
              *    QueryParser qp = new QueryParser("body", analyzer);
              *    这句和建立索引时候
                 Field FieldBody = new Field("body", temp, Field.Store.YES,   
                         Field.Index.TOKENIZED,   
                         Field.TermVector.WITH_POSITIONS_OFFSETS);
              *的这句的"body"是对应的。
             */
             query = qp.parse(queryString);   
         } catch (ParseException e) {
             System.out.println("异常");
         }   
if (searcher != null) {   
             hits = searcher.search(query);   
if (hits.length() > 0) {   
                 System.out.println("找到:" + hits.length() + " 个结果!");  
for (int i = 0; i < hits.length(); i++) {//输出搜索信息clip_image001[1]
                      Document document = hits.doc(i);
                      System.out.println("contents:"+document.get("body"));
//同样原理这里的document.get("body")就是取得建立在索引文件里面的额body的所有内容
                     //你若想输出文件路径就用document.get("path")就可以了
                 }
             } else{
                 System.out.println("0个结果!");
             }   
         }  
     }

posted @ 2010-02-01 10:48  Me疯子_(~  阅读(226)  评论(0编辑  收藏  举报