lucene之入门学习篇

2011-02-11 16:16 hanwesley 阅读(323) 评论(0) 编辑收藏举报

Lucene是一个高效的，基于java的全文检索库。

什么叫做全文检索呢？这要从我们生活中的数据说起。

我们生活中的数据总体分为两种：结构化数据和非结构化数据。

结构化数据：指具有固定格式或有限长度的数据，如数据库，元数据等。

非结构化数据：指不定长或无固定格式的数据，如邮件，word文档等。

当然有的地方还会提到第三种，半结构化数据，如XML，HTML等，当根据需要可按结构化数据来处理，也可抽取出纯文本按非结构化数据来处理。

非结构化数据又一种叫法叫全文数据。

按照数据的分类，搜索也分为两种：

对结构化数据的搜索：如对数据库的搜索，用SQL语句。再如对元数据的搜索，如利用windows搜索对文件名，类型，修改时间进行搜索等。

对非结构化数据的搜索：如利用windows的搜索也可以搜索文件内容，Linux下的grep命令，再如用Google和百度可以搜索大量内容数据。

其实搜索引擎做的两件很重要的事情就是建立索引，和从索引中查找数据。

下面这段代码主要就是将这两个重要部分呈现出来。至于原理以后慢慢道来。

package com.wesley.search;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;

public class search2 {
	public static void main(String[] args) throws Exception {
		index();
		search("严老大");
	}

	//索引
	public static void index() throws Exception {
		File fileDir = new File("D:\\test\\wesley");//该目录类似百度，Google搜索引擎爬虫的入口一样。

		//这里放索引文件的位置
		File indexDir = new File("D:\\test\\index");
		Analyzer luceneAnalyzer = new StandardAnalyzer();// 第二个参数：分析器，对文档关键词进行分词
		IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,true);
		// 第三个参数：为true时，IndexWriter不管目录内是否已经有索引了，一律清空，重新建立；
		//当为false时，则IndexWriter会在原有基础上增量添加索引。所以在更新的过程中，需要设置该值为false。

		File[] textFiles = fileDir.listFiles();
		long startTime = new Date().getTime();

		// 增加document到索引去
		for (int i = 0; i < textFiles.length; i++) {
			if (textFiles[i].isFile()
					&& textFiles[i].getName().endsWith(".txt")) {
				System.out.println("File " + textFiles[i].getCanonicalPath()
						+ "正在被索引.");
				String temp = FileReaderAll(textFiles[i].getCanonicalPath(),
						"GBK");
				System.out.println(temp);
				Document document = new Document();
				Field FieldPath = new Field("path", textFiles[i].getPath(),
						Field.Store.YES, Field.Index.TOKENIZED);
				Field FieldBody = new Field("body", temp, Field.Store.YES,
						Field.Index.TOKENIZED);
				document.add(FieldPath);
				document.add(FieldBody);
				indexWriter.addDocument(document);
				/* 1、每一个Field文件类似于Map文件，他有一个name和value值。
				 * value只接受字符串（非字符串类型要先转换成字符串才行）。
		         * 2、Field构造函数 public Field(String name, String value, Store store,
		         *  Index index, TermVector termVector)
		         * 3、Store，指定Field是否或怎样存储。--- Store.NO，不存储。 Store.YES，存储。
		         * Store.COMPRESS，压缩后存储
		         * 4、Index，指定Field是否或怎么被索引。----Index.NO，不索引（不索引就不能被搜索到）。
		         * Index.ANALYZED，分词后索引。Index.NOT_ANALYZED，不分词，
		         * 直接索引（把整个Field值做为一个term）。
		         */  

			}
		}
		// optimize()方法是对索引进行优化
		indexWriter.optimize();
		indexWriter.close();

		// 测试一下索引的时间
		long endTime = new Date().getTime();
		System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!"
				+ fileDir.getPath());
	}

	//查找
	public static void search(String keyword) throws CorruptIndexException,
			IOException {
		IndexSearcher indexSearcher = new IndexSearcher("D:\\test\\index");
		StandardAnalyzer analyzer = new StandardAnalyzer();
		QueryParser queryParser = new QueryParser("body", analyzer);
		Filter filter=null;
		int n=10;
		try {

			Query query=queryParser.parse(keyword);
			TopDocs docs=indexSearcher.search(query, filter, n);//对搜索结果进行过滤
			/* * 查询方法为IndexSearcher.search(Query, Filter, int);
	         * Query，查询对象，把用户输入的查询字符串封装成Lucene能够识别的Query。
	         * Filter，用来过虑搜索结果。
	         * 第三个参数（int类型），最多返回的Document的数量。
	         * 返回的是一个TopDocs类型的对象，调用TopDocs.scoreDocs得到查询结果。
	         */
			System.out.println("搜索到"+docs.totalHits+"条结果");
			for (ScoreDoc scoreDoc: docs.scoreDocs) {
				int i=scoreDoc.doc;
				Document document=indexSearcher.doc(i);
				System.out.println(document.getField("body").stringValue());
			}

			Hits hits = indexSearcher.search(query);//不对搜索结果进行过滤
			System.out.println("搜索结果记录:"+hits.length());
			for (int i = 0; i < hits.length(); i++) {
				String string = hits.doc(i).get("body");
				System.out.println(string);
			}
		} catch (ParseException e) {
			e.printStackTrace();
		}
	}
	/**
	 * 读入文本信息
	 * */
	public static String FileReaderAll(String FileName, String charset)
			throws IOException {
		BufferedReader reader = new BufferedReader(new InputStreamReader(
				new FileInputStream(FileName), charset));
		String line = new String();
		String temp = new String();

		while ((line = reader.readLine()) != null) {
			temp += line;
		}
		reader.close();
		return temp;
	}
}

刷新页面返回顶部

wesley tech blog

lucene之入门学习篇

About