Lucene简单应用

近期用Lucene做了个比较简单的站内检索，在这里和大家做个交流。全文检索的实现，从检索的数据源来分有两种：一种是数据库，另一种是已生成的文件(doc,html,txt......)。

无论哪一种方式，实现原理都是一样的。主要分为两大步：

一、将数据源转换为Lucene文件，保存到设定目录下

private static String filePath = "D:\\rookie\\date\\";//文件存放路径
private static String indexPath = "D:\\rookie\\source";//索引存放路径

public static void main(String[] args) throws Exception {
/* 指明要索引文件夹的位置,这里是d盘的文件夹下 */
File fileDir = new File(filePath);
/* 这里放索引文件的位置 */
File indexDir = new File(indexPath);

Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,true);//提醒：最后一个参数为false时,不重新创建索引文件夹需要追加索引(即更新索引时使用false)
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
// 增加document到索引去
for (int i = 0; i < textFiles.length; i++) {

//支持html,txt文件
if (textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) {
String temp = FileReaderAll(textFiles[i].getCanonicalPath(),"GBK");
Document document = new Document();

Field FieldId = new Field("id", "12345",Field.Store.YES, Field.Index.UN_TOKENIZED);//强烈建议在添加Field 时保存一个Id
Field FieldPath = new Field("path", textFiles[i].getPath(),Field.Store.YES, Field.Index.UN_TOKENIZED);
Field FieldBody = new Field("contents", temp, Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS);

document.add(FieldId);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
}

}
// optimize()方法是对索引进行优化
indexWriter.optimize();
indexWriter.close();

// 测试一下索引的时间
long endTime = new Date().getTime();
System.out.println("索引已经添加到文档中，共花费了" + (endTime - startTime) + " 毫秒！索引路径是：" + fileDir.getPath());
}

/**
* 功能:读取html ,txt...
* @author rookie_d
*/
public static String FileReaderAll(String FileName, String charset)
throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(FileName), charset));
String line = new String();
String temp = new String();

while ((line = reader.readLine()) != null) {
temp += line;
}
reader.close();
return temp;
}

二、从Lucene文件中进行检索

/**
* 功能:从索引中查询出包含要搜索名字的所有的文件
* @author rookie_d
*/
public static List luceneSearcher() {

String queryString="好";//要检索的字符串
String indexPath = "D:\\rookie\\source";//得到索引存放路径
Hits hits = null;
Query query = null;
IndexSearcher searcher;
List list = new ArrayList();
try {
searcher = new IndexSearcher(indexPath);
Analyzer analyzer = new StandardAnalyzer();
QueryParser qp = new QueryParser("contents", analyzer);
System.out.println(qp.getField());
try {
query = qp.parse(queryString);
System.out.println(query);
} catch (org.apache.lucene.queryParser.ParseException e) {
e.printStackTrace();
}
if (searcher != null) {
hits = searcher.search(query);
System.out.println(hits.length());
if (hits!=null && hits.length() > 0) {
System.out.println("共找到:" + hits.length() + "个结果!");
for(int i=0;i<hits.length();i++){
Document document = hits.doc(i);
String path = document.get("path");
File file = new File(path);
list.add(file.getPath());
}
}else{
System.out.println("*****no result find*****");
}

}
} catch (IOException e) {
e.printStackTrace();
}
return list;
}

在开发过程中遇到了更新索引的小难题，下面也给段转来的代码,作为菜鸟认为这段代码还是比较有用的

mport java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class UpdateDocument {

private static String path = "d:/index";

public static void main(String[] args){
// addIndex();
updateIndex();
search("李四");
search("王五");
}

public static void addIndex(){
try {
IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),true);

Document doc = new Document();
doc.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED));
doc.add(new Field("userName","张三",Field.Store.YES,Field.Index.TOKENIZED));
doc.add(new Field("comefrom","北京",Field.Store.YES,Field.Index.TOKENIZED));

write.addDocument(doc);

write.close();

} catch (IOException e) {
e.printStackTrace();
}
}

public static void updateIndex(){
try {

IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),false);
Document docNew = new Document();
docNew.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED));
docNew.add(new Field("userName","王五",Field.Store.YES,Field.Index.TOKENIZED));
Term term = new Term("id","123456");
/**
调用updateDocument的方法，传给它一个新的doc来更新数据，
Term term = new Term("id","1234567");
先去索引文件里查找id为1234567的Doc,如果有就更新它(如果有多条，最后更新后只有一条)。如果没有就新增.
数据库更新的时候，我们可以只针对某个列来更新，而lucene只能针对一行数据更新。
*/
write.updateDocument(term, docNew);

write.close(); //注意在这里一定要关闭write

} catch (IOException e) {
e.printStackTrace();
}
}

public static Query queryParser(String str){
QueryParser queryParser = new QueryParser("userName", new StandardAnalyzer());
try {
Query query = queryParser.parse(str);
return query;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}

public static void search(String str){
try {
IndexSearcher search = new IndexSearcher(path);

Query query = queryParser(str);

Hits hits = search.search(query);
if(hits==null){
return;
}
if(hits.length() == 0){
System.out.println(" 没有搜索到'" + str+"'");
return;
}
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i);
System.out.println("id = "+hits.id(i));
System.out.println("own id = " + doc.get("id"));
System.out.println("userName = "+doc.get("userName"));
System.out.println("come from = "+doc.get("comefrom"));
System.out.println("");
}

} catch (Exception e) {
e.printStackTrace();
}
}

}

最后再给一段删除索引的代码：

//删除Lucene中相应的索引
File indexDir = new File(indexPath);/* 这里放索引文件的位置 */
File[] textFiles = indexDir.listFiles();
Analyzer luceneAnalyzer = new StandardAnalyzer();
boolean create = false;
if(textFiles==null||textFiles.length<=0){
create = true;
}
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,create);
Term term = new Term("id",news.getId());
indexWriter.deleteDocuments(term);
indexWriter.optimize();//optimize()方法是对索引进行优化
indexWriter.close();//关闭

在删除和更新索引时要注意new IndexWriter(indexDir, luceneAnalyzer,false);最后一个参数为false

关于全文检索的内容还有许多需要学习,写这篇文章来帮助新手和自己来熟悉Lucene，希望对你有一点帮助！

posted @ 2014-09-18 19:15 rookie_d 阅读(132) 评论(0) 收藏举报

刷新页面返回顶部

Lucene简单应用

公告