Lucene简单应用

 近期用Lucene做了个比较简单的站内检索,在这里和大家做个交流。全文检索的实现,从检索的数据源来分有两种:一种是数据库,另一种是已生成的文件(doc,html,txt......)。

无论哪一种方式,实现原理都是一样的。主要分为两大步:

一、将数据源转换为Lucene文件,保存到设定目录下

 

private static String filePath = "D:\\rookie\\date\\";//文件存放路径
private static String indexPath = "D:\\rookie\\source";//索引存放路径

public static void main(String[] args) throws Exception {
/* 指明要索引文件夹的位置,这里是d盘的文件夹下 */
File fileDir = new File(filePath);
/* 这里放索引文件的位置 */
File indexDir = new File(indexPath);

Analyzer luceneAnalyzer = new StandardAnalyzer();
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,true);//提醒:最后一个参数为false时,不重新创建索引文件夹需要追加索引(即更新索引时使用false)
File[] textFiles = fileDir.listFiles();
long startTime = new Date().getTime();
// 增加document到索引去
for (int i = 0; i < textFiles.length; i++) {

//支持html,txt文件
if (textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) {
String temp = FileReaderAll(textFiles[i].getCanonicalPath(),"GBK");
Document document = new Document();

Field FieldId = new Field("id", "12345",Field.Store.YES, Field.Index.UN_TOKENIZED);//强烈建议在添加Field 时 保存一个Id
Field FieldPath = new Field("path", textFiles[i].getPath(),Field.Store.YES, Field.Index.UN_TOKENIZED);
Field FieldBody = new Field("contents", temp, Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS);

document.add(FieldId);
document.add(FieldPath);
document.add(FieldBody);
indexWriter.addDocument(document);
}


}
// optimize()方法是对索引进行优化
indexWriter.optimize();
indexWriter.close();

// 测试一下索引的时间
long endTime = new Date().getTime();
System.out.println("索引已经添加到文档中,共花费了" + (endTime - startTime) + " 毫秒! 索引路径是:" + fileDir.getPath());
}

/**
* 功能:读取html ,txt...
* @author rookie_d
*/
public static String FileReaderAll(String FileName, String charset)
throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(FileName), charset));
String line = new String();
String temp = new String();

while ((line = reader.readLine()) != null) {
temp += line;
}
reader.close();
return temp;
}

二、从Lucene文件中进行检索

 

/**
* 功能:从索引中查询出包含要搜索名字的所有的文件
* @author rookie_d
*/
public static List luceneSearcher() {

String queryString="好";//要检索的字符串
String indexPath = "D:\\rookie\\source";//得到索引存放路径
Hits hits = null;
Query query = null;
IndexSearcher searcher;
List list = new ArrayList();
try {
searcher = new IndexSearcher(indexPath);
Analyzer analyzer = new StandardAnalyzer();
QueryParser qp = new QueryParser("contents", analyzer);
System.out.println(qp.getField());
try {
query = qp.parse(queryString);
System.out.println(query);
} catch (org.apache.lucene.queryParser.ParseException e) {
e.printStackTrace();
}
if (searcher != null) {
hits = searcher.search(query);
System.out.println(hits.length());
if (hits!=null && hits.length() > 0) {
System.out.println("共找到:" + hits.length() + "个结果!");
for(int i=0;i<hits.length();i++){
Document document = hits.doc(i);
String path = document.get("path");
File file = new File(path);
list.add(file.getPath());
}
}else{
System.out.println("*****no result find*****");
}

}
} catch (IOException e) {
e.printStackTrace();
}
return list;
}

 

在开发过程中遇到了更新索引的小难题,下面也给段转来的代码,作为菜鸟认为这段代码还是比较有用的

mport java.io.IOException; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.Term; 
import org.apache.lucene.queryParser.QueryParser; 
import org.apache.lucene.search.Hits; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 

public class UpdateDocument { 

private static String path = "d:/index"; 


public static void main(String[] args){ 
// addIndex(); 
updateIndex(); 
search("李四"); 
search("王五"); 


public static void addIndex(){ 
try { 
IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),true); 

Document doc = new Document(); 
doc.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED)); 
doc.add(new Field("userName","张三",Field.Store.YES,Field.Index.TOKENIZED)); 
doc.add(new Field("comefrom","北京",Field.Store.YES,Field.Index.TOKENIZED)); 

write.addDocument(doc); 

write.close(); 

} catch (IOException e) { 
e.printStackTrace(); 




public static void updateIndex(){ 
try { 

IndexWriter write = new IndexWriter(path,new StandardAnalyzer(),false); 
Document docNew = new Document(); 
docNew.add(new Field("id","123456",Field.Store.YES,Field.Index.UN_TOKENIZED)); 
docNew.add(new Field("userName","王五",Field.Store.YES,Field.Index.TOKENIZED)); 
Term term = new Term("id","123456"); 
/** 
调用updateDocument的方法,传给它一个新的doc来更新数据, 
Term term = new Term("id","1234567"); 
先去索引文件里查找id为1234567的Doc,如果有就更新它(如果有多条,最后更新后只有一条)。如果没有就新增. 
数据库更新的时候,我们可以只针对某个列来更新,而lucene只能针对一行数据更新。 
*/ 
write.updateDocument(term, docNew); 

write.close(); //注意在这里一定要关闭write 

} catch (IOException e) { 
e.printStackTrace(); 



public static Query queryParser(String str){ 
QueryParser queryParser = new QueryParser("userName", new StandardAnalyzer()); 
try { 
Query query = queryParser.parse(str); 
return query; 
} catch (Exception e) { 
e.printStackTrace(); 

return null; 


public static void search(String str){ 
try { 
IndexSearcher search = new IndexSearcher(path); 

Query query = queryParser(str); 

Hits hits = search.search(query); 
if(hits==null){ 
return; 

if(hits.length() == 0){ 
System.out.println(" 没有搜索到'" + str+"'"); 
return; 

for (int i = 0; i < hits.length(); i++) { 
Document doc = hits.doc(i); 
System.out.println("id = "+hits.id(i)); 
System.out.println("own id = " + doc.get("id")); 
System.out.println("userName = "+doc.get("userName")); 
System.out.println("come from = "+doc.get("comefrom")); 
System.out.println(""); 


} catch (Exception e) { 
e.printStackTrace(); 



最后再给一段删除索引的代码:

//删除Lucene中相应的索引
File indexDir = new File(indexPath);/* 这里放索引文件的位置 */
File[] textFiles = indexDir.listFiles();
Analyzer luceneAnalyzer = new StandardAnalyzer();
boolean create = false;
if(textFiles==null||textFiles.length<=0){
create = true;
}
IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,create);
Term term = new Term("id",news.getId());
indexWriter.deleteDocuments(term);
indexWriter.optimize();//optimize()方法是对索引进行优化
indexWriter.close();//关闭

在删除和更新索引时要注意new IndexWriter(indexDir, luceneAnalyzer,false);最后一个参数为false

关于全文检索的内容还有许多需要学习,写这篇文章来帮助新手和自己来熟悉Lucene,希望对你有一点帮助!

posted @ 2014-09-18 19:15  rookie_d  阅读(125)  评论(0编辑  收藏  举报