Lucene 全文检索实践(5) 参考指数 : 3
对于 Lucene 的初步研究已经过去一段时间,自己感觉还不是很深入,但由于时间的关系,一直也没再拿起。应网友的要求,将自己实践中写的一些代码贴出来,希望能对大家有用。程序没有做进一步的优化,只是很简单的实现功能而已,仅供参考。
在实践中,我以将 PHP 中文手册中的 HTML 文件生成索引,然后通过一个 JSP 对其进行全文检索。
生成索引的 Java 代码:
/** * PHPDocIndexer.java * 用于对 PHPDoc 的 HTML 页面生成索引文件。 */ import java.io.File; import java.io.FileReader; import java.io.BufferedReader; import java.io.IOException; import java.util.Date; import java.text.DateFormat; import java.lang.*; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateField; class PHPDocIndexer { public static void main(String[] args) throws ClassNotFoundException, IOException { try { Date start = new Date(); IndexWriter writer = new IndexWriter("/home/nio/indexes-phpdoc", new CJKAnalyzer(), true); //索引保存目录,必须存在 indexDocs(writer, new File("/home/nio/phpdoc-zh")); //HTML 文件保存目录 System.out.println("Optimizing ...."); writer.optimize(); writer.close(); Date end = new Date(); System.out.print("Total time: "); System.out.println(end.getTime() - start.getTime()); } catch (Exception e) { System.out.println("Class " + e.getClass() + " throws error!\n errmsg: " + e.getMessage()); } //end try } //end main public static void indexDocs(IndexWriter writer, File file) throws Exception { if (file.isDirectory()) { String[] files = file.list(); for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } //end for } else if (file.getPath().endsWith(".html")) { //只对 HTML 文件做索引 System.out.print("Add file:" + file + " ...."); // Add html file .... Document doc = new Document(); doc.add(Field.UnIndexed("file", file.getName())); //索引文件名 doc.add(Field.UnIndexed("modified", DateFormat.getDateTimeInstance().format(new Date(file.lastModified())))); //索引最后修改时间 String title = ""; String content = ""; String status = "start"; FileReader fReader = new FileReader(file); BufferedReader bReader = new BufferedReader(fReader); String line = bReader.readLine(); while (line != null) { content += line; //截取 HTML 标题 <title> if ("start" == status && line.equalsIgnoreCase("><TITLE")) { status = "match"; } else if ("match" == status) { title = line.substring(1, line.length() - 7); doc.add(Field.Text("title", title)); //索引标题 status = "end"; } //end if line = bReader.readLine(); } //end while bReader.close(); fReader.close(); doc.add(Field.Text("content", content.replaceAll("<[^<>]+>", ""))); //索引内容 writer.addDocument(doc); System.out.println(" [OK]"); } //end if } } //end class
索引生成完之后,就需要一个检索页面,下边是搜索页面(search.jsp)的代码:
<%@ page language="java" import="javax.servlet.*, javax.servlet.http.*, java.io.*, java.util.Date, java.util.ArrayList, java.util.regex.*, org.apache.lucene.analysis.*, org.apache.lucene.document.*, org.apache.lucene.index.*, org.apache.lucene.search.*, org.apache.lucene.queryParser.*, org.apache.lucene.analysis.Token, org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.cjk.CJKAnalyzer, org.apache.lucene.analysis.cjk.CJKTokenizer, com.chedong.weblucene.search.WebLuceneHighlighter" %> <%@ page contentType="text/html;charset=GB2312" %> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=gb2312" /> <title>PHPDoc - PHP 简体中文手册全文检索</title> <base target="main"><!-- 由于使用了 Frame,所以指定 target 到 main 窗口显示 --> <style> body {background-color: white; margin: 4px} body, input, div {font-family: Tahoma; font-size: 9pt} body, div {line-height: 18px} u {color: red} b {color: navy} form {padding: 0px; margin: 0px} .txt {border: 1px solid black} .f {padding: 4px; margin-bottom: 16px; background-color: #E5ECF9; border-top: 1px solid #3366CC; border-bottom: 1px solid #3366CC; text-align: center;} .d, .o {padding-left: 16px} .d {color: gray} .o {color: green} .o a {color: #7777CC} </style> <script language="JavaScript"> function gotoPage(i) { document.frm.page.value = i; document.frm.submit(); } //end function </script> </head> <body> <% String keyVal = null; String pageVal = null; int offset = 0; int curPage = 0; int pages; final int ROWS = 50; //获取 GET 参数 try { byte[] keyValByte = request.getParameter("key").getBytes("ISO8859_1"); //查找关键字 keyVal = new String(keyValByte); pageVal = request.getParameter("page"); //页码 } catch (Exception e) { //do nothing; } if (keyVal == null) keyVal = new String(""); %> <div class="f"> <form name="frm" action="./index.jsp" method="GET" onsubmit="this.page.value='0';return true;" target="_self"> <input type="text" name="key" class="txt" size="40" value="<%=keyVal%>" /> <input type="hidden" name="page" value="<%=pageVal%>" /> <input type="submit" value="搜 索" /><br /> <font color="green">提示:可使用多个关键字(使用空格隔开)提高搜索的准确率。</font> </form> <script language="JavaScript"> document.frm.key.focus(); </script> </div> <% if (keyVal != null && keyVal.length() > 0) { try { curPage = Integer.parseInt(pageVal); //将当前页转换成整数 } catch (Exception e) { //do nothing; } //end try try { Date startTime = new Date(); keyVal = keyVal.toLowerCase().replaceAll("(or|and)", "").trim().replaceAll("\\s+", " AND "); Searcher searcher = new IndexSearcher("/home/nio/indexes-phpdoc"); //索引目录 Analyzer analyzer = new CJKAnalyzer(); String[] fields = {"title", "content"}; Query query = MultiFieldQueryParser.parse(keyVal, fields, analyzer); Hits hits = searcher.search(query); StringReader in = new StringReader(keyVal); TokenStream tokenStream = analyzer.tokenStream("", in); ArrayList al = new ArrayList(); for (Token token = tokenStream.next(); token != null; token = tokenStream.next()) { al.add(token.termText()); } //end for //总页数 pages = (new Integer(hits.length()).doubleValue() % ROWS != 0) ? (hits.length() / ROWS) + 1 : (hits.length() / ROWS); //当前页码 if (curPage < 1) curPage = 1; else if (curPage > pages) curPage = pages; //起始、终止下标 offset = (curPage - 1) * ROWS; int end = Math.min(hits.length(), offset + ROWS); //循环输出查询结果 WebLuceneHighlighter hl = new WebLuceneHighlighter(al); for (int i = offset; i < end; i++) { Document doc = hits.doc(i); %> <div class="t"><a href="/~nio/phpdoc-zh/<%=doc.get("file")%>"><%=hl.highLight(doc.get("title"))%></a></div> <div class="d"><%=hl.highLight(doc.get("content").replaceAll("\n", " "), 100)%> ……</div> <div class="o"> /~nio/phpdoc-zh/<%=doc.get("file")%> - <%=doc.get("modified")%> </div> <br /> <% } //end for searcher.close(); Date endTime = new Date(); %> <div class="f"> 检索总共耗时 <b><%=((endTime.getTime() - startTime.getTime()) / 1000.0)%></b> 秒,约有 <b><%=hits.length()%></b> 项符合条件的记录,共 <b><%=pages%></b> 页 <% if (curPage > 1 && pages > 1) { %> | <a href="javascript:gotoPage(<%=(curPage-1)%>);" target="_self">上一页</a> <% } //end if if (curPage < pages && pages > 1) { %> | <a href="javascript:gotoPage(<%=(curPage+1)%>)" target="_self">下一页</a> <% } //end if } catch (Exception e) { %> <!-- <%=e.getClass()%> 导致错误:<%=e.getMessage()%> --> <% } //end if } //end if %> </body> </html>