Lucene开发实例:Lucene中文分词(转载)
1、准备工作
下载lucene 3.6.1 : http://lucene.apache.org/
下载中文分词IK Analyzer: http://code.google.com/p/ik-analyzer/downloads/list (注意下载的是IK Analyzer 2012_u5_source.zip,其他版本有bug)
下载solr 3.6.1: http://lucene.apache.org/solr/(编译IK Analyzer时需引用包)
OK,将lucene 、solr 相关包(lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar)拷贝到项目lib下,IK源码置于项目src下。
2、从Oracle数据库中取数据创建索引(使用IK分词)
1 package lucene.util; 2 3 import org.apache.lucene.index.IndexWriter; 4 import org.apache.lucene.index.IndexWriterConfig; 5 import org.apache.lucene.index.CorruptIndexException; 6 import org.apache.lucene.store.FSDirectory; 7 import org.apache.lucene.store.Directory; 8 import org.apache.lucene.analysis.Analyzer; 9 import org.apache.lucene.analysis.standard.StandardAnalyzer; 10 import org.apache.lucene.util.Version; 11 import org.apache.lucene.document.Document; 12 import org.apache.lucene.document.Field; 13 import org.wltea.analyzer.lucene.IKAnalyzer; 14 15 import java.sql.Connection; 16 import java.io.File; 17 import java.io.IOException; 18 import java.util.ArrayList; 19 import java.util.Date; 20 21 import modules.gk.Gk_info; 22 import modules.gk.Gk_infoSub; 23 import web.sys.Globals; 24 import web.db.DBConnector; 25 import web.db.ObjectCtl; 26 import web.util.StringUtil; 27 //Wizzer.cn 28 public class LuceneIndex { 29 IndexWriter writer = null; 30 FSDirectory dir = null; 31 boolean create = true; 32 33 public void init() { 34 long a1 = System.currentTimeMillis(); 35 System.out.println("[Lucene 开始执行:" + new Date() + "]"); 36 Connection con = DBConnector.getconecttion(); //取得一个数据库连接 37 try { 38 final File docDir = new File(Globals.SYS_COM_CONFIG.get("sys.index.path").toString());//E:\lucene 39 if (!docDir.exists()) { 40 docDir.mkdirs(); 41 } 42 String cr = Globals.SYS_COM_CONFIG.get("sys.index.create").toString();//true or false 43 if ("false".equals(cr.toLowerCase())) { 44 create = false; 45 } 46 Directory dir = FSDirectory.open(docDir); 47 // Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); 48 Analyzer analyzer = new IKAnalyzer(true); 49 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer); 50 if (create) { 51 // Create a new index in the directory, removing any 52 // previously indexed documents: 53 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); 54 } else { 55 // Add new documents to an existing index: 56 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); 57 } 58 IndexWriter writer = new IndexWriter(dir, iwc); 59 String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 "; 60 int rowCount = ObjectCtl.getRowCount(con, sql); 61 int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get("sys.index.size").toString()); //每页记录数 62 int pages = (rowCount - 1) / pageSize + 1; //计算总页数 63 ArrayList list = null; 64 Gk_infoSub gk = null; 65 for (int i = 1; i < pages+1; i++) { 66 long a = System.currentTimeMillis(); 67 list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub()); 68 for (int j = 0; j < list.size(); j++) { 69 gk = (Gk_infoSub) list.get(j); 70 Document doc = new Document(); 71 doc.add(new Field("indexno", StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//主键不分词 72 doc.add(new Field("title", StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED)); 73 doc.add(new Field("describes", StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED)); 74 doc.add(new Field("pdate", StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//日期不分词 75 doc.add(new Field("keywords", StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED)); 76 writer.addDocument(doc); 77 ObjectCtl.executeUpdateBySql(con,"UPDATE TABLEA SET SSTAG=1 WHERE indexno='"+gk.getIndexno()+"'");//更新已索引状态 78 } 79 80 long b = System.currentTimeMillis(); 81 long c = b - a; 82 System.out.println("[Lucene " + rowCount + "条," + pages + "页,第" + i + "页花费时间:" + c + "毫秒]"); 83 } 84 writer.commit(); 85 86 } catch (Exception e) { 87 e.printStackTrace(); 88 } finally { 89 DBConnector.freecon(con); //释放数据库连接 90 try { 91 if (writer != null) { 92 writer.close(); 93 } 94 } catch (CorruptIndexException e) { 95 e.printStackTrace(); 96 } catch (IOException e) { 97 e.printStackTrace(); 98 } finally { 99 try { 100 if (dir != null && IndexWriter.isLocked(dir)) { 101 IndexWriter.unlock(dir);//注意解锁 102 } 103 } catch (IOException e) { 104 e.printStackTrace(); 105 } 106 } 107 } 108 long b1 = System.currentTimeMillis(); 109 long c1 = b1 - a1; 110 System.out.println("[Lucene 执行完毕,花费时间:" + c1 + "毫秒,完成时间:" + new Date() + "]"); 111 } 112 }
3、单字段查询以及多字段分页查询高亮显示
1 package lucene.util; 2 3 import org.apache.lucene.store.FSDirectory; 4 import org.apache.lucene.store.Directory; 5 import org.apache.lucene.search.*; 6 import org.apache.lucene.search.highlight.SimpleHTMLFormatter; 7 import org.apache.lucene.search.highlight.Highlighter; 8 import org.apache.lucene.search.highlight.SimpleFragmenter; 9 import org.apache.lucene.search.highlight.QueryScorer; 10 import org.apache.lucene.queryParser.QueryParser; 11 import org.apache.lucene.queryParser.MultiFieldQueryParser; 12 import org.apache.lucene.analysis.TokenStream; 13 import org.apache.lucene.analysis.Analyzer; 14 import org.apache.lucene.analysis.KeywordAnalyzer; 15 import org.apache.lucene.document.Document; 16 import org.apache.lucene.index.IndexReader; 17 import org.apache.lucene.index.Term; 18 import org.apache.lucene.util.Version; 19 import modules.gk.Gk_infoSub; 20 21 import java.util.ArrayList; 22 import java.io.File; 23 import java.io.StringReader; 24 import java.lang.reflect.Constructor; 25 26 import web.util.StringUtil; 27 import web.sys.Globals; 28 import org.wltea.analyzer.lucene.IKAnalyzer; 29 //Wizzer.cn 30 public class LuceneQuery { 31 private static String indexPath;// 索引生成的目录 32 private int rowCount;// 记录数 33 private int pages;// 总页数 34 private int currentPage;// 当前页数 35 private int pageSize; //每页记录数 36 37 public LuceneQuery() { 38 this.indexPath = Globals.SYS_COM_CONFIG.get("sys.index.path").toString(); 39 } 40 41 public int getRowCount() { 42 return rowCount; 43 } 44 45 public int getPages() { 46 return pages; 47 } 48 49 public int getPageSize() { 50 return pageSize; 51 } 52 53 public int getCurrentPage() { 54 return currentPage; 55 } 56 57 /** 58 * 函数功能:根据字段查询索引 59 */ 60 public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) { 61 ArrayList list = new ArrayList(); 62 try { 63 if (curpage <= 0) { 64 curpage = 1; 65 } 66 if (pageSize <= 0) { 67 pageSize = 20; 68 } 69 this.pageSize = pageSize; //每页记录数 70 this.currentPage = curpage; //当前页 71 int start = (curpage - 1) * pageSize; 72 Directory dir = FSDirectory.open(new File(indexPath)); 73 IndexReader reader = IndexReader.open(dir); 74 IndexSearcher searcher = new IndexSearcher(reader); 75 Analyzer analyzer = new IKAnalyzer(true); 76 QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title", analyzer); 77 queryParser.setDefaultOperator(QueryParser.AND_OPERATOR); 78 Query query = queryParser.parse(keyWord); 79 int hm = start + pageSize; 80 TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); 81 searcher.search(query, res); 82 83 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); 84 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); 85 this.rowCount = res.getTotalHits(); 86 this.pages = (rowCount - 1) / pageSize + 1; //计算总页数 87 TopDocs tds = res.topDocs(start, pageSize); 88 ScoreDoc[] sd = tds.scoreDocs; 89 for (int i = 0; i < sd.length; i++) { 90 Document hitDoc = reader.document(sd[i].doc); 91 list.add(createObj(hitDoc, analyzer, highlighter)); 92 } 93 94 } catch (Exception e) { 95 e.printStackTrace(); 96 } 97 98 return list; 99 100 } 101 /** 102 * 函数功能:根据字段查询索引 103 */ 104 public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) { 105 ArrayList list = new ArrayList(); 106 try { 107 if (curpage <= 0) { 108 curpage = 1; 109 } 110 if (pageSize <= 0) { 111 pageSize = 20; 112 } 113 this.pageSize = pageSize; //每页记录数 114 this.currentPage = curpage; //当前页 115 int start = (curpage - 1) * pageSize; 116 Directory dir = FSDirectory.open(new File(indexPath)); 117 IndexReader reader = IndexReader.open(dir); 118 IndexSearcher searcher = new IndexSearcher(reader); 119 BooleanQuery bQuery = new BooleanQuery(); //组合查询 120 if (!"".equals(allkeyword)) {//包含全部关键词 121 KeywordAnalyzer analyzer = new KeywordAnalyzer(); 122 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//AND 123 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer); 124 bQuery.add(query, BooleanClause.Occur.MUST); //AND 125 } 126 if (!"".equals(onekeyword)) { //包含任意关键词 127 Analyzer analyzer = new IKAnalyzer(true); 128 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//OR 129 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer); 130 bQuery.add(query, BooleanClause.Occur.MUST); //AND 131 } 132 if (!"".equals(nokeyword)) { //排除关键词 133 Analyzer analyzer = new IKAnalyzer(true); 134 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//NOT 135 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer); 136 bQuery.add(query, BooleanClause.Occur.MUST_NOT); //AND 137 138 } 139 int hm = start + pageSize; 140 TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); 141 searcher.search(bQuery, res); 142 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>"); 143 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(bQuery)); 144 this.rowCount = res.getTotalHits(); 145 this.pages = (rowCount - 1) / pageSize + 1; //计算总页数 146 System.out.println("rowCount:" + rowCount); 147 TopDocs tds = res.topDocs(start, pageSize); 148 ScoreDoc[] sd = tds.scoreDocs; 149 Analyzer analyzer = new IKAnalyzer(); 150 for (int i = 0; i < sd.length; i++) { 151 Document hitDoc = reader.document(sd[i].doc); 152 list.add(createObj(hitDoc, analyzer, highlighter)); 153 } 154 155 } catch (Exception e) { 156 e.printStackTrace(); 157 } 158 159 return list; 160 161 } 162 163 /** 164 * 创建返回对象(高亮) 165 */ 166 167 private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) { 168 169 Gk_infoSub gk = new Gk_infoSub(); 170 try { 171 172 if (doc != null) { 173 gk.setIndexno(StringUtil.null2String(doc.get("indexno"))); 174 gk.setPdate(StringUtil.null2String(doc.get("pdate"))); 175 String title = StringUtil.null2String(doc.get("title")); 176 gk.setTitle(title); 177 if (!"".equals(title)) { 178 highlighter.setTextFragmenter(new SimpleFragmenter(title.length())); 179 TokenStream tk = analyzer.tokenStream("title", new StringReader(title)); 180 String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title)); 181 if (!"".equals(htext)) { 182 gk.setTitle(htext); 183 } 184 } 185 String keywords = StringUtil.null2String(doc.get("keywords")); 186 gk.setKeywords(keywords); 187 if (!"".equals(keywords)) { 188 highlighter.setTextFragmenter(new SimpleFragmenter(keywords.length())); 189 TokenStream tk = analyzer.tokenStream("keywords", new StringReader(keywords)); 190 String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords)); 191 if (!"".equals(htext)) { 192 gk.setKeywords(htext); 193 } 194 } 195 String describes = StringUtil.null2String(doc.get("describes")); 196 gk.setDescribes(describes); 197 if (!"".equals(describes)) { 198 highlighter.setTextFragmenter(new SimpleFragmenter(describes.length())); 199 TokenStream tk = analyzer.tokenStream("keywords", new StringReader(describes)); 200 String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes)); 201 if (!"".equals(htext)) { 202 gk.setDescribes(htext); 203 } 204 } 205 206 } 207 return gk; 208 } 209 catch (Exception e) { 210 211 e.printStackTrace(); 212 return null; 213 } 214 finally { 215 gk = null; 216 } 217 218 } 219 220 private synchronized static Object createObj(Document doc) { 221 222 Gk_infoSub gk = new Gk_infoSub(); 223 try { 224 225 if (doc != null) { 226 gk.setIndexno(StringUtil.null2String(doc.get("indexno"))); 227 gk.setPdate(StringUtil.null2String(doc.get("pdate"))); 228 gk.setTitle(StringUtil.null2String(doc.get("title"))); 229 gk.setKeywords(StringUtil.null2String(doc.get("keywords"))); 230 gk.setDescribes(StringUtil.null2String(doc.get("describes"))); 231 } 232 return gk; 233 } 234 catch (Exception e) { 235 236 e.printStackTrace(); 237 return null; 238 } 239 finally { 240 gk = null; 241 } 242 243 } 244 }
单字段查询:
1 long a = System.currentTimeMillis(); 2 try { 3 int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage"))); 4 int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize"))); 5 String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("title"))); 6 LuceneQuery lu = new LuceneQuery(); 7 form.addResult("list", lu.queryIndexTitle(title, curpage, pagesize)); 8 form.addResult("curPage", lu.getCurrentPage()); 9 form.addResult("pageSize", lu.getPageSize()); 10 form.addResult("rowCount", lu.getRowCount()); 11 form.addResult("pageCount", lu.getPages()); 12 } catch (Exception e) { 13 e.printStackTrace(); 14 } 15 long b = System.currentTimeMillis(); 16 long c = b - a; 17 System.out.println("[搜索信息花费时间:" + c + "毫秒]");
多字段查询:
1 long a = System.currentTimeMillis(); 2 try { 3 int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage"))); 4 int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize"))); 5 String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("allkeyword"))); 6 String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("onekeyword"))); 7 String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("nokeyword"))); 8 LuceneQuery lu = new LuceneQuery(); 9 form.addResult("list", lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize)); 10 form.addResult("curPage", lu.getCurrentPage()); 11 form.addResult("pageSize", lu.getPageSize()); 12 form.addResult("rowCount", lu.getRowCount()); 13 form.addResult("pageCount", lu.getPages()); 14 } catch (Exception e) { 15 e.printStackTrace(); 16 } 17 long b = System.currentTimeMillis(); 18 long c = b - a; 19 System.out.println("[高级检索花费时间:" + c + "毫秒]");
4、Lucene通配符查询
1 BooleanQuery bQuery = new BooleanQuery(); //组合查询 2 if (!"".equals(title)) { 3 WildcardQuery w1 = new WildcardQuery(new Term("title", title+ "*")); 4 bQuery.add(w1, BooleanClause.Occur.MUST); //AND 5 } 6 int hm = start + pageSize; 7 TopScoreDocCollector res = TopScoreDocCollector.create(hm, false); 8 searcher.search(bQuery, res);
5、Lucene嵌套查询
实现SQL:(unitid like 'unitid%' and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)
1 BooleanQuery bQuery = new BooleanQuery(); 2 BooleanQuery b1 = new BooleanQuery(); 3 WildcardQuery w1 = new WildcardQuery(new Term("unitid", unitid + "*")); 4 WildcardQuery w2 = new WildcardQuery(new Term("idml", id2 + "*")); 5 b1.add(w1, BooleanClause.Occur.MUST);//AND 6 b1.add(w2, BooleanClause.Occur.MUST);//AND 7 bQuery.add(b1, BooleanClause.Occur.SHOULD);//OR 8 BooleanQuery b2 = new BooleanQuery(); 9 WildcardQuery w3 = new WildcardQuery(new Term("tounitid", unitid + "*")); 10 WildcardQuery w4 = new WildcardQuery(new Term("tomlid", id2 + "*")); 11 WildcardQuery w5 = new WildcardQuery(new Term("tostate", "1")); 12 b2.add(w3, BooleanClause.Occur.MUST);//AND 13 b2.add(w4, BooleanClause.Occur.MUST);//AND 14 b2.add(w5, BooleanClause.Occur.MUST);//AND 15 bQuery.add(b2, BooleanClause.Occur.SHOULD);//OR
6、Lucene先根据时间排序后分页
1 int hm = start + pageSize; 2 Sort sort = new Sort(new SortField("pdate", SortField.STRING, true)); 3 TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false); 4 searcher.search(bQuery, res); 5 this.rowCount = res.getTotalHits(); 6 this.pages = (rowCount - 1) / pageSize + 1; //计算总页数 7 TopDocs tds =searcher.search(bQuery,rowCount,sort);// res.topDocs(start, pageSize); 8 ScoreDoc[] sd = tds.scoreDocs; 9 System.out.println("rowCount:" + rowCount); 10 int i=0; 11 for (ScoreDoc scoreDoc : sd) { 12 i++; 13 if(i<start){ 14 continue; 15 } 16 if(i>hm){ 17 break; 18 } 19 Document doc = searcher.doc(scoreDoc.doc); 20 list.add(createObj(doc)); 21 }
这个效率不高,正常的做法是创建索引的时候进行排序,之后使用分页方法,不要这样进行2次查询。