Lucene开发实例：Lucene中文分词(转载)

1、准备工作
下载lucene 3.6.1 ： http://lucene.apache.org/
下载中文分词IK Analyzer： http://code.google.com/p/ik-analyzer/downloads/list （注意下载的是IK Analyzer 2012_u5_source.zip，其他版本有bug）
下载solr 3.6.1： http://lucene.apache.org/solr/（编译IK Analyzer时需引用包）
OK，将lucene 、solr 相关包（lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar）拷贝到项目lib下，IK源码置于项目src下。

2、从Oracle数据库中取数据创建索引（使用IK分词）

  1 package lucene.util;
  2 
  3 import org.apache.lucene.index.IndexWriter;
  4 import org.apache.lucene.index.IndexWriterConfig;
  5 import org.apache.lucene.index.CorruptIndexException;
  6 import org.apache.lucene.store.FSDirectory;
  7 import org.apache.lucene.store.Directory;
  8 import org.apache.lucene.analysis.Analyzer;
  9 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 10 import org.apache.lucene.util.Version;
 11 import org.apache.lucene.document.Document;
 12 import org.apache.lucene.document.Field;
 13 import org.wltea.analyzer.lucene.IKAnalyzer;
 14 
 15 import java.sql.Connection;
 16 import java.io.File;
 17 import java.io.IOException;
 18 import java.util.ArrayList;
 19 import java.util.Date;
 20 
 21 import modules.gk.Gk_info;
 22 import modules.gk.Gk_infoSub;
 23 import web.sys.Globals;
 24 import web.db.DBConnector;
 25 import web.db.ObjectCtl;
 26 import web.util.StringUtil;
 27 //Wizzer.cn
 28 public class LuceneIndex {
 29     IndexWriter writer = null;
 30     FSDirectory dir = null;
 31     boolean create = true;
 32 
 33     public void init() {
 34         long a1 = System.currentTimeMillis();
 35         System.out.println("[Lucene 开始执行：" + new Date() + "]");
 36         Connection con = DBConnector.getconecttion(); //取得一个数据库连接
 37         try {
 38             final File docDir = new File(Globals.SYS_COM_CONFIG.get("sys.index.path").toString());//E:\lucene
 39             if (!docDir.exists()) {
 40                 docDir.mkdirs();
 41             }
 42             String cr = Globals.SYS_COM_CONFIG.get("sys.index.create").toString();//true or false
 43             if ("false".equals(cr.toLowerCase())) {
 44                 create = false;
 45             }
 46             Directory dir = FSDirectory.open(docDir);
 47 //            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
 48             Analyzer analyzer = new IKAnalyzer(true);
 49             IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
 50             if (create) {
 51                 // Create a new index in the directory, removing any
 52                 // previously indexed documents:
 53                 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
 54             } else {
 55                 // Add new documents to an existing index:
 56                 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
 57             }
 58             IndexWriter writer = new IndexWriter(dir, iwc);
 59             String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 ";
 60             int rowCount = ObjectCtl.getRowCount(con, sql);
 61             int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get("sys.index.size").toString());   //每页记录数
 62             int pages = (rowCount - 1) / pageSize + 1; //计算总页数
 63             ArrayList list = null;
 64             Gk_infoSub gk = null;
 65             for (int i = 1; i < pages+1; i++) {
 66                 long a = System.currentTimeMillis();
 67                 list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub());
 68                 for (int j = 0; j < list.size(); j++) {
 69                     gk = (Gk_infoSub) list.get(j);
 70                     Document doc = new Document();
 71                     doc.add(new Field("indexno", StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//主键不分词
 72                     doc.add(new Field("title", StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED));
 73                     doc.add(new Field("describes", StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED));
 74                     doc.add(new Field("pdate", StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//日期不分词
 75                     doc.add(new Field("keywords", StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED));
 76                     writer.addDocument(doc);
 77                     ObjectCtl.executeUpdateBySql(con,"UPDATE TABLEA SET SSTAG=1 WHERE indexno='"+gk.getIndexno()+"'");//更新已索引状态
 78                 }
 79 
 80                 long b = System.currentTimeMillis();
 81                 long c = b - a;
 82                 System.out.println("[Lucene " + rowCount + "条，" + pages + "页，第" + i + "页花费时间：" + c + "毫秒]");
 83             }
 84             writer.commit();
 85 
 86         } catch (Exception e) {
 87             e.printStackTrace();
 88         } finally {
 89             DBConnector.freecon(con); //释放数据库连接
 90             try {
 91                 if (writer != null) {
 92                     writer.close();
 93                 }
 94             } catch (CorruptIndexException e) {
 95                 e.printStackTrace();
 96             } catch (IOException e) {
 97                 e.printStackTrace();
 98             } finally {
 99                 try {
100                     if (dir != null && IndexWriter.isLocked(dir)) {
101                         IndexWriter.unlock(dir);//注意解锁
102                     }
103                 } catch (IOException e) {
104                     e.printStackTrace();
105                 }
106             }
107         }
108         long b1 = System.currentTimeMillis();
109         long c1 = b1 - a1;
110         System.out.println("[Lucene 执行完毕，花费时间：" + c1 + "毫秒，完成时间：" + new Date() + "]");
111     }
112 }

3、单字段查询以及多字段分页查询高亮显示

  1 package lucene.util;
  2 
  3 import org.apache.lucene.store.FSDirectory;
  4 import org.apache.lucene.store.Directory;
  5 import org.apache.lucene.search.*;
  6 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
  7 import org.apache.lucene.search.highlight.Highlighter;
  8 import org.apache.lucene.search.highlight.SimpleFragmenter;
  9 import org.apache.lucene.search.highlight.QueryScorer;
 10 import org.apache.lucene.queryParser.QueryParser;
 11 import org.apache.lucene.queryParser.MultiFieldQueryParser;
 12 import org.apache.lucene.analysis.TokenStream;
 13 import org.apache.lucene.analysis.Analyzer;
 14 import org.apache.lucene.analysis.KeywordAnalyzer;
 15 import org.apache.lucene.document.Document;
 16 import org.apache.lucene.index.IndexReader;
 17 import org.apache.lucene.index.Term;
 18 import org.apache.lucene.util.Version;
 19 import modules.gk.Gk_infoSub;
 20 
 21 import java.util.ArrayList;
 22 import java.io.File;
 23 import java.io.StringReader;
 24 import java.lang.reflect.Constructor;
 25 
 26 import web.util.StringUtil;
 27 import web.sys.Globals;
 28 import org.wltea.analyzer.lucene.IKAnalyzer;
 29 //Wizzer.cn
 30 public class LuceneQuery {
 31     private static String indexPath;// 索引生成的目录
 32     private int rowCount;// 记录数
 33     private int pages;// 总页数
 34     private int currentPage;// 当前页数
 35     private int pageSize;   //每页记录数
 36 
 37     public LuceneQuery() {
 38         this.indexPath = Globals.SYS_COM_CONFIG.get("sys.index.path").toString();
 39     }
 40 
 41     public int getRowCount() {
 42         return rowCount;
 43     }
 44 
 45     public int getPages() {
 46         return pages;
 47     }
 48 
 49     public int getPageSize() {
 50         return pageSize;
 51     }
 52 
 53     public int getCurrentPage() {
 54         return currentPage;
 55     }
 56 
 57     /**
 58      * 函数功能:根据字段查询索引
 59      */
 60     public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) {
 61         ArrayList list = new ArrayList();
 62         try {
 63             if (curpage <= 0) {
 64                 curpage = 1;
 65             }
 66             if (pageSize <= 0) {
 67                 pageSize = 20;
 68             }
 69             this.pageSize = pageSize;   //每页记录数
 70             this.currentPage = curpage;   //当前页
 71             int start = (curpage - 1) * pageSize;
 72             Directory dir = FSDirectory.open(new File(indexPath));
 73             IndexReader reader = IndexReader.open(dir);
 74             IndexSearcher searcher = new IndexSearcher(reader);
 75             Analyzer analyzer = new IKAnalyzer(true);
 76             QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title", analyzer);
 77             queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
 78             Query query = queryParser.parse(keyWord);
 79             int hm = start + pageSize;
 80             TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
 81             searcher.search(query, res);
 82 
 83             SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
 84             Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
 85             this.rowCount = res.getTotalHits();
 86             this.pages = (rowCount - 1) / pageSize + 1; //计算总页数
 87             TopDocs tds = res.topDocs(start, pageSize);
 88             ScoreDoc[] sd = tds.scoreDocs;
 89             for (int i = 0; i < sd.length; i++) {
 90                 Document hitDoc = reader.document(sd[i].doc);
 91                 list.add(createObj(hitDoc, analyzer, highlighter));
 92             }
 93 
 94         } catch (Exception e) {
 95             e.printStackTrace();
 96         }
 97 
 98         return list;
 99 
100     }
101     /**
102      * 函数功能:根据字段查询索引
103      */
104     public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) {
105         ArrayList list = new ArrayList();
106         try {
107             if (curpage <= 0) {
108                 curpage = 1;
109             }
110             if (pageSize <= 0) {
111                 pageSize = 20;
112             }
113             this.pageSize = pageSize;   //每页记录数
114             this.currentPage = curpage;   //当前页
115             int start = (curpage - 1) * pageSize;
116             Directory dir = FSDirectory.open(new File(indexPath));
117             IndexReader reader = IndexReader.open(dir);
118             IndexSearcher searcher = new IndexSearcher(reader);
119             BooleanQuery bQuery = new BooleanQuery();  //组合查询
120             if (!"".equals(allkeyword)) {//包含全部关键词
121                 KeywordAnalyzer analyzer = new KeywordAnalyzer();
122                 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//AND
123                 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);
124                 bQuery.add(query, BooleanClause.Occur.MUST);  //AND
125             }
126             if (!"".equals(onekeyword)) { //包含任意关键词
127                 Analyzer analyzer = new IKAnalyzer(true);
128                 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//OR
129                 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);
130                 bQuery.add(query, BooleanClause.Occur.MUST);  //AND
131             }
132             if (!"".equals(nokeyword)) { //排除关键词
133                 Analyzer analyzer = new IKAnalyzer(true);
134                 BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//NOT
135                 Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);
136                 bQuery.add(query, BooleanClause.Occur.MUST_NOT);  //AND
137 
138             }
139             int hm = start + pageSize;
140             TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
141             searcher.search(bQuery, res);
142             SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
143             Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(bQuery));
144             this.rowCount = res.getTotalHits();
145             this.pages = (rowCount - 1) / pageSize + 1; //计算总页数
146             System.out.println("rowCount:" + rowCount);
147             TopDocs tds = res.topDocs(start, pageSize);
148             ScoreDoc[] sd = tds.scoreDocs;
149             Analyzer analyzer = new IKAnalyzer();
150             for (int i = 0; i < sd.length; i++) {
151                 Document hitDoc = reader.document(sd[i].doc);
152                 list.add(createObj(hitDoc, analyzer, highlighter));
153             }
154 
155         } catch (Exception e) {
156             e.printStackTrace();
157         }
158 
159         return list;
160 
161     }
162 
163     /**
164      * 创建返回对象（高亮）
165      */
166 
167     private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) {
168 
169         Gk_infoSub gk = new Gk_infoSub();
170         try {
171 
172             if (doc != null) {
173                 gk.setIndexno(StringUtil.null2String(doc.get("indexno")));
174                 gk.setPdate(StringUtil.null2String(doc.get("pdate")));
175                 String title = StringUtil.null2String(doc.get("title"));
176                 gk.setTitle(title);
177                 if (!"".equals(title)) {
178                     highlighter.setTextFragmenter(new SimpleFragmenter(title.length()));
179                     TokenStream tk = analyzer.tokenStream("title", new StringReader(title));
180                     String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title));
181                     if (!"".equals(htext)) {
182                         gk.setTitle(htext);
183                     }
184                 }
185                 String keywords = StringUtil.null2String(doc.get("keywords"));
186                 gk.setKeywords(keywords);
187                 if (!"".equals(keywords)) {
188                     highlighter.setTextFragmenter(new SimpleFragmenter(keywords.length()));
189                     TokenStream tk = analyzer.tokenStream("keywords", new StringReader(keywords));
190                     String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords));
191                     if (!"".equals(htext)) {
192                         gk.setKeywords(htext);
193                     }
194                 }
195                 String describes = StringUtil.null2String(doc.get("describes"));
196                 gk.setDescribes(describes);
197                 if (!"".equals(describes)) {
198                     highlighter.setTextFragmenter(new SimpleFragmenter(describes.length()));
199                     TokenStream tk = analyzer.tokenStream("keywords", new StringReader(describes));
200                     String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes));
201                     if (!"".equals(htext)) {
202                         gk.setDescribes(htext);
203                     }
204                 }
205 
206             }
207             return gk;
208         }
209         catch (Exception e) {
210 
211             e.printStackTrace();
212             return null;
213         }
214         finally {
215             gk = null;
216         }
217 
218     }
219 
220     private synchronized static Object createObj(Document doc) {
221 
222         Gk_infoSub gk = new Gk_infoSub();
223         try {
224 
225             if (doc != null) {
226                 gk.setIndexno(StringUtil.null2String(doc.get("indexno")));
227                 gk.setPdate(StringUtil.null2String(doc.get("pdate")));
228                 gk.setTitle(StringUtil.null2String(doc.get("title")));
229                 gk.setKeywords(StringUtil.null2String(doc.get("keywords")));
230                 gk.setDescribes(StringUtil.null2String(doc.get("describes")));
231             }
232             return gk;
233         }
234         catch (Exception e) {
235 
236             e.printStackTrace();
237             return null;
238         }
239         finally {
240             gk = null;
241         }
242 
243     }
244 }

单字段查询：

 1 long a = System.currentTimeMillis();
 2         try {
 3             int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));
 4             int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));
 5             String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("title")));
 6             LuceneQuery lu = new LuceneQuery();
 7             form.addResult("list", lu.queryIndexTitle(title, curpage, pagesize));
 8             form.addResult("curPage", lu.getCurrentPage());
 9             form.addResult("pageSize", lu.getPageSize());
10             form.addResult("rowCount", lu.getRowCount());
11             form.addResult("pageCount", lu.getPages());
12         } catch (Exception e) {
13             e.printStackTrace();
14         }
15         long b = System.currentTimeMillis();
16         long c = b - a;
17         System.out.println("[搜索信息花费时间：" + c + "毫秒]");

多字段查询：

 1 long a = System.currentTimeMillis();
 2         try {
 3             int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));
 4             int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));
 5             String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("allkeyword")));
 6             String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("onekeyword")));
 7             String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("nokeyword")));
 8             LuceneQuery lu = new LuceneQuery();
 9             form.addResult("list", lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize));
10             form.addResult("curPage", lu.getCurrentPage());
11             form.addResult("pageSize", lu.getPageSize());
12             form.addResult("rowCount", lu.getRowCount());
13             form.addResult("pageCount", lu.getPages());
14         } catch (Exception e) {
15             e.printStackTrace();
16         }
17         long b = System.currentTimeMillis();
18         long c = b - a;
19         System.out.println("[高级检索花费时间：" + c + "毫秒]");

4、Lucene通配符查询

1 BooleanQuery bQuery = new BooleanQuery();  //组合查询
2 if (!"".equals(title)) {
3 WildcardQuery w1 = new WildcardQuery(new Term("title", title+ "*"));
4 bQuery.add(w1, BooleanClause.Occur.MUST);  //AND
5 }
6 int hm = start + pageSize;
7 TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
8 searcher.search(bQuery, res);

5、Lucene嵌套查询

实现SQL：(unitid like 'unitid%' and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)

 1 BooleanQuery bQuery = new BooleanQuery();
 2         BooleanQuery b1 = new BooleanQuery();
 3         WildcardQuery w1 = new WildcardQuery(new Term("unitid", unitid + "*"));
 4         WildcardQuery w2 = new WildcardQuery(new Term("idml", id2 + "*"));
 5         b1.add(w1, BooleanClause.Occur.MUST);//AND
 6         b1.add(w2, BooleanClause.Occur.MUST);//AND
 7         bQuery.add(b1, BooleanClause.Occur.SHOULD);//OR
 8         BooleanQuery b2 = new BooleanQuery();
 9         WildcardQuery w3 = new WildcardQuery(new Term("tounitid", unitid + "*"));
10         WildcardQuery w4 = new WildcardQuery(new Term("tomlid", id2 + "*"));
11         WildcardQuery w5 = new WildcardQuery(new Term("tostate", "1"));
12         b2.add(w3, BooleanClause.Occur.MUST);//AND
13         b2.add(w4, BooleanClause.Occur.MUST);//AND
14         b2.add(w5, BooleanClause.Occur.MUST);//AND
15         bQuery.add(b2, BooleanClause.Occur.SHOULD);//OR

6、Lucene先根据时间排序后分页

 1 int hm = start + pageSize;
 2         Sort sort = new Sort(new SortField("pdate", SortField.STRING, true));
 3         TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false);
 4         searcher.search(bQuery, res);
 5         this.rowCount = res.getTotalHits();
 6         this.pages = (rowCount - 1) / pageSize + 1; //计算总页数
 7         TopDocs tds =searcher.search(bQuery,rowCount,sort);// res.topDocs(start, pageSize);
 8         ScoreDoc[] sd = tds.scoreDocs;
 9         System.out.println("rowCount:" + rowCount);
10         int i=0;
11         for (ScoreDoc scoreDoc : sd) {
12             i++;
13             if(i<start){
14                 continue;
15             }
16             if(i>hm){
17                 break;
18             }
19             Document doc = searcher.doc(scoreDoc.doc);
20             list.add(createObj(doc));
21         }

这个效率不高，正常的做法是创建索引的时候进行排序，之后使用分页方法，不要这样进行2次查询。

posted @ 2020-07-09 15:19 指尖挥舞阅读(357) 评论(0) 编辑收藏举报

刷新页面返回顶部

指尖挥舞

路漫漫其修远兮

Lucene开发实例：Lucene中文分词(转载)

公告