lucene 3.0.2 搜索
1.lucene 词频
转载:http://mxdxm.iteye.com/blog/989031
lucene in action作为action系列,确实坚持了其实用性的特色。全书花了很大的篇幅来讲解查询的解析,结果的优化和lucene应用上。很适合要做全文检索的人学习使用。但是lucen的功能决不仅仅在做搜索引擎上。如果不是最近看到一篇介绍用lucene作词频,文档统计的文章的话,我可能到现在还在为寻找一种用于专业研究的工具而苦恼。其实lucene可以很轻松地实现信息检索课中提到的要求,例如:
* 统计,实现以下功能 *
(1) 统计term在整个collection中的文档频度(document frequency, DF);
(2) 统计term在整个collection中出现的词次(term frequency in whole collection);
(3) 统计term在某个文档中出现的频度(term frequency, TF); (4) 列出term在某文档中出现的位置(position); (5) 整个collection中文档的个数;
另一个参考:http://www.360doc.com/content/11/0427/03/1947337_112596569.shtml
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; import java.io.File; import java.io.IOException; import java.util.Date; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermPositions; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Searchnum { //static final Log log = LogFactory.getLog(Statistic.class); public static void printIndex(IndexReader reader) throws Exception { /* // 显示document数 System.out.println(new Date() + "n"); System.out.println(reader + "t该索引共含 " + reader.numDocs() + "篇文档n"); for (int i = 0; i < reader.numDocs(); i++) { System.out.println("文档" + i + ":" + reader.document(i) + "n"); } */ // 枚举term,获得<document, term freq, position* >信息 TermEnum termEnum = reader.terms(); while (termEnum.next()) { System.out.println("n" + termEnum.term().field() + "域中出现的词语:" + termEnum.term().text()); System.out.println(" 出现该词的文档数=" + termEnum.docFreq()); TermPositions termPositions = reader.termPositions(termEnum.term()); int i = 0; int j = 0; while (termPositions.next()) { System.out.println("n" + (i++) + "->" + " 文章编号:" + termPositions.doc() + ", 出现次数:" + termPositions.freq() + " 出现位置:"); for (j = 0; j < termPositions.freq(); j++) System.out.println("[" + termPositions.nextPosition() + "]"); System.out.println("n"); } } } /* public static void main(String args[]) throws Exception { // String index = ReadConfig.getPara("indexdir"); IndexReader reader = IndexReader.open(index); printIndex(reader); }*/ public static void main(String[] args) throws Exception { if (args.length != 2) { throw new IllegalArgumentException("Usage: java " + Searcher.class.getName() + " <index dir> <query>"); } String indexDir = args[0]; // 1 索引路径 //String q = args[1]; // 2 解析输入的查询字符串 Directory dir = FSDirectory.open(new File(indexDir)); //3打开索引文件 //IndexSearcher is = new IndexSearcher(dir); //3 IndexSearcher search = new IndexSearcher(dir); IndexReader reader = search.getIndexReader(); //search(indexDir); System.out.println("asdfsasdfasd"); printIndex(reader); } }
结果:
n ncontents域中出现的词语:精神 出现该词的文档数=1 n0-> 文章编号:0, 出现次数:1 出现位置: [388] n ncontents域中出现的词语:繁荣 出现该词的文档数=1 n0-> 文章编号:0, 出现次数:3 出现位置: [254] [353] [450] n ncontents域中出现的词语:给予 出现该词的文档数=1 n0-> 文章编号:0, 出现次数:1 出现位置: [85]
参考:http://hanyuanbo.iteye.com/blog/812847
2.lucene 统计term(与建立索引时的分词器有关)出现的个数
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; import java.io.File; import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Searchnumber { public static void main(String[] args) throws CorruptIndexException, IOException { String indexDir = args[0]; // 1 索引路径 String q = args[1]; // 2 解析输入的查询字符串 search(indexDir, q); } public static void search(String indexDir, String keyword) { try { Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件 IndexSearcher is = new IndexSearcher(dir, true); IndexReader reader = is.getIndexReader(); int num = reader.numDocs(); for (int i = 0; i < num; i++) { Document doc = reader.document(i); System.out.println(doc); } Term term = new Term("contents", keyword); TermDocs docs = reader.termDocs(term); while (docs.next()) { // System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); System.out.print("doc num\t" + docs.doc() + "\t"); System.out.println("frequency:\t" + docs.freq()); } reader.close(); is.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
结果:
Document<stored,indexed<filename:commentbyme.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\commentbyme.txt>> Document<stored,indexed<filename:gettrendweek.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\gettrendweek.txt>> Document<stored,indexed<filename:no.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\no.txt>> Document<stored,indexed<filename:showuser.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\showuser.txt>> Document<stored,indexed<filename:suggestionusermayinst.txt> stored,indexed<fullpath:C:\Users\Administrator\Desktop\xdj\data\suggestionusermayinst.txt>> doc num 0 frequency: 15 doc num 2 frequency: 2 doc num 3 frequency: 1 doc num 4 frequency: 30
3.lucene统计多个文档中出现关键词的文档数
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; import java.io.File; import java.io.IOException; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Searchnumbers { public static void main(String[] args) throws CorruptIndexException, IOException { String indexDir = args[0]; // 1 索引路径 String q = args[1]; // 2 解析输入的查询字符串 search(indexDir, q); } public static void search(String indexDir, String keyword) { try { Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件 IndexSearcher is = new IndexSearcher(dir, true); QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30));// 对Document中的哪个Field进行QueryParser Query query = parser.parse(keyword); TopScoreDocCollector collector = TopScoreDocCollector.create(100, false); long start = new Date().getTime(); is.search(query, collector);// IndexSearcher对Query进行索引,并将结果保存在TopScoreDocCollector中 ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println(hits.length); for (int i = 0; i < hits.length; i++) { Document doc = is.doc(hits[i].doc); System.out.println(doc.getField("filename") + "\t" + hits[i].toString());// 得到doc中的filename的Field } is.close(); long end = new Date().getTime(); System.out.println("Found " + collector.getTotalHits() + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + keyword + "'"); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } } }
4.lucene搜索产生的索引term(分割词)中关键词出现个数。(同2)
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; import java.io.File; import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Searchnumber { public static void main(String[] args) throws CorruptIndexException, IOException { String indexDir = args[0]; // 1 索引路径 String q = args[1]; // 2 解析输入的查询字符串 search(indexDir, q); } public static void search(String indexDir, String keyword) { try { Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件 IndexSearcher is = new IndexSearcher(dir, true); IndexReader reader = is.getIndexReader(); int num = reader.numDocs(); for (int i = 0; i < num; i++) { Document doc = reader.document(i); System.out.println(doc); } Term term = new Term("contents", keyword); TermDocs docs = reader.termDocs(term); while (docs.next()) { System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); System.out.print("doc num\t" + docs.doc() + "\t"); System.out.println("frequency:\t" + docs.freq()); } reader.close(); is.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
Document<stored,indexed<filename:texthz.txt> stored,indexed<fullpath:E:\xdj\weibodata\text\texthz.txt>>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
doc num 0 frequency: 27254
5.lucene搜索文档中是否包含关键字,打印文档名
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.util.Version; import java.io.File; import java.io.IOException; // From chapter 1 /** * This code was originally written for Erik's Lucene intro java.net article */ public class Searcher { public static void main(String[] args) throws IllegalArgumentException, IOException, ParseException { if (args.length != 2) { throw new IllegalArgumentException("Usage: java " + Searcher.class.getName() + " <index dir> <query>"); } String indexDir = args[0]; // 1 索引路径 String q = args[1]; // 2 解析输入的查询字符串 search(indexDir, q); } public static void search(String indexDir, String q) throws IOException, ParseException { // //////////////////////////////////////////////////////////////////////////////// Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件 IndexSearcher is = new IndexSearcher(dir); // 3 IndexReader reader = is.getIndexReader(); // TermEnum termEnum=reader.terms(); // termEnum. /* * QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串 * "contents", //4 new StandardAnalyzer( //4 Version.LUCENE_30)); //4 */ QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串 "contents", // 4 new SmartChineseAnalyzer(Version.LUCENE_30)); // 4 Query query = parser.parse(q); // 4 long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 10); // 5 搜索索引 // Hits hits = is.search(query); long end = System.currentTimeMillis(); System.err.println("Found " + hits.totalHits + // 6记录索引状态 " document(s) (in " + (end - start) + // 6 " milliseconds) that matched query '" + // 6+reader.docFreq(new // Term("雨")) q + "':"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); // 7 返回匹配文字 System.out.println(doc.get("fullpath") + " " + scoreDoc.doc); // 8 // 显示匹配文件名 } // //////////////////////////////////////////////////////////////////////////////// is.close(); // 9 } } /* * #1 Parse provided index directory #2 Parse provided query string #3 Open * index #4 Parse query #5 Search index #6 Write search stats * * #7 Retrieve matching document #8 Display filename #9 Close IndexSearcher */
Found 1 document(s) (in 16 milliseconds) that matched query '雨钝':
E:\xdj\weibodata\text\texthz.txt 0
6.查询词排序输出
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; public class Searchnumbers { /** * 建索引<br> * 一共4个Document,每个文档两个Field:text,size。text存放内容,size用于排序 * * @throws CorruptIndexException * @throws LockObtainFailedException * @throws IOException */ private static void build() throws CorruptIndexException, LockObtainFailedException, IOException { IndexWriter writer = new IndexWriter(FSDirectory.open(new File("index")), new StandardAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED); Document document = new Document(); document.add(new Field("text", "google", Store.YES, Index.ANALYZED)); document.add(new Field("size", "1", Store.YES, Index.NOT_ANALYZED_NO_NORMS)); writer.addDocument(document); document = new Document(); document.add(new Field("text", "google earth apache", Store.YES, Index.ANALYZED)); document.add(new Field("size", "2", Store.YES, Index.NOT_ANALYZED_NO_NORMS)); writer.addDocument(document); document = new Document(); document.add(new Field("text", "baidu earth", Store.YES, Index.ANALYZED)); document.add(new Field("size", "3", Store.YES, Index.NOT_ANALYZED_NO_NORMS)); writer.addDocument(document); document = new Document(); document.add(new Field("text", "baidu earth apache", Store.YES, Index.ANALYZED)); document.add(new Field("size", "4", Store.YES, Index.NOT_ANALYZED_NO_NORMS)); writer.addDocument(document); writer.optimize(); writer.close(); } /** * lucene3.0已经没有返回Hits的方法,使用返回TopDocs的方法进行搜索 * * @param keyword * 要搜索的关键词 * @throws CorruptIndexException * @throws IOException * @throws ParseException */ private static void searchWithTopDocs(String keyword) throws CorruptIndexException, IOException, ParseException { QueryParser parser = new QueryParser(Version.LUCENE_30, "text", new StandardAnalyzer(Version.LUCENE_30)); IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index"))); TopDocs topDocs = searcher.search(parser.parse(keyword), 10);// 取前10个搜索结果,如果没有这么多,就取实际大小 ScoreDoc[] scoreDocs = topDocs.scoreDocs;// 获取ScoreDoc System.out.println("hits:" + topDocs.totalHits); for (ScoreDoc scoreDoc : scoreDocs) { int docNum = scoreDoc.doc;// 文档编号 Document doc = searcher.doc(docNum); String text = doc.get("text"); String size = doc.get("size"); float score = scoreDoc.score;// 评分 System.out.println(text + " " + size + " " + score); } } /** * 对命中文档进行排序的搜索,也不再返回Hits,而是返回TopFieldDocs * * @param keyword * 要搜索的关键词 * @throws CorruptIndexException * @throws IOException * @throws ParseException */ private static void searchWithSort(String keyword) throws CorruptIndexException, IOException, ParseException { QueryParser parser = new QueryParser(Version.LUCENE_30, "text", new StandardAnalyzer(Version.LUCENE_30)); Searcher searcher = new IndexSearcher(FSDirectory.open(new File("index"))); Query query = parser.parse(keyword); SortField sortField = new SortField("size", SortField.INT, true);// 需要排序的字段 TopFieldDocs topFieldDocs = searcher.search(query, null, 10, new Sort(sortField));// 第二个参数是过滤器,此例中不需要 ScoreDoc[] socDocs = topFieldDocs.scoreDocs; System.out.println("hits:" + topFieldDocs.totalHits); for (ScoreDoc scoreDoc : socDocs) { int docNum = scoreDoc.doc; Document doc = searcher.doc(docNum); String text = doc.get("text"); String size = doc.get("size"); float score = scoreDoc.score;// 评分,这里的评分不可用,分值都是NaN System.out.println(text + " " + size + " " + score); } } public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException { build(); String keyword = "google"; searchWithTopDocs(keyword); System.out.println("---------"); searchWithSort(keyword); } }
hits:2 google 1 1.287682 google earth apache 2 0.643841 --------- hits:2 google earth apache 2 NaN google 1 NaN
7.lucene关键字的高亮显示
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; import java.io.File; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; public class Searchnum { /** * lucene3.0开始已经抛弃了原来的分词方式,转而使用新的分词方式<br> * 本方法以SmartChineseAnalyzer为例,演示如何分词以及取得分词之后的term * * @throws Exception */ public static void analysis() throws Exception { Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30); String string = "中国人民银行采取了一系列措施防止人民币升值,但是很遗憾,这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?"; StringReader reader = new StringReader(string); TokenStream ts = analyzer.tokenStream("", reader); TermAttribute termAttribute = ts.getAttribute(TermAttribute.class); while (ts.incrementToken()) { System.out.print(termAttribute.term() + " "); } System.out.println(); } /** * 建索引<br> * 在构造IndexWriter时必须使用Directory作为参数了 * * @throws CorruptIndexException * @throws LockObtainFailedException * @throws IOException */ private static void build() throws CorruptIndexException, LockObtainFailedException, IOException { String path = "index"; IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)), new SmartChineseAnalyzer(Version.LUCENE_30), true, MaxFieldLength.LIMITED); Document document = new Document(); document.add(new Field("text", "中国人民银行采取了一系列措施防止人民币升值,但是很遗憾,这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?", Store.YES, Index.ANALYZED)); writer.addDocument(document); writer.optimize(); writer.close(); } /** * 搜索也没有返回Hits类型结果的方法了 * * @param keyword * @throws CorruptIndexException * @throws IOException * @throws ParseException * @throws InvalidTokenOffsetsException */ private static void search(String keyword) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException { Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_30); QueryParser parser = new QueryParser(Version.LUCENE_30, "text", analyzer); IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index"))); Query query = parser.parse(keyword); System.out.println(query); TopDocs topDocs = searcher.search(query, 10); ScoreDoc[] scoreDocs = topDocs.scoreDocs; System.out.println("hits:" + topDocs.totalHits); for (ScoreDoc scoreDoc : scoreDocs) { Document doc = searcher.doc(scoreDoc.doc); String text = doc.get("text"); System.out.println(highlight(text, query, analyzer)); } } /** * 高亮关键词 * * @param content * 需要高亮的内容 * @param query * 搜索时使用的Query对象 * @param analyzer * 分词器 * @return 高亮之后的文本 * @throws IOException * @throws InvalidTokenOffsetsException */ private static String highlight(String content, Query query, Analyzer analyzer) throws IOException, InvalidTokenOffsetsException { SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>"); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(25)); String resultString = highlighter.getBestFragment(analyzer.tokenStream("", new StringReader(content)), content); return resultString + "..."; } public static void main(String[] args) throws Exception { analysis(); build(); search("人民币"); } }
中国 人民 银行 采取 了 一 系列 措施 防止 人民币 升值 但是 很 遗憾 这些 措施 在 今天 看来 其 作用 是 微乎其微 的 难道 真 的 就 没有 什么 别的 措施 防止 人民币 再次 疯狂 升值 了 吗 text:人民币 hits:1 中国人民银行采取了一系列措施防止<b>人民币</b>升值,但是...
8.输出分词后,出现次数较多的前1000个terms
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermPositions; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Searchnumbers { // static final Log log = LogFactory.getLog(Statistic.class); public static void printIndex(IndexReader reader) throws Exception { /* * // 显示document数 System.out.println(new Date() + "n"); * System.out.println(reader + "t该索引共含 " + reader.numDocs() + "篇文档n"); * * for (int i = 0; i < reader.numDocs(); i++) { System.out.println("文档" * + i + ":" + reader.document(i) + "n"); } */ // 枚举term,获得<document, term freq, position* >信息 TermEnum termEnum = reader.terms(); // List ve = new List(); List<Person_Term> listA = new ArrayList<Person_Term>(); while (termEnum.next()) { Person_Term pa = new Person_Term(); /* * System.out.println("n" + termEnum.term().field() + "域中出现的词语:" + * termEnum.term().text()); */ pa.setterm(termEnum.term().text()); TermPositions termPositions = reader.termPositions(termEnum.term()); int i = 0; int j = 0; // while (termPositions.next()) { /* * System.out.println("n" + (i++) + "->" + " 文章编号:" + * termPositions.doc() + ", 出现次数:" + termPositions.freq() + * " 出现位置:"); for (j = 0; j < termPositions.freq(); j++) * System.out.println("[" + termPositions.nextPosition() + "]"); * System.out.println("n"); */ termPositions.next(); pa.setfreq(termPositions.freq()); // System.out.println(termPositions.); } listA.add(pa); } Collections.sort(listA, new Comparator<Person_Term>() { public int compare(Person_Term arg0, Person_Term arg1) { return arg1.getfreq().compareTo(arg0.getfreq()); } }); int i = 0; for (Person_Term p : listA) { i++; System.out.println(p.getterm() + "\t" + p.getfreq()); if (i > 1000) break; } } /* * public static void main(String args[]) throws Exception { // String index * = ReadConfig.getPara("indexdir"); * * IndexReader reader = IndexReader.open(index); printIndex(reader); * * } */ public static void main(String[] args) throws Exception { if (args.length != 2) { throw new IllegalArgumentException("Usage: java " + Searcher.class.getName() + " <index dir> <query>"); } String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";// args[0]; // // 1 // 索引路径 // String indexDir = "E:/xiaodajun/new/lia2e/src/lia/meetlucene"; // String q = args[1]; // 2 解析输入的查询字符串 Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件 // IndexSearcher is = new IndexSearcher(dir); //3 IndexSearcher search = new IndexSearcher(dir); IndexReader reader = search.getIndexReader(); // search(indexDir); // System.out.println("asdfsasdfasd"); printIndex(reader); } }
Person_Term.java的代码:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; public class Person_Term implements Comparable<Person_Term> { private String term; private Integer freq; /** * * @return the term */ public String getterm() { return term; } /** * * @param term * * the term to set */ public void setterm(String term) { this.term = term; } /** * * @return the freq */ public Integer getfreq() { return freq; } /** * * @param freq * * the freq to set */ public void setfreq(Integer freq) { this.freq = freq; } @Override public int compareTo(Person_Term arg0) { return this.getfreq().compareTo(arg0.getfreq()); } } /* * TermPositions termPositions = reader.termPositions(termEnum.term()); int i = * 0; int j = 0; while (termPositions.next()) { System.out.println("n" + (i++) + * "->" + " 文章编号:" + termPositions.doc() + ", 出现次数:" + termPositions.freq() + * " 出现位置:"); for (j = 0; j < termPositions.freq(); j++) * System.out.println("[" + termPositions.nextPosition() + "]"); * System.out.println("n"); } */
输出(部分):
文化 19 的 16 和 9 刘 8 中国 7 云 7 在 7 先达 3 好 3 委员 3 家中 3 就 3 C:\Users\Administrator\Desktop\xdj\weibohanzi\weibo.txt 1
9.多条件搜索
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package lia.meetlucene; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import org.apache.lucene.document.Document; import org.apache.lucene.document.NumberTools; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.search.Filter; import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.RangeFilter; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.util.Version; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; // From chapter 1 /** * This code was originally written for Erik's Lucene intro java.net article */ public class Searcherw { public static void main(String[] args) throws IllegalArgumentException, IOException, ParseException { // String indexDir = args[0]; //1 索引路径 //String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin"; String indexDir = "E:/xdj/tengxunsuoying"; String q = "雨天";// args[1]; //2 解析输入的查询字符串 search(indexDir, q); } public static void search(String indexDir, String q) throws IOException, ParseException { // //////////////////////////////////////////////////////////////////////////////// Directory dir = FSDirectory.open(new File(indexDir)); // 3打开索引文件 IndexSearcher is = new IndexSearcher(dir); // 3 IndexReader reader = is.getIndexReader(); // TermEnum termEnum=reader.terms(); // termEnum. QueryParser parser = new QueryParser(Version.LUCENE_30, // 4 解析查询字符串 "context", // 4 //new StandardAnalyzer(Version.LUCENE_30)); // 4 //new CJKAnalyzer(Version.LUCENE_30)); new SmartChineseAnalyzer(Version.LUCENE_30)); // SimpleDateFormat sdf = new SimpleDateFormat( " yyyy-MM-dd" ); // FileFilter filter = new FileFilter("time", datetime.parse(”2005-10-1′), datetime.parse(”2005-10-30′)); Filter filter = new RangeFilter("time", "20141001", "20141031", true, true); // NumericRangeQuery rangeQuery = NumericRangeQuery.("carPrice",st,ed,true, true); Query query = parser.parse(q); // 4 query = new FilteredQuery(query, filter); // 带过滤条件的搜索 long start = System.currentTimeMillis(); //is.search(query, filter, n, sort) //TopDocs hits = is.search(query, 10,new Sort(new SortField("time",SortField.STRING,true))); // 5 搜索索引 TopDocs hits = is.search(query, 10); // 5 搜索索引 long end = System.currentTimeMillis(); // System.err System.err.println("Found " + hits.totalHits + // 6记录索引状态 " document(s) (in " + (end - start) + // 6 " milliseconds) that matched query '" + // 6+reader.docFreq(new // Term("雨")) q + "':"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); // 7 返回匹配文字 System.out.println(doc.get("time") + " "+doc.get("context")); // 8 // 显示匹配文件名 } // //////////////////////////////////////////////////////////////////////////////// is.close(); // 9 } } /* * public List<Document> rangeSearch(){ List<Document> docList = new ArrayList<Document>(); Double start = 20.0; Double end = 40.0; NumericRangeQuery rangeQuery = NumericRangeQuery.newDoubleRange("carPrice",start,end,true, true); try { directory = FSDirectory.open(new File(LuceneConstant.INDEX_PATH));//打开索引文件 IndexReader reader = DirectoryReader.open(directory);//读取目录 IndexSearcher search = new IndexSearcher(reader);//初始化查询组件 TopDocs td = search.search(rangeQuery, 10000);//获取匹配上元素的一个docid for (ScoreDoc doc : td.scoreDocs) { docList.add(search.doc(doc.doc)); } reader.close();//关闭资源 directory.close();//关闭连接 } catch (IOException ex) { Logger.getLogger(LuceneDao.class.getName()).log(Level.SEVERE, null, ex); } } * * * */