lucene之中文分词及其高亮显示(五)
中文分词:即换个分词器
Analyzer analyzer = new StandardAnalyzer();// 标准分词器 换成 SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();//要加入lucene-analyzers-common-5.3.1.jar
package com.wp.lucene; import java.nio.file.Paths; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Indexer { private Integer ids[] = { 1, 2, 3 }; private String citys[] = { "青岛", "南京", "上海" }; private String descs[] = { "青岛是一个美丽的城市。", "南京是一个有文化的城市。南京是一个文化的城市南京,简称宁,是江苏省会,地处中国东部地区,长江下游,濒江近海。全市下辖11个区,总面积6597平方公里,2013年建成区面积752.83平方公里,常住人口818.78万,其中城镇人口659.1万人。[1-4] “江南佳丽地,金陵帝王州”,南京拥有着6000多年文明史、近2600年建城史和近500年的建都史,是中国四大古都之一,有“六朝古都”、“十朝都会”之称,是中华文明的重要发祥地,历史上曾数次庇佑华夏之正朔,长期是中国南方的政治、经济、文化中心,拥有厚重的文化底蕴和丰富的历史遗存。[5-7] 南京是国家重要的科教中心,自古以来就是一座崇文重教的城市,有“天下文枢”、“东南第一学”的美誉。截至2013年,南京有高等院校75所,其中211高校8所,仅次于北京上海;国家重点实验室25所、国家重点学科169个、两院院士83人,均居中国第三。[8-10] 。", "上海是一个繁华的城市。" }; private Directory dir; /** * 获取IndexWriter实例 * * @return * @throws Exception */ private IndexWriter getWriter() throws Exception { // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器 SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(dir, iwc); return writer; } /** * 生成索引 * * @param indexDir * @throws Exception */ private void index(String indexDir) throws Exception { dir = FSDirectory.open(Paths.get(indexDir)); IndexWriter writer = getWriter(); for (int i = 0; i < ids.length; i++) { Document doc = new Document(); doc.add(new IntField("id", ids[i], Field.Store.YES)); doc.add(new StringField("city", citys[i], Field.Store.YES)); doc.add(new TextField("desc", descs[i], Field.Store.YES)); writer.addDocument(doc); // 添加文档 } writer.close(); } public static void main(String[] args) throws Exception { new Indexer().index("D:\\lucene\\luceneIndex"); } }
高亮部分:在得到搜索结果后
QueryScorer scorer = new QueryScorer(query);// 查询得分
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);// 得到得分的片段,就是得到一段包含所查询的关键字的摘要
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(
"<b><font color='red'>", "</font></b>");// 对查询的数据格式化;无参构造器的默认是将关键字加粗
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);// 根据得分和格式化
highlighter.setTextFragmenter(fragmenter);// 设置成高亮
package com.wp.lucene; import java.io.StringReader; import java.nio.file.Paths; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Fragmenter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; public class Searcher { public static void search(String indexDir, String q) throws Exception { Directory dir = FSDirectory.open(Paths.get(indexDir)); IndexReader reader = DirectoryReader.open(dir); IndexSearcher is = new IndexSearcher(reader); // Analyzer analyzer=new StandardAnalyzer(); // 标准分词器 SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); QueryParser parser = new QueryParser("desc", analyzer); Query query = parser.parse(q); long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 10); long end = System.currentTimeMillis(); System.out.println("匹配 " + q + " ,总共花费" + (end - start) + "毫秒" + "查询到" + hits.totalHits + "个记录"); QueryScorer scorer = new QueryScorer(query);// 查询得分 Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);// 得到得分的片段,就是得到一段包含所查询的关键字的摘要 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter( "<b><font color='red'>", "</font></b>");// 对查询的数据格式化;无参构造器的默认是将关键字加粗 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);// 根据得分和格式化 highlighter.setTextFragmenter(fragmenter);// 设置成高亮 for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); System.out.println(doc.get("city")); System.out.println(doc.get("desc")); String desc = doc.get("desc"); if (desc != null) { TokenStream tokenStream = analyzer.tokenStream("desc", new StringReader(desc));// TokenStream将查询出来的搞成片段,得到的是整个内容 System.out.println(highlighter.getBestFragment(tokenStream, desc));// 将权重高的摘要显示出来,得到的是关键字内容 } } reader.close(); } public static void main(String[] args) { String indexDir = "D:\\lucene6"; String q = "南京文明"; try { search(indexDir, q); } catch (Exception e) { e.printStackTrace(); } } }
Java小生店铺:
Pc端:http://shop125970977.taobao.com/index.htm
手机端:搜索 java小生店铺
希望店铺的资料能帮助到你!!!