全文检索的Demo
用到lucene版本为6.3.0版本,利用的分词器为IKAnalyzer分词器,该分词对中文有较好的支持。关于支持lucene的6.xx以上的IkAnalyzer分词jar包下载地址:https://pan.baidu.com/s/1i5DreTZ
密码:2frx
package com.cn.shupu.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.nio.file.Paths; import javax.management.Query; import org.apache.ibatis.javassist.bytecode.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.highlight.Fragmenter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.BytesRef; import org.apache.xmlbeans.impl.store.Path; import org.wltea.analyzer.lucene.IKAnalyzer; public class LuceneUtils { // 索引文件的存放路径 private static String indexPath = "D://index/path"; // 要索引的文件(假如要索引的文件类型为txt文件) private static String tarPath = "D://font//管理学(第四版).txt"; // 创建索引的方法 public static void createIndex() { File filDir = new File(tarPath); File idnexDir = new File(indexPath); Boolean create = true; if (!idnexDir.exists()) { idnexDir.mkdirs(); } // 创建文件索引 try { Directory directroy = FSDirectory.open(Paths.get(indexPath)); // 创建内存索引 RAMDirectory ramDirectory = new RAMDirectory(); // 创建分词器 // 1.创建 SmartChineseAnalyze分词器 // SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer(); // 2.创建StantdarAnalyzer // StandardAnalyzer analyzer = new StandardAnalyzer(); // 3.创建IKAnalyzer IKAnalyzer analyzer = new IKAnalyzer(); // 尚未初始化,因为第一次执行分词的时候才会初始化,为了在执行分此前手动添加额外的字典,需要先手动的初始化一下 /* * Configuration cfg = DefaultConfig.getInstance(); // 加载词库 * cfg.setUseSmart(true); // 设置智能分词 * org.wltea.analyzer.dic.Dictionary.initial(cfg); * * org.wltea.analyzer.dic.Dictionary dictionary = * org.wltea.analyzer.dic.Dictionary.getSingleton(); * * List<String> newWords = new ArrayList<>(); * * for (BaseLib book : books) { * * newWords.add(book.getName()); * * } dictionary.addWords(newWords);// 自动添加自定义分词 */ // 创建索引器之前的初始化 IndexWriterConfig iwc = new IndexWriterConfig(analyzer); IndexWriterConfig rwc = new IndexWriterConfig(analyzer); // SetMergeFactor是控制segment合并频率的,其决定了一个索引块中包括多少个文档,当硬盘上的索引块达到多少时, // 将它们合并成一个较大的索引块。当MergeFactor值较大时,生成索引的速度较快。MergeFactor的默认值是10,建议在建立索引前将其设置的大一些。 LogMergePolicy mergePolicy = new LogByteSizeMergePolicy(); mergePolicy.setMergeFactor(100); iwc.setMergePolicy(mergePolicy); iwc.setRAMBufferSizeMB(2048); iwc.setMaxBufferedDocs(1000); iwc.setMaxBufferedDocs(1000); if (create) { // Create a new index in the directory, removing any // previously indexed documents: // 在目录中创建一个新的索引,删除任何先前生成的索引文档 rwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: // 新文档添加到一个现有的指数 rwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // 文件索引生成器 IndexWriter fileWriter = new IndexWriter(directroy, iwc); // 内存索引生成器 IndexWriter ramWriter = new IndexWriter(ramDirectory, rwc); // 现将索引添加到内存索引中 // 创建文档 Document document = new Document(); // 添加term,term为索引文件中最小的单位,就像数据库中每一表的field。 // 获取文件的内容 String content = getTxt(filDir); // 其中Store.YESb表示索引的内容要保存的,Store.NO表示索引内容只索引不保存 document.add(new TextField("path", filDir.getAbsolutePath(), Store.YES)); document.add(new TextField("content", content, Store.YES)); // 根据文件的初始的OpenMode来对文件索引的添加或修改 if (ramWriter.getConfig().getOpenMode() == OpenMode.CREATE) { System.out.println("adding " + filDir.getAbsolutePath()); ramWriter.addDocument(document); } else { System.out.println("updating " + filDir.getAbsolutePath()); ramWriter.updateDocument(new Term("path", filDir.toString()), document); } ramWriter.close(); // 将内存索引添加文件索引中,永久保存。内存索引只是暂时的保存索引文件,当程序结束时,内存索引的文件会消失 fileWriter.addIndexes(new Directory[] { ramDirectory }); fileWriter.forceMerge(1000); fileWriter.maybeMerge(); fileWriter.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 121 * 读取txt文件的内容 122 * @param file 想要读取的文件对象 123 * @return 返回文件内容 124 */ public static String getTxt(File file) { String result = ""; try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "GBK")); String s = null; while ((s = br.readLine()) != null) {// 使用readLine方法,一次读一行 result = result + "\n" + s; } br.close(); } catch (Exception e) { e.printStackTrace(); } return result; } public static void searchByKeyWord(String keyWord) { try { Directory directory = FSDirectory.open(Paths.get(indexPath)); // 采用IkAnalyzer,其中true为智能分词。 IKAnalyzer analuzer = new IKAnalyzer(true); // 创建索引的读器 IndexReader ireader = DirectoryReader.open(directory); // 创建索引的搜索器 IndexSearcher isearcher = new IndexSearcher(ireader); String[] stringQuery = { keyWord, keyWord };// 查询关键词的数组 String[] fields = { "path", "content" };// 多字段查询的值域 // Occur.MUST表示对应字段必须有查询值, Occur.MUST_NOT // 表示对应字段必须没有查询值,Occur.SHOULD表示对应字段应该存在查询值(但不是必须) Occur[] occ = { Occur.SHOULD, Occur.SHOULD }; org.apache.lucene.search.Query query = null; try { query = MultiFieldQueryParser.parse(stringQuery, fields, occ, analuzer); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } /* * // 只对content进行检索 QueryParser parser = new QueryParser("contents", * analyzer); * * Query query = parser.parse(text); */ // 排序 SortField sf = new SortField(keyWord, SortField.Type.STRING_VAL, true); Sort sort = new Sort(sf); int count; // 获取一共多少条数据 TopDocs docs = null; ScoreDoc[] hits = null; docs = isearcher.search(query, Integer.MAX_VALUE, sort); hits = docs.scoreDocs; // 假如用到分页的时候,用 docs = // isearcher.searchAfter(results.scoreDocs[page.getAfterDocId()], // query, page.getPagesize(), sort); for (int i = 0; i < hits.length; i++) { Document doc = isearcher.doc(hits[i].doc); System.out.println("文件路径:" + doc.get("path")); String content = doc.get("content"); // 查询字段高亮操作 其中100为字符长度可以自动修改的; String s = null; try { s = displayHtmlHighlight(query, analuzer, "content", content, 290); } catch (InvalidTokenOffsetsException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("内容:" + s); } ireader.close(); directory.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 获取高亮显示结果的html代码 * * @param query * 查询 * @param analyzer * 分词器 * @param fieldName * 域名 * @param fieldContent * 域内容 * @param fragmentSize * 结果的长度(不含html标签长度) * @return 结果(一段html代码) * @throws IOException * @throws InvalidTokenOffsetsException */ static String displayHtmlHighlight(org.apache.lucene.search.Query query, org.apache.lucene.analysis.Analyzer analyzer, String fieldName, String fieldContent, int fragmentSize) throws IOException, InvalidTokenOffsetsException { // 创建一个高亮器 Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<font color='red'>", "</font>"), new QueryScorer(query)); Fragmenter fragmenter = new SimpleFragmenter(fragmentSize); highlighter.setTextFragmenter(fragmenter); return highlighter.getBestFragment(analyzer, fieldName, fieldContent); } public static void main(String[] args) { //本Demo只对txt文件类型做的索引 createIndex(); searchByKeyWord("管理"); } }