lucene+IKAnalyzer实现中文纯文本检索系统
首先IntelliJ IDEA中搭建Maven项目(web):spring+SpringMVC+Lucene+IKAnalyzer
spring+SpringMVC搭建项目可以参考我的博客
整合Lucene 4.9.0
pom.xml添加lucene依赖
properties标签添加<lucene.version>4.9.0</lucene.version>
dependencies添加:
1 <!-- lucene start --> 2 <dependency> 3 <groupId> org.apache.lucene</groupId> 4 <artifactId>lucene-analyzers-common</artifactId> 5 <version> ${lucene.version}</version> 6 </dependency> 7 8 <dependency> 9 <groupId> org.apache.lucene</groupId> 10 <artifactId>lucene-core</artifactId> 11 <version> ${lucene.version}</version> 12 </dependency> 13 14 <dependency> 15 <groupId> org.apache.lucene</groupId> 16 <artifactId>lucene-highlighter</artifactId> 17 <version> ${lucene.version}</version> 18 </dependency> 19 20 <dependency> 21 <groupId> org.apache.lucene</groupId> 22 <artifactId>lucene-queryparser</artifactId> 23 <version> ${lucene.version}</version> 24 </dependency> 25 <!-- lucene end -->
整合IKAnalyzer 2012FF_hf1,中文分词器的版本要和Lucene的版本对应,Lucene 4.X对应IKAnalyzer 2012FF版本
maven依赖配置参考我的博客
将IKAnalyzer的配置文件考到resources目录里如图:
IKAnalyzer.cfg.xml可以配置词典以及停用词点,其它文件的为自定义停用词典
1 <?xml version="1.0" encoding="UTF-8"?> 2 <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> 3 <properties> 4 <comment>IK Analyzer 扩展配置</comment> 5 <!--用户可以在这里配置自己的扩展字典 6 <entry key="ext_dict">ext.dic;</entry> 7 --> 8 <!--用户可以在这里配置自己的扩展停止词字典--> 9 <entry key="ext_stopwords">classpath:stopword.dic;classpath:x-noise-charactor.dic;classpath:x-noise-word.dic;</entry> 10 11 </properties>
好了,到此项目就配置好了,接下来进行Lucene的核心操作——建索引和检索
IndexService.java
1 package com.ssm.demo.web.service; 2 3 import com.ssm.demo.core.dto.DocDto; 4 import org.springframework.ui.Model; 5 6 import java.util.List; 7 8 /** 9 * Describe: sevice接口 10 * Author: ouym 11 * Created Date: 2016/11/30. 12 */ 13 public interface IndexService { 14 15 /** 16 * 构建索引,传入参数:文档路径 17 * @param path 18 * @return 19 */ 20 public boolean createIndex(String path); 21 22 23 /** 24 * 通过query查询索引 25 * @param query 26 */ 27 public List<DocDto> searchIndex(String query, Model model); 28 }
IndexServiceImpl.java
1 package com.ssm.demo.web.service.impl; 2 3 import com.ssm.demo.core.constants.MyConstant; 4 import com.ssm.demo.core.dto.DocDto; 5 import com.ssm.demo.core.util.MyFileUtil; 6 import com.ssm.demo.web.service.IndexService; 7 import org.apache.lucene.analysis.Analyzer; 8 import org.apache.lucene.analysis.standard.StandardAnalyzer; 9 import org.apache.lucene.document.Document; 10 import org.apache.lucene.document.Field; 11 import org.apache.lucene.document.TextField; 12 import org.apache.lucene.index.DirectoryReader; 13 import org.apache.lucene.index.IndexWriter; 14 import org.apache.lucene.index.IndexWriterConfig; 15 import org.apache.lucene.queryparser.classic.QueryParser; 16 import org.apache.lucene.search.IndexSearcher; 17 import org.apache.lucene.search.Query; 18 import org.apache.lucene.search.ScoreDoc; 19 import org.apache.lucene.search.highlight.Highlighter; 20 import org.apache.lucene.search.highlight.QueryScorer; 21 import org.apache.lucene.search.highlight.SimpleHTMLFormatter; 22 import org.apache.lucene.store.Directory; 23 import org.apache.lucene.store.FSDirectory; 24 import org.apache.lucene.util.Version; 25 import org.springframework.stereotype.Service; 26 import org.springframework.ui.Model; 27 import org.wltea.analyzer.lucene.IKAnalyzer; 28 29 import java.io.File; 30 import java.nio.file.Paths; 31 import java.util.ArrayList; 32 import java.util.Date; 33 import java.util.List; 34 35 import static com.ssm.demo.core.util.MyFileUtil.*; 36 37 /** 38 * Describe: description of this class 39 * Author: ouym 40 * Created Date: 2016/11/30. 41 */ 42 @Service("indexService") 43 public class IndexServiceImpl implements IndexService { 44 45 public boolean createIndex(String path) { 46 Date date1 = new Date(); 47 List<File> fileList = getFileList(path); 48 File indexFile = new File(MyConstant.INDEX_PATH); 49 //避免重复索引 50 if (indexFile.exists()){ 51 MyFileUtil.deleteDir(indexFile); 52 }else { 53 indexFile.mkdirs(); 54 } 55 String content=""; 56 Analyzer analyzer = null; 57 Directory directory = null; 58 IndexWriter indexWriter = null; 59 60 for (File file : fileList) { 61 content = ""; 62 //获取文件后缀,只对.doc和.txt文件建索引 63 String type = file.getName().substring(file.getName().lastIndexOf(".")+1); 64 if("txt".equalsIgnoreCase(type)){ 65 content += txt2String(file); 66 }else if("doc".equalsIgnoreCase(type)){ 67 content += doc2String(file); 68 } 69 70 try{ 71 72 //使用第三方中文分词器IKAnalyzer 73 analyzer = new IKAnalyzer(true); 74 directory = FSDirectory.open(new File(MyConstant.INDEX_PATH)); 75 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,analyzer); 76 indexWriter = new IndexWriter(directory, config); 77 78 Document document = new Document(); 79 document.add(new TextField("filename", file.getName(), Field.Store.YES)); 80 document.add(new TextField("content", content, Field.Store.YES)); 81 document.add(new TextField("path", file.getPath(), Field.Store.YES)); 82 indexWriter.addDocument(document); 83 indexWriter.commit(); 84 indexWriter.close(); 85 86 }catch(Exception e){ 87 e.printStackTrace(); 88 } 89 content = ""; 90 } 91 Date date2 = new Date(); 92 System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n"); 93 return false; 94 } 95 96 public List<DocDto> searchIndex(String queryStr,Model model) { 97 98 Date date1 = new Date(); 99 Analyzer analyzer = null; 100 Directory directory = null; 101 IndexWriter indexWriter = null; 102 String prefixHTML = "<font color='red'>"; 103 String suffixHTML = "</font>"; 104 List<DocDto> docDtoList = new ArrayList<>(); 105 try{ 106 directory = FSDirectory.open(new File(MyConstant.INDEX_PATH)); 107 //analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 108 analyzer = new IKAnalyzer(true); 109 DirectoryReader ireader = DirectoryReader.open(directory); 110 IndexSearcher isearcher = new IndexSearcher(ireader); 111 112 QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,"content", analyzer); 113 Query query = parser.parse(queryStr); 114 115 ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs; 116 //ScoreDoc[] hits = isearcher.search(query, 10).scoreDocs; 117 118 for (int i = 0; i < hits.length; i++) { 119 DocDto docDto = new DocDto(); 120 Document hitDoc = isearcher.doc(hits[i].doc); 121 //自动摘要,查询关键词高亮 122 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(prefixHTML, suffixHTML); 123 Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(query)); 124 String highLightText = highlighter.getBestFragment(analyzer,"content",hitDoc.get("content")); 125 126 docDto.setDocName(hitDoc.get("filename")); 127 String path = hitDoc.get("path"); 128 path = path.replaceAll("\\\\", "/"); 129 docDto.setDocPath(path); 130 docDto.setDocAbstract(highLightText+"..."); 131 docDtoList.add(docDto); 132 } 133 ireader.close(); 134 directory.close(); 135 }catch(Exception e){ 136 e.printStackTrace(); 137 } 138 Date date2 = new Date(); 139 //System.out.println("查看索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n"); 140 model.addAttribute("spendTimes",(date2.getTime() - date1.getTime())); 141 return docDtoList; 142 } 143 }
省略了一些常量类和自定义的工具类,接下来只需要在controller里面调用service就行了
1 @RequestMapping("/index") 2 public String index(@RequestParam("wd")String wd, Model model){ 3 4 //建立索引 5 indexService.createIndex(MyConstant.DATA_PATH); 6 if (wd.trim().equals("")){ 7 return "redirect:/index/index"; 8 } 9 10 List<DocDto> docDtoList = indexService.searchIndex(wd,model); 11 if (!StringUtils.isEmpty(wd)) { 12 model.addAttribute("query",wd); 13 } 14 model.addAttribute("docDtoList",docDtoList); 15 model.addAttribute("listSize",docDtoList.size()); 16 return "result"; 17 }
我的测试文档集是30篇doc文档,然后自己简单仿百度首页做了一个界面,效果图如下:
首页:
检索结果:
结果按相关度排好序了~
实现过程中注意事项:
1.中文分词器的版本要和Lucene的版本对应,Lucene 4.X对应IKAnalyzer 2012FF版本
2.Maven仓库中没有IKAnalyzer 的jar包依赖,需要自己手动添加本地jar包
3. IKAnalyzer 分词器有自己的智能切词优化,声明时参数为true即可开启:analyzer = new IKAnalyzer(true);
若要添加自己的词典和停用词典,将true改为false效果可能更好(有待确认)。analyzer = new IKAnalyzer(false);