lucene+IKAnalyzer实现中文纯文本检索系统

首先IntelliJ IDEA中搭建Maven项目(web):spring+SpringMVC+Lucene+IKAnalyzer

spring+SpringMVC搭建项目可以参考我的博客

整合Lucene 4.9.0

pom.xml添加lucene依赖

properties标签添加<lucene.version>4.9.0</lucene.version>

dependencies添加:
 1 <!-- lucene start -->
 2       <dependency>
 3           <groupId> org.apache.lucene</groupId>
 4           <artifactId>lucene-analyzers-common</artifactId>
 5           <version> ${lucene.version}</version>
 6       </dependency>
 7 
 8       <dependency>
 9           <groupId> org.apache.lucene</groupId>
10           <artifactId>lucene-core</artifactId>
11           <version> ${lucene.version}</version>
12       </dependency>
13 
14       <dependency>
15           <groupId> org.apache.lucene</groupId>
16           <artifactId>lucene-highlighter</artifactId>
17           <version> ${lucene.version}</version>
18       </dependency>
19 
20       <dependency>
21           <groupId> org.apache.lucene</groupId>
22           <artifactId>lucene-queryparser</artifactId>
23           <version> ${lucene.version}</version>
24       </dependency>
25 <!-- lucene end -->

整合IKAnalyzer 2012FF_hf1,中文分词器的版本要和Lucene的版本对应,Lucene 4.X对应IKAnalyzer  2012FF版本

maven依赖配置参考我的博客

将IKAnalyzer的配置文件考到resources目录里如图:

IKAnalyzer.cfg.xml可以配置词典以及停用词点,其它文件的为自定义停用词典

 1 <?xml version="1.0" encoding="UTF-8"?>
 2 <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">  
 3 <properties>  
 4     <comment>IK Analyzer 扩展配置</comment>
 5     <!--用户可以在这里配置自己的扩展字典 
 6     <entry key="ext_dict">ext.dic;</entry> 
 7     -->
 8     <!--用户可以在这里配置自己的扩展停止词字典-->
 9     <entry key="ext_stopwords">classpath:stopword.dic;classpath:x-noise-charactor.dic;classpath:x-noise-word.dic;</entry>
10     
11 </properties>

 

好了,到此项目就配置好了,接下来进行Lucene的核心操作——建索引和检索

IndexService.java

 1 package com.ssm.demo.web.service;
 2 
 3 import com.ssm.demo.core.dto.DocDto;
 4 import org.springframework.ui.Model;
 5 
 6 import java.util.List;
 7 
 8 /**
 9  * Describe: sevice接口
10  * Author: ouym
11  * Created Date: 2016/11/30.
12  */
13 public interface IndexService {
14 
15     /**
16      * 构建索引,传入参数:文档路径
17      * @param path
18      * @return
19      */
20     public boolean createIndex(String path);
21 
22 
23     /**
24      * 通过query查询索引
25      * @param query
26      */
27     public List<DocDto> searchIndex(String query, Model model);
28 }

IndexServiceImpl.java

  1 package com.ssm.demo.web.service.impl;
  2 
  3 import com.ssm.demo.core.constants.MyConstant;
  4 import com.ssm.demo.core.dto.DocDto;
  5 import com.ssm.demo.core.util.MyFileUtil;
  6 import com.ssm.demo.web.service.IndexService;
  7 import org.apache.lucene.analysis.Analyzer;
  8 import org.apache.lucene.analysis.standard.StandardAnalyzer;
  9 import org.apache.lucene.document.Document;
 10 import org.apache.lucene.document.Field;
 11 import org.apache.lucene.document.TextField;
 12 import org.apache.lucene.index.DirectoryReader;
 13 import org.apache.lucene.index.IndexWriter;
 14 import org.apache.lucene.index.IndexWriterConfig;
 15 import org.apache.lucene.queryparser.classic.QueryParser;
 16 import org.apache.lucene.search.IndexSearcher;
 17 import org.apache.lucene.search.Query;
 18 import org.apache.lucene.search.ScoreDoc;
 19 import org.apache.lucene.search.highlight.Highlighter;
 20 import org.apache.lucene.search.highlight.QueryScorer;
 21 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
 22 import org.apache.lucene.store.Directory;
 23 import org.apache.lucene.store.FSDirectory;
 24 import org.apache.lucene.util.Version;
 25 import org.springframework.stereotype.Service;
 26 import org.springframework.ui.Model;
 27 import org.wltea.analyzer.lucene.IKAnalyzer;
 28 
 29 import java.io.File;
 30 import java.nio.file.Paths;
 31 import java.util.ArrayList;
 32 import java.util.Date;
 33 import java.util.List;
 34 
 35 import static com.ssm.demo.core.util.MyFileUtil.*;
 36 
 37 /**
 38  * Describe: description of this class
 39  * Author: ouym
 40  * Created Date: 2016/11/30.
 41  */
 42 @Service("indexService")
 43 public class IndexServiceImpl implements IndexService {
 44 
 45     public boolean createIndex(String path) {
 46         Date date1 = new Date();
 47         List<File> fileList = getFileList(path);
 48         File indexFile = new File(MyConstant.INDEX_PATH);
 49         //避免重复索引
 50         if (indexFile.exists()){
 51             MyFileUtil.deleteDir(indexFile);
 52         }else {
 53             indexFile.mkdirs();
 54         }
 55         String content="";
 56         Analyzer analyzer = null;
 57         Directory directory = null;
 58         IndexWriter indexWriter = null;
 59 
 60         for (File file : fileList) {
 61             content = "";
 62             //获取文件后缀,只对.doc和.txt文件建索引
 63             String type = file.getName().substring(file.getName().lastIndexOf(".")+1);
 64             if("txt".equalsIgnoreCase(type)){
 65                 content += txt2String(file);
 66             }else if("doc".equalsIgnoreCase(type)){
 67                 content += doc2String(file);
 68             }
 69 
 70             try{
 71 
 72                 //使用第三方中文分词器IKAnalyzer
 73                 analyzer = new IKAnalyzer(true);
 74                 directory = FSDirectory.open(new File(MyConstant.INDEX_PATH));
 75                 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,analyzer);
 76                 indexWriter = new IndexWriter(directory, config);
 77 
 78                 Document document = new Document();
 79                 document.add(new TextField("filename", file.getName(), Field.Store.YES));
 80                 document.add(new TextField("content", content, Field.Store.YES));
 81                 document.add(new TextField("path", file.getPath(), Field.Store.YES));
 82                 indexWriter.addDocument(document);
 83                 indexWriter.commit();
 84                 indexWriter.close();
 85 
 86             }catch(Exception e){
 87                 e.printStackTrace();
 88             }
 89             content = "";
 90         }
 91         Date date2 = new Date();
 92         System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
 93         return false;
 94     }
 95 
 96     public List<DocDto> searchIndex(String queryStr,Model model) {
 97 
 98         Date date1 = new Date();
 99         Analyzer analyzer = null;
100         Directory directory = null;
101         IndexWriter indexWriter = null;
102         String prefixHTML = "<font color='red'>";
103         String suffixHTML = "</font>";
104         List<DocDto> docDtoList = new ArrayList<>();
105         try{
106             directory = FSDirectory.open(new File(MyConstant.INDEX_PATH));
107             //analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
108             analyzer = new IKAnalyzer(true);
109             DirectoryReader ireader = DirectoryReader.open(directory);
110             IndexSearcher isearcher = new IndexSearcher(ireader);
111 
112             QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,"content", analyzer);
113             Query query = parser.parse(queryStr);
114 
115             ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;
116             //ScoreDoc[] hits = isearcher.search(query, 10).scoreDocs;
117 
118             for (int i = 0; i < hits.length; i++) {
119                 DocDto docDto = new DocDto();
120                 Document hitDoc = isearcher.doc(hits[i].doc);
121                 //自动摘要,查询关键词高亮
122                 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(prefixHTML, suffixHTML);
123                 Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(query));
124                 String highLightText = highlighter.getBestFragment(analyzer,"content",hitDoc.get("content"));
125 
126                 docDto.setDocName(hitDoc.get("filename"));
127                 String path = hitDoc.get("path");
128                 path = path.replaceAll("\\\\", "/");
129                 docDto.setDocPath(path);
130                 docDto.setDocAbstract(highLightText+"...");
131                 docDtoList.add(docDto);
132             }
133             ireader.close();
134             directory.close();
135         }catch(Exception e){
136             e.printStackTrace();
137         }
138         Date date2 = new Date();
139         //System.out.println("查看索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");
140         model.addAttribute("spendTimes",(date2.getTime() - date1.getTime()));
141         return docDtoList;
142     }
143 }

省略了一些常量类和自定义的工具类,接下来只需要在controller里面调用service就行了

 1 @RequestMapping("/index")
 2     public String index(@RequestParam("wd")String wd, Model model){
 3 
 4         //建立索引
 5         indexService.createIndex(MyConstant.DATA_PATH);
 6         if (wd.trim().equals("")){
 7             return "redirect:/index/index";
 8         }
 9 
10         List<DocDto> docDtoList = indexService.searchIndex(wd,model);
11         if (!StringUtils.isEmpty(wd)) {
12             model.addAttribute("query",wd);
13         }
14         model.addAttribute("docDtoList",docDtoList);
15         model.addAttribute("listSize",docDtoList.size());
16         return "result";
17     }

我的测试文档集是30篇doc文档,然后自己简单仿百度首页做了一个界面,效果图如下:

首页:

检索结果:

结果按相关度排好序了~

 

实现过程中注意事项:

1.中文分词器的版本要和Lucene的版本对应,Lucene 4.X对应IKAnalyzer  2012FF版本

2.Maven仓库中没有IKAnalyzer 的jar包依赖,需要自己手动添加本地jar包

3. IKAnalyzer 分词器有自己的智能切词优化,声明时参数为true即可开启:analyzer = new IKAnalyzer(true);

若要添加自己的词典和停用词典,将true改为false效果可能更好(有待确认)。analyzer = new IKAnalyzer(false);

posted @ 2016-12-01 23:41  OUYM  阅读(1852)  评论(0编辑  收藏  举报