solr 自聚类实现
参考官网:https://lucene.apache.org/solr/guide/6_6/result-clustering.html
最近用到solr自聚类的,先简单介绍如下:
1、配置文件
主要配置文件必须配置如下内容:
<lib dir="${solr.install.dir:../../..}/contrib/clustering/lib/" regex=".*\.jar" /> <lib dir="${solr.install.dir:../../..}/dist/" regex="solr-clustering-\d.*\.jar" />
<searchComponent name="clustering" enable="${solr.clustering.enabled:true}" class="solr.clustering.ClusteringComponent"> <!-- Lingo clustering algorithm --> <lst name="engine"> <str name="name">lingo</str> <!--<bool name="optional">true</bool>--> <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> <str name="carrot.resourcesDir">clustering/carrot2</str> </lst> <!-- An example definition for the STC clustering algorithm. --> <lst name="engine"> <str name="name">stc</str> <bool name="optional">true</bool> <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str> <str name="carrot.resourcesDir">clustering/carrot2</str> </lst> <lst name="engine"> <str name="name">kmeans</str> <!--<bool name="optional">true</bool>--> <str name="carrot.algorithm">org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm</str> <str name="carrot.resourcesDir">clustering/carrot2</str> </lst> </searchComponent>
下面的配置文件根据自己的实际情况进行修改:
<requestHandler name="/clustering" startup="lazy" class="solr.SearchHandler"> <lst name="defaults"> <bool name="clustering">true</bool> <bool name="clustering.results">true</bool> <!-- Field name with the logical "title" of a each document (optional) --> <str name="carrot.title">keyword</str> <!-- Logical field to physical field mapping. --> <str name="carrot.url">id</str> <!-- Field name with the logical "content" of a each document (optional) --> <str name="carrot.snippet">summary</str> <!-- Apply highlighter to the title/ content and use this for clustering. --> <bool name="carrot.produceSummary">true</bool> <!-- the maximum number of labels per cluster --> <!--<int name="carrot.numDescriptions">5</int>--> <!-- produce sub clusters --> <bool name="carrot.outputSubClusters">false</bool> <!-- Configure any other request handler parameters. We will cluster the top 100 search results so bump up the 'rows' parameter. --> <!--<str name="defType">edismax</str> <str name="qf"> text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 </str> <str name="q.alt">*:*</str>--> <str name="defType">edismax</str> <!--<str name="qf"> summary^0.5 category^1.2 id^10.0 </str>--> <str name="qf">keyword^0.5 title^1.2 id^10.0</str> <str name="rows">100</str> <str name="fl">*,score</str> </lst> <!-- Append clustering at the end of the list of search components. --> <arr name="last-components"> <str>clustering</str> </arr> </requestHandler>
managed-schema配置文件包含以下内:
<fieldType name="text_ik" class="solr.TextField"> <analyzer type="index" class="org.wltea.analyzer.lucene.IKAnalyzer"/> <analyzer type="query" class="org.wltea.analyzer.lucene.IKAnalyzer"/> </fieldType> <field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/> <field name="text" type="text_ik" multiValued="false" indexed="true" stored="true" termVectors ="true"/> <field name="title" type="text_ik" multiValued="false" indexed="true" stored="true" /> <field name="snippet" type="text_ik" multiValued="false" indexed="true" stored="true" /> <field name="keyword" type="text_ik" multiValued="false" indexed="true" stored="true" /> <field name="category" type="text_ik" multiValued="false" indexed="true" stored="true" /> <field name="summary" type="text_ik" multiValued="false" indexed="true" stored="true"/> <field name="path" type="string" multiValued="false" indexed="true" stored="true"/>
注意:text_ik对应的分词组件,要引用对应的jar包,具体参见:http://www.cnblogs.com/shaosks/p/8204615.html
2、测试索引的文件
启动solr服务,在浏览器输入:http://localhost:8983/solr/mycore/clustering?q=*:*&rows=10
结果如下:
3、java查询代码
import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.response.Cluster; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.ClusteringResponse; import org.apache.solr.common.SolrDocument; import java.io.IOException; import java.util.List; /** * @Author:sks * @Description: * @Date:Created in 9:41 2018/1/18 * @Modified by: **/ public class AutoCluster { private static SolrClient solr; /** * @Author:sks * @Description:初始化solr客户端 * @Date: */ public static void Init(String urlString){ solr = new HttpSolrClient.Builder(urlString).build(); } public static void main(String[] args) throws SolrServerException,IOException { String urlString = "http://localhost:8983/solr/mycore"; String path = "D:/work/Solr/ImportData"; Init(urlString); getAutoClusterInfo(); System.exit(0); } /** * @Author:sks * @Description:获取聚类数据 * @Date: */ private static void getAutoClusterInfo() throws SolrServerException,IOException { //使用这个对象做查询 SolrQuery params = new SolrQuery(); //查询所有数据 params.set("qt", "/clustering"); params.setQuery("*:*"); params.setStart(0); params.setRows(30); QueryResponse queryResponse = solr.query(params); ClusteringResponse clr = queryResponse.getClusteringResponse(); List<Cluster> list = clr.getClusters(); //拿到聚类数据集合,返回查询结果 String txt = ""; for(Cluster c :list){ //类别标签 List<String> lblist = c.getLabels(); for(String lb:lblist){ System.out.println(lb); } //聚类文档ID List<String> doclist = c.getDocs(); for(String doc:doclist){ System.out.println(" " + doc); } } } }
查询结果如下: