solr 自聚类实现

　　参考官网：https://lucene.apache.org/solr/guide/6_6/result-clustering.html

　　最近用到solr自聚类的，先简单介绍如下：

　　1、配置文件

　　　　主要配置文件必须配置如下内容：

<lib dir="${solr.install.dir:../../..}/contrib/clustering/lib/" regex=".*.jar" />
<lib dir="${solr.install.dir:../../..}/dist/" regex="solr-clustering-d.*.jar" />

<searchComponent name="clustering" enable="${solr.clustering.enabled:true}" class="solr.clustering.ClusteringComponent">
    <!-- Lingo clustering algorithm -->
    <lst name="engine">
      <str name="name">lingo</str>
      <!--<bool name="optional">true</bool>-->
      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
      <str name="carrot.resourcesDir">clustering/carrot2</str>
    </lst>

    <!-- An example definition for the STC clustering algorithm. -->
    <lst name="engine">
      <str name="name">stc</str>
      <bool name="optional">true</bool>
      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
      <str name="carrot.resourcesDir">clustering/carrot2</str>
    </lst>

    <lst name="engine">
      <str name="name">kmeans</str>
      <!--<bool name="optional">true</bool>-->
      <str name="carrot.algorithm">org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm</str>
      <str name="carrot.resourcesDir">clustering/carrot2</str>
    </lst>
  </searchComponent>

　　　　下面的配置文件根据自己的实际情况进行修改：

 <requestHandler name="/clustering"
                  startup="lazy"
                  class="solr.SearchHandler">
    <lst name="defaults">
      <bool name="clustering">true</bool>
      <bool name="clustering.results">true</bool>

      <!-- Field name with the logical "title" of a each document (optional) -->
      <str name="carrot.title">keyword</str>
      <!-- Logical field to physical field mapping. -->
      <str name="carrot.url">id</str>
      <!-- Field name with the logical "content" of a each document (optional) -->
      <str name="carrot.snippet">summary</str>
      <!-- Apply highlighter to the title/ content and use this for clustering. -->
      <bool name="carrot.produceSummary">true</bool>
      <!-- the maximum number of labels per cluster -->
      <!--<int name="carrot.numDescriptions">5</int>-->
      <!-- produce sub clusters -->
      <bool name="carrot.outputSubClusters">false</bool>

      <!-- Configure any other request handler parameters. We will cluster the
         top 100 search results so bump up the 'rows' parameter. -->
      <!--<str name="defType">edismax</str>
      <str name="qf">
        text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
      </str>
      <str name="q.alt">*:*</str>-->
      <str name="defType">edismax</str>
      <!--<str name="qf">
        summary^0.5 category^1.2  id^10.0
      </str>-->
      <str name="qf">keyword^0.5 title^1.2  id^10.0</str>
      <str name="rows">100</str>
      <str name="fl">*,score</str>
    </lst>

    <!-- Append clustering at the end of the list of search components. -->
    <arr name="last-components">
      <str>clustering</str>
    </arr>
  </requestHandler>

　　　　managed-schema配置文件包含以下内：

 <fieldType name="text_ik" class="solr.TextField">
    <analyzer type="index" class="org.wltea.analyzer.lucene.IKAnalyzer"/>
    <analyzer type="query" class="org.wltea.analyzer.lucene.IKAnalyzer"/>
  </fieldType>
  <field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
  <field name="text" type="text_ik" multiValued="false" indexed="true" stored="true" termVectors ="true"/>
  <field name="title" type="text_ik" multiValued="false" indexed="true" stored="true" />
  <field name="snippet" type="text_ik" multiValued="false" indexed="true" stored="true" />
  <field name="keyword" type="text_ik" multiValued="false" indexed="true" stored="true" />
  <field name="category" type="text_ik" multiValued="false" indexed="true" stored="true" />
  <field name="summary" type="text_ik" multiValued="false" indexed="true" stored="true"/>
  <field name="path" type="string" multiValued="false" indexed="true" stored="true"/>

　　　　注意：text_ik对应的分词组件，要引用对应的jar包，具体参见：http://www.cnblogs.com/shaosks/p/8204615.html

　　2、测试索引的文件

　　　　启动solr服务，在浏览器输入：http://localhost:8983/solr/mycore/clustering?q=*:*&rows=10

　　　　结果如下：

　　3、java查询代码

import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.response.Cluster;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.ClusteringResponse;
import org.apache.solr.common.SolrDocument;

import java.io.IOException;
import java.util.List;

/**
 * @Author：sks
 * @Description：
 * @Date：Created in 9:41 2018/1/18
 * @Modified by：
 **/
public class AutoCluster {

    private static SolrClient solr;

    /**
     * @Author：sks
     * @Description：初始化solr客户端
     * @Date：
     */
    public static void Init(String urlString){

        solr = new HttpSolrClient.Builder(urlString).build();
    }
    public static void main(String[] args) throws SolrServerException,IOException {

        String urlString = "http://localhost:8983/solr/mycore";
        String path = "D:/work/Solr/ImportData";

        Init(urlString);
        getAutoClusterInfo();
        System.exit(0);
    }

    /**
     * @Author：sks
     * @Description：获取聚类数据
     * @Date：
     */
    private static void getAutoClusterInfo() throws SolrServerException,IOException {
        //使用这个对象做查询
        SolrQuery params = new SolrQuery();
        //查询所有数据
        params.set("qt", "/clustering");
        params.setQuery("*:*");
        params.setStart(0);
        params.setRows(30);

        QueryResponse queryResponse = solr.query(params);
        ClusteringResponse clr = queryResponse.getClusteringResponse();
        List<Cluster> list = clr.getClusters();
        //拿到聚类数据集合,返回查询结果

        String  txt = "";
        for(Cluster c :list){
            //类别标签
            List<String> lblist = c.getLabels();
            for(String lb:lblist){
                System.out.println(lb);
            }
            //聚类文档ID
            List<String> doclist  = c.getDocs();
            for(String doc:doclist){
                System.out.println("        " + doc);
            }
        }


    }

}

　　　　查询结果如下：