solr6.6 solrJ索引富文本(word/pdf)文件

  1、文件配置

    在core下面新建lib文件夹,存放相关的jar包,如图所示:

    

    

    修改solrconfig.xml

   

<lib dir="${solr.install.dir:../../../..}/contrib/extraction/lib" regex=".*\.jar" />
  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-cell-\d.*\.jar" />

  <lib dir="${solr.install.dir:../../../..}/contrib/clustering/lib/" regex=".*\.jar" />
  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-clustering-\d.*\.jar" />

  <lib dir="${solr.install.dir:../../../..}/contrib/langid/lib/" regex=".*\.jar" />
  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-langid-\d.*\.jar" />

  <lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*\.jar" />
  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-\d.*\.jar" />
  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar" />
  <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar" />
  <lib dir="./lib" regex=".*\.jar"/>

 

 

 

    增加配置,如果有则不用添加:

    

 <requestHandler name="/update/extract"
                  startup="lazy"
                  class="solr.extraction.ExtractingRequestHandler" >
    <lst name="defaults">
      <str name="fmap.content">text</str>
      <str name="fmap.meta">ignored_</str>
      <str name="lowernames">true</str>
      <str name="uprefix">attr_</str>
      <str name="captureAttr">true</str>
    </lst>
  </requestHandler>

 

 

   配置managed-schema文件:

   

  

  修改managed-schema文件,增加字段:

  <field name="path"      type="string"   indexed="true"  stored="true"  multiValued="false" />
  <field name="pathftype"      type="string"   indexed="true"  stored="true"  multiValued="false" />
  <field name="pathuploaddate"      type="string"   indexed="true"  stored="true"  multiValued="false" />
  <field name="pathsummary"      type="string"   indexed="true"  stored="true"  multiValued="false" />
  <field name="attr_content"      type="text_general"   indexed="true"  stored="true"  multiValued="false" />

 

 

  2、Java代码solrj操作(6.6.0版本) 

 

 

import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest.ACTION;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;

/**
 * @Author:sks
 * @Description:索引pdf等富文本文件
 * @Date:Created in 15:16 2017/12/13
 * @Modified by:
 **/
public class solr_pdf {
    public static void main(String[] args)
    {

        String fileName = "D:/work/Solr/ImportData/20160229001cn.pdf";
        String solrId = "20160229001cn.pdf";

        try
        {
            indexFilesSolrCell(solrId, solrId,fileName);
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
        catch (SolrServerException e)
        {
            e.printStackTrace();
        }
     
    }

    /**
     * @Author:sks
     * @Description:获取系统当天日期yyyy-mm-dd
     * @Date:
     */
    private static String GetCurrentDate(){
        Date dt = new Date();
        //最后的aa表示“上午”或“下午”    HH表示24小时制    如果换成hh表示12小时制
//        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss aa");
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
        String day =sdf.format(dt);
        return day;
    }

    public static void indexFilesSolrCell(String fileName, String solrId, String path)
            throws IOException, SolrServerException
    {
        String urlString = "http://localhost:8983/solr/test";
        SolrClient solr = new HttpSolrClient.Builder(urlString).build();

        ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
        String contentType = getFileContentType(fileName);
        up.addFile(new File(path), contentType);
        String fileType = fileName.substring(fileName.lastIndexOf(".")+1);
        up.setParam("literal.id", fileName);

        up.setParam("literal.path", path);//文件路径
        up.setParam("literal.pathuploaddate", GetCurrentDate());//文件上传时间
        up.setParam("literal.pathftype", fileType);//文件类型,doc,pdf
        up.setParam("fmap.content", "attr_content");//文件内容
        up.setAction(ACTION.COMMIT, true, true);
        solr.request(up);
    }

    /**
    * @Author:sks
    * @Description:根据文件名获取文件的ContentType类型
    * @Date: 
    */
    public static String getFileContentType(String filename) {
        String contentType = "";
        String prefix = filename.substring(filename.lastIndexOf(".") + 1);
        if (prefix.equals("xlsx")) {
            contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
        } else if (prefix.equals("pdf")) {
            contentType = "application/pdf";
        } else if (prefix.equals("doc")) {
            contentType = "application/msword";
        } else if (prefix.equals("txt")) {
            contentType = "text/plain";
        } else if (prefix.equals("xls")) {
            contentType = "application/vnd.ms-excel";
        } else if (prefix.equals("docx")) {
            contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
        } else if (prefix.equals("ppt")) {
            contentType = "application/vnd.ms-powerpoint";
        } else if (prefix.equals("pptx")) {
            contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
        }

        else {
            contentType = "othertype";
        }

        return contentType;
    }
}

 

posted on 2017-12-13 16:04  shaomine  阅读(3611)  评论(1编辑  收藏  举报