solr6.6 solrJ索引富文本(word/pdf)文件
1、文件配置
在core下面新建lib文件夹,存放相关的jar包,如图所示:
修改solrconfig.xml
<lib dir="${solr.install.dir:../../../..}/contrib/extraction/lib" regex=".*\.jar" /> <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-cell-\d.*\.jar" /> <lib dir="${solr.install.dir:../../../..}/contrib/clustering/lib/" regex=".*\.jar" /> <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-clustering-\d.*\.jar" /> <lib dir="${solr.install.dir:../../../..}/contrib/langid/lib/" regex=".*\.jar" /> <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-langid-\d.*\.jar" /> <lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*\.jar" /> <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-\d.*\.jar" /> <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar" /> <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*\.jar" /> <lib dir="./lib" regex=".*\.jar"/>
增加配置,如果有则不用添加:
<requestHandler name="/update/extract" startup="lazy" class="solr.extraction.ExtractingRequestHandler" > <lst name="defaults"> <str name="fmap.content">text</str> <str name="fmap.meta">ignored_</str> <str name="lowernames">true</str> <str name="uprefix">attr_</str> <str name="captureAttr">true</str> </lst> </requestHandler>
配置managed-schema文件:
修改managed-schema文件,增加字段:
<field name="path" type="string" indexed="true" stored="true" multiValued="false" /> <field name="pathftype" type="string" indexed="true" stored="true" multiValued="false" /> <field name="pathuploaddate" type="string" indexed="true" stored="true" multiValued="false" /> <field name="pathsummary" type="string" indexed="true" stored="true" multiValued="false" /> <field name="attr_content" type="text_general" indexed="true" stored="true" multiValued="false" />
2、Java代码solrj操作(6.6.0版本)
import java.io.File; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.request.AbstractUpdateRequest.ACTION; import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest; import org.apache.solr.client.solrj.response.QueryResponse; /** * @Author:sks * @Description:索引pdf等富文本文件 * @Date:Created in 15:16 2017/12/13 * @Modified by: **/ public class solr_pdf { public static void main(String[] args) { String fileName = "D:/work/Solr/ImportData/20160229001cn.pdf"; String solrId = "20160229001cn.pdf"; try { indexFilesSolrCell(solrId, solrId,fileName); } catch (IOException e) { e.printStackTrace(); } catch (SolrServerException e) { e.printStackTrace(); } } /** * @Author:sks * @Description:获取系统当天日期yyyy-mm-dd * @Date: */ private static String GetCurrentDate(){ Date dt = new Date(); //最后的aa表示“上午”或“下午” HH表示24小时制 如果换成hh表示12小时制 // SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss aa"); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); String day =sdf.format(dt); return day; } public static void indexFilesSolrCell(String fileName, String solrId, String path) throws IOException, SolrServerException { String urlString = "http://localhost:8983/solr/test"; SolrClient solr = new HttpSolrClient.Builder(urlString).build(); ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract"); String contentType = getFileContentType(fileName); up.addFile(new File(path), contentType); String fileType = fileName.substring(fileName.lastIndexOf(".")+1); up.setParam("literal.id", fileName); up.setParam("literal.path", path);//文件路径 up.setParam("literal.pathuploaddate", GetCurrentDate());//文件上传时间 up.setParam("literal.pathftype", fileType);//文件类型,doc,pdf up.setParam("fmap.content", "attr_content");//文件内容 up.setAction(ACTION.COMMIT, true, true); solr.request(up); } /** * @Author:sks * @Description:根据文件名获取文件的ContentType类型 * @Date: */ public static String getFileContentType(String filename) { String contentType = ""; String prefix = filename.substring(filename.lastIndexOf(".") + 1); if (prefix.equals("xlsx")) { contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; } else if (prefix.equals("pdf")) { contentType = "application/pdf"; } else if (prefix.equals("doc")) { contentType = "application/msword"; } else if (prefix.equals("txt")) { contentType = "text/plain"; } else if (prefix.equals("xls")) { contentType = "application/vnd.ms-excel"; } else if (prefix.equals("docx")) { contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; } else if (prefix.equals("ppt")) { contentType = "application/vnd.ms-powerpoint"; } else if (prefix.equals("pptx")) { contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; } else { contentType = "othertype"; } return contentType; } }