1. 创建本地目录
$ mkdir /usr/local/contentplatform/solr/solr/core1/file1 $ ls -lh total 88M -rw-r--r-- 1 tnuser appuser 14M May 14 20:11 apache_hbase_reference_guide.pdf -rw-r--r-- 1 tnuser appuser 7.4M Apr 28 23:00 Architecting_HBase_Applications.pdf -rw-r--r-- 1 tnuser appuser 14M Jan 15 2014 Cloudera_Hadoop_Test_Cases.docx -rw-r--r-- 1 tnuser appuser 6.6M Apr 21 21:01 HBase_Administration_Cookbook.pdf -rw-r--r-- 1 tnuser appuser 2.1M Apr 28 22:58 HBase_Essentials.pdf -rw-r--r-- 1 tnuser appuser 25M Apr 9 16:16 Hbase-HBase实战.pdf -rw-r--r-- 1 tnuser appuser 7.9M Nov 13 2015 HBase.in.Action.pdf -rw-r--r-- 1 tnuser appuser 13M Apr 28 22:44 HBase:The_Definitive_Guide.pdf
2. 在core的conf目录修改配置文件solrconfig.xml配置dataimport请求处理器
<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler"> <lst name="defaults"> <str name="config">data-config.xml</str> </lst> </requestHandler>
3. 在conf目录新建data-config.xml文件并添加数据源的引用
<dataConfig> <dataSource name="fileDataSource" type="fileDataSource" /> <dataSource name="binFileDataSource" type="BinFileDataSource" /> <document> <entity name="file1" datasource="fileDataSource" processor="FileListEntityProcessor" baseDir="/usr/local/contentplatform/solr/solr/core1/file1" fileName=".*\.(pdf)|(doc)|(docx)|(ppt)|(pptx)|(xls)|(xlsx)|(odf)|(txt)|(rtf)|(html)|(htm)|(jpg)|(csv)" onError="skip" recursive="true" rootEntity="false"> <field column="file" name="id" /> <field column="fileSize" name="size" /> <field column="fileAbsolutePath" name="filepath" /> <field column="fileLastModified" name="lastModified" /> <entity name="documentImport1" processor="TikaEntityProcessor" url="${file1.fileAbsolutePath}" format="text" datasource="binFileDataSource" onError="skip" recursive="true"> <field column="Author" name="author" meta="true"/> <field column="title" name="title" meta="true"/> <field column="text" name="text"/> </entity> </entity> </document> </dataConfig>
4.修改conf目录下的schema.xml文件,添加以下内容
<field name="fileLastModified" type="date" indexed="true" stored="true"/> <field name="fileAbsolutePath" type="string" indexed="true" stored="true"/>
5. 重新加载配置文件
6. 通过DIH导入本地的文件
6. 查看导入的文档
{ "responseHeader": { "status": 0, "QTime": 1, "params": { "indent": "true", "q": "*:*", "_": "1564127787808", "wt": "json" } }, "response": { "numFound": 8, "start": 0, "docs": [ { "id": "Hbase-HBase实战.pdf", "title": [ "HBASE 实战=HBASE IN ACTION" ], "author": "(美)NICK DIMIDUK著;谢磊译", "author_s": "(美)NICK DIMIDUK著;谢磊译", "_version_": 1640106408929132500 }, { "id": "apache_hbase_reference_guide.pdf", "title": [ "Apache HBase ™ Reference Guide" ], "author": "Apache HBase Team", "author_s": "Apache HBase Team", "_version_": 1640106415302377500 }, { "id": "Architecting_HBase_Applications.pdf", "title": [ "Architecting HBase Applications" ], "author": "Jean-Marc Spaggiari & Kevin O'Dell", "author_s": "Jean-Marc Spaggiari & Kevin O'Dell", "_version_": 1640106423153066000 }, { "id": "HBase_Administration_Cookbook.pdf", "_version_": 1640106425323618300 }, { "id": "HBase_Essentials.pdf", "title": [ "" ], "author": "", "author_s": "", "_version_": 1640106427129266200 }, { "id": "HBase.in.Action.pdf", "title": [ "HBase in Action" ], "author": "Nick Dimiduk, Amandeep Khurana", "author_s": "Nick Dimiduk, Amandeep Khurana", "_version_": 1640106439293796400 }, { "id": "HBase:The_Definitive_Guide.pdf", "title": [ "HBase: The Definitive Guide" ], "author": "Lars George", "author_s": "Lars George", "_version_": 1640106444193792000 }, { "id": "Cloudera_Hadoop_Test_Cases.docx", "author": "FeiLong, Li [DBA]", "author_s": "FeiLong, Li [DBA]", "_version_": 1640106445801259000 } ] } }
===================来自一泽涟漪的博客,转载请标明出处 www.cnblogs.com/ilifeilong===================