Solr 4.4.0利用dataimporthandler导入本地pdf、word等文档

 1. 创建本地目录

$ mkdir /usr/local/contentplatform/solr/solr/core1/file1
$ ls -lh
total 88M
-rw-r--r-- 1 tnuser appuser  14M May 14 20:11 apache_hbase_reference_guide.pdf
-rw-r--r-- 1 tnuser appuser 7.4M Apr 28 23:00 Architecting_HBase_Applications.pdf
-rw-r--r-- 1 tnuser appuser  14M Jan 15  2014 Cloudera_Hadoop_Test_Cases.docx
-rw-r--r-- 1 tnuser appuser 6.6M Apr 21 21:01 HBase_Administration_Cookbook.pdf
-rw-r--r-- 1 tnuser appuser 2.1M Apr 28 22:58 HBase_Essentials.pdf
-rw-r--r-- 1 tnuser appuser  25M Apr  9 16:16 Hbase-HBase实战.pdf
-rw-r--r-- 1 tnuser appuser 7.9M Nov 13  2015 HBase.in.Action.pdf
-rw-r--r-- 1 tnuser appuser  13M Apr 28 22:44 HBase:The_Definitive_Guide.pdf

2. 在core的conf目录修改配置文件solrconfig.xml配置dataimport请求处理器

<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
    <lst name="defaults">
      <str name="config">data-config.xml</str>
    </lst>
  </requestHandler>

 3. 在conf目录新建data-config.xml文件并添加数据源的引用

<dataConfig> 
    <dataSource name="fileDataSource" type="fileDataSource" /> 
    <dataSource name="binFileDataSource" type="BinFileDataSource" /> 
     <document> 
      <entity 
        name="file1" 
        datasource="fileDataSource"
        processor="FileListEntityProcessor" 
        baseDir="/usr/local/contentplatform/solr/solr/core1/file1"
        fileName=".*.(pdf)|(doc)|(docx)|(ppt)|(pptx)|(xls)|(xlsx)|(odf)|(txt)|(rtf)|(html)|(htm)|(jpg)|(csv)" 
        onError="skip" 
        recursive="true"       
        rootEntity="false">
       <field column="file" name="id" /> 
       <field column="fileSize" name="size" /> 
       <field column="fileAbsolutePath" name="filepath" /> 
       <field column="fileLastModified" name="lastModified" /> 

     <entity
        name="documentImport1" 
        processor="TikaEntityProcessor" 
        url="${file1.fileAbsolutePath}" 
        format="text"
        datasource="binFileDataSource"
        onError="skip"
        recursive="true">
       <field column="Author" name="author" meta="true"/> 
       <field column="title" name="title" meta="true"/> 
       <field column="text" name="text"/> 
     </entity>       
     </entity>       
    </document>
</dataConfig> 

 4.修改conf目录下的schema.xml文件,添加以下内容

   <field name="fileLastModified" type="date" indexed="true" stored="true"/>
   <field name="fileAbsolutePath" type="string" indexed="true" stored="true"/>

 5. 重新加载配置文件

 6. 通过DIH导入本地的文件

 6. 查看导入的文档

{
  "responseHeader": {
    "status": 0,
    "QTime": 1,
    "params": {
      "indent": "true",
      "q": "*:*",
      "_": "1564127787808",
      "wt": "json"
    }
  },
  "response": {
    "numFound": 8,
    "start": 0,
    "docs": [
      {
        "id": "Hbase-HBase实战.pdf",
        "title": [
          "HBASE 实战=HBASE IN ACTION"
        ],
        "author": "(美)NICK DIMIDUK著;谢磊译",
        "author_s": "(美)NICK DIMIDUK著;谢磊译",
        "_version_": 1640106408929132500
      },
      {
        "id": "apache_hbase_reference_guide.pdf",
        "title": [
          "Apache HBase ™ Reference Guide"
        ],
        "author": "Apache HBase Team",
        "author_s": "Apache HBase Team",
        "_version_": 1640106415302377500
      },
      {
        "id": "Architecting_HBase_Applications.pdf",
        "title": [
          "Architecting HBase Applications"
        ],
        "author": "Jean-Marc Spaggiari & Kevin O'Dell",
        "author_s": "Jean-Marc Spaggiari & Kevin O'Dell",
        "_version_": 1640106423153066000
      },
      {
        "id": "HBase_Administration_Cookbook.pdf",
        "_version_": 1640106425323618300
      },
      {
        "id": "HBase_Essentials.pdf",
        "title": [
          ""
        ],
        "author": "",
        "author_s": "",
        "_version_": 1640106427129266200
      },
      {
        "id": "HBase.in.Action.pdf",
        "title": [
          "HBase in Action"
        ],
        "author": "Nick Dimiduk, Amandeep Khurana",
        "author_s": "Nick Dimiduk, Amandeep Khurana",
        "_version_": 1640106439293796400
      },
      {
        "id": "HBase:The_Definitive_Guide.pdf",
        "title": [
          "HBase: The Definitive Guide"
        ],
        "author": "Lars George",
        "author_s": "Lars George",
        "_version_": 1640106444193792000
      },
      {
        "id": "Cloudera_Hadoop_Test_Cases.docx",
        "author": "FeiLong, Li [DBA]",
        "author_s": "FeiLong, Li [DBA]",
        "_version_": 1640106445801259000
      }
    ]
  }
}
原文地址:https://www.cnblogs.com/ilifeilong/p/11250902.html