Elasticsearch 中文分词器IK

1、安装说明

https://github.com/medcl/elasticsearch-analysis-ik

2、release版本

https://github.com/medcl/elasticsearch-analysis-ik/releases

3、安装插件

bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.5.1/elasticsearch-analysis-ik-6.1.1.zip

[es@bigdata-senior01 elasticsearch-6.5.1]$ ll plugins/analysis-ik/
总用量 1428
-rw-r--r-- 1 es es 263965 12月 12 10:30 commons-codec-1.9.jar
-rw-r--r-- 1 es es  61829 12月 12 10:30 commons-logging-1.2.jar
-rw-r--r-- 1 es es  54693 12月 12 10:30 elasticsearch-analysis-ik-6.5.1.jar
-rw-r--r-- 1 es es 736658 12月 12 10:30 httpclient-4.5.2.jar
-rw-r--r-- 1 es es 326724 12月 12 10:30 httpcore-4.4.4.jar
-rw-r--r-- 1 es es   1805 12月 12 10:30 plugin-descriptor.proper

也可以自己下载包之后解压缩，copy到plugins下即可
4、扩展词库

在es目录下config/analysis-ik/中

新建自己的词库，utf8编码

mkdir mydic
vi myword001.dic
魔兽世界
李云龙
嫦娥

修改配置文件

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
	<comment>IK Analyzer 扩展配置</comment>
	<!--用户可以在这里配置自己的扩展字典 -->
	<entry key="ext_dict">mydic/myword001.dic</entry>
	 <!--用户可以在这里配置自己的扩展停止词字典-->
	<entry key="ext_stopwords"></entry>
	<!--用户可以在这里配置远程扩展字典 -->
	<!-- <entry key="remote_ext_dict">words_location</entry> -->
	<!--用户可以在这里配置远程扩展停止词字典-->
	<!-- <entry key="remote_ext_stopwords">words_location</entry> -->
</properties>

官网说明：

IKAnalyzer.cfg.xml can be located at {conf}/analysis-ik/config/IKAnalyzer.cfg.xml or {plugins}/elasticsearch-analysis-ik-*/config/IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
	<comment>IK Analyzer 扩展配置</comment>
	<!--用户可以在这里配置自己的扩展字典 -->
	<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
	 <!--用户可以在这里配置自己的扩展停止词字典-->
	<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
 	<!--用户可以在这里配置远程扩展字典 -->
	<entry key="remote_ext_dict">location</entry>
 	<!--用户可以在这里配置远程扩展停止词字典-->
	<entry key="remote_ext_stopwords">http://xxx.com/xxx.dic</entry>
</properties>

测试：

GET _analyze
{
  "analyzer": "ik_smart",
  "text": "魔兽世界"
}

{
  "tokens" : [
    {
      "token" : "魔兽世界",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 0
    }
  ]
}

GET _analyze
{
  "analyzer": "ik_max_word",
  "text": "魔兽世界"
}

{
  "tokens" : [
    {
      "token" : "魔兽世界",
      "start_offset" : 0,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "魔兽",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "世界",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 2
    }
  ]
}

ik_smart 是粗粒度分词，分过的词不在参与分词。
ik_max_word是细粒度分词，根据可能的词进行组合.

5、使用分词
5.1直接在settings里设置缺省的分词器

PUT user
{
  "settings": {
    "number_of_shards": 2,
    "number_of_replicas": 1,
     "index" : {
            "analysis.analyzer.default.type": "ik_smart"
        }
    }
  }
}

PUT bus3
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0,
     "index" : {
        "analysis.analyzer.default.type": "ik_max_word",
        "analysis.search_analyzer.default.type":"ik_smart"
        }
    }
  }
}

GET /bus/_settings
返回：
{
  "bus3" : {
    "settings" : {
      "index" : {
        "number_of_shards" : "1",
        "provided_name" : "bus3",
        "creation_date" : "1545318988048",
        "analysis" : {
          "analyzer" : {
            "default" : {
              "type" : "ik_max_word"
            }
          },
          "search_analyzer" : {
            "default" : {
              "type" : "ik_smart"
            }
          }
        },
        "number_of_replicas" : "0",
        "uuid" : "dOU8yi5pRdi-0Akq_zCWtw",
        "version" : {
          "created" : "6050199"
        }
      }
    }
  }
}

5.2 在mapping里对每个字段设置

PUT bus
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0
  },
  "mappings": {
    "product":{
       "properties": {
      "name":{
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "ik_max_word"
      }
    }
    }
   
  }
}

GET bus/_mapping

{
  "bus" : {
    "mappings" : {
      "product" : {
        "properties" : {
          "name" : {
            "type" : "text",
            "analyzer" : "ik_max_word"
          }
        }
      }
    }
  }
}

查询测试1：查询使用分词器ik_smart

GET /bus/_search
{
  "query": {
    "match": {
      "name": {
        "query": "公交车"
        , "analyzer": "ik_smart"
      }
    }
  },
  "highlight": {
    "fields": {"name": {}}
  }
}

返回：
{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 5,
    "max_score" : 1.8566245,
    "hits" : [
      {
        "_index" : "bus",
        "_type" : "product",
        "_id" : "1",
        "_score" : 1.8566245,
        "_source" : {
          "name" : "公交车1路",
          "desc" : "从东站到西站",
          "price" : 10,
          "producer" : "东部公交",
          "tags" : [
            "普通",
            "单层"
          ],
          "memo" : "a test"
        },
        "highlight" : {
          "name" : [
            "<em>公交车</em>1路"
          ]
        }
      }
    ]
  }
}

查询测试2：查询使用分词器ik_max_word

GET /bus/_search
{
  "from": 0, "size": 1, 
  "query": {
    "match": {
      "name": {
        "query": "公交车"
        , "analyzer": "ik_max_word"
      }
    }
  },
  "highlight": {
    "fields": {"name": {}}
  }
}
返回：
{
  "took" : 5,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 5,
    "max_score" : 7.426498,
    "hits" : [
      {
        "_index" : "bus",
        "_type" : "product",
        "_id" : "1",
        "_score" : 7.426498,
        "_source" : {
          "name" : "公交车1路",
          "desc" : "从东站到西站",
          "price" : 10,
          "producer" : "东部公交",
          "tags" : [
            "普通",
            "单层"
          ],
          "memo" : "a test"
        },
        "highlight" : {
          "name" : [
            "<em>公交</em><em>车</em>1路"
          ]
        }
      }
    ]
  }
}

可以看到高亮部分是不一样的，一般情况我们可以分词用ik_max_word，查询分词用ik_smart。

posted @ 2018-12-12 13:27 我是属车的阅读(982) 评论(0) 编辑收藏举报

刷新页面返回顶部

我是属车的

Elasticsearch 中文分词器IK

公告