Elasticsearch 中文分词器IK
1、安装说明
https://github.com/medcl/elasticsearch-analysis-ik
2、release版本
https://github.com/medcl/elasticsearch-analysis-ik/releases
3、安装插件
bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.5.1/elasticsearch-analysis-ik-6.1.1.zip
[es@bigdata-senior01 elasticsearch-6.5.1]$ ll plugins/analysis-ik/ 总用量 1428 -rw-r--r-- 1 es es 263965 12月 12 10:30 commons-codec-1.9.jar -rw-r--r-- 1 es es 61829 12月 12 10:30 commons-logging-1.2.jar -rw-r--r-- 1 es es 54693 12月 12 10:30 elasticsearch-analysis-ik-6.5.1.jar -rw-r--r-- 1 es es 736658 12月 12 10:30 httpclient-4.5.2.jar -rw-r--r-- 1 es es 326724 12月 12 10:30 httpcore-4.4.4.jar -rw-r--r-- 1 es es 1805 12月 12 10:30 plugin-descriptor.proper
也可以自己下载包之后解压缩,copy到plugins下即可
4、扩展词库
在es目录下config/analysis-ik/中
新建自己的词库,utf8编码
mkdir mydic vi myword001.dic 魔兽世界 李云龙 嫦娥
修改配置文件
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 扩展配置</comment> <!--用户可以在这里配置自己的扩展字典 --> <entry key="ext_dict">mydic/myword001.dic</entry> <!--用户可以在这里配置自己的扩展停止词字典--> <entry key="ext_stopwords"></entry> <!--用户可以在这里配置远程扩展字典 --> <!-- <entry key="remote_ext_dict">words_location</entry> --> <!--用户可以在这里配置远程扩展停止词字典--> <!-- <entry key="remote_ext_stopwords">words_location</entry> --> </properties>
官网说明:
IKAnalyzer.cfg.xml can be located at {conf}/analysis-ik/config/IKAnalyzer.cfg.xml or {plugins}/elasticsearch-analysis-ik-*/config/IKAnalyzer.cfg.xml <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 扩展配置</comment> <!--用户可以在这里配置自己的扩展字典 --> <entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry> <!--用户可以在这里配置自己的扩展停止词字典--> <entry key="ext_stopwords">custom/ext_stopword.dic</entry> <!--用户可以在这里配置远程扩展字典 --> <entry key="remote_ext_dict">location</entry> <!--用户可以在这里配置远程扩展停止词字典--> <entry key="remote_ext_stopwords">http://xxx.com/xxx.dic</entry> </properties>
测试:
GET _analyze { "analyzer": "ik_smart", "text": "魔兽世界" } { "tokens" : [ { "token" : "魔兽世界", "start_offset" : 0, "end_offset" : 4, "type" : "CN_WORD", "position" : 0 } ] }
GET _analyze { "analyzer": "ik_max_word", "text": "魔兽世界" } { "tokens" : [ { "token" : "魔兽世界", "start_offset" : 0, "end_offset" : 4, "type" : "CN_WORD", "position" : 0 }, { "token" : "魔兽", "start_offset" : 0, "end_offset" : 2, "type" : "CN_WORD", "position" : 1 }, { "token" : "世界", "start_offset" : 2, "end_offset" : 4, "type" : "CN_WORD", "position" : 2 } ] }
ik_smart 是粗粒度分词,分过的词不在参与分词。
ik_max_word是细粒度分词,根据可能的词进行组合.
5、使用分词
5.1直接在settings里设置缺省的分词器
PUT user { "settings": { "number_of_shards": 2, "number_of_replicas": 1, "index" : { "analysis.analyzer.default.type": "ik_smart" } } } }
PUT bus3
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"index" : {
"analysis.analyzer.default.type": "ik_max_word",
"analysis.search_analyzer.default.type":"ik_smart"
}
}
}
}
GET /bus/_settings
返回:
{
"bus3" : {
"settings" : {
"index" : {
"number_of_shards" : "1",
"provided_name" : "bus3",
"creation_date" : "1545318988048",
"analysis" : {
"analyzer" : {
"default" : {
"type" : "ik_max_word"
}
},
"search_analyzer" : {
"default" : {
"type" : "ik_smart"
}
}
},
"number_of_replicas" : "0",
"uuid" : "dOU8yi5pRdi-0Akq_zCWtw",
"version" : {
"created" : "6050199"
}
}
}
}
}
5.2 在mapping里对每个字段设置
PUT bus { "settings": { "number_of_shards": 1, "number_of_replicas": 0 }, "mappings": { "product":{ "properties": { "name":{ "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_max_word" } } } } }
GET bus/_mapping { "bus" : { "mappings" : { "product" : { "properties" : { "name" : { "type" : "text", "analyzer" : "ik_max_word" } } } } } }
查询测试1:查询使用分词器ik_smart
GET /bus/_search { "query": { "match": { "name": { "query": "公交车" , "analyzer": "ik_smart" } } }, "highlight": { "fields": {"name": {}} } } 返回: { "took" : 3, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : 5, "max_score" : 1.8566245, "hits" : [ { "_index" : "bus", "_type" : "product", "_id" : "1", "_score" : 1.8566245, "_source" : { "name" : "公交车1路", "desc" : "从东站到西站", "price" : 10, "producer" : "东部公交", "tags" : [ "普通", "单层" ], "memo" : "a test" }, "highlight" : { "name" : [ "<em>公交车</em>1路" ] } } ] } }
查询测试2:查询使用分词器ik_max_word
GET /bus/_search { "from": 0, "size": 1, "query": { "match": { "name": { "query": "公交车" , "analyzer": "ik_max_word" } } }, "highlight": { "fields": {"name": {}} } } 返回: { "took" : 5, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : 5, "max_score" : 7.426498, "hits" : [ { "_index" : "bus", "_type" : "product", "_id" : "1", "_score" : 7.426498, "_source" : { "name" : "公交车1路", "desc" : "从东站到西站", "price" : 10, "producer" : "东部公交", "tags" : [ "普通", "单层" ], "memo" : "a test" }, "highlight" : { "name" : [ "<em>公交</em><em>车</em>1路" ] } } ] } }
可以看到高亮部分是不一样的,一般情况我们可以分词用ik_max_word,查询分词用ik_smart。