ik中文分词器及拼音分词器试用

安装

./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.6.4/elasticsearch-analysis-ik-5.6.4.zip
./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-pinyin/releases/download/v5.6.4/elasticsearch-analysis-pinyin-5.6.4.zip

安装后需要重启elasticsearch服务

查看当前已安装插件

GET _cat/plugins

结果
node01 analysis-ik     5.6.4
node01 analysis-pinyin 5.6.4

测试中文分词器，支持ik_max_word和ik_smart两种方式

GET _analyze
{
  "analyzer":"ik_max_word",
  "text":"中华人民共和国国歌"
}

结果
{
  "tokens": [
    {
      "token": "中华人民共和国",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "中华人民",
      "start_offset": 0,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 1
    },
    {
      "token": "中华",
      "start_offset": 0,
      "end_offset": 2,
      "type": "CN_WORD",
      "position": 2
    },
    {
      "token": "华人",
      "start_offset": 1,
      "end_offset": 3,
      "type": "CN_WORD",
      "position": 3
    },
    {
      "token": "人民共和国",
      "start_offset": 2,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 4
    },
    {
      "token": "人民",
      "start_offset": 2,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 5
    },
    {
      "token": "共和国",
      "start_offset": 4,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 6
    },
    {
      "token": "共和",
      "start_offset": 4,
      "end_offset": 6,
      "type": "CN_WORD",
      "position": 7
    },
    {
      "token": "国",
      "start_offset": 6,
      "end_offset": 7,
      "type": "CN_CHAR",
      "position": 8
    },
    {
      "token": "国歌",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 9
    }
  ]
}

使用ik_smart,则会尽可能少的返回词语:
{
  "tokens": [
    {
      "token": "中华人民共和国",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "国歌",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 1
    }
  ]
}

ik分词器支持自定义词库

vi config/IKAnalyzer.cfg.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
        <comment>IK Analyzer 扩展配置</comment>
        <!--用户可以在这里配置自己的扩展字典 -->
        <entry key="ext_dict">zhouls.dic</entry>
         <!--用户可以在这里配置自己的扩展停止词字典-->
        <entry key="ext_stopwords"></entry>
        <!--用户可以在这里配置远程扩展字典 -->
        <!-- <entry key="remote_ext_dict">words_location</entry> -->
        <!--用户可以在这里配置远程扩展停止词字典-->
        <!-- <entry key="remote_ext_stopwords">words_location</entry> -->
</properties>

#配置完成需要重启服务

简单测试拼音分词

PUT test08
{
  "index": {
    "analysis": {
      "analyzer": {
        "pinyin_analyzer": {
          "tokenizer": "my_pinyin",
          "filter": "word_delimiter"
        }
      },
      "tokenizer": {
        "my_pinyin": {
          "type": "pinyin",
          "first_letter": "none",
          "padding_char": " "
        }
      }
    }
  }
}

GET medcl/_analyze
{
  "text":"刘德华",
  "analyzer":"pinyin_analyzer"
}

结果
{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    }
  ]
}

同时支持中文和拼音的分词器

PUT test06
{
  "settings":{
    "number_of_shards":"1",
    "index.refresh_interval":"15s",
    "index":{
      "analysis":{
        "analyzer":{
           "ik_pinyin_analyzer":{
            "type":"custom",
            "tokenizer":"ik_smart",
            "filter":"pinyin_filter"
          }
        },
        "filter":{
          "pinyin_filter":{
            "type":"pinyin",
            "keep_first_letter": false
          }
        }
      }
    }
  },
  "mappings": {
    "doc":{
      "properties": {
        "name":{
          "type": "text",
          "analyzer": "ik_pinyin_analyzer"
        }
      }
    }
  }
}

POST test06/_analyze
{
  "analyzer": "ik_pinyin_analyzer",
  "text":"中华人民共和国国歌"
}

结果
{
  "tokens": [
    {
      "token": "zhong",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "hua",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 1
    },
    {
      "token": "ren",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 2
    },
    {
      "token": "min",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 3
    },
    {
      "token": "gong",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 4
    },
    {
      "token": "he",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 5
    },
    {
      "token": "guo",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 6
    },
    {
      "token": "guo",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 7
    },
    {
      "token": "ge",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 8
    }
  ]
}

拼音分词配置项

keep_first_letter启用此选项时，例如：刘德华> ldh，默认值：true
keep_separate_first_letter启用该选项时，将保留第一个字母分开，例如：刘德华> l，d，h，默认：假的，注意：查询结果也许是太模糊，由于长期过频
limit_first_letter_length 设置first_letter结果的最大长度，默认值：16
keep_full_pinyin当启用该选项，例如：刘德华> [ liu，de，hua]，默认值：true
keep_joined_full_pinyin当启用此选项时，例如：刘德华> [ liudehua]，默认值：false
keep_none_chinese 在结果中保留非中文字母或数字，默认值：true
keep_none_chinese_together保持非中国信一起，默认值：true，如：DJ音乐家- > DJ，yin，yue，jia，当设置为false，例如：DJ音乐家- > D，J，yin，yue，jia，注意：keep_none_chinese必须先启动
keep_none_chinese_in_first_letter第一个字母保持非中文字母，例如：刘德华AT2016- > ldhat2016，默认值：true
keep_none_chinese_in_joined_full_pinyin保留非中文字母加入完整拼音，例如：刘德华2016- > liudehua2016，默认：false
none_chinese_pinyin_tokenize打破非中国信成单独的拼音项，如果他们拼音，默认值：true，如：liudehuaalibaba13zhuanghan- > liu，de，hua，a，li，ba，ba，13，zhuang，han，注意：keep_none_chinese和keep_none_chinese_together应首先启用
keep_original 当启用此选项时，也会保留原始输入，默认值：false
lowercase 小写非中文字母，默认值：true
trim_whitespace 默认值：true
remove_duplicated_term当启用此选项时，将删除重复项以保存索引，例如：de的> de，默认值：false，注意：位置相关查询可能受影响

参考文档：

https://blog.csdn.net/u013905744/article/details/80935846

https://www.cnblogs.com/xing901022/p/5910139.html

https://blog.csdn.net/qq_28018283/article/details/80396937

posted @ 2019-03-08 18:52 粒子先生阅读(3044) 评论(0) 收藏举报

刷新页面返回顶部

AI晓