ES - 自动补全

拼音分词器

效果:

要实现根据字母做补全,就必须对文档按照拼音分词,在github 上已经有elasticsearch 的拼音分词插件:

  1. 拼音分词器下载地址:(https://github.com/medcl/elasticsearch-analysis-pinyin)[https://github.com/medcl/elasticsearch-analysis-pinyin]

  2. 下载解压好后上传到es 插件目录:/var/lib/docker/volumes/es-plugins/_data

  3. 重启es

  4. 测试拼音分词器

POST /_analyze
{
  "analyzer": "pinyin",
  "text": "如家酒店真不错"
}

结果:

{
  "tokens" : [
    {
      "token" : "ru",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "rjjdzbc",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "jia",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "jiu",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "dian",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "zhen",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "bu",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "cuo",
      "start_offset" : 0,
      "end_offset" : 0,
      "type" : "word",
      "position" : 6
    }
  ]
}

自定义分词器

拼音分词器存在的问题:

1.不会进行分词
2.汉字被丢弃了
3.每一个字都转为拼音没有用

es中分词器(analyzer) 的组成包含三部分:

  • character filters:在tokenizer 之前对文本进行处理。例如删除字符、替换字符
  • tokenizer:将文本按照一定的规则切割成词条(term).例如keyword,就是不分词;还有ik_smart
  • tokenizer filter: 将tokenizer 输出的词条做进一步处理。例如大小写转换、同义词处理、拼音处理等

如下:

我们可以在创建索引库时,通过settings 来配置自定义的analyzer(分词器):

PUT /test
{
  "settings": {   
    "analysis": { //自定义分词器
      "analyzer": { 
        "my_analyzer": {   //给自定义分词器起个名字
          "tokenizer": "ik_max_word",  //分词模式
          "filter": "py"   // tokenizer filter,对 tokenizer 输出的词条做进一步处理。对于py 是在下文指定需要保持一致
        } 
      },
      "filter": {
        "py": {   //里面的属性在github可找到
          "type": "pinyin",  
          "keep_full_pinyin": false,   //关闭每个字转拼音
          "keep_joined_full_pinyin": true, //开启词转拼音
          "keep_original": true,
          "limit_first_letter_length": 16,
          "remove_duplicated_term": true,
          "none_chinese_pinyin_tokenize": false
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "name":{
        "type": "text",
        "analyzer": "my_analyzer"
      }
    }
  }
}

测试:

POST /test/_analyze
{
  "analyzer": "my_analyzer",
  "text": "如家酒店真不错"
}

返回结果:

{
  "tokens" : [
    {
      "token" : "如家",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "rujia",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "rj",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "酒店",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "jiudian",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "jd",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "真不错",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "zhenbucuo",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "zbc",
      "start_offset" : 4,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "真不",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "zhenbu",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "zb",
      "start_offset" : 4,
      "end_offset" : 6,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "不错",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "bucuo",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "bc",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 4
    }
  ]
}

以上创建的自定义分词器还存问题:

POST /test/_doc/1
{
  "id": 1,
  "name": "狮子"
}
POST /test/_doc/2
{
  "id": 2,
  "name": "虱子"
}

GET /test/_search
{
  "query": {
    "match": {
      "name": "掉入狮子笼咋办"
    }
  }
}

为什么搜索狮子会返回虱子??

拼音分词器适合在创建倒排索引的时候使用,但不能在搜索的时候使用

因此字段在创建倒排索引时应该使用my_analyzer分词器;字段在搜索时应该使用ik_smart分词器;

自动补全查询

es 提供了 Completion Suggester 查询来实现自动补全功能。这个查询会匹配以用户输入内容开头的词条并返回。为了提高补全查询的效率,对于文档中字段的类型有一些约束:

  • 参数补全查询的字段类型必须时completion 类型
  • 字段的内容一般是用来补全的多个词条形成的数组

// 自动补全查询
POST /test/_search
{
  "suggest": {
    "title_suggest": {
      "text": "w", // 关键字
      "completion": {
        "field": "title", // 补全字段
        "skip_duplicates": true, // 跳过重复的
        "size": 10 // 获取前10条结果
      }
    }
  }
}

如上搜索 w 关键字,都将整个数组的结果返回:

{
  "took" : 8,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "suggest" : {
    "title_suggest" : [
      {
        "text" : "w",
        "offset" : 0,
        "length" : 1,
        "options" : [
          {
            "text" : "WH-1000XM3",
            "_index" : "test2",
            "_type" : "_doc",
            "_id" : "gPiB74oBghB_R-1edDlI",
            "_score" : 1.0,
            "_source" : {
              "title" : [
                "Sony",
                "WH-1000XM3"
              ]
            }
          }
        ]
      }
    ]
  }
}

酒店数据修改

修改酒店的索引结构:

// 酒店数据索引库
PUT /hotel
{
  "settings": {
    "analysis": {
      "analyzer": {
        "text_anlyzer": {
          "tokenizer": "ik_max_word",
          "filter": "py"
        },
        "completion_analyzer": {
          "tokenizer": "keyword",
          "filter": "py"
        }
      },
      "filter": {
        "py": {
          "type": "pinyin",
          "keep_full_pinyin": false,
          "keep_joined_full_pinyin": true,
          "keep_original": true,
          "limit_first_letter_length": 16,
          "remove_duplicated_term": true,
          "none_chinese_pinyin_tokenize": false
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "id":{
        "type": "keyword"
      },
      "name":{
        "type": "text",
        "analyzer": "text_anlyzer",
        "search_analyzer": "ik_smart",
        "copy_to": "all"
      },
      "address":{
        "type": "keyword",
        "index": false
      },
      "price":{
        "type": "integer"
      },
      "score":{
        "type": "integer"
      },
      "brand":{
        "type": "keyword",
        "copy_to": "all"
      },
      "city":{
        "type": "keyword"
      },
      "starName":{
        "type": "keyword"
      },
      "business":{
        "type": "keyword",
        "copy_to": "all"
      },
      "location":{
        "type": "geo_point"
      },
      "pic":{
        "type": "keyword",
        "index": false
      },
      "all":{
        "type": "text",
        "analyzer": "text_anlyzer",
        "search_analyzer": "ik_smart"
      },
      "suggestion":{
          "type": "completion",
          "analyzer": "completion_analyzer"
      }
    }
  }
}

修改索引库对应的java bean 新增 suggestion 字段,将品牌和商圈 作为自动补全的字段:

@Data
@NoArgsConstructor
public class HotelDoc {

    ...
    private List<String> suggestion;

    public HotelDoc(Hotel hotel) {
        ...
        this.suggestion = new ArrayList<>();
        if (this.business.contains("/")){
            this.suggestion.add(this.brand);
            String[] split = this.business.split("/");
            Collections.addAll(this.suggestion,split);
        }else{
            this.suggestion.add(this.brand);
            this.suggestion.add(this.business);
        }
    }
}

批量插入mysql 到 es:

测试DSL:

GET /hotel/_search
{
  "suggest": {
    "test_suggest": {
      "text": "h",
      "completion": {
        "field": "suggestion",
        "skip_duplicates": true,
        "size":10
      }
    }
  }
}

返回结果:

{
  "took" : 36,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "suggest" : {
    "test_suggest" : [
      {
        "text" : "h",
        "offset" : 0,
        "length" : 1,
        "options" : [
          {
            "text" : "和颐",
            "_index" : "hotel",
            "_type" : "_doc",
            "_id" : "416268",
            "_score" : 1.0,
            "_source" : {
              "address" : "朝阳路高井176号",
              "brand" : "和颐",
              "business" : "国贸地区",
              "city" : "北京",
              "id" : 416268,
              "location" : "39.918277, 116.53015",
              "name" : "和颐酒店(北京传媒大学财满街店)",
              "pic" : "https://m.tuniucdn.com/fb2/t1/G6/M00/52/13/Cii-TF3eP5GIJIOLAAUwsIVCxdAAAGKXgK5a0IABTDI239_w200_h200_c1_t0.jpg",
              "price" : 524,
              "score" : 46,
              "starName" : "三钻",
              "suggestion" : [
                "和颐",
                "国贸地区"
              ]
            }
          },
          {
            "text" : "横沙岛",
            "_index" : "hotel",
            "_type" : "_doc",
            "_id" : "5872067",
            "_score" : 1.0,
            "_source" : {
              "address" : "陈家镇揽海路799弄",
              "brand" : "凯悦",
              "business" : "崇明岛/长兴岛/横沙岛",
              "city" : "上海",
              "id" : 5872067,
              "location" : "31.466563, 121.799671",
              "name" : "崇明金茂凯悦酒店",
              "pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/fsKrbnNsmSsYnNLmhh3ZvVjZ5cA_w200_h200_c1_t0.jpg",
              "price" : 1024,
              "score" : 46,
              "starName" : "五钻",
              "suggestion" : [
                "凯悦",
                "崇明岛",
                "长兴岛",
                "横沙岛"
              ]
            }
          },
          {
            "text" : "汉庭",
            "_index" : "hotel",
            "_type" : "_doc",
            "_id" : "607915",
            "_score" : 1.0,
            "_source" : {
              "address" : "滨河大道6033号海滨广场国皇大厦3楼",
              "brand" : "汉庭",
              "business" : "皇岗口岸/福田口岸",
              "city" : "深圳",
              "id" : 607915,
              "location" : "22.528101, 114.064221",
              "name" : "汉庭酒店(深圳皇岗店)",
              "pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/qMyCJVYuW21nsCeEBt8CMfmEhra_w200_h200_c1_t0.jpg",
              "price" : 313,
              "score" : 42,
              "starName" : "二钻",
              "suggestion" : [
                "汉庭",
                "皇岗口岸",
                "福田口岸"
              ]
            }
          },
          {
            "text" : "海岸城",
            "_index" : "hotel",
            "_type" : "_doc",
            "_id" : "1406627919",
            "_score" : 1.0,
            "_source" : {
              "address" : "海德一道88号中洲控股中心A座",
              "brand" : "万豪",
              "business" : "海岸城/后海",
              "city" : "深圳",
              "id" : 1406627919,
              "location" : "22.517293, 113.933785",
              "name" : "深圳中洲万豪酒店",
              "pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/3wsinQAcuWtCdmv1yxauVG2PSYpC_w200_h200_c1_t0.jpg",
              "price" : 204,
              "score" : 47,
              "starName" : "五钻",
              "suggestion" : [
                "万豪",
                "海岸城",
                "后海"
              ]
            }
          },
          {
            "text" : "淮海路",
            "_index" : "hotel",
            "_type" : "_doc",
            "_id" : "60522",
            "_score" : 1.0,
            "_source" : {
              "address" : "汾阳路1号",
              "brand" : "豪生",
              "business" : "淮海路/新天地地区",
              "city" : "上海",
              "id" : 60522,
              "location" : "31.215497, 121.456297",
              "name" : "上海嘉豪淮海国际豪生酒店",
              "pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/38UBi4QYuaF8jN94CxQ7tb7tjtmZ_w200_h200_c1_t0.jpg",
              "price" : 425,
              "score" : 45,
              "starName" : "四钻",
              "suggestion" : [
                "豪生",
                "淮海路",
                "新天地地区"
              ]
            }
          }
        ]
      }
    ]
  }
}

RestClient 实现 自动 补全

@Test //自动补全测试
public void testSuggestion() throws IOException {
    SearchRequest searchRequest = new SearchRequest("hotel");

    searchRequest.source().suggest(new SuggestBuilder().addSuggestion("mySuggestion",
            SuggestBuilders.completionSuggestion("suggestion")
                    .prefix("hl")
                    .skipDuplicates(true)
                    .size(10)));

    SearchResponse response = client.search(searchRequest, RequestOptions.DEFAULT);

    //解析响应结果
    Suggest suggest = response.getSuggest();
    CompletionSuggestion completionSuggestion = suggest.getSuggestion("mySuggestion");
    List<CompletionSuggestion.Entry.Option> options = completionSuggestion.getOptions();

    options.stream().map(CompletionSuggestion.Entry.Option::getText).forEach(System.out::println);
}
posted @ 2023-10-02 11:20  chuangzhou  阅读(69)  评论(0编辑  收藏  举报