ES - 自动补全
拼音分词器
效果:
要实现根据字母做补全,就必须对文档按照拼音分词,在github 上已经有elasticsearch 的拼音分词插件:
-
拼音分词器下载地址:(https://github.com/medcl/elasticsearch-analysis-pinyin)[https://github.com/medcl/elasticsearch-analysis-pinyin]
-
下载解压好后上传到es 插件目录:/var/lib/docker/volumes/es-plugins/_data
-
重启es
-
测试拼音分词器
POST /_analyze
{
"analyzer": "pinyin",
"text": "如家酒店真不错"
}
结果:
{
"tokens" : [
{
"token" : "ru",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 0
},
{
"token" : "rjjdzbc",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 0
},
{
"token" : "jia",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 1
},
{
"token" : "jiu",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 2
},
{
"token" : "dian",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 3
},
{
"token" : "zhen",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 4
},
{
"token" : "bu",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 5
},
{
"token" : "cuo",
"start_offset" : 0,
"end_offset" : 0,
"type" : "word",
"position" : 6
}
]
}
自定义分词器
拼音分词器存在的问题:
1.不会进行分词
2.汉字被丢弃了
3.每一个字都转为拼音没有用
es中分词器(analyzer) 的组成包含三部分:
- character filters:在tokenizer 之前对文本进行处理。例如删除字符、替换字符
- tokenizer:将文本按照一定的规则切割成词条(term).例如keyword,就是不分词;还有ik_smart
- tokenizer filter: 将tokenizer 输出的词条做进一步处理。例如大小写转换、同义词处理、拼音处理等
如下:
我们可以在创建索引库时,通过settings 来配置自定义的analyzer(分词器):
PUT /test
{
"settings": {
"analysis": { //自定义分词器
"analyzer": {
"my_analyzer": { //给自定义分词器起个名字
"tokenizer": "ik_max_word", //分词模式
"filter": "py" // tokenizer filter,对 tokenizer 输出的词条做进一步处理。对于py 是在下文指定需要保持一致
}
},
"filter": {
"py": { //里面的属性在github可找到
"type": "pinyin",
"keep_full_pinyin": false, //关闭每个字转拼音
"keep_joined_full_pinyin": true, //开启词转拼音
"keep_original": true,
"limit_first_letter_length": 16,
"remove_duplicated_term": true,
"none_chinese_pinyin_tokenize": false
}
}
}
},
"mappings": {
"properties": {
"name":{
"type": "text",
"analyzer": "my_analyzer"
}
}
}
}
测试:
POST /test/_analyze
{
"analyzer": "my_analyzer",
"text": "如家酒店真不错"
}
返回结果:
{
"tokens" : [
{
"token" : "如家",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "rujia",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "rj",
"start_offset" : 0,
"end_offset" : 2,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "酒店",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "jiudian",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "jd",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "真不错",
"start_offset" : 4,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "zhenbucuo",
"start_offset" : 4,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "zbc",
"start_offset" : 4,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "真不",
"start_offset" : 4,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "zhenbu",
"start_offset" : 4,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "zb",
"start_offset" : 4,
"end_offset" : 6,
"type" : "CN_WORD",
"position" : 3
},
{
"token" : "不错",
"start_offset" : 5,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "bucuo",
"start_offset" : 5,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "bc",
"start_offset" : 5,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 4
}
]
}
以上创建的自定义分词器还存问题:
POST /test/_doc/1
{
"id": 1,
"name": "狮子"
}
POST /test/_doc/2
{
"id": 2,
"name": "虱子"
}
GET /test/_search
{
"query": {
"match": {
"name": "掉入狮子笼咋办"
}
}
}
为什么搜索狮子会返回虱子??
拼音分词器适合在创建倒排索引的时候使用,但不能在搜索的时候使用
因此字段在创建倒排索引时应该使用my_analyzer分词器;字段在搜索时应该使用ik_smart分词器;
自动补全查询
es 提供了 Completion Suggester 查询来实现自动补全功能。这个查询会匹配以用户输入内容开头的词条并返回。为了提高补全查询的效率,对于文档中字段的类型有一些约束:
- 参数补全查询的字段类型必须时completion 类型
- 字段的内容一般是用来补全的多个词条形成的数组
// 自动补全查询
POST /test/_search
{
"suggest": {
"title_suggest": {
"text": "w", // 关键字
"completion": {
"field": "title", // 补全字段
"skip_duplicates": true, // 跳过重复的
"size": 10 // 获取前10条结果
}
}
}
}
如上搜索 w 关键字,都将整个数组的结果返回:
{
"took" : 8,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"title_suggest" : [
{
"text" : "w",
"offset" : 0,
"length" : 1,
"options" : [
{
"text" : "WH-1000XM3",
"_index" : "test2",
"_type" : "_doc",
"_id" : "gPiB74oBghB_R-1edDlI",
"_score" : 1.0,
"_source" : {
"title" : [
"Sony",
"WH-1000XM3"
]
}
}
]
}
]
}
}
酒店数据修改
修改酒店的索引结构:
// 酒店数据索引库
PUT /hotel
{
"settings": {
"analysis": {
"analyzer": {
"text_anlyzer": {
"tokenizer": "ik_max_word",
"filter": "py"
},
"completion_analyzer": {
"tokenizer": "keyword",
"filter": "py"
}
},
"filter": {
"py": {
"type": "pinyin",
"keep_full_pinyin": false,
"keep_joined_full_pinyin": true,
"keep_original": true,
"limit_first_letter_length": 16,
"remove_duplicated_term": true,
"none_chinese_pinyin_tokenize": false
}
}
}
},
"mappings": {
"properties": {
"id":{
"type": "keyword"
},
"name":{
"type": "text",
"analyzer": "text_anlyzer",
"search_analyzer": "ik_smart",
"copy_to": "all"
},
"address":{
"type": "keyword",
"index": false
},
"price":{
"type": "integer"
},
"score":{
"type": "integer"
},
"brand":{
"type": "keyword",
"copy_to": "all"
},
"city":{
"type": "keyword"
},
"starName":{
"type": "keyword"
},
"business":{
"type": "keyword",
"copy_to": "all"
},
"location":{
"type": "geo_point"
},
"pic":{
"type": "keyword",
"index": false
},
"all":{
"type": "text",
"analyzer": "text_anlyzer",
"search_analyzer": "ik_smart"
},
"suggestion":{
"type": "completion",
"analyzer": "completion_analyzer"
}
}
}
}
修改索引库对应的java bean 新增 suggestion 字段,将品牌和商圈 作为自动补全的字段:
@Data
@NoArgsConstructor
public class HotelDoc {
...
private List<String> suggestion;
public HotelDoc(Hotel hotel) {
...
this.suggestion = new ArrayList<>();
if (this.business.contains("/")){
this.suggestion.add(this.brand);
String[] split = this.business.split("/");
Collections.addAll(this.suggestion,split);
}else{
this.suggestion.add(this.brand);
this.suggestion.add(this.business);
}
}
}
批量插入mysql 到 es:
测试DSL:
GET /hotel/_search
{
"suggest": {
"test_suggest": {
"text": "h",
"completion": {
"field": "suggestion",
"skip_duplicates": true,
"size":10
}
}
}
}
返回结果:
{
"took" : 36,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"test_suggest" : [
{
"text" : "h",
"offset" : 0,
"length" : 1,
"options" : [
{
"text" : "和颐",
"_index" : "hotel",
"_type" : "_doc",
"_id" : "416268",
"_score" : 1.0,
"_source" : {
"address" : "朝阳路高井176号",
"brand" : "和颐",
"business" : "国贸地区",
"city" : "北京",
"id" : 416268,
"location" : "39.918277, 116.53015",
"name" : "和颐酒店(北京传媒大学财满街店)",
"pic" : "https://m.tuniucdn.com/fb2/t1/G6/M00/52/13/Cii-TF3eP5GIJIOLAAUwsIVCxdAAAGKXgK5a0IABTDI239_w200_h200_c1_t0.jpg",
"price" : 524,
"score" : 46,
"starName" : "三钻",
"suggestion" : [
"和颐",
"国贸地区"
]
}
},
{
"text" : "横沙岛",
"_index" : "hotel",
"_type" : "_doc",
"_id" : "5872067",
"_score" : 1.0,
"_source" : {
"address" : "陈家镇揽海路799弄",
"brand" : "凯悦",
"business" : "崇明岛/长兴岛/横沙岛",
"city" : "上海",
"id" : 5872067,
"location" : "31.466563, 121.799671",
"name" : "崇明金茂凯悦酒店",
"pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/fsKrbnNsmSsYnNLmhh3ZvVjZ5cA_w200_h200_c1_t0.jpg",
"price" : 1024,
"score" : 46,
"starName" : "五钻",
"suggestion" : [
"凯悦",
"崇明岛",
"长兴岛",
"横沙岛"
]
}
},
{
"text" : "汉庭",
"_index" : "hotel",
"_type" : "_doc",
"_id" : "607915",
"_score" : 1.0,
"_source" : {
"address" : "滨河大道6033号海滨广场国皇大厦3楼",
"brand" : "汉庭",
"business" : "皇岗口岸/福田口岸",
"city" : "深圳",
"id" : 607915,
"location" : "22.528101, 114.064221",
"name" : "汉庭酒店(深圳皇岗店)",
"pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/qMyCJVYuW21nsCeEBt8CMfmEhra_w200_h200_c1_t0.jpg",
"price" : 313,
"score" : 42,
"starName" : "二钻",
"suggestion" : [
"汉庭",
"皇岗口岸",
"福田口岸"
]
}
},
{
"text" : "海岸城",
"_index" : "hotel",
"_type" : "_doc",
"_id" : "1406627919",
"_score" : 1.0,
"_source" : {
"address" : "海德一道88号中洲控股中心A座",
"brand" : "万豪",
"business" : "海岸城/后海",
"city" : "深圳",
"id" : 1406627919,
"location" : "22.517293, 113.933785",
"name" : "深圳中洲万豪酒店",
"pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/3wsinQAcuWtCdmv1yxauVG2PSYpC_w200_h200_c1_t0.jpg",
"price" : 204,
"score" : 47,
"starName" : "五钻",
"suggestion" : [
"万豪",
"海岸城",
"后海"
]
}
},
{
"text" : "淮海路",
"_index" : "hotel",
"_type" : "_doc",
"_id" : "60522",
"_score" : 1.0,
"_source" : {
"address" : "汾阳路1号",
"brand" : "豪生",
"business" : "淮海路/新天地地区",
"city" : "上海",
"id" : 60522,
"location" : "31.215497, 121.456297",
"name" : "上海嘉豪淮海国际豪生酒店",
"pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/38UBi4QYuaF8jN94CxQ7tb7tjtmZ_w200_h200_c1_t0.jpg",
"price" : 425,
"score" : 45,
"starName" : "四钻",
"suggestion" : [
"豪生",
"淮海路",
"新天地地区"
]
}
}
]
}
]
}
}
RestClient 实现 自动 补全
@Test //自动补全测试
public void testSuggestion() throws IOException {
SearchRequest searchRequest = new SearchRequest("hotel");
searchRequest.source().suggest(new SuggestBuilder().addSuggestion("mySuggestion",
SuggestBuilders.completionSuggestion("suggestion")
.prefix("hl")
.skipDuplicates(true)
.size(10)));
SearchResponse response = client.search(searchRequest, RequestOptions.DEFAULT);
//解析响应结果
Suggest suggest = response.getSuggest();
CompletionSuggestion completionSuggestion = suggest.getSuggestion("mySuggestion");
List<CompletionSuggestion.Entry.Option> options = completionSuggestion.getOptions();
options.stream().map(CompletionSuggestion.Entry.Option::getText).forEach(System.out::println);
}
本文来自博客园,作者:chuangzhou,转载请注明原文链接:https://www.cnblogs.com/czzz/p/17739805.html