代码改变世界

搜索联想词(提示词)实现

2022-04-07 11:15  倪平凡  阅读(1367)  评论(0编辑  收藏  举报

1.应用场景

  作为购物网站,搜索功能支持底纹搜索,后台配置的搜索框默认的搜索信息;输入搜索,用户通过输入商品关键字进行模糊搜索;最近搜索,用户输入搜索过的商品记录保存;热门搜索,后台配置的商品搜索信息。那么,对于用户来说,搜索到想要的商品是需要尽可能输入完整的商品名称,而潜在的主流族裔用户很多时候是模糊记住品牌名称和商品类别的,对于他们来说更依赖于搜索联想词功能帮助他们更快的搜索到想要的商品和推荐商品。

2.技术选型

  ElasticSearch

3.创建索引

  1PUT keywords_index_test

 2 {
 3   "settings": {
 4     "index": {
 5         "analysis": {
 6             "analyzer": {
 7                 "default": {
 8                     "tokenizer": "ik_max_word"
 9                 },
10                 "pinyin_analyzer": {
11                     "tokenizer": "shopmall_pinyin"
12                 },
13                 "first_py_letter_analyzer": {
14                     "tokenizer": "first_py_letter"
15                 },
16                 "full_pinyin_letter_analyzer": {
17                     "tokenizer": "full_pinyin_letter"
18                 }
19             },
20             "tokenizer": {
21                 "shopmall_pinyin": {
22                     "keep_joined_full_pinyin": "true",
23                     "keep_first_letter": "true",
24                     "keep_separate_first_letter": "false",
25                     "lowercase": "true",
26                     "type": "pinyin",
27                     "limit_first_letter_length": "16",
28                     "keep_original": "true",
29                     "keep_full_pinyin": "true",
30                     "keep_none_chinese_in_joined_full_pinyin": "true"
31                 },
32                 "first_py_letter": {
33                     "type": "pinyin",
34                     "keep_first_letter": true,
35                     "keep_full_pinyin": false,
36                     "keep_original": false,
37                     "limit_first_letter_length": 16,
38                     "lowercase": true,
39                     "trim_whitespace": true,
40                     "keep_none_chinese_in_first_letter": false,
41                     "none_chinese_pinyin_tokenize": false,
42                     "keep_none_chinese": true,
43                     "keep_none_chinese_in_joined_full_pinyin": true
44                 },
45                 "full_pinyin_letter": {
46                     "type": "pinyin",
47                     "keep_separate_first_letter": false,
48                     "keep_full_pinyin": false,
49                     "keep_original": false,
50                     "limit_first_letter_length": 16,
51                     "lowercase": true,
52                     "keep_first_letter": false,
53                     "keep_none_chinese_in_first_letter": false,
54                     "none_chinese_pinyin_tokenize": false,
55                     "keep_none_chinese": true,
56                     "keep_joined_full_pinyin": true,
57                     "keep_none_chinese_in_joined_full_pinyin": true
58                 }
59             }
60         }
61     }
62 },
63   "mappings": {
64     "doc": {
65       "_all": {
66         "enabled": false
67       },
68       "properties": {
69             "keywords": {
70                 "type": "completion",
71                 "fields": {
72                     "pinyin": {
73                         "type": "completion",
74                         "analyzer": "pinyin_analyzer"
75                     },
76                     "keyword_pinyin": {
77                         "type": "completion",
78                         "analyzer": "full_pinyin_letter_analyzer"
79                     },
80                     "keyword_first_py": {
81                         "type": "completion",
82                         "analyzer": "first_py_letter_analyzer"
83                     }
84                 }
85             }
86     }
87   }
88 }
89 }

 

4.初始化词库

  网站近三个月每天top4000搜索有结果的搜索词

5.服务端接口开发

  

@Override
    public BaseResponse<List<String>> getSearchSug(@RequestHeader("token") String token,
                                     @RequestParam(required = false, value = "keywords", defaultValue = "") String keywords) {
        List<String> suggestionList = searchService.getSearchSuggestions(keywords);
        return BaseResponse.send(suggestionList);
    }



public List<String> getSearchSuggestions(String keywords) {
        List<String> result = new ArrayList<String>();
        if (keywords == null) {
            keywords = "";
        } else {
            keywords = keywords.trim().replace("\\", "").replace("\"", "");
        }
        if (keywords.equals("")) {
            // result = getHotSuggestion(token);
            return result;
        }

        String index = "keywords_index_test";
        String type = "doc";
        QueryBuilder queryBuilder = QueryBuilders.matchAllQuery();
        String field = "keywords";

        if(checkLetter(keywords)) {
            field = "keywords.keyword_pinyin";
        } else if(checkChinese(keywords)) {
            field = "keywords";
        } else {
            field = "keywords.keyword_pinyin";
        }

        Set<String> results = getSuggestWord(index, type, field, keywords, queryBuilder);
        //结果为空且是拼音,可以尝试拼音首字母提示
        if(results.size() == 0 && checkLetter(keywords)) {
            field = "keywords.keyword_first_py";
            results = getSuggestWord(index, type, field, keywords, queryBuilder);
        }

        for (String res : results) {
            System.out.println(res);
            result.add(res);
        }
        return result;

    }

/**
 * Description:提示词,支持中文、拼音、首字母等(注意要去掉_source信息)
 *
 * 1、检测搜索词是中文还是拼音
 * 2、若是中文,直接按照name字段提示
 * 3、若是拼音(拼音+汉字),先按照name.keyword_pinyin获取,若是无结果按照首字母name.keyword_first_py获取
 *
 * SearchRequestBuilder的size要设置为0,否则显示hits结果
 * searchRequestBuilder.setSize(0);
 *
 * _source 由于磁盘读取和网络传输开销,可以影响性能的大小,为了节省一些网络开销,请从_source 使用源过滤中过滤掉不必要的字段以最小化 _source大小
 * 可以采用过滤的形式,也可以直接不显示_source
 * 1、searchRequestBuilder.setFetchSource("name", null);     过滤形式
 * 2、searchRequestBuilder.setFetchSource(false)   直接不显示_source
 *
 *
 * @param index
 * @param type
 * @param field
 * @param text
 * @return
 */
public Set<String> getSuggestWord(String index, String type, String field, String text, QueryBuilder queryBuilder) {
    //过滤相同的提示词,Es5.2版本不支持过滤掉重复的建议,故需自己对ES返回做去重处理,Es6.1以上版本可以通过skip_duplicates字段处理,skip_duplicates表示是否应过滤掉重复的建议(默认为false)
    Set<String> results = new TreeSet<String>();
    CompletionSuggestionBuilder suggestionBuilder = new CompletionSuggestionBuilder(field);
    suggestionBuilder.text(text);
    suggestionBuilder.size(20);

    SuggestBuilder suggestBuilder = new SuggestBuilder();
    suggestBuilder.addSuggestion("my-suggest-1", suggestionBuilder);

    SearchRequestBuilder searchRequestBuilder = client.prepareSearch(index).setTypes(type);
    searchRequestBuilder.setExplain(false);
    searchRequestBuilder.setSize(0);
    searchRequestBuilder.setQuery(queryBuilder);
    searchRequestBuilder.suggest(suggestBuilder);
    searchRequestBuilder.setFetchSource(false);
    //     searchRequestBuilder.setFetchSource("name", null);

    SearchResponse resp = searchRequestBuilder.execute().actionGet();
    Suggest sugg = resp.getSuggest();
    CompletionSuggestion suggestion = sugg.getSuggestion("my-suggest-1");
    List<CompletionSuggestion.Entry> list = suggestion.getEntries();
    for (int i = 0; i < list.size(); i++) {
        List<? extends Suggest.Suggestion.Entry.Option> options = list.get(i).getOptions();
        for (Suggest.Suggestion.Entry.Option op : options) {
            results.add(op.getText().toString());
        }
    }
    return results;
}

/**
 * 只包含字母
 * @return 验证成功返回true,验证失败返回false
 */
public static boolean checkLetter(String cardNum) {
    String regex = "^[A-Za-z]+$";
    return Pattern.matches(regex, cardNum);
}

/**
 * 验证中文
 * @param chinese 中文字符
 * @return 验证成功返回true,验证失败返回false
 */
public static boolean checkChinese(String chinese) {
    String regex = "^[\u4E00-\u9FA5]+$";
    return Pattern.matches(regex,chinese);
}