Elasticsearch实现搜索推荐词
本篇介绍的是基于Elasticsearch实现搜索推荐词,其中需要用到Elasticsearch的pinyin插件以及ik分词插件,代码的实现这里提供了java跟C#的版本方便大家参考。
1.实现的结果
①当搜索【qiy】的时候,能匹配企业、祈愿等
②当搜索【qi业】的时候,只能匹配的到企业,如果没有企业,将使用模糊查询,匹配祈愿。
③当搜索【q业】的时候结果同②。
④当搜索【企y】或【企ye】的时候结果同②。
④当搜索【qy】的时候,能匹配企业、祈愿等。
2.实现的逻辑
中文匹配前缀==》全拼匹配前缀==》拼音首字母匹配前缀==》拼音模糊匹配前缀
优先级从左到右,当前面三个有结果的时候不建议用模糊匹配,这样结果更加精确。比如需要获取8个推荐词,先获取中文的,如果足够8个将不再获取之后的匹配结果。但是当模糊匹配之前已经存在匹配结果了,即使数量没有达到8个,也不再继续获取模糊匹配结果。
3.插件准备
ik分词插件安装相对简单,网上教程也多,这里不做介绍。这里讲解下pinyin插件,官方版本的拼音插件不支持中文,处理结果只有拼音的,这样会出现同音字匹配,结果不准确。
这里感谢小伙伴分享的拼音插件修改方法:https://www.cnblogs.com/danvid/p/10691547.html。
按照里面的操作处理后的插件将实现:
企业画报:{"qi","企","ye","业","hua","画","bao","报"}
拼音插件的各项具体属性参考:https://blog.csdn.net/a1148233614/article/details/80280024,里面有详细介绍。
4.Elasticsearch创建index
这里使用的ES版本为7.0.1,不再支持mapping,创建代码如下:
PUT /suggest_tset { "settings": { "number_of_shards": 1, "number_of_replicas": 0, "analysis": { "analyzer": { "prefix_pinyin_analyzer": { "tokenizer": "standard", "filter": [ "lowercase", "prefix_pinyin" ] }, "full_pinyin_analyzer": { "tokenizer": "standard", "filter": [ "lowercase", "full_pinyin" ] }, "like_pinyin_analyzer": { "tokenizer": "standard", "filter": [ "lowercase", "like_pinyin" ] } }, "filter": { "_pattern": { "type": "pattern_capture", "preserve_original": true, "patterns": [ "([0-9])", "([a-z])" ] }, "prefix_pinyin": { "type": "pinyin", "keep_first_letter": "true", "keep_full_pinyin": "false", "none_chinese_pinyin_tokenize": "false", "keep_separate_chinese": "true", "keep_original": "false" }, "full_pinyin": { "type": "pinyin", "keep_first_letter": "false", "keep_full_pinyin": "true", "keep_original": "false", "keep_separate_chinese": "true", "keep_none_chinese_in_first_letter": "false" }, "like_pinyin": { "type": "pinyin", "keep_first_letter": "true", "keep_full_pinyin": "true", "keep_joined_full_pinyin": "false", "keep_original": "false", "keep_separate_chinese": "false", "keep_none_chinese_in_first_letter": "false" } } } }, "mappings": { "dynamic": "false", "properties": { "kwsuggest": { "fields": { "suggestText": { "type": "completion", "analyzer": "standard", "preserve_separators": "false", "preserve_position_increments": "true", "max_input_length": 50 }, "prefix_pinyin": { "type": "completion", "analyzer": "prefix_pinyin_analyzer", "search_analyzer": "standard", "preserve_separators": "false" }, "full_pinyin": { "type": "completion", "analyzer": "full_pinyin_analyzer", "search_analyzer": "standard", "preserve_separators": "false" }, "like_pinyin": { "type": "completion", "analyzer": "like_pinyin_analyzer", "preserve_separators": "false" } }, "type": "text" } } } }
这里插入几条测试数据
POST _bulk/?refresh=true { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "企业规划"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "祈愿设计 完美无瑕"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "悬崖的图片 美景"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "县衙地址 那里呢"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "悬崖风景图"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "起夜的风光 真的美"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "起夜第二个词 测试使用"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "需要一半留下一半打一字谜"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "许亚为"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "许雅非测试"} { "index" : { "_index" : "suggest_tset", "_type" : "_doc" } } { "kwsuggest": "徐杨是谁"}
下面为测试的查询语句
GET /suggest_tset/_search { "suggest": { "suggestText": { "prefix": "qi业", "completion": { "field": "kwsuggest.suggestText", "skip_duplicates": true } }, "full_pinyin": { "prefix": "qi业", "completion": { "field": "kwsuggest.full_pinyin", "skip_duplicates": true } }, "prefix_pinyin": { "prefix": "qi业", "completion": { "field": "kwsuggest.prefix_pinyin", "skip_duplicates": true } }, "like_pinyin": { "prefix": "qi业", "completion": { "field": "kwsuggest.like_pinyin", "skip_duplicates": true, "fuzzy": { "fuzziness": 1 } } } } }
当输入查询条件为【qiy】的时候,结果为:
{ "took" : 17, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 0, "relation" : "eq" }, "max_score" : null, "hits" : [ ] }, "suggest" : { "full_pinyin" : [ { "text" : "qiy", "offset" : 0, "length" : 3, "options" : [ { "text" : "起夜的风光 真的美", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "-jgnlHMBSEyTxFiDO4lU", "_score" : 1.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "aDg3lHMBSEyTxFiDXprV", "_score" : 1.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "like_pinyin" : [ { "text" : "qiy", "offset" : 0, "length" : 3, "options" : [ { "text" : "企业规划", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "9TgnlHMBSEyTxFiDO4lU", "_score" : 2.0, "_source" : { "kwsuggest" : "企业规划" } }, { "text" : "祈愿设计 这是啥呢", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "9jgnlHMBSEyTxFiDO4lU", "_score" : 2.0, "_source" : { "kwsuggest" : "祈愿设计 这是啥呢" } }, { "text" : "起夜的风光 真的美", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "-jgnlHMBSEyTxFiDO4lU", "_score" : 2.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "aDg3lHMBSEyTxFiDXprV", "_score" : 2.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "prefix_pinyin" : [ { "text" : "qiy", "offset" : 0, "length" : 3, "options" : [ ] } ], "suggestText" : [ { "text" : "qiy", "offset" : 0, "length" : 3, "options" : [ ] } ] } }
输入【qi业】的查询结果为
{ "took" : 2, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 0, "relation" : "eq" }, "max_score" : null, "hits" : [ ] }, "suggest" : { "full_pinyin" : [ { "text" : "qi业", "offset" : 0, "length" : 3, "options" : [ { "text" : "企业规划", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "9TgnlHMBSEyTxFiDO4lU", "_score" : 1.0, "_source" : { "kwsuggest" : "企业规划" } } ] } ], "like_pinyin" : [ { "text" : "qi业", "offset" : 0, "length" : 3, "options" : [ { "text" : "企业规划", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "9TgnlHMBSEyTxFiDO4lU", "_score" : 2.0, "_source" : { "kwsuggest" : "企业规划" } }, { "text" : "祈愿设计 这是啥呢", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "9jgnlHMBSEyTxFiDO4lU", "_score" : 2.0, "_source" : { "kwsuggest" : "祈愿设计 这是啥呢" } }, { "text" : "起夜的风光 真的美", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "-jgnlHMBSEyTxFiDO4lU", "_score" : 2.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "aDg3lHMBSEyTxFiDXprV", "_score" : 2.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "prefix_pinyin" : [ { "text" : "qi业", "offset" : 0, "length" : 3, "options" : [ ] } ], "suggestText" : [ { "text" : "qi业", "offset" : 0, "length" : 3, "options" : [ ] } ] } }
输入【qy】的结果为
{ "took" : 1, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 0, "relation" : "eq" }, "max_score" : null, "hits" : [ ] }, "suggest" : { "full_pinyin" : [ { "text" : "qy", "offset" : 0, "length" : 2, "options" : [ ] } ], "like_pinyin" : [ { "text" : "qy", "offset" : 0, "length" : 2, "options" : [ { "text" : "起夜的风光 真的美", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "-jgnlHMBSEyTxFiDO4lU", "_score" : 2.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "aDg3lHMBSEyTxFiDXprV", "_score" : 2.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "prefix_pinyin" : [ { "text" : "qy", "offset" : 0, "length" : 2, "options" : [ { "text" : "起夜的风光 真的美", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "-jgnlHMBSEyTxFiDO4lU", "_score" : 1.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用", "_index" : "suggest_tset", "_type" : "_doc", "_id" : "aDg3lHMBSEyTxFiDXprV", "_score" : 1.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "suggestText" : [ { "text" : "qy", "offset" : 0, "length" : 2, "options" : [ ] } ] } }
5.java版本代码
这里使用elasticsearch-rest-high-level-client
application.yml添加配置
# ES配置 elasticsearch: ipAddress: [127.0.0.1:9200]
添加配置类
@Component @Configuration @ConfigurationProperties(prefix = "elasticsearch") @Data public class ElasticsearchRestClientConfig { private Logger logger = LoggerFactory.getLogger(getClass()); private static final int ADDRESS_LENGTH = 2; private static final String HTTP_SCHEME = "http"; /** * 使用冒号隔开ip和端口 */ public String[] ipAddress; @Bean public RestClientBuilder restClientBuilder() { HttpHost[] hosts = Arrays.stream(ipAddress) .map(this::makeHttpHost) .filter(Objects::nonNull) .toArray(HttpHost[]::new); logger.debug("hosts:{}", Arrays.toString(hosts)); return RestClient.builder(hosts); } @Bean(name = "highLevelClient") public RestHighLevelClient highLevelClient(@Autowired RestClientBuilder restClientBuilder) { return new RestHighLevelClient(restClientBuilder); } private HttpHost makeHttpHost(String s) { assert StringUtils.isNotEmpty(s); String[] address = s.split(":"); if (address.length == ADDRESS_LENGTH) { String ip = address[0]; int port = Integer.parseInt(address[1]); return new HttpHost(ip, port, HTTP_SCHEME); } else { return null; } } }
实现的代码:
@Service public class KwSuggestService implements IKwSuggest { @Autowired RestHighLevelClient highLevelClient; @Override public List<String> GetKwSuggestList(String kw){ SearchRequest searchRequest = new SearchRequest("suggest_tset"); SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); SuggestBuilder suggestBuilder=new SuggestBuilder(); suggestBuilder.addSuggestion("suggestText", SuggestBuilders.completionSuggestion("kwsuggest.suggestText").prefix(kw).skipDuplicates(true).size(5)); suggestBuilder.addSuggestion("full_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.full_pinyin").prefix(kw).skipDuplicates(true).size(5)); suggestBuilder.addSuggestion("prefix_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.prefix_pinyin").prefix(kw).skipDuplicates(true).size(5)); suggestBuilder.addSuggestion("like_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.like_pinyin").prefix(kw, Fuzziness.fromEdits(1)).skipDuplicates(true).size(5)); sourceBuilder.suggest(suggestBuilder); sourceBuilder.timeout(new TimeValue(10, TimeUnit.SECONDS)); searchRequest.source(sourceBuilder); List<String> result = new ArrayList<>(); List<String> suggestionList= Arrays.asList("suggestText","full_pinyin","prefix_pinyin","like_pinyin"); try { SearchResponse response = highLevelClient.search(searchRequest, RequestOptions.DEFAULT); Suggest suggestions = response.getSuggest(); Integer index = 1; for(String suggestionType : suggestionList){ CompletionSuggestion completionSuggestion = suggestions.getSuggestion(suggestionType); for (CompletionSuggestion.Entry entry : completionSuggestion.getEntries()) { for (CompletionSuggestion.Entry.Option option : entry) { String suggestText = option.getHit().getSourceAsMap().get("kwsuggest").toString(); result.add(suggestText); } } // 按照中文匹配、全拼匹配、拼音首字母匹配、模糊匹配的顺序,结果大于5的时候返回结果,根据自己业务需要判断这个返回的数量 if(result.size()>=5){ break; } // 中文匹配,全拼匹配以及拼音首字母匹配存在结果的,不需要模糊匹配 if(index==3 && result.size()>0){ break; } // 超过3个字模糊匹配不准确 if(kw.length()>3 && result.size()==0){ break; } } return result; } catch (IOException e) { e.printStackTrace(); return new ArrayList<>(); } } }
6..c#代码实现
C#使用的是NEST
public partial class ElasticFactory { public ExternalServiceResponse<KeywordsSuggestResponseDataEntity> GetKeywordsSuggest(ElasticKeywordsSuggestRequest request) { var result = new ExternalServiceResponse<KeywordsSuggestResponseDataEntity>(); try { if (string.IsNullOrEmpty(request.q)) return result; var nodes = new Uri[0]; nodes[0] = new Uri("http://127.0.0.1:9200"); var pool = new StaticConnectionPool(nodes); var settings = new ConnectionSettings(pool).DefaultIndex("suggest_tset"); var client = new ElasticClient(settings); string[] keys = new[] { "suggestText", "full_pinyin", "prefix_pinyin", "like_pinyin" }; SearchDescriptor<object> search = new SearchDescriptor<object>(); search .Source(r => r .Includes(f => f .Fields("kw") ) ) .Suggest(s => s.Completion(keys[0], c => c.Field("kwsuggest.suggestText").SkipDuplicates(true).Prefix(request.q).SkipDuplicates()) .Completion(keys[1], c => c.Field("kwsuggest.full_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates()) .Completion(keys[2], c => c.Field("kwsuggest.prefix_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates()) .Completion(keys[3], c => c.Field("kwsuggest.like_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates().Fuzzy(m=>m.Fuzziness(Fuzziness.EditDistance(1))))) ; var esResult = client.Search<dynamic>(s => search); if (esResult != null) { result.code = 1; result.data = new KeywordsSuggestResponseDataEntity(); //1.先获取中文全匹配 //2.上面不满5个,再匹配全拼 //3.上面不满5个,中文全拼匹配首字母 //4.上面都没有用模糊匹配 if (esResult.Suggest != null) { result.data.items = new List<KeywordsSuggestResponseItemEntity>(); int index = 1; foreach (var key in keys) { AddSuggestItems(esResult.Suggest, key, result.data.items); //1-3之间,够了5个就返回 if (index >= 1 && index <= 3 && result.data.items.Count >= 5) { result.data.items = result.data.items.Skip(0).Take(5).ToList(); break; } //到了第3步如果还没有满足5个,直接返回,模糊匹配不精确 if (index == 3 && result.data.items.Count > 0) { break; } //输入的字符数大于3个以上,前面没有关键词匹配,后面不做模糊处理,匹配度太差了 if (index == 3 && request.q.Length>3) { break; } index++; } result.data.num = result.data.items.Count; } else { result.data.num = 0; } } else { result.code = 0; result.msg = "查询失败"; } } catch (Exception ex) { result.code = 0; result.msg = ex.Message; } return result; } private void AddSuggestItems(ISuggestDictionary<dynamic> suggest, string key, List<KeywordsSuggestResponseItemEntity> items) { var suggestFullPinyin = suggest[key]; if (suggestFullPinyin != null) { foreach (var hit in suggestFullPinyin[0].Options) { string kwSource = hit.Source["kwsuggest"]; //已经存在的不要重复添加 if (items.Any(m => m.kw == kwSource)) { continue; } items.Add(new KeywordsSuggestResponseItemEntity() { kw = kwSource }); } } } }