Elasticsearch实现搜索推荐词
本篇介绍的是基于Elasticsearch实现搜索推荐词,其中需要用到Elasticsearch的pinyin插件以及ik分词插件,代码的实现这里提供了java跟C#的版本方便大家参考。
1.实现的结果
①当搜索【qiy】的时候,能匹配企业、祈愿等
②当搜索【qi业】的时候,只能匹配的到企业,如果没有企业,将使用模糊查询,匹配祈愿。
③当搜索【q业】的时候结果同②。
④当搜索【企y】或【企ye】的时候结果同②。
④当搜索【qy】的时候,能匹配企业、祈愿等。
2.实现的逻辑
中文匹配前缀==》全拼匹配前缀==》拼音首字母匹配前缀==》拼音模糊匹配前缀
优先级从左到右,当前面三个有结果的时候不建议用模糊匹配,这样结果更加精确。比如需要获取8个推荐词,先获取中文的,如果足够8个将不再获取之后的匹配结果。但是当模糊匹配之前已经存在匹配结果了,即使数量没有达到8个,也不再继续获取模糊匹配结果。
3.插件准备
ik分词插件安装相对简单,网上教程也多,这里不做介绍。这里讲解下pinyin插件,官方版本的拼音插件不支持中文,处理结果只有拼音的,这样会出现同音字匹配,结果不准确。
这里感谢小伙伴分享的拼音插件修改方法:https://www.cnblogs.com/danvid/p/10691547.html。
按照里面的操作处理后的插件将实现:
企业画报:{"qi","企","ye","业","hua","画","bao","报"}
拼音插件的各项具体属性参考:https://blog.csdn.net/a1148233614/article/details/80280024,里面有详细介绍。
4.Elasticsearch创建index
这里使用的ES版本为7.0.1,不再支持mapping,创建代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | PUT /suggest_tset { "settings" : { "number_of_shards" : 1, "number_of_replicas" : 0, "analysis" : { "analyzer" : { "prefix_pinyin_analyzer" : { "tokenizer" : "standard" , "filter" : [ "lowercase" , "prefix_pinyin" ] }, "full_pinyin_analyzer" : { "tokenizer" : "standard" , "filter" : [ "lowercase" , "full_pinyin" ] }, "like_pinyin_analyzer" : { "tokenizer" : "standard" , "filter" : [ "lowercase" , "like_pinyin" ] } }, "filter" : { "_pattern" : { "type" : "pattern_capture" , "preserve_original" : true , "patterns" : [ "([0-9])" , "([a-z])" ] }, "prefix_pinyin" : { "type" : "pinyin" , "keep_first_letter" : "true" , "keep_full_pinyin" : "false" , "none_chinese_pinyin_tokenize" : "false" , "keep_separate_chinese" : "true" , "keep_original" : "false" }, "full_pinyin" : { "type" : "pinyin" , "keep_first_letter" : "false" , "keep_full_pinyin" : "true" , "keep_original" : "false" , "keep_separate_chinese" : "true" , "keep_none_chinese_in_first_letter" : "false" }, "like_pinyin" : { "type" : "pinyin" , "keep_first_letter" : "true" , "keep_full_pinyin" : "true" , "keep_joined_full_pinyin" : "false" , "keep_original" : "false" , "keep_separate_chinese" : "false" , "keep_none_chinese_in_first_letter" : "false" } } } }, "mappings" : { "dynamic" : "false" , "properties" : { "kwsuggest" : { "fields" : { "suggestText" : { "type" : "completion" , "analyzer" : "standard" , "preserve_separators" : "false" , "preserve_position_increments" : "true" , "max_input_length" : 50 }, "prefix_pinyin" : { "type" : "completion" , "analyzer" : "prefix_pinyin_analyzer" , "search_analyzer" : "standard" , "preserve_separators" : "false" }, "full_pinyin" : { "type" : "completion" , "analyzer" : "full_pinyin_analyzer" , "search_analyzer" : "standard" , "preserve_separators" : "false" }, "like_pinyin" : { "type" : "completion" , "analyzer" : "like_pinyin_analyzer" , "preserve_separators" : "false" } }, "type" : "text" } } } } |
这里插入几条测试数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | POST _bulk/?refresh= true { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "企业规划" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "祈愿设计 完美无瑕" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "悬崖的图片 美景" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "县衙地址 那里呢" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "悬崖风景图" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "起夜的风光 真的美" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "起夜第二个词 测试使用" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "需要一半留下一半打一字谜" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "许亚为" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "许雅非测试" } { "index" : { "_index" : "suggest_tset" , "_type" : "_doc" } } { "kwsuggest" : "徐杨是谁" } |
下面为测试的查询语句
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | GET /suggest_tset/_search { "suggest" : { "suggestText" : { "prefix" : "qi业" , "completion" : { "field" : "kwsuggest.suggestText" , "skip_duplicates" : true } }, "full_pinyin" : { "prefix" : "qi业" , "completion" : { "field" : "kwsuggest.full_pinyin" , "skip_duplicates" : true } }, "prefix_pinyin" : { "prefix" : "qi业" , "completion" : { "field" : "kwsuggest.prefix_pinyin" , "skip_duplicates" : true } }, "like_pinyin" : { "prefix" : "qi业" , "completion" : { "field" : "kwsuggest.like_pinyin" , "skip_duplicates" : true , "fuzzy" : { "fuzziness" : 1 } } } } } |
当输入查询条件为【qiy】的时候,结果为:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | { "took" : 17, "timed_out" : false , "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 0, "relation" : "eq" }, "max_score" : null , "hits" : [ ] }, "suggest" : { "full_pinyin" : [ { "text" : "qiy" , "offset" : 0, "length" : 3, "options" : [ { "text" : "起夜的风光 真的美" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "-jgnlHMBSEyTxFiDO4lU" , "_score" : 1.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "aDg3lHMBSEyTxFiDXprV" , "_score" : 1.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "like_pinyin" : [ { "text" : "qiy" , "offset" : 0, "length" : 3, "options" : [ { "text" : "企业规划" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "9TgnlHMBSEyTxFiDO4lU" , "_score" : 2.0, "_source" : { "kwsuggest" : "企业规划" } }, { "text" : "祈愿设计 这是啥呢" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "9jgnlHMBSEyTxFiDO4lU" , "_score" : 2.0, "_source" : { "kwsuggest" : "祈愿设计 这是啥呢" } }, { "text" : "起夜的风光 真的美" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "-jgnlHMBSEyTxFiDO4lU" , "_score" : 2.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "aDg3lHMBSEyTxFiDXprV" , "_score" : 2.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "prefix_pinyin" : [ { "text" : "qiy" , "offset" : 0, "length" : 3, "options" : [ ] } ], "suggestText" : [ { "text" : "qiy" , "offset" : 0, "length" : 3, "options" : [ ] } ] } } |
输入【qi业】的查询结果为
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | { "took" : 2, "timed_out" : false , "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 0, "relation" : "eq" }, "max_score" : null , "hits" : [ ] }, "suggest" : { "full_pinyin" : [ { "text" : "qi业" , "offset" : 0, "length" : 3, "options" : [ { "text" : "企业规划" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "9TgnlHMBSEyTxFiDO4lU" , "_score" : 1.0, "_source" : { "kwsuggest" : "企业规划" } } ] } ], "like_pinyin" : [ { "text" : "qi业" , "offset" : 0, "length" : 3, "options" : [ { "text" : "企业规划" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "9TgnlHMBSEyTxFiDO4lU" , "_score" : 2.0, "_source" : { "kwsuggest" : "企业规划" } }, { "text" : "祈愿设计 这是啥呢" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "9jgnlHMBSEyTxFiDO4lU" , "_score" : 2.0, "_source" : { "kwsuggest" : "祈愿设计 这是啥呢" } }, { "text" : "起夜的风光 真的美" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "-jgnlHMBSEyTxFiDO4lU" , "_score" : 2.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "aDg3lHMBSEyTxFiDXprV" , "_score" : 2.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "prefix_pinyin" : [ { "text" : "qi业" , "offset" : 0, "length" : 3, "options" : [ ] } ], "suggestText" : [ { "text" : "qi业" , "offset" : 0, "length" : 3, "options" : [ ] } ] } } |
输入【qy】的结果为
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | { "took" : 1, "timed_out" : false , "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 0, "relation" : "eq" }, "max_score" : null , "hits" : [ ] }, "suggest" : { "full_pinyin" : [ { "text" : "qy" , "offset" : 0, "length" : 2, "options" : [ ] } ], "like_pinyin" : [ { "text" : "qy" , "offset" : 0, "length" : 2, "options" : [ { "text" : "起夜的风光 真的美" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "-jgnlHMBSEyTxFiDO4lU" , "_score" : 2.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "aDg3lHMBSEyTxFiDXprV" , "_score" : 2.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "prefix_pinyin" : [ { "text" : "qy" , "offset" : 0, "length" : 2, "options" : [ { "text" : "起夜的风光 真的美" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "-jgnlHMBSEyTxFiDO4lU" , "_score" : 1.0, "_source" : { "kwsuggest" : "起夜的风光 真的美" } }, { "text" : "起夜第二个词 测试使用" , "_index" : "suggest_tset" , "_type" : "_doc" , "_id" : "aDg3lHMBSEyTxFiDXprV" , "_score" : 1.0, "_source" : { "kwsuggest" : "起夜第二个词 测试使用" } } ] } ], "suggestText" : [ { "text" : "qy" , "offset" : 0, "length" : 2, "options" : [ ] } ] } } |
5.java版本代码
这里使用elasticsearch-rest-high-level-client
application.yml添加配置
1 2 3 | # ES配置 elasticsearch: ipAddress: [127.0.0.1:9200] |
添加配置类
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | @Component @Configuration @ConfigurationProperties(prefix = "elasticsearch" ) @Data public class ElasticsearchRestClientConfig { private Logger logger = LoggerFactory.getLogger(getClass()); private static final int ADDRESS_LENGTH = 2; private static final String HTTP_SCHEME = "http" ; /** * 使用冒号隔开ip和端口 */ public String[] ipAddress; @Bean public RestClientBuilder restClientBuilder() { HttpHost[] hosts = Arrays.stream(ipAddress) .map( this ::makeHttpHost) .filter(Objects::nonNull) .toArray(HttpHost[]:: new ); logger.debug( "hosts:{}" , Arrays.toString(hosts)); return RestClient.builder(hosts); } @Bean(name = "highLevelClient" ) public RestHighLevelClient highLevelClient(@Autowired RestClientBuilder restClientBuilder) { return new RestHighLevelClient(restClientBuilder); } private HttpHost makeHttpHost(String s) { assert StringUtils.isNotEmpty(s); String[] address = s.split( ":" ); if (address.length == ADDRESS_LENGTH) { String ip = address[0]; int port = Integer.parseInt(address[1]); return new HttpHost(ip, port, HTTP_SCHEME); } else { return null ; } } } |
实现的代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | @Service public class KwSuggestService implements IKwSuggest { @Autowired RestHighLevelClient highLevelClient; @Override public List<String> GetKwSuggestList(String kw){ SearchRequest searchRequest = new SearchRequest( "suggest_tset" ); SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); SuggestBuilder suggestBuilder= new SuggestBuilder(); suggestBuilder.addSuggestion( "suggestText" , SuggestBuilders.completionSuggestion( "kwsuggest.suggestText" ).prefix(kw).skipDuplicates( true ).size(5)); suggestBuilder.addSuggestion( "full_pinyin" , SuggestBuilders.completionSuggestion( "kwsuggest.full_pinyin" ).prefix(kw).skipDuplicates( true ).size(5)); suggestBuilder.addSuggestion( "prefix_pinyin" , SuggestBuilders.completionSuggestion( "kwsuggest.prefix_pinyin" ).prefix(kw).skipDuplicates( true ).size(5)); suggestBuilder.addSuggestion( "like_pinyin" , SuggestBuilders.completionSuggestion( "kwsuggest.like_pinyin" ).prefix(kw, Fuzziness.fromEdits(1)).skipDuplicates( true ).size(5)); sourceBuilder.suggest(suggestBuilder); sourceBuilder.timeout( new TimeValue(10, TimeUnit.SECONDS)); searchRequest.source(sourceBuilder); List<String> result = new ArrayList<>(); List<String> suggestionList= Arrays.asList( "suggestText" , "full_pinyin" , "prefix_pinyin" , "like_pinyin" ); try { SearchResponse response = highLevelClient.search(searchRequest, RequestOptions.DEFAULT); Suggest suggestions = response.getSuggest(); Integer index = 1; for (String suggestionType : suggestionList){ CompletionSuggestion completionSuggestion = suggestions.getSuggestion(suggestionType); for (CompletionSuggestion.Entry entry : completionSuggestion.getEntries()) { for (CompletionSuggestion.Entry.Option option : entry) { String suggestText = option.getHit().getSourceAsMap(). get ( "kwsuggest" ).toString(); result.add(suggestText); } } // 按照中文匹配、全拼匹配、拼音首字母匹配、模糊匹配的顺序,结果大于5的时候返回结果,根据自己业务需要判断这个返回的数量 if (result.size()>=5){ break ; } // 中文匹配,全拼匹配以及拼音首字母匹配存在结果的,不需要模糊匹配 if (index==3 && result.size()>0){ break ; } // 超过3个字模糊匹配不准确 if (kw.length()>3 && result.size()==0){ break ; } } return result; } catch (IOException e) { e.printStackTrace(); return new ArrayList<>(); } } } |
6..c#代码实现
C#使用的是NEST
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | public partial class ElasticFactory { public ExternalServiceResponse<KeywordsSuggestResponseDataEntity> GetKeywordsSuggest(ElasticKeywordsSuggestRequest request) { var result = new ExternalServiceResponse<KeywordsSuggestResponseDataEntity>(); try { if ( string .IsNullOrEmpty(request.q)) return result; var nodes = new Uri[0]; nodes[0] = new Uri( "http://127.0.0.1:9200" ); var pool = new StaticConnectionPool(nodes); var settings = new ConnectionSettings(pool).DefaultIndex( "suggest_tset" ); var client = new ElasticClient(settings); string [] keys = new [] { "suggestText" , "full_pinyin" , "prefix_pinyin" , "like_pinyin" }; SearchDescriptor< object > search = new SearchDescriptor< object >(); search .Source(r => r .Includes(f => f .Fields( "kw" ) ) ) .Suggest(s => s.Completion(keys[0], c => c.Field( "kwsuggest.suggestText" ).SkipDuplicates( true ).Prefix(request.q).SkipDuplicates()) .Completion(keys[1], c => c.Field( "kwsuggest.full_pinyin" ).SkipDuplicates( true ).Prefix(request.q).SkipDuplicates()) .Completion(keys[2], c => c.Field( "kwsuggest.prefix_pinyin" ).SkipDuplicates( true ).Prefix(request.q).SkipDuplicates()) .Completion(keys[3], c => c.Field( "kwsuggest.like_pinyin" ).SkipDuplicates( true ).Prefix(request.q).SkipDuplicates().Fuzzy(m=>m.Fuzziness(Fuzziness.EditDistance(1))))) ; var esResult = client.Search<dynamic>(s => search); if (esResult != null ) { result.code = 1; result.data = new KeywordsSuggestResponseDataEntity(); //1.先获取中文全匹配 //2.上面不满5个,再匹配全拼 //3.上面不满5个,中文全拼匹配首字母 //4.上面都没有用模糊匹配 if (esResult.Suggest != null ) { result.data.items = new List<KeywordsSuggestResponseItemEntity>(); int index = 1; foreach ( var key in keys) { AddSuggestItems(esResult.Suggest, key, result.data.items); //1-3之间,够了5个就返回 if (index >= 1 && index <= 3 && result.data.items.Count >= 5) { result.data.items = result.data.items.Skip(0).Take(5).ToList(); break ; } //到了第3步如果还没有满足5个,直接返回,模糊匹配不精确 if (index == 3 && result.data.items.Count > 0) { break ; } //输入的字符数大于3个以上,前面没有关键词匹配,后面不做模糊处理,匹配度太差了 if (index == 3 && request.q.Length>3) { break ; } index++; } result.data.num = result.data.items.Count; } else { result.data.num = 0; } } else { result.code = 0; result.msg = "查询失败" ; } } catch (Exception ex) { result.code = 0; result.msg = ex.Message; } return result; } private void AddSuggestItems(ISuggestDictionary<dynamic> suggest, string key, List<KeywordsSuggestResponseItemEntity> items) { var suggestFullPinyin = suggest[key]; if (suggestFullPinyin != null ) { foreach ( var hit in suggestFullPinyin[0].Options) { string kwSource = hit.Source[ "kwsuggest" ]; //已经存在的不要重复添加 if (items.Any(m => m.kw == kwSource)) { continue ; } items.Add( new KeywordsSuggestResponseItemEntity() { kw = kwSource }); } } } } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 地球OL攻略 —— 某应届生求职总结
· 提示词工程——AI应用必不可少的技术
· 字符编码:从基础到乱码解决
· Open-Sora 2.0 重磅开源!