python版elasticsearch入门笔记
Elasticsearch 是一个分布式、高扩展、高实时的搜索与数据分析引擎。Elasticsearch 的实现原理主要分为以下几个步骤,首先用户将数据提交到Elasticsearch 数据库中,再通过分词控制器去将对应的语句分词,将其权重和分词结果一并存入数据,当用户搜索数据时候,再根据权重将结果排名,打分,再将返回结果呈现给用户,以下案例版本为7.16.2,注意python环境安装的elasticsearch版本最好与es数据库版本一致,避免出现兼容性问题
目录:
1、测试样例说明
2、获取相似问embedding
3、索引表创建
4、数据更新导入案例
5、索引表信息查看
6、数据搜索查询
a、通过主键查新,get方法通过主键返回对应数据
b、完全匹配,term方法类似sql 的“=” terms类似sql的in
c、相似匹配,BM25
d、相似匹配加embedding点击排序
e、正则查询,该方法可用于搜索推荐
1、测试样例说明
测试样例是claud2生成的18个标准问,每个标准问有5个相似问,一共90条测试数据,标准问与相似问有均对应id,共四列数据
import pandas as pd dt=pd.read_csv('./kn_test.csv',encoding='gb18030')
2、获取相似问embedding,这里以macbert为例子,需提前部署bert-seriving,获取similar_question的向量表征后,数据格式如下所示
from bert_serving.client import BertClient IP = IP PORT = PORT PORT_OUT = PORT_OUT BC = BertClient(ip=IP, port=PORT, port_out=PORT_OUT) dt['vec']=list(BC.encode(list(dt['similar_question']))) dt['similar_content'] = dt['similar_question'] #复制一列用于完全匹配 dt
3、索引表创建,需注意字段是否要设为索引或关键字,以及需要倒排索引的列名,向量维度等
from elasticsearch import Elasticsearch import elasticsearch.helpers as es_helpers IP=IP PORT=9200 es = Elasticsearch([{'host': IP,'port': PORT}]) if es.indices.exists(index="faq_test"): es.indices.delete('faq_test') #倒排索引创建 index_body={ "settings" : { "number_of_shards" : 1, "number_of_replicas" : 1 }, "mappings": { "properties": { "similar_content": { "type":"text", "analyzer": "ik_max_word", #需安装IK分词器 "search_analyzer": "ik_max_word" }, "similar_question":{ "type":"keyword", #用于完全匹配 "index":True }, "vec":{ "type":"dense_vector", "dims":768, #向量维度 "index":False }, "faq_id":{ "type":"text", "index": True }, "fsimilar_id":{ "type":"text", "index": False } , "faq_standard":{ "type":"text", "index": True } } } } es.indices.create(index="faq_test", body=index_body)
#
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'faq_test'}
4、数据更新导入案例,这里以dataframe,其他数据格式同理,该方法采用update方法,存在则更新,不存在则新建,需注意主键字段
data=dt.to_dict(orient='records') action = [{'_op_type':'index','_index':'faq_test','_id':d['fsimilar_id'],'_source':d} for d in data] es_helpers.bulk(es, action)
5、索引表信息查看
#方法一 !curl '{IP}:9200/_cat/indices?v' #方法二 from prettyprinter import cpprint all_indices = es.indices.get_alias("*") cpprint(all_indices) #方法三 index_num=es.count(index='faq_test')['count'] print(num) index_info = es.indices.get(index="faq_test") cpprint(index_info)
6、数据搜索查询
a、通过主键查新,get方法通过主键返回对应数据
es.get(index="faq_test",id='1002',_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"]) # {'_index': 'faq_test', '_type': '_doc', '_id': '1002', '_version': 1, '_seq_no': 1, '_primary_term': 1, 'found': True, '_source': {'fsimilar_id': 1002, 'faq_standard': ' 如何学习编程', 'similar_content': '学习编程的步骤是什么', 'faq_id': 8001}}
b、完全匹配,term方法类似sql 的“=” terms类似sql的in
query = {"term": {"similar_question": "如何入门编程"}} res = es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"]) res # {'took': 7, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 4.1053944, 'hits': [{'_index': 'faq_test', '_type': '_doc', '_id': '1003', '_score': 4.1053944, '_source': {'fsimilar_id': 1003, 'faq_standard': ' 如何学习编程', 'similar_content': '如何入门编程', 'faq_id': 8001}}]}}
c、相似匹配,这里直接返回BM25得分最高的数据
query = {"match": {"similar_content": "商品怎么还没有发货"}} res = es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"]) res['hits']['hits'][:3] # [{'_index': 'faq_test', '_type': '_doc', '_id': '1065', '_score': 5.111511, '_source': {'fsimilar_id': 1065, 'faq_standard': '商品几天后发货', 'similar_content': '商品何时开始处理发货', 'faq_id': 8013}}, {'_index': 'faq_test', '_type': '_doc', '_id': '1053', '_score': 5.0239367, '_source': {'fsimilar_id': 1053, 'faq_standard': '购买的商品质量有问题怎么办', 'similar_content': '买到的商品存在缺陷怎么申请退换', 'faq_id': 8011}}, {'_index': 'faq_test', '_type': '_doc', '_id': '1063', '_score': 4.578239, '_source': {'fsimilar_id': 1063, 'faq_standard': '商品几天后发货', 'similar_content': '购买商品多长时间安排发货', 'faq_id': 8013}}]
query = { "bool": { "must": [ {"match": {"similar_content": "怎么学习"}}, {"terms": {"faq_id": ['8012','8001']}} ] } } res=es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"]) res['hits']['hits'][:3] # [{'_index': 'faq_test', '_type': '_doc', '_id': '1001', '_score': 4.7467327, '_source': {'fsimilar_id': 1001, 'faq_standard': ' 如何学习编程', 'similar_content': '怎样开始学习编程', 'faq_id': 8001}}, {'_index': 'faq_test', '_type': '_doc', '_id': '1002', '_score': 4.310314, '_source': {'fsimilar_id': 1002, 'faq_standard': ' 如何学习编程', 'similar_content': '学习编程的步骤是什么', 'faq_id': 8001}}, {'_index': 'faq_test', '_type': '_doc', '_id': '1004', '_score': 4.128132, '_source': {'fsimilar_id': 1004, 'faq_standard': ' 如何学习编程', 'similar_content': '编程学习的最佳途径是什么', 'faq_id': 8001}}]
d、相似匹配加向量点积排序,注意这里的embedding需和入库时调用的embedding 服务为同一个模型,返回结果根据点积相似度降序排列
text ="商品怎么还没有发货" vec= BC.encode([text])[0] query = { "script_score": { "query": { "bool": { "must": [ {"match": {"similar_content": text}} , {"terms": {"faq_id": ['8018','8013']}} ] } }, "script": { "source": "cosineSimilarity(params.vec, \u0027vec\u0027)", "params": { "vec": vec } } } } res =es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"]) #[i['_source']['faq_standard'] for i in res['hits']['hits']] #[i['_score'] for i in res['hits']['hits']] res['hits']['hits'][:3] # [{'_index': 'faq_test', '_type': '_doc', '_id': '1065', '_score': 0.8778344, '_source': {'fsimilar_id': 1065, 'faq_standard': '商品几天后发货', 'similar_content': '商品何时开始处理发货', 'faq_id': 8013}}, {'_index': 'faq_test', '_type': '_doc', '_id': '1063', '_score': 0.8663627, '_source': {'fsimilar_id': 1063, 'faq_standard': '商品几天后发货', 'similar_content': '购买商品多长时间安排发货', 'faq_id': 8013}}, {'_index': 'faq_test', '_type': '_doc', '_id': '1062', '_score': 0.8555895, '_source': {'fsimilar_id': 1062, 'faq_standard': '商品几天后发货', 'similar_content': '付款成功后多少天发货', 'faq_id': 8013}}]
e、正则查询,该方法可用于搜索推荐
text='如何' text_left=text+".*" query = { "bool": { "must": [ {"regexp": {"similar_content": { "value": text_left, "flags": "ALL", "max_determinized_states": 10000, "rewrite": "constant_score" } } }, { "terms":{"faq_id": ["8001","8002"]}}, ] ,"must_not":[{"terms":{"faq_id":["8018","8019"]}}] } } res =es.search(index="faq_test", query=query, size=10,_source=["similar_content", "faq_id", "fsimilar_id","faq_standard"]) print ([i['_source']['faq_standard'] for i in res['hits']['hits']])
['如何学习编程', '如何提高英语口语能力', '如何提高英语口语能力']