ES RAG向量搜索示例,使用BAAI BGE创建embedding
准备:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | docker pull docker.elastic.co / elasticsearch / elasticsearch: 7.6 . 2 7.6 . 2 : Pulling from elasticsearch / elasticsearch c808caf183b6: Pull complete d6caf8e15a64: Pull complete b0ba5f324e82: Pull complete d7e8c1e99b9a: Pull complete 85c4d6c81438 : Pull complete 3119218fac98 : Pull complete 914accf214bb : Pull complete Digest: sha256: 59342c577e2b7082b819654d119f42514ddf47f0699c8b54dc1f0150250ce7aa Status: Downloaded newer image for docker.elastic.co / elasticsearch / elasticsearch: 7.6 . 2 docker.elastic.co / elasticsearch / elasticsearch: 7.6 . 2 What's Next ? View a summary of image vulnerabilities and recommendations → docker scout quickview docker.elastic.co / elasticsearch / elasticsearch: 7.6 . 2 PS D:\source\pythonProject> pip install elasticsearch Requirement already satisfied: elasticsearch in d:\python\python312\lib\site - packages ( 7.6 . 0 ) Requirement already satisfied: urllib3> = 1.21 . 1 in d:\python\python312\lib\site - packages ( from elasticsearch) ( 1.26 . 18 ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | 进入容器修改配置 docker exec - it esid bash cd config / vi elasticsearch.yml 增加 http.cors.enabled: true http.cors.allow - origin: "*" discovery.zen.minimum_master_nodes: 1 重启服务 docker restart esid |
查看页面
ip:9200
编写代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | from elasticsearch import Elasticsearch # 连接Elasticsearch es = Elasticsearch() # 定义索引的设置和映射 index_name = "vector_search_example" index_settings = { "settings" : { "number_of_shards" : 1 , "number_of_replicas" : 0 }, "mappings" : { "properties" : { "title" : { "type" : "text" }, "embedding" : { "type" : "dense_vector" , # 使用dense_vector类型 "dims" : 5 , # 向量维度,根据实际情况调整 } } } } # 创建索引 if not es.indices.exists(index = index_name): es.indices.create(index = index_name, body = index_settings) # 存储向量数据示例 doc1 = { "title" : "Hello World Document" , "embedding" : [ 0.1 , 0.2 , 0.3 , 0.4 , 0.5 ] # 示例向量数据 } response = es.index(index = index_name, id = 1 , body = doc1) print (f "Indexed document: {response['result']}" ) # 添加更多文档 doc2 = { "title" : "Another Document Example" , "embedding" : [ 0.2 , 0.35 , 0.45 , 0.55 , 0.6 ] # 另一个示例向量 } response = es.index(index = index_name, id = 2 , body = doc2) print (f "Indexed document: {response['result']}" ) doc3 = { "title" : "Yet Another Hello" , "embedding" : [ 0.7 , 0.6 , 0.5 , 0.4 , 0.3 ] # 第三个示例向量,与前两个有较大差异 } response = es.index(index = index_name, id = 3 , body = doc3) print (f "Indexed document: {response['result']}" ) # 搜索相似向量 query_vector = [ 0.2 , 0.3 , 0.4 , 0.5 , 0.6 ] # 查询向量 script_query = { "script_score" : { "query" : { "match_all" : {}}, "script" : { "source" : "cosineSimilarity(params.query_vector, 'embedding') + 1.0" , "params" : { "query_vector" : query_vector} } } } response = es.search(index = index_name, body = { "query" : script_query}, size = 2 ) # 打印搜索结果 for hit in response[ "hits" ][ "hits" ]: print (f "Document ID: {hit['_id']}, Score: {hit['_score']}, Title: {hit['_source']['title']}" ) |
返回结果:
1 2 3 4 5 | Indexed document: updated Indexed document: updated Indexed document: updated Document ID : 2 , Score: 1.9982954 , Title: Another Document Example Document ID : 1 , Score: 1.9949367 , Title: Hello World Document |
我们再复杂一点,使用BGE模型进行编码,便于搜索:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | from elasticsearch import Elasticsearch from FlagEmbedding import FlagModel from collections import defaultdict from time import time # 连接Elasticsearch es = Elasticsearch() # 定义索引的设置和映射 index_name = "vector_search_sec_tool" index_settings = { "settings" : { "number_of_shards" : 1 , "number_of_replicas" : 0 }, "mappings" : { "properties" : { "description" : { "type" : "text" }, "embedding" : { "type" : "dense_vector" , "dims" : 768 } } } } # 创建索引 if not es.indices.exists(index = index_name): es.indices.create(index = index_name, body = index_settings) def search_sectool_knowledge_base(descriptions): # 构建索引 corpus = [] index = defaultdict( dict ) for item in descriptions: for method, description in item[ 'methods' ].items(): index[description] = { "method" : method, "path" : item[ "path" ]} corpus.append(description) embedder = FlagModel( 'bge-base-zh-v1.5/' , query_instruction_for_retrieval = "为这个句子生成表示以用于检索相关文章:" ,) corpus_embeddings = embedder.encode(corpus) # 存储向量数据到Elasticsearch for i, description in enumerate (corpus): doc = { "description" : description, "embedding" : corpus_embeddings[i].tolist() # 将numpy数组转换为列表 } response = es.index(index = index_name, id = i + 1 , body = doc) print (f "Indexed document: {response['result']}" ) # Query sentences: queries = [ '搜索告警列表' , '查询漏洞' ] now = time() times = 1 for i in range (times): for query in queries: query_embedding = embedder.encode(query).tolist() script_query = { "script_score" : { "query" : { "match_all" : {}}, "script" : { "source" : "cosineSimilarity(params.query_vector, 'embedding') + 1.0" , "params" : { "query_vector" : query_embedding} } } } response = es.search(index = index_name, body = { "query" : script_query}, size = 3 ) print ( "\n\n======================\n\n" ) print ( "Query:" , query) print ( "\nTop 3 most similar sentences in corpus:" ) for hit in response[ "hits" ][ "hits" ]: description = hit[ "_source" ][ "description" ] score = hit[ "_score" ] print (f "{description} (Score: {score:.4f}) ==> {index[description]}" ) print (f "{times} {time() - now} seconds elapsed" ) if __name__ = = '__main__' : descriptions = [{ 'path' : '/v1/{project_id}/subscriptions/version' , 'methods' : { 'GET' : '获取视图订购信息' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/sa/reports' , 'methods' : { 'GET' : '分析报管理获取报告列表' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/siem/alert-rules' , 'methods' : { 'GET' : 'corss-workspace智能建模聚合列表接口' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/siem/alert-rules/metrics' , 'methods' : { 'GET' : 'cross-workspace智能建模可用模型指标接口' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/alerts/search' , 'methods' : { 'POST' : '搜索告警列表' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/incidents/search' , 'methods' : { 'POST' : '搜索事件列表' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/indicators/search' , 'methods' : { 'POST' : '威胁情报列表查询' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/vulnerability/search' , 'methods' : { 'POST' : '查询漏洞列表' }}] search_sectool_knowledge_base(descriptions) |
运行结果:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | Indexed document: updated Indexed document: updated Indexed document: updated Indexed document: updated Indexed document: updated Indexed document: updated Indexed document: updated Indexed document: updated = = = = = = = = = = = = = = = = = = = = = = Query: 搜索告警列表 Top 3 most similar sentences in corpus: 搜索告警列表 (Score: 2.0000 ) = = > { 'method' : 'POST' , 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/alerts/search' } 搜索事件列表 (Score: 1.9030 ) = = > { 'method' : 'POST' , 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/incidents/search' } 威胁情报列表查询 (Score: 1.8769 ) = = > { 'method' : 'POST' , 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/indicators/search' } = = = = = = = = = = = = = = = = = = = = = = Query: 查询漏洞 Top 3 most similar sentences in corpus: 查询漏洞列表 (Score: 1.9688 ) = = > { 'method' : 'POST' , 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/vulnerability/search' } 威胁情报列表查询 (Score: 1.8580 ) = = > { 'method' : 'POST' , 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/indicators/search' } 搜索告警列表 (Score: 1.8370 ) = = > { 'method' : 'POST' , 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/alerts/search' } 1 0.060091257095336914 seconds elapsed |
还可以继续优化下,将ES数据存储完整:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | from elasticsearch import Elasticsearch from FlagEmbedding import FlagModel from collections import defaultdict from time import time # 连接Elasticsearch es = Elasticsearch() # 定义索引的设置和映射 index_name = "vector_search_example222" index_settings = { "settings" : { "number_of_shards" : 1 , "number_of_replicas" : 0 }, "mappings" : { "properties" : { "path" : { "type" : "text" }, "methods" : { "type" : "object" }, "description" : { "type" : "text" }, "embedding" : { "type" : "dense_vector" , "dims" : 768 # 假设FlagModel生成768维的向量 } } } } # 创建索引 if not es.indices.exists(index = index_name): es.indices.create(index = index_name, body = index_settings) def search_sec_knowledge_base(descriptions): # 构建索引 corpus = [] index = defaultdict( dict ) for item in descriptions: for method, description in item[ 'methods' ].items(): index[description] = { "method" : method, "path" : item[ "path" ]} corpus.append(description) model = FlagModel( 'bge-base-zh-v1.5/' , query_instruction_for_retrieval = "为这个句子生成表示以用于检索相关文章:" ,) embedder = model corpus_embeddings = embedder.encode(corpus) # 存储向量数据到Elasticsearch for i, description in enumerate (corpus): doc = { "path" : index[description][ "path" ], "methods" : {index[description][ "method" ]: description}, "description" : description, "embedding" : [ float (x) for x in corpus_embeddings[i]] # 确保是浮点数列表 } response = es.index(index = index_name, id = i + 1 , body = doc) print (f "Indexed document: {response['result']}" ) # Query sentences: queries = [ '搜索告警列表' , '查询漏洞' , 'Someone in a gorilla costume is playing a set of drums.' , 'A cheetah chases prey on across a field.' ] now = time() times = 1 for i in range (times): for query in queries: query_embedding = [ float (x) for x in embedder.encode(query)] script_query = { "script_score" : { "query" : { "match_all" : {}}, "script" : { "source" : "cosineSimilarity(params.query_vector, 'embedding') + 1.0" , "params" : { "query_vector" : query_embedding} } } } response = es.search(index = index_name, body = { "query" : script_query}, size = 5 ) print ( "\n\n======================\n\n" ) print ( "Query:" , query) print ( "\nTop 5 most similar sentences in corpus:" ) for hit in response[ "hits" ][ "hits" ]: source = hit[ "_source" ] description = source[ "description" ] score = hit[ "_score" ] print (f "{description} (Score: {score:.4f}) ==> Path: {source['path']}, Methods: {source['methods']}" ) print (f "{times} {time() - now} seconds elapsed" ) if __name__ = = '__main__' : descriptions = [{ 'path' : '/v1/{project_id}/subscriptions/version' , 'methods' : { 'GET' : '获取视图订购信息' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/sa/reports' , 'methods' : { 'GET' : '分析报管理获取报告列表' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/siem/alert-rules' , 'methods' : { 'GET' : 'corss-workspace智能建模聚合列表接口' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/siem/alert-rules/metrics' , 'methods' : { 'GET' : 'cross-workspace智能建模可用模型指标接口' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/alerts/search' , 'methods' : { 'POST' : '搜索告警列表' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/incidents/search' , 'methods' : { 'POST' : '搜索事件列表' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/indicators/search' , 'methods' : { 'POST' : '威胁情报列表查询' }}, { 'path' : '/v1/{project_id}/workspaces/{workspace_id}/soc/vulnerability/search' , 'methods' : { 'POST' : '查询漏洞列表' }}] search_sec_knowledge_base(descriptions) |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」
2022-06-20 挖矿家族Necro使用 动态域名DGA
2022-06-20 RSAC 2022热点议题——勒索软件、AI安全、威胁分析和狩猎,基于机器学习实现内存取证的技术,使用 Volatility 3 + pslist、psscan、pstree、malfind、netscan 等
2021-06-20 SQL注入 绕过WAF 绕过cc防护,没想到爬虫这么有用!!!比代理池还有延迟好用太多。
2021-06-20 SQL注入——WAF绕过
2021-06-20 SQL注入——堆叠注入,很简单,无非就是执行多条sql语句,注意不是所有DB支持
2021-06-20 SQL注入——DNSlog注入,限制有点大
2021-06-20 SQL注入——二次注入,可以用于篡改他人数据,一般代码分析发现,扫描工具是很难的