elasticsearch查询之大数据集分页性能分析
一、测试环境
python 3.7
elasticsearch 6.8
elasticsearch-dsl 7
安装elasticsearch-dsl
pip install elasticsearch-dsl
测试elasticsearch连通性
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
client = Elasticsearch(hosts=['http://127.0.0.1:9200'])
s = Search(using=client, index="my_store_index") .query("match_phrase_prefix", name="us")
s = s.source(['id'])
s = s.params(http_auth=["test", "test"])
response = s.execute()
for hit in response:
print(hit.meta.score, hit.name)
11.642133 945d0426-033e-4a8a-86db-b776c6c9a082
11.642133 3c1aead4-aa6f-4256-a126-f29f84c9ac89
11.642133 77782add-ab58-4eb6-85af-bcbe79be9623
11.642133 75a02b9a-be31-4a78-a3d9-9af72f98cbf9
11.642133 d5aacf16-61fc-4f0c-b05d-3d57c8ab6236
11.642133 30912e1d-4662-4f24-bd5b-5a997e44c290
11.642133 95c28501-66a6-4786-917b-0f1e38707648
11.642133 605f4e11-08c8-4d60-b803-7925cf325cea
11.642133 5dd93a29-e75c-44e3-9f26-bd90e588bc1d
11.642133 84e97af5-4e99-466f-bd82-10cd2b79aa18
二、from + size一次性返回大量数据性能测试
通过以下code,直接使用from + size返回100000记录,耗时17279ms;
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
def from_size_query(client):
s = Search(using=client, index="my_store_index")
s = s.params(http_auth=["test", "test"], request_timeout=50);
q = Q('bool',
must_not=[Q('match_phrase_prefix', name='us')]
)
s = s.query(q)
s = s.source(['id'])
s = s[0:100000]
response = s.execute()
print(f'hit total {response.hits.total}')
print(f'request time {response.took}ms')
client = Elasticsearch(hosts=['http://127.0.0.1:9200'])
from_size_query(client)
hit total 485070
request time 17279ms
三、使用search after分页返回大量数据性能测试
通过以下code,使用search_after分多次共返回100000记录;从执行结果可以看到当每页获取记录达到5000时,执行的时间基本变化不大;考虑到size增大对cpu和内存的影响,在测试数据情况下,size设置为3000或者4000比较合适;
def search_after_query(client, result):
s = Search(using=client, index="my_store_index")
s = s.params(http_auth=["test", "test"], request_timeout=50);
q = Q('bool',
must_not=[Q('match_phrase_prefix', name='us')]
)
s = s.query(q)
if result['after_value']:
s = s.extra(search_after= [result['after_value']])
s = s.source(['id'])
s = s[:result['size']]
s = s.sort('id')
response = s.execute()
fetch = len(response.hits)
result['total'] += response.took
result['times'] -= 1
while fetch == result['size'] and result['times'] > 0:
sort_val = response.hits.hits[-1].sort[-1]
s = s.extra(search_after=[sort_val])
response = s.execute()
fetch = len(response.hits)
result['total'] += response.took
result['times'] -= 1
client = Elasticsearch(hosts=['http://127.0.0.1:9200'])
times = 100
result = {"total": 0, "times":times, "size": 1000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 50
result = {"total": 0, "times":times, "size": 2000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 25
result = {"total": 0, "times":times, "size": 4000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 20
result = {"total": 0, "times":times, "size": 5000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 10
result = {"total": 0, "times":times, "size": 10000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 5
result = {"total": 0, "times":times, "size": 20000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 2
result = {"total": 0, "times":times, "size": 50000, "after_value":None}
search_after_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
size 1000 request 100 times total 14111ms
size 2000 request 50 times total 11987ms
size 4000 request 25 times total 11167ms
size 5000 request 20 times total 10589ms
size 10000 request 10 times total 9930ms
size 20000 request 5 times total 9978ms
size 50000 request 2 times total 9946ms
四、使用scroll分页返回大量数据性能测试
通过以下code,使用scroll分多次共取回100000记录;从执行结果通过不同的size获取数据,执行的时间变化不大,所以elasticsearch官方也不建议使用scroll;
def search_scroll_query(client, result):
s = Search(using=client, index="my_store_index")
s = s.params( request_timeout=50, scroll='1m');
q = Q('bool',
must_not=[Q('match_phrase_prefix', name='us')]
)
s = s.query(q)
s = s.source(['id'])
s = s[:result['size']]
response = s.execute()
fetch = len(response.hits)
result['total'] += response.took
result['times'] -= 1
scroll_id = response._scroll_id
while fetch == result['size'] and result['times'] > 0:
response = client.scroll(scroll_id=scroll_id, scroll='1m', request_timeout=50)
scroll_id = response['_scroll_id']
fetch = len(response['hits']['hits'])
result['total'] += response['took']
result['times'] -= 1
client = Elasticsearch(hosts=['http://127.0.0.1:9200'], http_auth=["test", "test"])
times = 100
result = {"total": 0, "times":times, "size": 1000}
search_scroll_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 50
result = {"total": 0, "times":times, "size": 2000}
search_scroll_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 25
result = {"total": 0, "times":times, "size": 4000}
search_scroll_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 20
result = {"total": 0, "times":times, "size": 5000}
search_scroll_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 10
result = {"total": 0, "times":times, "size": 10000}
search_scroll_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 5
result = {"total": 0, "times":times, "size": 20000}
search_scroll_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
times = 2
result = {"total": 0, "times":times, "size": 50000}
search_scroll_query(client, result)
print(f'size {result["size"]} request {times} times total {result["total"]}ms ')
size 1000 request 100 times total 16573ms
size 2000 request 50 times total 17678ms
size 4000 request 25 times total 16719ms
size 5000 request 20 times total 16031ms
size 10000 request 10 times total 16008ms
size 20000 request 5 times total 16074ms
size 50000 request 2 times total 14390ms
五、测试总结
通过对以上三种分页方式的性能测试,可以看到对于获取10W条记录级别的数据集,search_after的性能最好,在不考虑其他性能优化的基础上建议,可以考虑此种分页方式;
分类:
elastic search
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 为DeepSeek添加本地知识库
· 精选4款基于.NET开源、功能强大的通讯调试工具
· DeepSeek智能编程
· [翻译] 为什么 Tracebit 用 C# 开发
· 腾讯ima接入deepseek-r1,借用别人脑子用用成真了~