ElasticSearch scroll分页查询
from size
from + size不能大于10000, 适用小数据量的查询,总量大于10000时这种方法就不适用了。
scroll_id分页查询
通过游标的方式查,无查询上限,实际是一种分页机制。
from elasticsearch import Elasticsearch
class MyElastic:
def __init__(self):
self.es = Elasticsearch(['192.168.199.32'], http_auth=('elastic', 'passwd'), port=9200)
def query_by_ScrollId(self, index, body):
with open('es_query_answer.txt', 'w') as fw:
res = self.es.search(index=index, doc_type='_doc', scroll='5m', timeout='1m', size=1000, body=body)
total = res["hits"]["total"]['value']
print(f'符合Query的记录总数:{total}, 使用scroll分页查:')
cur_length = len(res['hits']['hits'])
for x in res['hits']['hits']:
fw.write(x['_source']['name'])
fw.write('\n')
print('当前:', cur_length)
# 通过游标scroll_id查出全部数据
scroll_id = res["_scroll_id"]
for i in range(int(total / 1000)+1): # scroll分页, 每次size=1000
res = self.es.scroll(scroll_id=scroll_id, scroll='5m')
for x in res['hits']['hits']: # 写入文件
fw.write(x['_source']['name'])
fw.write('\n')
cur_length += 1000
print('当前:', cur_length)
es = MyElastic()
body = { # match: 匹配name包含xxx的数据
"_source": ["tld.subdomain", "tld.domain", 'name'], # 选取字段
"query": {
"match": {
"name": '.xyz'
}
}
}
es.query_by_ScrollId('fdns_a_2020-05', body)