ES-es-ElasticSearch:打破默认取回10条,scroll连续取回
import random import json import datetime import time from pymongo import MongoClient from pymongo import MongoClient, ASCENDING, UpdateOne, InsertOne, DeleteOne, ReplaceOne from pymongo.errors import BulkWriteError from io import BytesIO import pymysql.cursors from bson.objectid import ObjectId from bson import json_util as jsonb from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk from elasticsearch import helpers import logging # 默认的日志级别设置为WARNING(日志级别等级CRITICAL > ERROR > WARNING > INFO > DEBUG > NOTSET) logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(name)s [line:%(lineno)d] %(levelname)s %(message)s", datefmt='%a, %d %b %Y %H:%M:%S', filename="/root/backend/db/cront.log", filemode="a") dddd = MongoClient('120.133.26.118:20002', username='xwk', password='495vvFul015dV0cV') mongo_post = MongoClient('dds-2ze197183eba5c941.mongodb.rds.aliyuncs.com:3717', username='root', password='lyp82nLF') es = Elasticsearch(['es-cn-xxxx.elasticsearch.aliyuncs.com'], http_auth=('elastic', 'xxxx'), port=9200, timeout=50000) # 要加定时任务 def delete_es_posts(): current_start = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) logging.info("delete posts ok start-%s"%(current_start)) print("start:", current_start) # {"index": {"max_result_window": 1000000}} query_body = { "query": { "match_all": {} } } # 返回条数有限,默认10条,单次处理 # _searched = es.search(index="goodlook", doc_type="post", body=query_body) # for hit in _searched['hits']['hits']: # # print(hit) # print("---------------------------------------------------------------") # post_id = hit.get("_id") # one_post = mongo_post['admin']['post'].find_one({"_id": ObjectId("%s"%(post_id))}) # print("es_post_id, one_post:", post_id, one_post) # if one_post is None: # del_ret = es.delete(index="goodlook", doc_type="post", id=post_id) # print("delete es post:", post_id) # print("delete es status:", del_ret.get("result")) # print("---------------------------------------------------------------") # 采用scroll方法返回,返回条数不受限制 # clear_scroll default = True _searched = helpers.scan( client=es, query=query_body, scroll='10m', index='goodlook', doc_type='post', timeout='10m', size=2000, clear_scroll=True ) for search in _searched: # print(search) # {'_index': 'goodlook', '_type': 'post', '_id': '5c83779c8443a458eba30749', '_score': None, # '_source': {'color': [], 'items': [], 'scene': 'show', 'year': '2019', 'season': '春夏', # 'show_name_en': 'Spring 2019 Menswear', 'show_name_cn': '2019春夏男装系列'}, 'sort': [11206]} post_id = search.get("_id") one_post = mongo_post['admin']['post'].find_one({"_id": ObjectId("%s"%(post_id))}) print("es_post_id, one_post:", post_id, one_post) if one_post is None: del_ret = es.delete(index="goodlook", doc_type="post", id=post_id) print("delete es post:", post_id) print("delete es status:", del_ret.get("result")) print("---------------------------------------------------------------") current_end = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) logging.info("delete posts ok over-%s" % (current_end)) print("end:", current_end) print("sustained time:", current_end-current_start) delete_es_posts()
from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk from elasticsearch import helpers query_body = { "query": { "match_all": {} } } es = Elasticsearch(['es-cn-xxxx.elasticsearch.aliyuncs.com'], http_auth=('elastic', 'xxxx'), port=9200, timeout=50000) _searched = helpers.scan( client=es, query=query_body, scroll='10m', index='goodlook', doc_type='post', timeout='10m', size=2000, clear_scroll=True ) for search in _searched: pass