mongo批量写入es
import pymongo import math from elasticsearch import Elasticsearch from elasticsearch import helpers import time HOST = ['ip:端口'] es = Elasticsearch(HOST,timeout=3600) # 链接ES HOST可以是[ip:端口,ip:端口] 的集群 client = pymongo.MongoClient("") # 链接数据库 db = client["blue_book_news_dev"]["blue_book_news"] # 统计mongo里面的数量, 计算分页 nums = db.count() print(nums) pages = math.ceil(nums/500) _index = "ai51_main_prod" start_time = time.time() for i in range(pages): n = 500 * i print("第{}多少个500,第{}条".format(i,n)) l=list(db.find({},projection={'_id':False,'news_url': True,"content":True,"title": True,"publish_time":True}).skip(n).limit(500)) for line in l: actions = [] if line.get("news_url"): action = { "_index": _index, # 类似于主键类型 "_type": "sources", # 类型 "_id": line["news_url"], # id 如果不自己定义系统会给创建 "_source": { "page_category": None, "url": line.get("news_url"), "article_title": line.get("title"), "article_content": line.get("content"), "publish_time_raw": line.get("publish_time"), "publish_time_nomalized": None, "summary":None }} actions.append(action) helpers.bulk(es, actions) # 批量写入 end_time =time.time() print(start_time-end_time)