分批次处理数据,批处理
关于列表分组的基本代码
如果一个 列表,根据固定每批次多少个,分组 self.batch_num = 1000 codes = [1, 2, 4, 2, 1, 0, 3, 5, 7] batch_groups = [codes[idx:idx + self.batch_num] for idx in range(0, len(codes), self.batch_num)] # 根据分组来分,一共分几组, 比如分5组 self.pool_num = 10 self.batch_num = math.ceil(len(codes)/self.pool_num) batch_groups = [codes[idx:idx + self.batch_num] for idx in range(0, len(codes), self.batch_num)] # 打印结果 for idx, item in enumerate(batch_groups): logger.info("{}-{}-{}".format(idx, len(item), item))
第一种, 使用index
#! /usr/bin/env python # -*- coding: utf-8 -*- # __author__ = "Victor" # Date: 2016/9/20 # 定义批处理的条数, 如果是类就初始化为self.batch_times = 2 from loguru import logger batch_times = 2 def batch_upsert(batches): pass def batch_processor(all_data_list): try: batches = [] for idx, value in enumerate(all_data_list): print("idx: {}, value: {}".format(idx, value)) batches.append(value) # for循环内正常处理 if idx % batch_times == 0: batch_upsert(batches) logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches))) batches = [] # for循环外批量处理剩下 batch_upsert(batches) logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches))) except Exception as ex: logger.exception('批处理异常: {}'.format(ex), ex) if __name__ == '__main__': all_data_list = ["a", "b", "c", "d", "e", "f", "g", "h"] batch_processor(all_data_list) # 基本思路, 连续的列表或者数组数据, 索引除尽批次数的条件下,把那个小循环数据插入数据库, 并把那个小循环列表清空 # 2020-11-27 11:19:26.541 | DEBUG | __main__:batch_processor:28 - update items: 8-1 # 2020-11-27 11:19:26.542 | DEBUG | __main__:batch_processor:28 - update items: 8-2 # 2020-11-27 11:19:26.542 | DEBUG | __main__:batch_processor:28 - update items: 8-2 # 2020-11-27 11:19:26.542 | DEBUG | __main__:batch_processor:28 - update items: 8-2 # 2020-11-27 11:19:26.542 | DEBUG | __main__:batch_processor:36 - update items: 8-1 # 有个问题是索引是从0开始的, 但是0很特殊, 0%任意数都是0, 因此第一条会单独插入, 因此更改为索引从1开始 # for idx, value in enumerate(all_data_list, 1):
第二种,使用len==
#! /usr/bin/env python # -*- coding: utf-8 -*- # __author__ = "Victor" # Date: 2016/9/20 # 定义批处理的条数, 如果是类就初始化为self.batch_times = 2 from loguru import logger batch_times = 2 def batch_upsert(batches): pass def batch_processor(all_data_list): try: batches = [] for item in all_data_list: print("item: {}".format(item)) batches.append(item) # for循环内正常处理 if len(batches) == batch_times: batch_upsert(batches) logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches))) batches = [] # for循环外批量处理剩下 batch_upsert(batches) logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches))) except Exception as ex: logger.exception('批处理异常: {}'.format(ex), ex) if __name__ == '__main__': all_data_list = ["a", "b", "c", "d", "e", "f", "g", "h"] batch_processor(all_data_list) # 基本思路, 连续的列表或者数组数据, 达到每批次的个数时,把那个小循环数据插入数据库, 并把那个小循环列表清空 # 2020-11-27 11:27:17.194 | DEBUG | __main__:batch_processor:27 - update items: 8-2 # 2020-11-27 11:27:17.194 | DEBUG | __main__:batch_processor:27 - update items: 8-2 # 2020-11-27 11:27:17.195 | DEBUG | __main__:batch_processor:27 - update items: 8-2 # 2020-11-27 11:27:17.195 | DEBUG | __main__:batch_processor:27 - update items: 8-2 # 2020-11-27 11:27:17.195 | DEBUG | __main__:batch_processor:35 - update items: 8-0
第三种, 时间和插入批次双维度
#! /usr/bin/env python # -*- coding: utf-8 -*- # __author__ = "Victor" # Date: 2019/10/23 import time def handle_batch_data(batches): pass # 每批插入的个数和默认插入的时间 batch_num = 100 default_interval = 10 batches = [] time_start = time.time() for msg in [1,2,2,3,2,1,2,4,5,8,7,6,5,2,1]: row = (msg) batches.append(row) # 计算距离上次插入数据库的时间 current_interval = time.time() - time_start print("current_interval: ", current_interval) # 如果达到默认插入的数值就插入, 或者实时消费数据超过默认时间自动插入数据 if (len(batches) == batch_num) or (current_interval > default_interval): handle_batch_data(batches) batches = [] time_start = time.time() # print("批量处理剩余数据", batches) handle_batch_data(batches) print("end")
完善版本
#!/usr/bin/env python # -*- encoding: utf-8 -*- import time from loguru import logger from ycyj_zhongtai.service_ref.mdb_client import MdbClient class MyTable(object): """ 资金 分钟 Kline 表 """ def __init__(self): self.min_table = MdbClient().get_multi_mdb('gongshi2_mdb', 'money')['MoneyTable'] def insert_items(self, data_arr): self.min_table.insert_many(data_arr) def remove_items(self, where): self.min_table.delete_many(where) class Example(): def __init__(self): self.table = MyTable() self.batch_to_db_num = 1000 def insert_minutes_data(self, tag, data_list): """ 批量插入数据库 :param tag: :param data_list: :return: """ data_len = len(data_list) batches = [] accumulator = 0 for item in data_list: batches.append(item) # for循环内正常处理 if len(batches) == self.batch_to_db_num: self.table.insert_items(batches) accumulator += len(batches) logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len)) batches = [] time.sleep(0.01) # for循环外批量处理剩下 if batches: self.table.insert_items(batches) accumulator += len(batches) logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))
def update_data(self, tag, data): """ 批量插入数据库 :param tag: :param data: :return: """ data_len = len(data) batches = [] accumulator = 0 for item in data: batches.append(item) # for循环内正常处理 if len(batches) == self.batch_to_db_num: self.table.update_items(batches) accumulator += len(batches) logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len)) batches = [] time.sleep(0.01) # for循环外批量处理剩下 if batches: self.table.update_items(batches) accumulator += len(batches) logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))
def start(self): pass