分批次处理数据,批处理

 

关于列表分组的基本代码

如果一个 列表,根据固定每批次多少个,分组
self.batch_num = 1000
codes = [1, 2, 4, 2, 1, 0, 3, 5, 7]
batch_groups = [codes[idx:idx + self.batch_num] for idx in range(0, len(codes), self.batch_num)]

# 根据分组来分,一共分几组, 比如分5组
self.pool_num = 10
self.batch_num = math.ceil(len(codes)/self.pool_num)
batch_groups = [codes[idx:idx + self.batch_num] for idx in range(0, len(codes), self.batch_num)]

# 打印结果
for idx, item in enumerate(batch_groups):
  logger.info("{}-{}-{}".format(idx, len(item), item))

  

第一种, 使用index

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "Victor"
# Date: 2016/9/20


# 定义批处理的条数, 如果是类就初始化为self.batch_times = 2
from loguru import logger

batch_times = 2


def batch_upsert(batches):
    pass


def batch_processor(all_data_list):
    try:
        batches = []
        for idx, value in enumerate(all_data_list):
            print("idx: {}, value: {}".format(idx, value))
            batches.append(value)

            # for循环内正常处理
            if idx % batch_times == 0:
                batch_upsert(batches)
                logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches)))
                batches = []

        # for循环外批量处理剩下
        batch_upsert(batches)
        logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches)))
    except Exception as ex:
        logger.exception('批处理异常: {}'.format(ex), ex)


if __name__ == '__main__':
    all_data_list = ["a", "b", "c", "d", "e", "f", "g", "h"]
    batch_processor(all_data_list)

    # 基本思路, 连续的列表或者数组数据, 索引除尽批次数的条件下,把那个小循环数据插入数据库, 并把那个小循环列表清空
    # 2020-11-27 11:19:26.541 | DEBUG    | __main__:batch_processor:28 - update items: 8-1
    # 2020-11-27 11:19:26.542 | DEBUG    | __main__:batch_processor:28 - update items: 8-2
    # 2020-11-27 11:19:26.542 | DEBUG    | __main__:batch_processor:28 - update items: 8-2
    # 2020-11-27 11:19:26.542 | DEBUG    | __main__:batch_processor:28 - update items: 8-2
    # 2020-11-27 11:19:26.542 | DEBUG    | __main__:batch_processor:36 - update items: 8-1
    # 有个问题是索引是从0开始的, 但是0很特殊, 0%任意数都是0, 因此第一条会单独插入, 因此更改为索引从1开始
    # for idx, value in enumerate(all_data_list, 1):

第二种,使用len==

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "Victor"
# Date: 2016/9/20


# 定义批处理的条数, 如果是类就初始化为self.batch_times = 2
from loguru import logger

batch_times = 2


def batch_upsert(batches):
    pass


def batch_processor(all_data_list):
    try:
        batches = []
        for item in all_data_list:
            print("item: {}".format(item))
            batches.append(item)

            # for循环内正常处理
            if len(batches) == batch_times:
                batch_upsert(batches)
                logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches)))
                batches = []

        # for循环外批量处理剩下
        batch_upsert(batches)
        logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches)))

    except Exception as ex:
        logger.exception('批处理异常: {}'.format(ex), ex)


if __name__ == '__main__':
    all_data_list = ["a", "b", "c", "d", "e", "f", "g", "h"]
    batch_processor(all_data_list)

    # 基本思路, 连续的列表或者数组数据, 达到每批次的个数时,把那个小循环数据插入数据库, 并把那个小循环列表清空
    # 2020-11-27 11:27:17.194 | DEBUG    | __main__:batch_processor:27 - update items: 8-2
    # 2020-11-27 11:27:17.194 | DEBUG    | __main__:batch_processor:27 - update items: 8-2
    # 2020-11-27 11:27:17.195 | DEBUG    | __main__:batch_processor:27 - update items: 8-2
    # 2020-11-27 11:27:17.195 | DEBUG    | __main__:batch_processor:27 - update items: 8-2
    # 2020-11-27 11:27:17.195 | DEBUG    | __main__:batch_processor:35 - update items: 8-0

第三种,  时间和插入批次双维度

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "Victor"
# Date: 2019/10/23

import time
def handle_batch_data(batches):
    pass

# 每批插入的个数和默认插入的时间
batch_num = 100
default_interval = 10
batches = []
time_start = time.time()

for msg in [1,2,2,3,2,1,2,4,5,8,7,6,5,2,1]:

    row = (msg)
    batches.append(row)

    # 计算距离上次插入数据库的时间
    current_interval = time.time() - time_start
    print("current_interval: ", current_interval)

    # 如果达到默认插入的数值就插入, 或者实时消费数据超过默认时间自动插入数据
    if (len(batches) == batch_num) or (current_interval > default_interval):

        handle_batch_data(batches)
        batches = []
        time_start = time.time()

# print("批量处理剩余数据", batches)
handle_batch_data(batches)
print("end")

  

 完善版本

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import time
from loguru import logger
from ycyj_zhongtai.service_ref.mdb_client import MdbClient


class MyTable(object):
    """
    资金 分钟 Kline 表
    """

    def __init__(self):
        self.min_table = MdbClient().get_multi_mdb('gongshi2_mdb', 'money')['MoneyTable']

    def insert_items(self, data_arr):
        self.min_table.insert_many(data_arr)

    def remove_items(self, where):
        self.min_table.delete_many(where)


class Example():

    def __init__(self):
        self.table = MyTable()
        self.batch_to_db_num = 1000

    def insert_minutes_data(self, tag, data_list):
        """
        批量插入数据库
        :param tag:
        :param data_list:
        :return:
        """
        data_len = len(data_list)
        batches = []
        accumulator = 0
        for item in data_list:
            batches.append(item)
            # for循环内正常处理
            if len(batches) == self.batch_to_db_num:
                self.table.insert_items(batches)
                accumulator += len(batches)
                logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))
                batches = []
                time.sleep(0.01)

        # for循环外批量处理剩下
        if batches:
            self.table.insert_items(batches)
            accumulator += len(batches)
            logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))

  
  def update_data(self, tag, data):
        """
        批量插入数据库
        :param tag:
        :param data:
        :return:
        """
        data_len = len(data)
        batches = []
        accumulator = 0
        for item in data:
            batches.append(item)
            # for循环内正常处理
            if len(batches) == self.batch_to_db_num:
                self.table.update_items(batches)
                accumulator += len(batches)
                logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))
                batches = []
                time.sleep(0.01)

        # for循环外批量处理剩下
        if batches:
            self.table.update_items(batches)
            accumulator += len(batches)
            logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))
def start(self): pass

  

 

posted @ 2019-09-20 16:51  Adamanter  阅读(639)  评论(0编辑  收藏  举报