用dask并行把大量文本数据读入numpy并分批保存

导入包

import numpy as np
import os
import dask 

看看文件格式和

file_list = os.listdir('train_data')
print(len(file_list))
print(file_list[:100])

image

delayed读入并且分批保存

n = 0
result = []
for i in range(len(file_list)):
    y = dask.delayed(np.loadtxt)('train_data/'+file_list[i], delimiter=',')
    result.append(y)
    if (i+1) % 10000 == 0:
        print('saving ... '+str(n))
        result = np.array(result)
        result = dask.compute(*result)
        np.save('data_pack/data_pack_'+str(n), result)
        n+=1
        result = []

if result!= []:
    result = np.array(result)
    result = dask.compute(*result)
    np.save('data_pack/data_pack_'+str(n), result)
    print('saving ... '+str(n))
posted @ 2023-02-27 17:29  裏表異体  阅读(78)  评论(0编辑  收藏  举报