导入包
import numpy as np
import os
import dask
看看文件格式和
file_list = os.listdir('train_data')
print(len(file_list))
print(file_list[:100])
delayed读入并且分批保存
n = 0
result = []
for i in range(len(file_list)):
y = dask.delayed(np.loadtxt)('train_data/'+file_list[i], delimiter=',')
result.append(y)
if (i+1) % 10000 == 0:
print('saving ... '+str(n))
result = np.array(result)
result = dask.compute(*result)
np.save('data_pack/data_pack_'+str(n), result)
n+=1
result = []
if result!= []:
result = np.array(result)
result = dask.compute(*result)
np.save('data_pack/data_pack_'+str(n), result)
print('saving ... '+str(n))