aiofiles拆分大文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | import asyncio import aiofiles import time import csv import os async def main(out_path, infile, num = 1000000 ): """ :param out_path: 输出文件路径 :param infile: 输入文件 :param num: 拆分每个文件的大小 :return: """ async with aiofiles. open (infile, "r" , encoding = "utf-8" ) as fp: basename = os.path.basename(infile).split( '.' )[ 0 ] start = time.time() contents = await fp.readlines() count = 0 csv_list = [] for line in contents: if count = = 0 : head = line if count % num = = 0 and csv_list: file_idx = int (count / num) file = f "{out_path}/{basename}_{file_idx}.csv" print ( len (csv_list)) if file_idx = = 1 : async with aiofiles. open ( file , "w" , encoding = "utf-8" ) as fw: await fw.writelines(csv_list) else : async with aiofiles. open ( file , "w+" , encoding = "utf-8" ) as fw: await fw.write(head) await fw.writelines(csv_list) csv_list = [] csv_list.append(line) count + = 1 if csv_list: print ( len (csv_list)) file = f "{out_path}/{basename}_{file_idx + 1}.csv" async with aiofiles. open ( file , "w" , encoding = "utf-8" ) as w: await w.write(head) await w.writelines(csv_list) print (f "end1 {time.time() - start}" ) print (count) def read_csv_sync(out_path, infile, num = 1000000 ): """ :param out_path: 输出文件路径 :param infile: 输入文件 :param num: 拆分每个文件的大小 :return: """ basename = os.path.basename(infile).split( '.' )[ 0 ] start = time.time() with open (infile, newline = ' ', encoding=' utf - 8 ') as f: reader = csv.DictReader(e.replace( '\0' , '') for e in f) h = reader.fieldnames count = 0 res_lst = [] for r in reader: if count % num = = 0 and res_lst: file_idx = int (count / num) file = f "{out_path}/{basename}_{file_idx}.csv" with open ( file , newline = ' ', encoding=' utf - 8 ') as fw: write = csv.DictWriter(fw, h) write.writerows(res_lst) res_lst = [] if res_lst: print ( len (res_lst)) file = f "{out_path}/{basename}_{file_idx + 1}.csv" with open ( file , newline = ' ', encoding=' utf - 8 ') as w: write = csv.DictWriter(w, h) write.writerows(res_lst) print (f "end2 {time.time() - start}" ) if __name__ = = '__main__' : o_path = r 'D:\常用保存文件\split' o_path2 = r 'D:\常用保存文件\split2' in_file = r 'D:\临时文件\test.csv' loop = asyncio.get_event_loop() loop.run_until_complete(main(o_path, in_file)) loop.close() read_csv_sync(o_path2, in_file)<br><br><br><br><br><br>测试文件 3852733 使用异步拆分时间对比 |
1000000
1000000
1000000
852733
end1 5.248189210891724
3852733
end2 9.213284492492676
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)