aiofiles拆分大文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import asyncio
import aiofiles
import time
import csv
import os
 
 
 
async def main(out_path, infile, num=1000000):
    """
        :param out_path: 输出文件路径
        :param infile: 输入文件
        :param num: 拆分每个文件的大小
        :return:
        """
    async with aiofiles.open(infile, "r", encoding="utf-8") as fp:
        basename = os.path.basename(infile).split('.')[0]
        start = time.time()
        contents = await fp.readlines()
        count = 0
        csv_list = []
 
        for line in contents:
            if count == 0:
                head = line
            if count % num == 0 and csv_list:
                file_idx = int(count / num)
                file = f"{out_path}/{basename}_{file_idx}.csv"
                print(len(csv_list))
                if file_idx == 1:
                    async with aiofiles.open(file, "w", encoding="utf-8") as fw:
                        await fw.writelines(csv_list)
                else:
                    async with aiofiles.open(file, "w+", encoding="utf-8") as fw:
                        await fw.write(head)
                        await fw.writelines(csv_list)
                csv_list = []
 
            csv_list.append(line)
 
            count += 1
        if csv_list:
            print(len(csv_list))
            file = f"{out_path}/{basename}_{file_idx + 1}.csv"
            async with aiofiles.open(file, "w", encoding="utf-8") as w:
                await w.write(head)
                await w.writelines(csv_list)
        print(f"end1 {time.time() - start}")
        print(count)
 
 
def read_csv_sync(out_path, infile, num=1000000):
    """
    :param out_path: 输出文件路径
    :param infile: 输入文件
    :param num: 拆分每个文件的大小
    :return:
    """
    basename = os.path.basename(infile).split('.')[0]
    start = time.time()
    with open(infile, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(e.replace('\0', '') for e in f)
        h = reader.fieldnames
        count = 0
        res_lst = []
        for r in reader:
            if count % num == 0 and res_lst:
                file_idx = int(count / num)
                file = f"{out_path}/{basename}_{file_idx}.csv"
                with open(file, newline='', encoding='utf-8') as fw:
                    write = csv.DictWriter(fw, h)
                    write.writerows(res_lst)
                res_lst = []
        if res_lst:
            print(len(res_lst))
            file = f"{out_path}/{basename}_{file_idx + 1}.csv"
            with open(file, newline='', encoding='utf-8') as w:
                write = csv.DictWriter(w, h)
                write.writerows(res_lst)
 
        print(f"end2 {time.time() - start}")
 
 
 
 
if __name__ == '__main__':
    o_path = r'D:\常用保存文件\split'
    o_path2 = r'D:\常用保存文件\split2'
    in_file = r'D:\临时文件\test.csv'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(o_path, in_file))
    loop.close()
    read_csv_sync(o_path2, in_file)<br><br><br><br><br><br>测试文件 3852733 使用异步拆分时间对比

 

    1000000
 1000000
 1000000
 852733
 end1 5.248189210891724
 3852733
 end2 9.213284492492676

  

posted @   Young_Mo  阅读(171)  评论(0编辑  收藏  举报
编辑推荐:
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示