python多线程统计大文件字数并对返回值进行计算
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | # _*_coding:utf-8_*_ import time import threading import configparser import os from datetime import datetime class MyThread(threading.Thread): def __init__( self , func, args = ()): super (MyThread, self ).__init__() self .func = func self .args = args self .result = None def run( self ): self .result = self .func( * self .args) def get_result( self ): try : return self .result except Exception as e: return None def word_count( file , start, size): # print("移动的大小", size) words = {} # 分段时以 rb 形式打开,所以这里也要以rb打开,否则数据不对 with open ( file , 'rb' ) as fd: fd.seek(start, 0 ) # print("移动前位置", k) lines = fd.read(size) # 把 byte 转换成 string 指定编码格式 lines = str (lines, encoding = 'gbk' ) for l in lines: for w in l: if w not in words: words[w] = 1 else : words[w] + = 1 fd.close() return words """ tell():返回文件读取指针的位置 seek()的三种模式:(如果offset的值非零的时候,一定要以 b 的方式打开,否则则抛出 io.UnsupportedOperation 错误) (1)f.seek(p,0) 移动当文件第p个字节处,绝对位置 (2)f.seek(p,1) 移动到相对于当前位置之后的p个字节 (3)f.seek(p,2) 移动到相对文章尾之后的p个字节 """ def chunk_file( file , size = 1024 * 1024 * 20 ): if not os.path.exists( file ): exit( "File not exists" ) else : size_count = os.path.getsize( file ) with open ( file , 'rb' ) as f: end = 0 while True : start = end f.seek(size, 1 ) end = f.tell() yield start, end - start if end > = size_count: break f.close() if __name__ = = '__main__' : ''' 读取配置文件 ''' config = configparser.ConfigParser() config.read( 'conf.ini' ) # 文件名 file_name = config.get( 'info' , 'fileName' ) # 线程数量 thread_num = int (config.get( 'info' , 'threadNum' )) # 起始时间 start_time = datetime.now() t = [] for start, size in chunk_file(file_name): th = MyThread(word_count, args = (file_name, start, size,)) t.append(th) th.start() th.join() results = {} for k in t: k.join() result = k.get_result() for i, v in result.items(): if i in results: results[i] + = v else : results[i] = v print (results) end_time = datetime.now() print (end_time - start_time) |
conf.ini
1 2 3 | [info] fileName = D:\files\projects\test\word_deal\result.txt threadNum = 5 |
结论:
测试 result.txt 文件410M ,电脑配置4核8G内存,消耗时间:92s
如果你也测试了,请在下方让我知道你的测试结果
· Obsidian + DeepSeek:免费 AI 助力你的知识管理,让你的笔记飞起来!
· 分享4款.NET开源、免费、实用的商城系统
· 解决跨域问题的这6种方案,真香!
· 5. Nginx 负载均衡配置案例(附有详细截图说明++)
· Windows 提权-UAC 绕过
2020-02-16 java list的交集,差集,并集
2014-02-16 配置quartz数据源的三种方式
2014-02-16 Spring整合Quartz实现持久化、动态设定时间