python多线程统计大文件字数并对返回值进行计算
# _*_coding:utf-8_*_ import time import threading import configparser import os from datetime import datetime class MyThread(threading.Thread): def __init__(self, func, args=()): super(MyThread, self).__init__() self.func = func self.args = args self.result = None def run(self): self.result = self.func(*self.args) def get_result(self): try: return self.result except Exception as e: return None def word_count(file, start, size): # print("移动的大小", size) words = {} # 分段时以 rb 形式打开,所以这里也要以rb打开,否则数据不对 with open(file, 'rb') as fd: fd.seek(start, 0) # print("移动前位置", k) lines = fd.read(size) # 把 byte 转换成 string 指定编码格式 lines = str(lines, encoding='gbk') for l in lines: for w in l: if w not in words: words[w] = 1 else: words[w] += 1 fd.close() return words """ tell():返回文件读取指针的位置 seek()的三种模式:(如果offset的值非零的时候,一定要以 b 的方式打开,否则则抛出 io.UnsupportedOperation 错误) (1)f.seek(p,0) 移动当文件第p个字节处,绝对位置 (2)f.seek(p,1) 移动到相对于当前位置之后的p个字节 (3)f.seek(p,2) 移动到相对文章尾之后的p个字节 """ def chunk_file(file, size=1024*1024*20): if not os.path.exists(file): exit("File not exists") else: size_count = os.path.getsize(file) with open(file, 'rb') as f: end = 0 while True: start = end f.seek(size, 1) end = f.tell() yield start, end - start if end >= size_count: break f.close() if __name__ == '__main__': ''' 读取配置文件 ''' config = configparser.ConfigParser() config.read('conf.ini') # 文件名 file_name = config.get('info', 'fileName') # 线程数量 thread_num = int(config.get('info', 'threadNum')) # 起始时间 start_time = datetime.now() t = [] for start, size in chunk_file(file_name): th = MyThread(word_count, args=(file_name, start, size,)) t.append(th) th.start() th.join() results = {} for k in t: k.join() result = k.get_result() for i, v in result.items(): if i in results: results[i] += v else: results[i] = v print(results) end_time = datetime.now() print(end_time - start_time)
conf.ini
[info] fileName=D:\files\projects\test\word_deal\result.txt threadNum=5
结论:
测试 result.txt 文件410M ,电脑配置4核8G内存,消耗时间:92s
如果你也测试了,请在下方让我知道你的测试结果