Python 多线程
才10k 左右的PV,每日的交互日志就已经爆增到500M 的级别,XML 真是万恶阿!
为了提高日志处理速度,引入多线程。(如果保持这个趋势,我想Hadoop 的技术储备需要尽快考虑了)
脚本原执行时间43秒,引入5个worker 的多线程laungcher后为12秒,至少提高50%!!
最简单的Python 多线程模型:
#!/usr/bin/env python # encoding: utf-8 """ discovery.py Created by wangchen on 2010-06-25. Copyright (c) 2010 NetQin. All rights reserved. """ import sys import os import threading from xml.dom import minidom from re import match from time import time, strftime, localtime class WorkThread(threading.Thread): def __init__(self, func, targets): """docstring for __init__""" threading.Thread.__init__(self) self._func = func self._targets = targets def run(self): """docstring for run""" print '\t'.join([self.getName(), str(len(self._targets))]) for target in self._targets: self._func(target) def laungcher(thread_num, target, func, pattern=None): if os.path.isdir(target): cmd = 'find "' + target + '" -type f' if not pattern == None: cmd += ' -regex "' + pattern + '"' targets = os.popen(cmd).readlines() total = len(targets) pos = 0 wndsize = int(round(float(total) / thread_num)) workers=[] while (pos < total): end = pos + wndsize mt = WorkThread(func, targets[pos:end]) mt.start() #mt.join() workers.append(mt) pos = end for worker in workers: worker.join() else: func(target) def proc_report(target): """docstring for proc_report""" pass def main(): t = time() if len(sys.argv) < 2: print 'Usage: ', sys.argv[0], '<report_file or directory>' sys.exit() laungcher(5, sys.argv[1], proc_report, '.*2_.*.xml') print 'time used: ', time() - t if __name__ == '__main__': main()