Python 多线程

才10k 左右的PV，每日的交互日志就已经爆增到500M 的级别，XML 真是万恶阿！

为了提高日志处理速度，引入多线程。（如果保持这个趋势，我想Hadoop 的技术储备需要尽快考虑了）

脚本原执行时间43秒，引入5个worker 的多线程laungcher后为12秒，至少提高50%！！

最简单的Python 多线程模型：

#!/usr/bin/env python
# encoding: utf-8
"""
discovery.py

Created by wangchen on 2010-06-25.
Copyright (c) 2010 NetQin. All rights reserved.
"""

import sys
import os
import threading
from xml.dom import minidom
from re import match
from time import time, strftime, localtime



class WorkThread(threading.Thread):
	def __init__(self, func, targets):
		"""docstring for __init__"""
		threading.Thread.__init__(self)
		self._func = func
		self._targets = targets
	
	def run(self):
		"""docstring for run"""
		print '\t'.join([self.getName(), str(len(self._targets))])
		for target in self._targets:
			self._func(target)
	
def laungcher(thread_num, target, func, pattern=None):
	if os.path.isdir(target):
		cmd = 'find "' + target + '" -type f'
		if not pattern == None: cmd += ' -regex "' + pattern + '"'
		targets = os.popen(cmd).readlines()
		total = len(targets)
		pos = 0
		wndsize = int(round(float(total) / thread_num))
		workers=[]
		while (pos < total):
			end = pos + wndsize
			mt = WorkThread(func, targets[pos:end])
			mt.start()
			#mt.join()
			workers.append(mt)
			pos = end
		for worker in workers:
			worker.join()
	else:
		func(target)

	
def proc_report(target):
	"""docstring for proc_report"""
	pass
	
def main():
	t = time()
	if len(sys.argv) < 2:
		print 'Usage: ', sys.argv[0], '<report_file or directory>'
		sys.exit()
	laungcher(5, sys.argv[1], proc_report, '.*2_.*.xml')
	print 'time used: ', time() - t


if __name__ == '__main__':
	main()

posted on 2010-06-26 02:00 Tyrant 阅读(422) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Tyrant

Python 多线程

导航

公告