django 多线程下载图片
example1:
from multiprocessing.dummy import Pool as ThreadPool #多线程 import time import urllib2 urls = [ 'http://www.python.org', 'http://www.python.org/about/', 'http://www.onlamp.com/pub/a/python/2003/04/17/metaclasses.html', 'http://www.python.org/doc/', ] # 单线程 start = time.time() results = map(urllib2.urlopen, urls) print 'Normal:', time.time() - start # 多线程 start2 = time.time() # 开4个 worker,没有参数时默认是 cpu 的核心数 pool = ThreadPool(4) # 在线程中执行 urllib2.urlopen(url) 并返回执行结果 results2 = pool.map(urllib2.urlopen, urls) pool.close() pool.join() print 'Thread Pool:', time.time() - start2
example2:
#!/usr/bin/env python # coding=utf-8 import os import random import requests from cStringIO import StringIO from PIL import Image from multiprocessing.dummy import Pool as ThreadPool class Labelcode(object): def __init__(self, path='images'): self.base_path = os.path.dirname(__file__) print 'self.base_path', self.base_path totalImg = [os.path.join(self.base_path, path, fname) for fname in os.listdir(os.path.join(self.base_path, path))] finished = [fname for fname in totalImg if fname.find('_')>1] self.unfinished = [fname for fname in totalImg if fname not in finished] self.label_msg = {} # 当前信息 self.label_msg['totalCnt'] = len(totalImg) self.label_msg['finishedCnt']= len(finished) if self.unfinished: self.label_msg['fname'] = random.choice(self.unfinished) else: self.label_msg['fname'] = 'no img exists' def get_label_img(self, fname=None, result=''): """ 标注相关验证码图片,并返回当前信息 """ print self.base_path print '---'*30 if fname in self.unfinished and len(result)>1: newName = '%s/images/%s_%s.jpg' % (self.base_path, result, fname.split('/')[-1].split('.')[0]) if os.path.exists(fname): os.rename(fname, newName) # 更新名字 self.unfinished.remove(fname) self.label_msg['finishedCnt'] += 1 self.label_msg['fname'] = random.choice(self.unfinished) if self.unfinished: self.label_msg['fname'] = random.choice(self.unfinished) return self.label_msg def download_img(self, img_url=None, web_name=None, cnt=1000): """ 多线程下载图片 """ res = {'totalCnt': cnt, 'finishedCnt':0} if not img_url: return res def get_html((img_ur, idx)): try: img = StringIO(requests.get(img_url).content) img = Image.open(img).convert('RGB') img.save('%s/images/%s/%d.jpg' % (self.base_path, web_name, idx)) res['finishedCnt'] += 1 except Exception as e: print (e) web_name = web_name.strip().lower() cnt = int(cnt) if cnt else 0 img_urls = [] pool = ThreadPool(10) # 同时开启 10 个线程 for i in range(cnt): img_urls.append((img_url, i+500)) # 插入需要下载的url if not os.path.exists('%s/images/%s' % (self.base_path, web_name)): os.makedirs('%s/images/%s' % (self.base_path, web_name)) pool.map(get_html, img_urls) pool.close() pool.join() return res if __name__ == '__main__': test = Labelcode() url = 'https://passport.360.cn/captcha.php?m=create&app=i360&scene=login&userip=n7ASHVefL%2FAiu7j%2BPntTvQ%3D%3D&level=default&sign=c5d208' print test.download_img(url, '360', 10)
每天一小步,人生一大步!Good luck~