爬虫并发

#!/usr/bin/python3
# _*_ coding:utf-8 _*_
'''
单线程
'''
import os,time
import requests
from bs4 import BeautifulSoup
import uuid
def out_wrapper(func):  # 记录执行时间的简单装饰器
    def inner_wrapper():
        start_time = time.time()
        func()
        stop_time = time.time()
        print('Used time {}'.format(stop_time-start_time))
    return inner_wrapper
def save_flag(img,filename):    # 保存图片
    path = os.path.join('down_photos',filename)
    with open(path,'wb') as fp:
        fp.write(img)
def download_one(url):  # 下载一个图片
    image = requests.get(url)
    save_flag(image.content,str(uuid.uuid4()))
def user_conf():    # 返回30个图片的url
    url = 'https://unsplash.com/'
    ret = requests.get(url)
    soup = BeautifulSoup(ret.text, "html.parser")
    zzr = soup.find_all('img')
    ret = []
    num = 0
    for item in zzr:
        if item.get("src").endswith('80') and num < 30:
            num += 1
            ret.append(item.get("src"))
    return ret

@out_wrapper
def download_many():
    zzr = user_conf()
    for item in zzr:
        download_one(item)
if __name__ == '__main__':
    download_many()
单线程
并发:1多进程 multiprocessing
futures.ProcessPoolExector
2多线程 threading
futures.ThreadPollExecutor map
submit和futures.as_completed
3协成 gevent
asyncio
from multiprocessing import Process
from get_photos import out_wrapper,download_one,user_conf
@out_wrapper
def download_many():
    zzr = user_conf()
    task_list = []
    for item in zzr:
        t = Process(target=download_one,args=(item,))
        t.start()
        task_list.append(t)
    print(task_list)
    [t.join() for t in task_list]   # 等待进程全部执行完毕(为了记录时间)
if __name__ == '__main__':
    download_many()
多进程
from concurrent import futures
from get_photos import out_wrapper,download_one,user_conf
@out_wrapper
def download_many():
    zzr =user_conf()
    with futures.ProcessPoolExecutor(len(zzr)) as executor:
        res = executor.map(download_one,zzr)
    return len(list(res))

if __name__ == '__main__':
    download_many()
二多进程
import threading
from get_photos import out_wrapper,download_one,user_conf
@out_wrapper
def download_many():
    zzr =user_conf()
    task_list = []
    for item in zzr:
        t = threading.Thread(target=download_one,args=(item,))
        t.start()
        task_list.append(t)
    [t.join() for t in task_list]


if __name__ == '__main__':
    download_many()
一多线程
from gevent import monkey
monkey.patch_all()
import gevent
from get_photos import out_wrapper,download_one,user_conf

@out_wrapper
def download_many():
    zzr =user_conf()
    jobs = [gevent.spawn(download_one,item) for item in zzr]
    gevent.joinall(jobs)


if __name__ == '__main__':
    download_many()
协程
import uuid
import asyncio

import aiohttp
from get_photos import out_wrapper,user_conf,save_flag
async def download_one(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            save_flag(await resp.read(),str(uuid.uuid4()))
@out_wrapper
def download_many():
    urls =user_conf()
    loop = asyncio.get_event_loop()
    to_do = [download_one(url) for url in urls]
    wait_coro = asyncio.wait(to_do)
    res, _=loop.run_until_complete(wait_coro)
    loop.close()
    return len(res)


if __name__ == '__main__':
    download_many()
asyncio

 

from concurrent import futures
from get_photos import out_wrapper,download_one,user_conf
@out_wrapper
def download_many():
    zzr =user_conf()
    with futures.ThreadPoolExecutor(len(zzr)) as executor:
        res = executor.map(download_one,zzr)
    return len(list(res))
多线程map
from concurrent import futures
from get_photos import out_wrapper,download_one,user_conf
@out_wrapper
def download_many():
    zzr =user_conf()
    with futures.ThreadPoolExecutor(len(zzr)) as executor:
       to_do = [executor.submit(download_one,item) for item in zzr]
       ret = [future.result() for future in futures.as_completed(to_do)]
    return ret


if __name__ == '__main__':
    download_many()
多线程submit和futures.as_completed

 

posted @ 2018-07-06 17:26  rjm123456  阅读(193)  评论(0编辑  收藏  举报