12 python asyncio并发编程

事件循环

asyncio是python用于解决异步io编程的一整套解决方案

执行 10 个耗时的请求

import asyncio
import time
async def get_html(url):
    print("start get url")
    await asyncio.sleep(2)
    print("end get url")

if __name__ == "__main__":
    start_time = time.time()
    loop = asyncio.get_event_loop()
    tasks = [get_html("http://www.imooc.com") for i in range(10)]
    loop.run_until_complete(asyncio.wait(tasks))
    print(time.time()-start_time)
    loop.close()

输出结果如下

获取协程的返回值

1 创建一个任务 task

2 通过调用 task.result 获取协程的返回值

import asyncio
import time

async def get_html(url):
    print("start get url")
    await asyncio.sleep(2)
    return "zhangbiao"

if __name__ == "__main__":
    start_time = time.time()
    loop = asyncio.get_event_loop()
    task = loop.create_task(get_html("http://www.imooc.com"))
    loop.run_until_complete(task)
    print(task.result())

输出结果如下

执行成功进行回调处理

可以通过 add_done_callback( 任务) 添加回调，因为这个函数只接受一个回调的函数名，不能传参，我们想要传参可以使用偏函数

# 获取协程的返回值
import asyncio
import time
from functools import partial


async def get_html(url):
    print("start get url")
    await asyncio.sleep(2)
    return "zhangbiao"

def callback(url, future):
    print(url)
    print("send email to bobby")

if __name__ == "__main__":
    start_time = time.time()
    loop = asyncio.get_event_loop()
    task = loop.create_task(get_html("http://www.imooc.com"))
    task.add_done_callback(partial(callback, "http://www.imooc.com"))
    loop.run_until_complete(task)
    print(task.result())

输出结果如下

wait 和 gather区别

这两个都可以添加多个任务到事件循环中

gather 比wait更加的高级

　　1 可以对任务进行分组

　　2 可以取消任务

import asyncio
import time
async def get_html(url):
    print("start get url")
    await asyncio.sleep(2)
    print("end get url")

if __name__ == "__main__":
    start_time = time.time()
    loop = asyncio.get_event_loop()
    tasks = [get_html("http://www.imooc.com") for i in range(10)]

    #gather和wait的区别
    # tasks = [get_html("http://www.imooc.com") for i in range(10)]
    # loop.run_until_complete(asyncio.wait(tasks))
    group1 = [get_html("http://projectsedu.com") for i in range(2)]
    group2 = [get_html("http://www.imooc.com") for i in range(2)]
    group1 = asyncio.gather(*group1)
    group2 = asyncio.gather(*group2)
    loop.run_until_complete(asyncio.gather(group1, group2))
    print(time.time() - start_time)

输出结果如下

task取消和子协程调用原理

程序运行时通过 ctl +c 取消任务调用task.cancel()取消任务

import asyncio
import time

async def get_html(sleep_times):
    print("waiting")
    await asyncio.sleep(sleep_times)
    print("done after {}s".format(sleep_times))


if __name__ == "__main__":
    task1 = get_html(2)
    task2 = get_html(3)
    task3 = get_html(3)

    tasks = [task1, task2, task3]

    loop = asyncio.get_event_loop()

    try:
        loop.run_until_complete(asyncio.wait(tasks))
    except KeyboardInterrupt as e:
        all_tasks = asyncio.Task.all_tasks()
        for task in all_tasks:
            print("cancel task")
            print(task.cancel())
        loop.stop()
        loop.run_forever()
    finally:
        loop.close()

在终端

python ceshi.py 

# 运行成功后 按 ctl +c 取消任务

输出结果如下

子协程调用原理图

官方的一个实例如下

从下面的原理图我们可以看到

　　1 当时间循环处于运行状态的时候任务Task 处于pending（等待），会把控制权交给委托生成器print_sum

　　2 委托生成器print_sum 会建立一个双向通道为Task和子生成器，调用子生成器compute并把值传递过去

　　3 子生成器compute会通过委托生成器建立的双向通道把自己当前的状态suspending(暂停)，传给Task，Task告诉loop它数据还没处理完成

　　4 loop会循环检测Task ，Task通过双向通道去看自生成器是否处理完成

　　5 子生成器处理完成后会向委托生成器抛出一个异常和计算的值，并关闭生成器

　　6 委托生成器再把异常抛给任务（Task），把任务关闭

　　7 loop停止循环

call_soon、call_at、call_later、call_soon_threadsafe

call_soon 循环开始检测时，立即执行一个回调函数

call_at 循环开始的第几秒s执行

call_later 循环开始后10s后执行

call_soom_threadsafe 立即执行一个安全的线程

import asyncio

import time


def callback(str, loop):
    print("success time {}".format(str))
def stoploop(str,loop):
    time.sleep(str)
    loop.stop()


#call_later, call_at
if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.call_soon(callback, 'loop循环开始检测立即执行', loop)
    now = loop.time() # loop循环时间
    loop.call_at(now+2, callback, 2, loop)
    loop.call_at(now+1, callback, 1, loop)
    loop.call_at(now+3, callback, 3, loop)
    loop.call_later(6,callback,"6s后执行",loop)
    # loop.call_soon_threadsafe(stoploop,loop)
    loop.run_forever()

运行结果如下

ThreadPollExecutor 和 asyncio 完成阻塞 IO 请求

在asyncio 中集成线程池处理耗时IO

在协程中同步阻塞的写法，但有些时候不得已就是一些同步耗时的接口

可以把线程池进程到asynico模块中

　　tasks = []
      executor = ThreadPoolExecutor(3)
　　for url in range(20):
　　　　url = "http://shop.projectsedu.com/goods/{}/".format(url)
　　　　task = loop.run_in_executor(executor, get_url, url)
　　　　tasks.append(task)
　　loop.run_until_complete(asyncio.wait(tasks))

完整代码如下

#使用多线程：在携程中集成阻塞io
import asyncio
from concurrent.futures import ThreadPoolExecutor
import socket
from urllib.parse import urlparse


def get_url(url):
    #通过socket请求html
    url = urlparse(url)
    host = url.netloc
    path = url.path
    if path == "":
        path = "/"

    #建立socket连接
    client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    # client.setblocking(False)
    client.connect((host, 80)) #阻塞不会消耗cpu

    #不停的询问连接是否建立好， 需要while循环不停的去检查状态
    #做计算任务或者再次发起其他的连接请求

    client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))

    data = b""
    while True:
        d = client.recv(1024)
        if d:
            data += d
        else:
            break

    data = data.decode("utf8")
    html_data = data.split("\r\n\r\n")[1]
    print(html_data)
    client.close()


if __name__ == "__main__":
    import time
    start_time = time.time()
    loop = asyncio.get_event_loop()
    executor = ThreadPoolExecutor(3)
    tasks = []
    for url in range(20):
        url = "http://shop.projectsedu.com/goods/{}/".format(url)
        task = loop.run_in_executor(executor, get_url, url)
        tasks.append(task)
    loop.run_until_complete(asyncio.wait(tasks))
    print("last time:{}".format(time.time()-start_time))

输出结果如下

不用集成也是可以的但是要在函数的前面加上 async使同步变成异步写法

#使用多线程：在携程中集成阻塞io
import asyncio
from concurrent.futures import ThreadPoolExecutor
import socket
from urllib.parse import urlparse
import time

async def get_html(url):
    #通过socket请求html
    url = urlparse(url)
    host = url.netloc
    path = url.path
    if path == "":
        path = "/"

    #建立socket连接
    client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    # client.setblocking(False)
    client.connect((host, 80)) #阻塞不会消耗cpu

    #不停的询问连接是否建立好， 需要while循环不停的去检查状态
    #做计算任务或者再次发起其他的连接请求

    client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))

    data = b""
    while True:
        d = client.recv(1024)
        if d:
            data += d
        else:
            break

    data = data.decode("utf8")
    html_data = data.split("\r\n\r\n")[1]
    print(html_data)
    client.close()

if __name__ == "__main__":
    start_time = time.time()
    loop = asyncio.get_event_loop()
    tasks = [get_html("http://shop.projectsedu.com/goods/2/") for i in range(10)]
    loop.run_until_complete(asyncio.wait(tasks))
    print(time.time() - start_time)

View Code

输出结果如下

asyncio 模拟 http 请求

#asyncio 没有提供http协议的接口 aiohttp
import asyncio
import socket
from urllib.parse import urlparse


async def get_url(url):
    #通过socket请求html
    url = urlparse(url)
    host = url.netloc
    path = url.path
    if path == "":
        path = "/"

    #建立socket连接
    reader, writer = await asyncio.open_connection(host,80)
    writer.write("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode("utf8"))
    all_lines = []
    async for raw_line in reader:
        data = raw_line.decode("utf8")
        all_lines.append(data)
    html = "\n".join(all_lines)
    return html

async def main():
    tasks = []
    for url in range(20):
        url = "http://shop.projectsedu.com/goods/{}/".format(url)
        tasks.append(asyncio.ensure_future(get_url(url)))
    for task in asyncio.as_completed(tasks): # 获取执行完的任务
        result = await task   # 获取执行完任务的结果
        print(result)

if __name__ == "__main__":
    import time
    start_time = time.time()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
    print('last time:{}'.format(time.time()-start_time))

输出结果如下

future 和 task

future 是一个结果的容器，结果执行完后在内部会回调call_back函数

task 是future的子类，可以用来激活协程

asyncio同步和通信

在多少线程中考虑安全性，需要加锁，在协程中是不需要的

import asyncio

total = 0


async def add():
    global total
    for _ in range(1000000):
        total += 1


async def desc():
    global total, lock
    for _ in range(1000000):
        total -= 1


if __name__ == '__main__':
    tasks = [add(), desc()]
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    print(total)

输出结果如下

在有些情况在对协程中我们还是需要类似锁的机制

parse_stuff和use_stuff有共同调用的代码

get_stuff parse_stuff去请求的时候 如果get_stuff也去请求， 会触发网站的反爬虫机制. 

这就需要我们像上诉代码那样加lock

get_stuff 和 use_stuff 中都调用了parse_stuff我们想在get_stuff中只请求一次，下次用缓存，所以要用到锁

import asyncio
import aiohttp
from asyncio import Lock

cache = {}
lock = Lock()


async def get_stuff(url):
    async with lock:  # 等价于 with await lock:   还有async for 。。。类似的用法
        # 这里可以使用async with 是因为 Lock中有__await__ 和 __aenter__两个魔法方法
        # 和线程一样， 这里也可以用 await lock.acquire() 并在结束时 lock.release
        if url in cache:
            return cache[url]
        print("第一次请求")
        stuff =  aiohttp.request('GET', url)

        cache[url] = stuff
        return stuff


async def parse_stuff(url):
    stuff = await get_stuff(url)
    print('parse_stuff',stuff)
    # do some parse


async def use_stuff(url):
    stuff = await get_stuff(url)
    print('use_stuff',stuff)

    # use stuff to do something interesting


if __name__ == '__main__':
    tasks = [parse_stuff('baidu'), use_stuff('baidu')]
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))

输出结果如下　

asyncio 通信 queue

协程是单线程的，所以协程中完全可以使用全局变量实现queue来相互通信，但是如果想要在queue中定义存放有限的最大数目。我们需要使用 :

put 和get 的前面都要加 await

from asyncio import Queue

queue = Queue(maxsize=3)  

await queue.get()

await queue.put()

aiohttp实现高并发爬虫

# asyncio爬虫， 去重， 入库

import asyncio
import re
import aiohttp
import aiomysql
from pyquery import PyQuery

stopping = False

start_url = 'http://www.jobbole.com'
waitting_urls = []
seen_urls = set()  # 实际使用爬虫去重时，数量过多，需要使用布隆过滤器


async def fetch(url, session):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(url) as resp:
                print('url status: {}'.format(resp.status))
                if resp.status in [200, 201]:
                    data = await resp.text()
                    return data
        except Exception as e:
            print(e)


def extract_urls(html):  # html中提取所有url
    urls = []
    pq = PyQuery(html)
    for link in pq.items('a'):
        url = link.attr('href')
        if url and url.startwith('http') and url not in seen_urls:
            urls.append(url)
            waitting_urls.append(urls)
    return urls


async def init_urls(url, session):
    html = await fetch(url, session)
    seen_urls.add(url)
    extract_urls(html)


async def article_handler(url, session, pool):  # 获取文章详情并解析入库
    html = await fetch(url, session)
    extract_urls(html)
    pq = PyQuery(html)
    title = pq('title').text()  # 为了简单， 只获取title的内容
    async with pool.acquire() as conn:
        async with conn.cursor() as cur:
            await cur.execute('SELECT 42;')
            insert_sql = "insert into article_test(title) values('{}')".format(
                title)
            await cur.execute(insert_sql)  # 插入数据库
            # print(cur.description)
            # (r,) = await cur.fetchone()
            # assert r == 42


async def consumer(pool):
    async with aiohttp.ClientSession() as session:
        while not stopping:
            if len(waitting_urls) == 0:  # 如果使用asyncio.Queue的话， 不需要我们来处理这些逻辑。
                await asyncio.sleep(0.5)
                continue
            url = waitting_urls.pop()
            print('start get url:{}'.format(url))
            if re.match('http://.*?jobbole.com/\d+/', url):
                if url not in seen_urls:  # 是没有处理过的url，则处理
                    asyncio.ensure_future(article_handler(url, sssion, pool))
            else:
                if url not in seen_urls:
                    asyncio.ensure_future(init_urls(url))


async def main(loop):
    # 等待mysql连接建立好
    pool = await aiomysql.creat_pool(host='127.0.0.1', port=3306, user='root',
                                     password='', db='aiomysql_test', loop=loop, charset='utf8', autocommit=True)
    # charset  autocommit必须设置， 这是坑， 不写数据库写入不了中文数据
    async with aiohttp.ClientSession() as session:
        html = await fetch(start_url, session)
        seen_urls.add(start_url)
        extract_urls(html)
    asyncio.ensure_future(consumer(pool))


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    asyncio.ensure_future(main(loop))
    loop.run_forever()

posted @ 2018-12-12 10:28 Crazymagic 阅读(785) 评论(0) 编辑收藏举报

刷新页面返回顶部

Crazymagic

12 python asyncio并发编程

事件循环

task取消和子协程调用原理

call_soon、call_at、call_later、call_soon_threadsafe

ThreadPollExecutor 和 asyncio 完成阻塞 IO 请求

asyncio 模拟 http 请求

future 和 task

asyncio同步和通信

aiohttp实现高并发爬虫

公告