单线程+多任务异步协程
1.基础概念
1.协程
- 在函数(特殊的函数)定义的时候,如果使用了async修饰的话,则改变函数调用后返回一个协程对象,并且函数内部的实现语句不会被立即执行。
2.任务对象
- 任务对象就是对协程对象的进一步封装,任务对象==高级的协程对象==特殊的函数
- 任务对象必须要注册到事件循环对象中
- 给任务对象绑定回调函数:爬虫的数据解析中
3.事件循环
- 当作是一个容器,容器中必须存放任务对象
- 当启动事件循环对象后,则事件循环对象会对其内部存储任务对象进行一步的执行
4.aiohttp
- 支持异步网络请求的模块
协程
import asyncio def callback(task): # 作为任务对象的回调函数 print('i am callback and ', task.result()) # task.result是用来接收特殊函数的返回值 async def test(): print("i am test") return "123" c = test() # 封装了一个任务对象 task = asyncio.ensure_future(c) task.add_done_callback(callback) # 创建一个事件循环的对象 loop = asyncio.get_event_loop() loop.run_until_complete(task) # 执行结果 i am test i am callback and 123
多任务
import time import asyncio start = time.time() # 在特殊函数内部的实现中不可以出现不支持异步的模块代码 async def get_request(url): await asyncio.sleep(2) print("下载成功:", url) urls = [ 'www.1.com', 'www.2.com', ] tasks = [] for url in urls: c = get_request(url) task = asyncio.ensure_future(c) tasks.append(task) loop = asyncio.get_event_loop() # 注意:挂起操作需要手动处理 loop.run_until_complete(asyncio.wait(tasks)) print(time.time() - start) # 执行结果: 下载成功: www.1.com 下载成功: www.2.com 2.0021142959594727
在爬虫中的应用
# flask_server.py from flask import Flask import time app = Flask(__name__) @app.route('/bobo') def index_bobo(): time.sleep(2) return 'Hello bobo' @app.route('/jay') def index_jay(): time.sleep(2) return 'Hello jay' if __name__ == '__main__': app.run(threaded=True) # 在爬虫中的应用.py import time import asyncio import aiohttp start = time.time() urls = [ 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/jay', ] async def get_request(url): async with aiohttp.ClientSession() as s: async with await s.get(url=url) as response: page_text = await response.text() # print(page_text) return page_text tasks = [] for url in urls: c = get_request(url) task = asyncio.ensure_future(c) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) print(time.time() - start)
注意事项:
-
- iohttp支持异步请求的模块
练习:
# templates文件夹 # test.html
<!DOCTYPE html> <html lang="zh-CN"> <head> <meta charset="utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=device-width, initial-scale=1"> <!-- 上述3个meta标签*必须*放在最前面,任何其他内容都*必须*跟随其后! --> <title>Bootstrap 101 Template</title> <!-- Bootstrap --> <link href="bootstrap-3.3.7-dist/css/bootstrap.min.css" rel="stylesheet"> </head> <body> <h1>你好,世界!</h1> <ul> <li>i am hero!!!</li> <li>i am superMan!!!</li> <li>i am Spider!!!</li> </ul> </body> </html>
# flask_server.py
from flask import Flask, render_template import time app = Flask(__name__) @app.route('/bobo') def index_bobo(): time.sleep(2) return render_template('test.html') @app.route('/jay') def index_jay(): time.sleep(2) return render_template('test.html') @app.route('/tom') def index_tom(): time.sleep(2) return render_template('test.html') if __name__ == '__main__': app.run(threaded=True) # crawler_practice.py
import aiohttp import asyncio import time from lxml import etree start = time.time() urls = [ 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/jay', 'http://127.0.0.1:5000/tom', ] # 特殊的函数:请求发送和响应数据的捕获 # 细节:在每一个with前加上async,在每一个阻塞操作的前边加上await async def get_request(url): async with aiohttp.ClientSession() as s: # s.get(url,headers,proxy="http://ip:port",params) async with await s.get(url) as response: page_text = await response.text() # read()返回的是byte类型的数据,text返回时的字符串类型的数据 return page_text # 回调函数(数据解析和数据持久化) def parse(task): page_text = task.result() tree = etree.HTML(page_text) parse_data = tree.xpath('//li/text()') print(parse_data) tasks = [] for url in urls: c = get_request(url) task = asyncio.ensure_future(c) task.add_done_callback(parse) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) print(time.time()-start)
待续