爬虫之协程异步 asyncio和aiohttp
基本用法:
# 高性能之协程爬虫 import asyncio # async def fun(url): # print(f"正在请求{url}") # print(f"{url}请求完毕") # return f"{url}你爸爸已经搞定了" # f = fun("http://www.baidu.com") # loop怎么玩? # loop = asyncio.get_event_loop() # loop.run_until_complete(f) # task怎么玩呢? # loop = asyncio.get_event_loop() # task = loop.create_task(f) # loop.run_until_complete(task) # future怎么玩呢? # loop = asyncio.get_event_loop() # task = asyncio.ensure_future(f) # loop.run_until_complete(task) # 回调函数 # def callback(task): # print(task.result()) # 绑定回调 # loop = asyncio.get_event_loop() # task = loop.create_task(f) # task.add_done_callback(callback) # loop.run_until_complete(task)
对多个任务爬取的例子
# =============正题来了============== # 对多个任务进行爬取 import requests import aiohttp import asyncio import random,time import time headers = { 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'http://www.baidu.com/', 'Connection': 'keep-alive', } start = time.time() arr = ["http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3", "http://music.163.com/song/media/outer/url?id=1820550501.mp3"] async def downsong(url): print(f"{url}开始") # 记住: 在异步协程中如果出现了同步模块相关的代码,那么就无法实现异步 # time.sleep(3) # await asyncio.sleep(3) # 得用这个 # res = requests.get(url,headers=headers) # print(res.text) async with aiohttp.ClientSession() as session: async with await session.get(url,headers=headers) as response: pass # page_text = await response.text() # print(page_text) print(f"{url}结束") stasks = [] loop = asyncio.get_event_loop() for i in range(len(arr)): f = downsong(arr[i]) task = loop.create_task(f) stasks.append(task) loop.run_until_complete(asyncio.wait(stasks)) end = time.time() print(end-start)
aiohttp基本用法
# 注意解决协程异步需要一个模块aiohttp # pip install aiohttp # aiohttp解决问题的基本代码 # async def get_page(url): # async with aiohttp.ClientSession() as session: # #get()、post(): # #headers,params/data,proxy='http://ip:port' # async with await session.get(url) as response: # #text()返回字符串形式的响应数据 # #read()返回的二进制形式的响应数据 # #json()返回的就是json对象 # #注意:获取响应数据操作之前一定要使用await进行手动挂起 # page_text = await response.text() # print(page_text)
基本用法
import asyncio async def a(): await asyncio.sleep(3) print('Resuming a') async def b(): await asyncio.sleep(3) print('In b') async def main(): await asyncio.gather(a(), b()) if __name__ == '__main__': asyncio.run(main()) print("dasahbi")
怎么说呢,他就是一个线程里面,把所有的IO进行协程异步。
有一点还没有解决,我不知道写入本地怎么写
1-1;程序报错:
There is no current event loop in thread 'Thread-1'
loop = asyncio.get_event_loop() 改为: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop)
-----------------------------------------------------------------------------------------------------------------------------------------