爬虫之单线程多任务异步抓取
协程
import asyncio import time #定义了一个特殊的函数 #特殊:调用后会返回一个协程对象,且函数内部的实现语句不会被立即执行 #创建一个协程对象 # async def test(num): # print(num) # # c = test(10) # print(c) #封装一个任务对象 # async def test(num): # print(num) # # c = test(10) # #根据协程对象封装了一个任务对象 # task = asyncio.ensure_future(c) # print(task) #事件循环对象 async def request(url): print('正在请求:',url) time.sleep(2) print('请求完毕!',url) c1 = request('www.1.com') task_A = asyncio.ensure_future(c1) #创建一个事件循环对象 loop = asyncio.get_event_loop() #将任务对象注册到该对象中并且启动事件循环 loop.run_until_complete(task_A)
任务对象绑定回调
import asyncio import time async def request(url): print('正在请求:',url) time.sleep(2) print('请求完毕!',url) return url #定义一个任务对象的回调函数 #task参数表示的就是该函数被绑定的那个任务对象 def task_callback(task): print('i am task_callback()') print(task.result()) #task.result()返回的就是任务对象对应的特殊函数内部的返回值 c = request('www.xxx.com') task = asyncio.ensure_future(c) task.add_done_callback(task_callback) loop = asyncio.get_event_loop() loop.run_until_complete(task)
多任务异步协程
import asyncio import time start = time.time() #在特殊函数内部不可以出现不支持异步模块相关的代码 async def request(url): print('正在请求:',url) # time.sleep(2)#time模块是不支持异步 await asyncio.sleep(2) #阻塞操作必须使用await关键字进行挂起 print('请求完毕!',url) return url urls = [ 'www.1.com', 'www.2.com', 'www.3.com' ] def task_callback(task): print(task.result()) tasks = [] #多任务列表:存放多个任务对象 for url in urls: c = request(url) task = asyncio.ensure_future(c) task.add_done_callback(task_callback) tasks.append(task) #将多个任务对象装在到一个任务列表中 loop = asyncio.get_event_loop() #多任务注册 #wait就是将任务列表中的任务对象进行挂起 loop.run_until_complete(asyncio.wait(tasks)) print(time.time()-start)
多任务异步爬虫
import asyncio import time import requests start = time.time() #在特殊函数内部不可以出现不支持异步模块相关的代码 async def request(url): print('正在请求:',url) response = requests.get(url) return response.text urls = [ 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/tom', 'http://127.0.0.1:5000/jay' ] def parse(task): page_text = task.result() print(page_text+',请求到的数据!!!') tasks = [] for url in urls: c = request(url) task = asyncio.ensure_future(c) task.add_done_callback(parse) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) print(time.time()-start)
aiohttp使用
# import asyncio # import time # import aiohttp # start = time.time() 在特殊函数内部不可以出现不支持异步模块相关的代码 简单的基本架构: async def request(url): with aiohttp.ClientSession() as s: #s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames #在s.get中如果使用代理操作:proxy="http://ip:port" with s.get(url) as response: #获取字符串形式的响应数据:response.text() #获取byte类型的:response.read() page_text = response.text() return page_text 在当前架构的基础上补充细节即可 细节1:在每一个with前加上async关键字 细节2:在get方法前和response.text()前加上await关键字进行手动挂起操作 # async def request(url): # async with aiohttp.ClientSession() as s: s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames 在s.get中如果使用代理操作:proxy="http://ip:port" # async with await s.get(url) as response: 获取字符串形式的响应数据:response.text() 获取byte类型的:response.read() # page_text = await response.text() # return page_text urls = [ 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/tom', 'http://127.0.0.1:5000/jay', 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/tom', 'http://127.0.0.1:5000/jay', 'http://127.0.0.1:5000/bobo', 'http://127.0.0.1:5000/tom', 'http://127.0.0.1:5000/jay', ] # urls = [] # for i in range(500): # urls.append('http://127.0.0.1:5000/bobo') # def parse(task): # page_text = task.result() # print(page_text+',请求到的数据!!!') # tasks = [] # for url in urls: # c = request(url) # task = asyncio.ensure_future(c) # task.add_done_callback(parse) # tasks.append(task) # loop = asyncio.get_event_loop() # loop.run_until_complete(asyncio.wait(tasks)) # print(time.time()-start)
案列
import aiohttp import asyncio from lxml import etree all_titles = [] headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } async def request(url): async with aiohttp.ClientSession() as s: async with await s.get(url,headers=headers) as response: page_text = await response.text() return page_text urls = [] url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=%d' for page in range(100): u_page = page * 30 new_url = format(url%u_page) urls.append(new_url) tasks = [] def parse(task): page_text = task.result() page_text = page_text.encode('gb2312').decode('gbk') tree = etree.HTML(page_text) tr_list = tree.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr') for tr in tr_list: title = tr.xpath('./td[2]/a[2]/text()')[0] print(title) all_titles.append(title) for url in urls: c = request(url) task = asyncio.ensure_future(c) task.add_done_callback(parse) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks))