异步爬虫实现的多种方式
对1000个url进行请求,并处理响应,测试每一种方法的耗时
一、grequests
单线程+协程
1 import base64 2 import time 3 import grequests 4 import requests 5 from io import BytesIO 6 7 from rosi.settings import MYSQL_CONFIG, MONGO_CONFIG 8 from rosi.db.mysqldb import MysqlDB 9 from rosi.db.mongodb import MongoDB 10 11 12 def get_data(): 13 db = MysqlDB(**MYSQL_CONFIG) 14 sql = "select * from rosi where image_b64=''" 15 data = db.find(sql, limit=1000) 16 print(len(data)) 17 return data 18 19 20 def get_image_b64_by_grequests(url): 21 response = requests.get(url) 22 b64_data = base64.b64encode(BytesIO(response.content).read()) 23 return b64_data 24 25 26 def update_mongo_data(data): 27 db = MongoDB(**MONGO_CONFIG) 28 db.update('rosinew', 'image_data', data) 29 30 31 def update_data(data, url): 32 db = MysqlDB(**MYSQL_CONFIG) 33 sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url) 34 db.update(sql) 35 36 37 def callback_func(r, *args, **kwargs): 38 """ 39 回调函数,参数必须这样写 40 :param r: 41 :param args: 42 :param kwargs: 43 :return: 44 """ 45 if r is None: 46 print('非成功请求') 47 return 48 b64_data = base64.b64encode(BytesIO(r.content).read()) 49 b64_data = str(b64_data, 'utf-8') 50 update_data(b64_data, r.url) 51 print(r.url) 52 53 def exception_handler(request, exception): 54 """ 55 异常回调函数 56 :param request: 57 :param exception: 58 :return: 59 """ 60 print('出错了') 61 62 def main(): 63 data = get_data() 64 s_time = time.time() 65 66 requests_list = (grequests.get(item[1], callback=callback_func) for item in data) 67 response_list = grequests.map(requests_list, size=10, exception_handler=exception_handler) 68 69 print(response_list) 70 print(f'结束了: {time.time() - s_time}') 71 72 73 if __name__ == '__main__': 74 main()
结果:
二、多线程+grequests
4个线程,每个线程并发10个协程
1 import base64 2 import time 3 import grequests 4 import requests 5 from io import BytesIO 6 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, as_completed 7 8 from rosi.settings import MYSQL_CONFIG, MONGO_CONFIG 9 from rosi.db.mysqldb import MysqlDB 10 from rosi.db.mongodb import MongoDB 11 12 13 def get_data(): 14 db = MysqlDB(**MYSQL_CONFIG) 15 sql = "select * from rosi where image_b64=''" 16 data = db.find(sql, limit=1000) 17 print(len(data)) 18 return data 19 20 21 def list_of_groups(init_list, children_list_len): 22 list_of_groups = zip(*(iter(init_list),) * children_list_len) 23 end_list = [list(i) for i in list_of_groups] 24 count = len(init_list) % children_list_len 25 end_list.append(init_list[-count:]) if count != 0 else end_list 26 return end_list 27 28 29 def get_image_b64_by_grequests(url): 30 response = requests.get(url) 31 b64_data = base64.b64encode(BytesIO(response.content).read()) 32 return b64_data 33 34 35 def update_mongo_data(data): 36 db = MongoDB(**MONGO_CONFIG) 37 db.update('rosinew', 'image_data', data) 38 39 40 def update_data(data, url): 41 db = MysqlDB(**MYSQL_CONFIG) 42 sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url) 43 db.update(sql) 44 45 46 def callback_func(r, *args, **kwargs): 47 """ 48 回调函数,参数必须这样写 49 :param r: 50 :param args: 51 :param kwargs: 52 :return: 53 """ 54 if r is None: 55 print('非成功请求') 56 return 57 b64_data = base64.b64encode(BytesIO(r.content).read()) 58 b64_data = str(b64_data, 'utf-8') 59 update_data(b64_data, r.url) 60 print(r.url) 61 62 63 def exception_handler(request, exception): 64 """ 65 异常回调函数 66 :param request: 67 :param exception: 68 :return: 69 """ 70 print('出错了') 71 72 73 def grequests_func(data): 74 requests_list = (grequests.get(item[1], callback=callback_func) for item in data) 75 response_list = grequests.map(requests_list, size=10, exception_handler=exception_handler) 76 print(response_list) 77 return 78 79 80 def main(): 81 data = get_data() 82 init_data = list_of_groups(data, 250) 83 s_time = time.time() 84 # 4个多线程,每个线程并发10个协程 85 with ThreadPoolExecutor(max_workers=4) as executor: 86 all_task = [executor.submit(grequests_func, (params)) for params in init_data] 87 wait(all_task, return_when=ALL_COMPLETED) 88 for future in as_completed(all_task): 89 item = future.result() 90 print(f'结束了: {time.time() - s_time}') 91 92 93 if __name__ == '__main__': 94 main()
结果:
三、asyncio
使用python3异步协程asyncio,单线程并发数10
1 import asyncio 2 import aiohttp 3 import base64 4 import time 5 from io import BytesIO 6 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, as_completed 7 8 from rosi.settings import MYSQL_CONFIG 9 from rosi.db.mysqldb import MysqlDB 10 11 12 def get_data(): 13 db = MysqlDB(**MYSQL_CONFIG) 14 sql = "select * from rosi where image_b64=''" 15 data = db.find(sql, limit=1000) 16 print(len(data)) 17 return data 18 19 20 def update_data(data, url): 21 db = MysqlDB(**MYSQL_CONFIG) 22 sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url) 23 db.update(sql) 24 25 26 async def fetch(client, url): 27 async with client.get(url) as resp: 28 # assert resp.status == 200 29 content = b'' 30 while True: 31 chunk = await resp.content.read(1024) 32 if not chunk: 33 break 34 content += chunk 35 print(url) 36 return (content, url) 37 38 39 async def fetch_all(urls): 40 async with asyncio.Semaphore(10): 41 async with aiohttp.ClientSession() as client: 42 return await asyncio.gather(*[fetch(client, url) for url in urls]) 43 44 45 def main(): 46 urls = [item[1] for item in get_data()] 47 s_time = time.time() 48 loop = asyncio.get_event_loop() 49 # 异步获取任务 50 results = loop.run_until_complete(fetch_all(urls)) 51 for result in results: 52 content = result[0] 53 url = result[1] 54 b64_data = base64.b64encode(BytesIO(content).read()) 55 b64_data = str(b64_data, 'utf-8') 56 update_data(b64_data, url) 57 print(f'结束了: {time.time() - s_time}') 58 59 60 if __name__ == '__main__': 61 main()
结果:
四、多线程+asyncio
4个线程,每个线程并发10个协程
1 import asyncio 2 import aiohttp 3 import base64 4 import time 5 from io import BytesIO 6 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, as_completed 7 8 from rosi.settings import MYSQL_CONFIG 9 from rosi.db.mysqldb import MysqlDB 10 11 12 def get_data(): 13 db = MysqlDB(**MYSQL_CONFIG) 14 sql = "select * from rosi where image_b64=''" 15 data = db.find(sql, limit=1000) 16 print(len(data)) 17 return data 18 19 20 def list_of_groups(init_list, children_list_len): 21 list_of_groups = zip(*(iter(init_list),) * children_list_len) 22 end_list = [list(i) for i in list_of_groups] 23 count = len(init_list) % children_list_len 24 end_list.append(init_list[-count:]) if count != 0 else end_list 25 return end_list 26 27 28 def update_data(data, url): 29 db = MysqlDB(**MYSQL_CONFIG) 30 sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url) 31 db.update(sql) 32 33 34 async def fetch(client, url): 35 async with client.get(url) as resp: 36 # assert resp.status == 200 37 content = b'' 38 while True: 39 chunk = await resp.content.read(1024) 40 if not chunk: 41 break 42 content += chunk 43 print(url) 44 return (content, url) 45 46 47 async def fetch_all(urls): 48 async with asyncio.Semaphore(10): 49 async with aiohttp.ClientSession() as client: 50 return await asyncio.gather(*[fetch(client, url[1]) for url in urls]) 51 52 53 def run(urls): 54 loop1 = asyncio.new_event_loop() 55 asyncio.set_event_loop(loop1) 56 loop = asyncio.get_event_loop() 57 # 异步获取任务 58 results = loop.run_until_complete(fetch_all(urls)) 59 for result in results: 60 content = result[0] 61 url = result[1] 62 b64_data = base64.b64encode(BytesIO(content).read()) 63 b64_data = str(b64_data, 'utf-8') 64 update_data(b64_data, url) 65 66 67 def main(): 68 data = get_data() 69 init_data = list_of_groups(data, 250) 70 s_time = time.time() 71 # 4个多线程,每个线程并发10个协程 72 with ThreadPoolExecutor(max_workers=4) as executor: 73 all_task = [executor.submit(run, (params)) for params in init_data] 74 wait(all_task, return_when=ALL_COMPLETED) 75 for future in as_completed(all_task): 76 item = future.result() 77 print(f'结束了: {time.time() - s_time}') 78 79 80 if __name__ == '__main__': 81 main()
结果:
asyncio协程+多线程没啥提升,还不如只用asyncio
五、tornado异步
tornado官方文档中的示例改了一下,并发10协程
1 import time 2 import base64 3 from datetime import timedelta 4 from io import BytesIO 5 from tornado import httpclient, gen, ioloop, queues 6 7 from rosi.settings import MYSQL_CONFIG 8 from rosi.db.mysqldb import MysqlDB 9 10 try: 11 from HTMLParser import HTMLParser 12 from urlparse import urljoin, urldefrag 13 except ImportError: 14 from html.parser import HTMLParser 15 from urllib.parse import urljoin, urldefrag 16 17 18 concurrency = 10 19 20 21 def get_data(): 22 db = MysqlDB(**MYSQL_CONFIG) 23 sql = "select * from rosi where image_b64=''" 24 data = db.find(sql, limit=1000) 25 print(len(data)) 26 return data 27 28 29 @gen.coroutine 30 def update_data(data, url): 31 db = MysqlDB(**MYSQL_CONFIG) 32 sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url) 33 db.update(sql) 34 35 36 @gen.coroutine 37 def get_request_async(url): 38 try: 39 response = yield httpclient.AsyncHTTPClient().fetch(url) 40 image_data = base64.b64encode(BytesIO(response.body).read()) 41 except Exception as e: 42 print('Exception: %s %s' % (e, url)) 43 raise gen.Return([]) 44 raise gen.Return(image_data) 45 46 47 @gen.coroutine 48 def main(): 49 q = queues.Queue() 50 # 添加url 51 for item in get_data(): 52 q.put(item[1]) 53 start = time.time() 54 print(q.qsize()) 55 @gen.coroutine 56 def fetch_url(): 57 current_url = yield q.get() 58 try: 59 b64_data = yield get_request_async(current_url) 60 image_data = str(b64_data, 'utf-8') 61 yield update_data(image_data, current_url) 62 print(current_url) 63 except Exception as e: 64 print('**************************************', str(e)) 65 finally: 66 q.task_done() 67 68 @gen.coroutine 69 def worker(): 70 while True: 71 yield fetch_url() 72 73 for _ in range(concurrency): 74 worker() 75 yield q.join(timeout=timedelta(seconds=3000)) 76 print(q.qsize()) 77 print(f'结束了: {time.time() - start}') 78 79 80 if __name__ == '__main__': 81 import logging 82 logging.basicConfig() 83 io_loop = ioloop.IOLoop.current() 84 io_loop.run_sync(main)
结果:
六、多线程
10个线程测试一下,多进程就不测了
1 import base64 2 import requests 3 import time 4 from io import BytesIO 5 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, as_completed 6 7 from rosi.settings import MYSQL_CONFIG 8 from rosi.db.mysqldb import MysqlDB 9 10 11 def get_data(): 12 db = MysqlDB(**MYSQL_CONFIG) 13 sql = "select * from rosi where image_b64=''" 14 data = db.find(sql, limit=1000) 15 print(len(data)) 16 return data 17 18 19 def update_data(data, url): 20 db = MysqlDB(**MYSQL_CONFIG) 21 sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url) 22 db.update(sql) 23 24 25 def get_image_b64_by_requests(url): 26 response = requests.get(url) 27 b64_data = base64.b64encode(BytesIO(response.content).read()) 28 return b64_data 29 30 31 def deal_func(item): 32 b64_data = get_image_b64_by_requests(item[1]) 33 b64_data = str(b64_data, 'utf-8') 34 update_data(b64_data, item[1]) 35 print(item[1]) 36 return None 37 38 39 def main(): 40 data = get_data() 41 s_time = time.time() 42 with ThreadPoolExecutor(max_workers=10) as executor: 43 all_task = [executor.submit(deal_func, (item)) for item in data] 44 wait(all_task, return_when=ALL_COMPLETED) 45 for future in as_completed(all_task): 46 item = future.result() 47 print(f'结束了: {time.time() - s_time}') 48 49 50 if __name__ == '__main__': 51 main()
结果:
七、同步方式
测试一下同步的方式,用作对比
1 import base64 2 import time 3 import requests 4 from io import BytesIO 5 from rosi.settings import MYSQL_CONFIG 6 from rosi.db.mysqldb import MysqlDB 7 8 9 def get_data(): 10 db = MysqlDB(**MYSQL_CONFIG) 11 sql = "select * from rosi where image_b64=''" 12 data = db.find(sql, limit=1000) 13 print(len(data)) 14 return data 15 16 17 def get_image_b64_by_requests(url): 18 response = requests.get(url) 19 b64_data = base64.b64encode(BytesIO(response.content).read()) 20 return b64_data 21 22 23 def update_data(data, id): 24 db = MysqlDB(**MYSQL_CONFIG) 25 sql = "update rosi set image_b64='{}' where id='{}'".format(str(data, 'utf-8'), id) 26 db.update(sql) 27 28 29 def main(): 30 data = get_data() 31 s_time = time.time() 32 for item in data: 33 try: 34 image = get_image_b64_by_requests(item[1]) 35 update_data(image, item[0]) 36 print(item[2],item[4]) 37 except Exception as e: 38 print('*****************', str(e)) 39 continue 40 print(f'结束了: {time.time() - s_time}') 41 42 43 if __name__ == '__main__': 44 main()
结果: