异步爬虫实现的多种方式

对1000个url进行请求,并处理响应,测试每一种方法的耗时

一、grequests

单线程+协程

 1 import base64
 2 import time
 3 import grequests
 4 import requests
 5 from io import BytesIO
 6 
 7 from rosi.settings import MYSQL_CONFIG, MONGO_CONFIG
 8 from rosi.db.mysqldb import MysqlDB
 9 from rosi.db.mongodb import MongoDB
10 
11 
12 def get_data():
13     db = MysqlDB(**MYSQL_CONFIG)
14     sql = "select * from rosi where image_b64=''"
15     data = db.find(sql, limit=1000)
16     print(len(data))
17     return data
18 
19 
20 def get_image_b64_by_grequests(url):
21     response = requests.get(url)
22     b64_data = base64.b64encode(BytesIO(response.content).read())
23     return b64_data
24 
25 
26 def update_mongo_data(data):
27     db = MongoDB(**MONGO_CONFIG)
28     db.update('rosinew', 'image_data', data)
29 
30 
31 def update_data(data, url):
32     db = MysqlDB(**MYSQL_CONFIG)
33     sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url)
34     db.update(sql)
35 
36 
37 def callback_func(r, *args, **kwargs):
38     """
39     回调函数,参数必须这样写
40     :param r:
41     :param args:
42     :param kwargs:
43     :return:
44     """
45     if r is None:
46         print('非成功请求')
47         return
48     b64_data = base64.b64encode(BytesIO(r.content).read())
49     b64_data = str(b64_data, 'utf-8')
50     update_data(b64_data, r.url)
51     print(r.url)
52 
53 def exception_handler(request, exception):
54     """
55     异常回调函数
56     :param request: 
57     :param exception: 
58     :return: 
59     """
60     print('出错了')
61 
62 def main():
63     data = get_data()
64     s_time = time.time()
65     
66     requests_list = (grequests.get(item[1], callback=callback_func) for item in data)
67     response_list = grequests.map(requests_list, size=10, exception_handler=exception_handler)
68     
69     print(response_list)
70     print(f'结束了: {time.time() - s_time}')
71 
72 
73 if __name__ == '__main__':
74     main()

结果:

二、多线程+grequests

4个线程,每个线程并发10个协程

 1 import base64
 2 import time
 3 import grequests
 4 import requests
 5 from io import BytesIO
 6 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, as_completed
 7 
 8 from rosi.settings import MYSQL_CONFIG, MONGO_CONFIG
 9 from rosi.db.mysqldb import MysqlDB
10 from rosi.db.mongodb import MongoDB
11 
12 
13 def get_data():
14     db = MysqlDB(**MYSQL_CONFIG)
15     sql = "select * from rosi where image_b64=''"
16     data = db.find(sql, limit=1000)
17     print(len(data))
18     return data
19 
20 
21 def list_of_groups(init_list, children_list_len):
22     list_of_groups = zip(*(iter(init_list),) * children_list_len)
23     end_list = [list(i) for i in list_of_groups]
24     count = len(init_list) % children_list_len
25     end_list.append(init_list[-count:]) if count != 0 else end_list
26     return end_list
27 
28 
29 def get_image_b64_by_grequests(url):
30     response = requests.get(url)
31     b64_data = base64.b64encode(BytesIO(response.content).read())
32     return b64_data
33 
34 
35 def update_mongo_data(data):
36     db = MongoDB(**MONGO_CONFIG)
37     db.update('rosinew', 'image_data', data)
38 
39 
40 def update_data(data, url):
41     db = MysqlDB(**MYSQL_CONFIG)
42     sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url)
43     db.update(sql)
44 
45 
46 def callback_func(r, *args, **kwargs):
47     """
48     回调函数,参数必须这样写
49     :param r:
50     :param args:
51     :param kwargs:
52     :return:
53     """
54     if r is None:
55         print('非成功请求')
56         return
57     b64_data = base64.b64encode(BytesIO(r.content).read())
58     b64_data = str(b64_data, 'utf-8')
59     update_data(b64_data, r.url)
60     print(r.url)
61 
62 
63 def exception_handler(request, exception):
64     """
65     异常回调函数
66     :param request:
67     :param exception:
68     :return:
69     """
70     print('出错了')
71 
72 
73 def grequests_func(data):
74     requests_list = (grequests.get(item[1], callback=callback_func) for item in data)
75     response_list = grequests.map(requests_list, size=10, exception_handler=exception_handler)
76     print(response_list)
77     return
78 
79 
80 def main():
81     data = get_data()
82     init_data = list_of_groups(data, 250)
83     s_time = time.time()
84     # 4个多线程,每个线程并发10个协程
85     with ThreadPoolExecutor(max_workers=4) as executor:
86         all_task = [executor.submit(grequests_func, (params)) for params in init_data]
87         wait(all_task, return_when=ALL_COMPLETED)
88     for future in as_completed(all_task):
89         item = future.result()
90     print(f'结束了: {time.time() - s_time}')
91 
92 
93 if __name__ == '__main__':
94     main()

结果:

 

三、asyncio

 使用python3异步协程asyncio,单线程并发数10

 1 import asyncio
 2 import aiohttp
 3 import base64
 4 import time
 5 from io import BytesIO
 6 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, as_completed
 7 
 8 from rosi.settings import MYSQL_CONFIG
 9 from rosi.db.mysqldb import MysqlDB
10 
11 
12 def get_data():
13     db = MysqlDB(**MYSQL_CONFIG)
14     sql = "select * from rosi where image_b64=''"
15     data = db.find(sql, limit=1000)
16     print(len(data))
17     return data
18 
19 
20 def update_data(data, url):
21     db = MysqlDB(**MYSQL_CONFIG)
22     sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url)
23     db.update(sql)
24 
25 
26 async def fetch(client, url):
27     async with client.get(url) as resp:
28         # assert resp.status == 200
29         content = b''
30         while True:
31             chunk = await resp.content.read(1024)
32             if not chunk:
33                 break
34             content += chunk
35         print(url)
36         return (content, url)
37 
38 
39 async def fetch_all(urls):
40     async with asyncio.Semaphore(10):
41         async with aiohttp.ClientSession() as client:
42             return await asyncio.gather(*[fetch(client, url) for url in urls])
43 
44 
45 def main():
46     urls = [item[1] for item in get_data()]
47     s_time = time.time()
48     loop = asyncio.get_event_loop()
49     # 异步获取任务
50     results = loop.run_until_complete(fetch_all(urls))
51     for result in results:
52         content = result[0]
53         url = result[1]
54         b64_data = base64.b64encode(BytesIO(content).read())
55         b64_data = str(b64_data, 'utf-8')
56         update_data(b64_data, url)
57     print(f'结束了: {time.time() - s_time}')
58 
59 
60 if __name__ == '__main__':
61     main()

 结果:

四、多线程+asyncio

4个线程,每个线程并发10个协程

 1 import asyncio
 2 import aiohttp
 3 import base64
 4 import time
 5 from io import BytesIO
 6 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, as_completed
 7 
 8 from rosi.settings import MYSQL_CONFIG
 9 from rosi.db.mysqldb import MysqlDB
10 
11 
12 def get_data():
13     db = MysqlDB(**MYSQL_CONFIG)
14     sql = "select * from rosi where image_b64=''"
15     data = db.find(sql, limit=1000)
16     print(len(data))
17     return data
18 
19 
20 def list_of_groups(init_list, children_list_len):
21     list_of_groups = zip(*(iter(init_list),) * children_list_len)
22     end_list = [list(i) for i in list_of_groups]
23     count = len(init_list) % children_list_len
24     end_list.append(init_list[-count:]) if count != 0 else end_list
25     return end_list
26 
27 
28 def update_data(data, url):
29     db = MysqlDB(**MYSQL_CONFIG)
30     sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url)
31     db.update(sql)
32 
33 
34 async def fetch(client, url):
35     async with client.get(url) as resp:
36         # assert resp.status == 200
37         content = b''
38         while True:
39             chunk = await resp.content.read(1024)
40             if not chunk:
41                 break
42             content += chunk
43         print(url)
44         return (content, url)
45 
46 
47 async def fetch_all(urls):
48     async with asyncio.Semaphore(10):
49         async with aiohttp.ClientSession() as client:
50             return await asyncio.gather(*[fetch(client, url[1]) for url in urls])
51 
52 
53 def run(urls):
54     loop1 = asyncio.new_event_loop()
55     asyncio.set_event_loop(loop1)
56     loop = asyncio.get_event_loop()
57     # 异步获取任务
58     results = loop.run_until_complete(fetch_all(urls))
59     for result in results:
60         content = result[0]
61         url = result[1]
62         b64_data = base64.b64encode(BytesIO(content).read())
63         b64_data = str(b64_data, 'utf-8')
64         update_data(b64_data, url)
65 
66 
67 def main():
68     data = get_data()
69     init_data = list_of_groups(data, 250)
70     s_time = time.time()
71     # 4个多线程,每个线程并发10个协程
72     with ThreadPoolExecutor(max_workers=4) as executor:
73         all_task = [executor.submit(run, (params)) for params in init_data]
74         wait(all_task, return_when=ALL_COMPLETED)
75     for future in as_completed(all_task):
76         item = future.result()
77     print(f'结束了: {time.time() - s_time}')
78 
79 
80 if __name__ == '__main__':
81     main() 

结果:

 asyncio协程+多线程没啥提升,还不如只用asyncio

五、tornado异步

tornado官方文档中的示例改了一下,并发10协程

 1 import time
 2 import base64
 3 from datetime import timedelta
 4 from io import BytesIO
 5 from tornado import httpclient, gen, ioloop, queues
 6 
 7 from rosi.settings import MYSQL_CONFIG
 8 from rosi.db.mysqldb import MysqlDB
 9 
10 try:
11     from HTMLParser import HTMLParser
12     from urlparse import urljoin, urldefrag
13 except ImportError:
14     from html.parser import HTMLParser
15     from urllib.parse import urljoin, urldefrag
16 
17 
18 concurrency = 10
19 
20 
21 def get_data():
22     db = MysqlDB(**MYSQL_CONFIG)
23     sql = "select * from rosi where image_b64=''"
24     data = db.find(sql, limit=1000)
25     print(len(data))
26     return data
27 
28 
29 @gen.coroutine
30 def update_data(data, url):
31     db = MysqlDB(**MYSQL_CONFIG)
32     sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url)
33     db.update(sql)
34 
35 
36 @gen.coroutine
37 def get_request_async(url):
38     try:
39         response = yield httpclient.AsyncHTTPClient().fetch(url)
40         image_data = base64.b64encode(BytesIO(response.body).read())
41     except Exception as e:
42         print('Exception: %s %s' % (e, url))
43         raise gen.Return([])
44     raise gen.Return(image_data)
45 
46 
47 @gen.coroutine
48 def main():
49     q = queues.Queue()
50     # 添加url
51     for item in get_data():
52         q.put(item[1])
53     start = time.time()
54     print(q.qsize())
55     @gen.coroutine
56     def fetch_url():
57         current_url = yield q.get()
58         try:
59             b64_data = yield get_request_async(current_url)
60             image_data = str(b64_data, 'utf-8')
61             yield update_data(image_data, current_url)
62             print(current_url)
63         except Exception as e:
64             print('**************************************', str(e))
65         finally:
66             q.task_done()
67 
68     @gen.coroutine
69     def worker():
70         while True:
71             yield fetch_url()
72 
73     for _ in range(concurrency):
74         worker()
75     yield q.join(timeout=timedelta(seconds=3000))
76     print(q.qsize())
77     print(f'结束了: {time.time() - start}')
78 
79 
80 if __name__ == '__main__':
81     import logging
82     logging.basicConfig()
83     io_loop = ioloop.IOLoop.current()
84     io_loop.run_sync(main)

结果:

六、多线程

10个线程测试一下,多进程就不测了

 1 import base64
 2 import requests
 3 import time
 4 from io import BytesIO
 5 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, as_completed
 6 
 7 from rosi.settings import MYSQL_CONFIG
 8 from rosi.db.mysqldb import MysqlDB
 9 
10 
11 def get_data():
12     db = MysqlDB(**MYSQL_CONFIG)
13     sql = "select * from rosi where image_b64=''"
14     data = db.find(sql, limit=1000)
15     print(len(data))
16     return data
17 
18 
19 def update_data(data, url):
20     db = MysqlDB(**MYSQL_CONFIG)
21     sql = "update rosi set image_b64='{}' where image_url='{}'".format(data, url)
22     db.update(sql)
23 
24 
25 def get_image_b64_by_requests(url):
26     response = requests.get(url)
27     b64_data = base64.b64encode(BytesIO(response.content).read())
28     return b64_data
29 
30 
31 def deal_func(item):
32     b64_data = get_image_b64_by_requests(item[1])
33     b64_data = str(b64_data, 'utf-8')
34     update_data(b64_data, item[1])
35     print(item[1])
36     return None
37 
38 
39 def main():
40     data = get_data()
41     s_time = time.time()
42     with ThreadPoolExecutor(max_workers=10) as executor:
43         all_task = [executor.submit(deal_func, (item)) for item in data]
44         wait(all_task, return_when=ALL_COMPLETED)
45     for future in as_completed(all_task):
46         item = future.result()
47     print(f'结束了: {time.time() - s_time}')
48 
49 
50 if __name__ == '__main__':
51     main()

 结果:

七、同步方式

测试一下同步的方式,用作对比

 1 import base64
 2 import time
 3 import requests
 4 from io import BytesIO
 5 from rosi.settings import MYSQL_CONFIG
 6 from rosi.db.mysqldb import MysqlDB
 7 
 8 
 9 def get_data():
10     db = MysqlDB(**MYSQL_CONFIG)
11     sql = "select * from rosi where image_b64=''"
12     data = db.find(sql, limit=1000)
13     print(len(data))
14     return data
15 
16 
17 def get_image_b64_by_requests(url):
18     response = requests.get(url)
19     b64_data = base64.b64encode(BytesIO(response.content).read())
20     return b64_data
21 
22 
23 def update_data(data, id):
24     db = MysqlDB(**MYSQL_CONFIG)
25     sql = "update rosi set image_b64='{}' where id='{}'".format(str(data, 'utf-8'), id)
26     db.update(sql)
27 
28 
29 def main():
30     data = get_data()
31     s_time = time.time()
32     for item in data:
33         try:
34             image = get_image_b64_by_requests(item[1])
35             update_data(image, item[0])
36             print(item[2],item[4])
37         except Exception as e:
38             print('*****************', str(e))
39             continue
40     print(f'结束了: {time.time() - s_time}')
41 
42 
43 if __name__ == '__main__':
44     main()

结果:

 

posted @ 2021-12-29 15:11  luyizhou  阅读(58)  评论(0编辑  收藏  举报