在编写爬虫时,性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢
1.利用线程实现并发(io密集型用--http请求是io密集型)
线程开多了性能降低,线程上下文切换耗时多,可以实现并发,但是,请求发送出去后和返回之前,中间时期线程空闲
(1)编写方式一(多线程直接返回处理)
from concurrent.futures import ThreadPoolExecutor #线程池 import requests #线程函数,把任务url放进来执行 def task(url): response = requests.get(url) print(url,response) pool = ThreadPoolExecutor(7) #声明一个线程池 url_list = [ 'http://www.cnblogs.com/', 'http://huaban.com/favorite/beauty/', 'http://www.bing.com', 'http://www.zhihu.com', 'http://www.sina.com', 'http://www.baidu.com', 'http://www.autohome.com.cn', ] for url in url_list: pool.submit(task,url) #把请求都放到调用线程函数里 pool.shutdown(wait=True)
(2)编写方式二(多线程+回调函数处理)
第一个请求来第一个url过来执行task下载任务,当下载页面完成之后会有response,返回之后会执行add_done_callback的done方法,done方法里的参数就有task函数执行后返回过来的值
from concurrent.futures import ThreadPoolExecutor #线程池 import requests #线程函数,把任务url放进来下载页面 def task(url): response = requests.get(url) return response #执行函数执行task方法返回过来的值 def done(future,*args,**kwargs): response = future.result() #result是task执行后返回过来的结果 print(response.status_code,response.content) pool = ThreadPoolExecutor(7) #声明一个线程池 url_list = [ 'http://www.cnblogs.com/', 'http://huaban.com/favorite/beauty/', 'http://www.bing.com', 'http://www.zhihu.com', 'http://www.sina.com', 'http://www.baidu.com', 'http://www.autohome.com.cn', ] for url in url_list: v = pool.submit(task,url) #把请求url都放到调用线程函数task里接收返回值赋值v v.add_done_callback(done) #v.add_done_callback执行done函数 pool.shutdown(wait=True)
2.利用进程实现并发(计算密集型用进程)
可以实现并发,但是,请求发送出去后和返回之前,中间时期进程空闲
(1)编写方式一(多进程接返回处理)
from concurrent.futures import ProcessPoolExecutor #进程池 import requests #进程函数,把任务url放进来执行 def task(url): response = requests.get(url) print(url,response) pool = ProcessPoolExecutor(2) #声明一个进程池 url_list = [ 'http://www.cnblogs.com/', 'http://huaban.com/favorite/beauty/', 'http://www.bing.com', 'http://www.zhihu.com', 'http://www.sina.com', 'http://www.baidu.com', 'http://www.autohome.com.cn', ] for url in url_list: pool.submit(task,url) #把请求都放到调用线程函数里 pool.shutdown(wait=True)
(2)编写方式二(多进程+回调函数处理)
''' 第一个请求来第一个url过来执行task下载任务,当下载页面完成之后会有response,返回之后会执行add_done_callback的done方法, done方法里的参数就有task函数执行后返回过来的值 ''' from concurrent.futures import ProcessPoolExecutor #线程池 import requests #线程函数,把任务url放进来下载页面 def task(url): response = requests.get(url) return response #执行函数执行task方法返回过来的值 def done(future,*args,**kwargs): response = future.result() #result是task执行后返回过来的结果 print(response.status_code,response.content) pool = ProcessPoolExecutor(7) #声明一个线程池 url_list = [ 'http://www.cnblogs.com/', 'http://huaban.com/favorite/beauty/', 'http://www.bing.com', 'http://www.zhihu.com', 'http://www.sina.com', 'http://www.baidu.com', 'http://www.autohome.com.cn', ] for url in url_list: v = pool.submit(task,url) #把请求url都放到调用线程函数task里接收返回值赋值v v.add_done_callback(done) #v.add_done_callback执行done函数 pool.shutdown(wait=True)
通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步IO会是首选:
(1)asyncio模块
方式一:asyncio只支持TCP请求,不支持Http请求
import asyncio #单线程执行两个任务 #task任务 @asyncio.coroutine def task(): print('before...task......') #先执行这一句 yield from asyncio.sleep(5) #等待五秒 print('end...task......') # tasks = [task(), task()] #列表定义两个任务 #把两个任务放到这里 loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
方式二:asyncio通过自己封装http数据包一个线程完成异步io操作,支持Http请求
import asyncio @asyncio.coroutine def task(host, url='/'): print('开始请求',host,url) reader, writer = yield from asyncio.open_connection(host, 80) #创建连接 request_header_content = "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" % (url, host,) #http请求格式 request_header_content = bytes(request_header_content, encoding='utf-8') writer.write(request_header_content) yield from writer.drain() text = yield from reader.read() print('获取结果',host, url, text) writer.close() tasks = [ task('www.cnblogs.com', '/xixi/'), task('dig.chouti.com', '/pic/show?nid=4073644713430508&lid=10273091') ] loop = asyncio.get_event_loop() results = loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
方式三:asyncio+aiohttp
安装pip3 install aiohttp
import aiohttp #aiohttp模块:封装Http数据包 import asyncio #异步功能 @asyncio.coroutine def fetch_async(url): print(url) response = yield from aiohttp.request('GET', url) #用aiohttp去执行 print(url, response) response.close() tasks = [fetch_async('http://www.baidu.com/'), fetch_async('http://www.sina.com/')] event_loop = asyncio.get_event_loop() results = event_loop.run_until_complete(asyncio.gather(*tasks)) event_loop.close()
方式四:asyncio+requests
安装pip3 install requests
import asyncio import requests #requests模块:封装Http数据包 @asyncio.coroutine def task(func, *args): print(func,args) loop = asyncio.get_event_loop() future = loop.run_in_executor(None, func, *args) #执行requests.get('http://www.cnblogs.com/xixi/') response = yield from future print(response.url, response.content) #tasks列表里的函数作为参数传到task函数里func参数 tasks = [ task(requests.get, 'http://www.cnblogs.com/xixi/'), task(requests.get, 'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091') ] loop = asyncio.get_event_loop() results = loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
(2)gevent模块:
方式一:gevent依赖greenlet协程模块+异步IO
安装pip3 install greenlet
安装pip3 install gevent
import gevent import requests from gevent import monkey monkey.patch_all() #把内部找到所有原来socket变为异步IO的socket def task(method, url, req_kwargs): print(method, url, req_kwargs) response = requests.request(method=method, url=url, **req_kwargs) #封装 print(response.url, response.content) #发送请求 gevent.joinall([ gevent.spawn(task, method='get', url='https://www.python.org/', req_kwargs={}), gevent.spawn(task, method='get', url='https://www.yahoo.com/', req_kwargs={}), gevent.spawn(task, method='get', url='https://github.com/', req_kwargs={}), ])
方式二:gevent(协程池,最多发多少个请求)+requests
安装pip3 install greenlet
安装pip3 install gevent
import gevent import requests from gevent import monkey monkey.patch_all() #把内部找到所有原来socket变为异步IO的socket def task(method, url, req_kwargs): print(method, url, req_kwargs) response = requests.request(method=method, url=url, **req_kwargs) #封装 print(response.url, response.content) #发送请求(协程池控制最大协程数量) from gevent.pool import Pool pool = Pool(5) #最多向远程发5个 gevent.joinall([ pool.spawn(task, method='get', url='https://www.python.org/', req_kwargs={}), pool.spawn(task, method='get', url='https://www.yahoo.com/', req_kwargs={}), pool.spawn(task, method='get', url='https://www.github.com/', req_kwargs={}), ])
方式三:
安装pip3 install grequests
import grequests #里面帮助下载执行 request_list = [ grequests.get('https://www.python.org', timeout=0.001), grequests.get('http://www.baidu.com/'), grequests.get('http://httpbin.org/status/500') ] #执行并获取响应列表 response_list = grequests.map(request_list,size=5) print(response_list)
(3)Twisted
(4)Tornado
from tornado.httpclient import AsyncHTTPClient from tornado.httpclient import HTTPRequest from tornado import ioloop COUNT = 0 def handle_response(response): global COUNT COUNT -= 1 if response.error: print("Error:", response.error) else: print(response.body) # 方法同twisted # ioloop.IOLoop.current().stop() if COUNT == 0: ioloop.IOLoop.current().stop() def func(): url_list = [ 'http://www.baidu.com', 'http://www.bing.com', ] global COUNT COUNT = len(url_list) for url in url_list: print(url) http_client = AsyncHTTPClient() http_client.fetch(HTTPRequest(url), handle_response) #回调函数 ioloop.IOLoop.current().add_callback(func) ioloop.IOLoop.current().start() # 死循环
以上均是Python内置以及第三方模块提供异步IO请求模块,使用简便大大提高效率,而对于异步IO请求的本质则是【非阻塞Socket】+【IO多路复用】
3.自定义异步IO模块(自定义socket客户端)
1)标准HTTP请求本质,阻塞
import socket sk = socket.socket() #1.连接 sk.connect(('www.baidu.com',80,)) #IO阻塞 print('连接成功了...') #2.连接成功发送HTTP这种格式的数据(响应头和响应体用两个换行分割) #sk.send(b'GET / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n') #GET请求 sk.send(b'POST / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\nk1=v1&k2=v2') #POST请求 #3.等待着服务端响应 data = sk.recv(8096) #IO阻塞 print(data) #4.关闭连接 sk.close()
2)HTTP请求本质,非阻塞
import socket sk = socket.socket() sk.setblocking(False) #设置非阻塞 #1.连接 try: sk.connect(('www.baidu.com',80,)) #IO阻塞 print('连接成功了...') except BlockingIOError as e: print(e) #2.连接成功发送消息 sk.send(b'GET / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n') #sk.send(b'POST / HTTP/1.0\r\nHost:www.baidu.com\r\n\r\nk1=v1&k2=v2') #3. 等待着服务端响应 data = sk.recv(8096) #IO阻塞 print(data) #关闭连接 sk.close()
3)自定义非阻塞select+socket完成异步IO,通过一个线程向很多地方把请求发出去
import socket import select #监听多个socket对象 #定义HttpRequest类封装了socket和主机名字 class HttpRequest: def __init__(self,sk,host,callback): self.socket = sk self.host = host self.callback = callback def fileno(self): return self.socket.fileno() #定义HttpResponse类分割请求头请求体 class HttpResponse: def __init__(self,recv_data): self.recv_data = recv_data self.header_dict = {} self.body = None #执行initialize函数进行请求体请求头分割 self.initialize() def initialize(self): headers, body = self.recv_data.split(b'\r\n\r\n', 1) self.body = body header_list = headers.split(b'\r\n') for h in header_list: h_str = str(h,encoding='utf-8') v = h_str.split(':',1) if len(v) == 2: self.header_dict[v[0]] = v[1] #定义一个类AsyncRequest class AsyncRequest: def __init__(self): self.conn = [] #conn等于空列表,有值代表接收数据,全部接收完清空跳出循环 self.connection = [] #connection等于空列表,有值代表没连接成功,每连接成功一个删除一个值,用于检测是否已经连接成功, #发送连接请求 def add_request(self,host,callback): #host,callback是每个主机名的回调函数 try: sk = socket.socket() #创建socket对象进行连接 sk.setblocking(0) #设置非阻塞 sk.connect((host,80,)) #ip+默认端口(向某个地址发请求) except BlockingIOError as e: pass request = HttpRequest(sk,host,callback) #创建request对象序列化HttpRequest类把socket和主机名字加进去 self.conn.append(request) #把request放到conn self.connection.append(request) #把request放到connection #连接成功发送消息:执行run之前conn和connection已经有HttpRequest对象,这个对象里有socket和主机名字 def run(self): while True: rlist,wlist,elist = select.select(self.conn,self.connection,self.conn,0.05) #死循环把conn和connection放进去 #每一个w是HttpRequest对象,只要能循环到,表示socket和服务器端已经连接成功 for w in wlist: print(w.host,'连接成功...') #发送数据 tpl = "GET / HTTP/1.0\r\nHost:%s\r\n\r\n" %(w.host,) w.socket.send(bytes(tpl,encoding='utf-8')) self.connection.remove(w) #从connection里把w删除掉 #r是HttpRequest对象,等待接收返回值,r里有conn和主机名 for r in rlist: recv_data = bytes() #recv_data默认等于空的字节 #循环一直接收返回数据 while True: try: chunck = r.socket.recv(8096) #接收返回数据 recv_data += chunck #每接收一次让recv_data加等于一个chunck except Exception as e: #如果出异常代表接收完了跳出循环 break #print(r.host,recv_data) #recv_data是返回的数据 response = HttpResponse(recv_data) #创建response对象实例化HttpResponse类把recv_data传进去 r.callback(response) #接收response的返回 r.socket.close() #终止http请求 self.conn.remove(r) #select不需要监听请求发没发 if len(self.conn) == 0: #判断conn是否等于0,如果等于0代表全部执行完跳出循环 break def f1(response): print('保存到文件',response.header_dict) def f2(response): print('保存到数据库', response.header_dict) #定义列表 url_list = [ {'host':'www.baidu.com','callback': f1}, #给host用f1函数 {'host':'cn.bing.com','callback': f2}, #给host用f2函数 {'host':'www.sina.com','callback': f2}, #给host用f2函数 ] req = AsyncRequest() #创建req对象实例化AsyncRequest()把url_list所有元素加进去 for item in url_list: #循环url_list这个列表 req.add_request(item['host'],item['callback']) #add_request把列表内容传到AsyncRequest()里,会为每一个创造一个socket对象进行连接 req.run()