多线程、多进程、协程、IO多路复用请求百度
最近学习了多线程、多进程、协程以及IO多路复用,那么对于爬取数据来说,这几个方式哪个最快呢,今天就来稍微测试一下
普通方式请求百度5次
import socket import time import socks socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理 socket.socket = socks.socksocket # 把代理应用到socket def blocking(wd): sock = socket.socket() sock.connect(('www.baidu.com',80)) # 连接百度 request = 'GET {} HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n'.format('/s?wd={}'.format(wd)) # 构造http请求头 response = b'' # 用于接收数据 sock.send(request.encode()) # 发送http请求 chunk = sock.recv(1024) # 一次接收1024字节数据 while chunk: # 循环接收数据,若没有数据了说明已接收完 response += chunk # 字符串拼接 chunk = sock.recv(1024) # print(response.decode()) return response def blocking_way(): search_list = ['python', 'java', 'C++', 'Ruby', 'Go'] for item in search_list: blocking(item) if __name__ == '__main__': start_time = time.time() blocking_way() print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))
多次执行结果:
请求5次百度总耗时:4.24秒
多线程版本
import socket import time import socks from multiprocessing.pool import ThreadPool socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理 socket.socket = socks.socksocket # 把代理应用到socket def blocking(wd): sock = socket.socket() sock.connect(('www.baidu.com',80)) # 连接百度 request = 'GET {} HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n'.format('/s?wd={}'.format(wd)) # 构造http请求头 response = b'' # 用于接收数据 sock.send(request.encode()) # 发送http请求 chunk = sock.recv(1024) # 一次接收1024字节数据 while chunk: # 循环接收数据,若没有数据了说明已接收完 response += chunk # 字符串拼接 chunk = sock.recv(1024) # print(response.decode()) return response def blocking_way(): #多线程 pool = ThreadPool(5) #实例线程池,开启5个线程 search_list = ['python','java','C++','Ruby','Go'] for i in search_list: pool.apply_async(blocking,args=(i,)) # 提交任务到线程池 pool.close() #线程池不再接收任务 pool.join() #等待任务执行完 if __name__ == '__main__': start_time = time.time() blocking_way() print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))
多次执行结果:
请求5次百度总耗时:1.0秒
多进程版本
import socket import time import socks from multiprocessing import Pool socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理 socket.socket = socks.socksocket # 把代理应用到socket def blocking(wd): sock = socket.socket() sock.connect(('www.baidu.com',80)) # 连接百度 request = 'GET {} HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n'.format('/s?wd={}'.format(wd)) # 构造http请求头 response = b'' # 用于接收数据 sock.send(request.encode()) # 发送http请求 chunk = sock.recv(1024) # 一次接收1024字节数据 while chunk: # 循环接收数据,若没有数据了说明已接收完 response += chunk # 字符串拼接 chunk = sock.recv(1024) # print(response.decode()) return response def blocking_way(): #多进程 pool = Pool(5) search_list = ['python','java','C++','Ruby','Go'] for i in search_list: pool.apply_async(blocking,args=(i,)) pool.close() pool.join() if __name__ == '__main__': start_time = time.time() blocking_way() print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))
多次执行结果:
请求5次百度总耗时:1.52秒
协程版本
from gevent import monkey;monkey.patch_socket() import socket import time import socks import gevent socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理 socket.socket = socks.socksocket # 把代理应用到socket def blocking(wd): sock = socket.socket() sock.connect(('www.baidu.com',80)) # 连接百度 request = 'GET {} HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n'.format('/s?wd={}'.format(wd)) # 构造http请求头 response = b'' # 用于接收数据 sock.send(request.encode()) # 发送http请求 chunk = sock.recv(1024) # 一次接收1024字节数据 while chunk: # 循环接收数据,若没有数据了说明已接收完 response += chunk # 字符串拼接 chunk = sock.recv(1024) # print(response.decode()) return response def blocking_way(): search_list = ['python', 'java', 'C++', 'Ruby', 'Go'] tasks = [gevent.spawn(blocking,i) for i in search_list] gevent.joinall(tasks) if __name__ == '__main__': start_time = time.time() blocking_way() print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))
多次执行结果:
请求5次百度总耗时:1.02秒
IO多路复用版本
import socks import time import socket import selectors socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) # 设置socks代理 socket.socket = socks.socksocket # 把代理应用到socket selector = selectors.DefaultSelector() # 事件选择器 flag = True # 事件循环的标志 times = 5 # 用于计数,每请求一次百度,就减1,若为0,说明已请求5次,此时结束事件循环 class Crawler(): def __init__(self,wd): self.response = b'' # 用于接收数据 self.wd = wd # 搜索内容 def fetch(self): '''创建客户端套接字,连接百度,定义好如果连接成功应该调用什么函数''' client = socket.socket() client.setblocking(False) try: client.connect(('www.baidu.com',80)) #此处需要注册事件监控 except BlockingIOError: pass selector.register(client,selectors.EVENT_WRITE,self.send_request) def send_request(self,client): '''连接成功后发送请求到百度,并注册事件:收到百度应答应该做什么''' selector.unregister(client) # 把原先监控的事件取消,方便后面监控其他事件 request = 'GET {} HTTP/1.0\r\nHost:www.baidu.com\r\n\r\n'.format('/s?wd={}'.format(self.wd)) # 构造http请求头 client.send(request.encode()) selector.register(client,selectors.EVENT_READ,self.get_response) #此处注册事件,若百度响应,调用get_response def get_response(self,client): '''若有数据发过来,就接收,每次发数据过来,都会触发,所以不用while循环''' global flag global times data = client.recv(1024) # 每次接收的数据不超过1024字节,若大于1024,分批传输 if data: self.response += data # 字符串拼接 else: # 数据接收完 # print(self.response.decode()) client.close() selector.unregister(client) times -= 1 # 每次请求的响应接收完后,计数器减一 if times == 0: # 5次请求完后,结束事件监控循环 flag = False def loop(): '''事件监控循环''' while flag: events = selector.select() for key,mask in events: callback = key.data callback(key.fileobj) if __name__ == '__main__': start_time = time.time() search_list = ['python', 'java', 'C++', 'Ruby', 'Go'] for item in search_list: crawler = Crawler(item) crawler.fetch() loop() print('请求5次百度耗时:{}'.format(round(time.time()-start_time,2)))
多次执行结果:
请求5次百度耗时:1.17秒
大家可以把请求数调多一些多试几次!
基本上协程和多线程耗时较短,更适用于爬虫。