爬虫(二)
性能相关
在编写爬虫时,性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢。
import requests def fetch_async(url): response = requests.get(url) return response url_list = ['http://www.github.com', 'http://www.bing.com'] for url in url_list: fetch_async(url)
from concurrent.futures import ThreadPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ThreadPoolExecutor(5) for url in url_list: pool.submit(fetch_async, url) pool.shutdown(wait=True)
from concurrent.futures import ThreadPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response def callback(future): print(future.result()) url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ThreadPoolExecutor(5) for url in url_list: v = pool.submit(fetch_async, url) v.add_done_callback(callback) pool.shutdown(wait=True)
from concurrent.futures import ProcessPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ProcessPoolExecutor(5) for url in url_list: pool.submit(fetch_async, url) pool.shutdown(wait=True)
from concurrent.futures import ProcessPoolExecutor import requests def fetch_async(url): response = requests.get(url) return response def callback(future): print(future.result()) url_list = ['http://www.github.com', 'http://www.bing.com'] pool = ProcessPoolExecutor(5) for url in url_list: v = pool.submit(fetch_async, url) v.add_done_callback(callback) pool.shutdown(wait=True)
通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步IO应当是首选
自定义异步io请求模块
原理:非阻塞socket + io多路复用
import socket import select class HttpRequest(): def __init__(self,sk,host,callback): self.socket = sk self.host = host self.callback = callback #对sk进行host,对应回调函数的封装 def fileno(self): return self.socket.fileno() class AsyncRequest: def __init__(self): self.conn = [] self.connection = [] def add_request(self,host,callback): try: sk = socket.socket() sk.setblocking(0) #非阻塞io socket 只管发送连接请求 sk.connect((host,80)) except BlockingIOError as e: #链接无响应 就会报错 pass request = HttpRequest(sk,host,callback) self.conn.append(request) self.connection.append(request) def run(self): while True: rlist,wlist,elist=select.select(self.conn,self.connection,[],0.05) #监听socket对象 # rlist 监听数据 # wlist 监听socket链接响应 # 不一定是socket对象,只要该对象包含fileno方法,并且返回一个文件描述符即可,所以可以是封装了socket(这才是重点)以及其他参数的对象 for w in wlist: print('%s-链接响应'%(w.host)) tpl = "GET / HTTP/1.0\r\nHost:%s\r\n\r\n"%(w.host) #构造http请求头 w.socket.send(bytes(tpl,encoding='utf8')) #开始发送http请求 self.connection.remove(w) #从监听中删除 for r in rlist: print('%s-接收数据---'%(r.host)) recv_data = bytes() while True: #循环接收数据 try: chunck = r.socket.recv(8096) recv_data += chunck except Exception as e: # 非阻塞接收数据,没有数据会报错(表示数据接收完成) break #数据接收完成,就可以使用回调函数进行处理 r.callback(recv_data) r.socket.close() self.conn.remove(r) #接收数据的监听对象列表为空时,结束整个监听 if len(self.conn) == 0: break def f1(data): print('文件写入') def f2(data): print('数据库写入') url_list = [ {'host':'www.baidu.com','callback':f1}, {'host':'www.qq.com','callback':f1}, {'host':'cn.bing.com','callback':f2} ] req = AsyncRequest() for item in url_list: req.add_request(item['host'],item['callback']) #创建socket对象,发送链接 req.run() #开始监听 + 后续数据操作 + 数据回调函数处理
Scrapy
链接:http://scrapy-chs.readthedocs.io/zh_CN/latest/index.html
#centos
pip3 install Incremental -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
pip3 install -i http://pypi.douban.com/simple/ Twisted --trusted-host pypi.douban.com pip3 install -i http://pypi.douban.com/simple/ scrapy --trusted-host pypi.douban.com #windows 下载Twisted-17.1.0-cp35-cp35m-win_amd64.whl pip3 install Twisted-17.1.0-cp35-cp35m-win_amd64.whl pip3 install scrapy
安装pywin32
没事~吐着吐着就习惯了