scrapy配置user-agent中间件和ip代理中间件
middlewares.py文件中添加如下的信息:
# 使用了fake库
from fake_useragent import UserAgent
# 配置headers
class RandomUserAgentMidddlware(object):
# 随机更换user-agent
def __init__(self, crawler):
super(RandomUserAgentMidddlware, self).__init__()
self.ua = UserAgent()
# 从配置文件读取随机类型
self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
# 通过配置文件的随机类型进行调用
def get_ua():
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_ua())
# 使用了阿里云的IP代理服务
from myscrapy.aliproxy import get_proxy_ip
# 配置代理
class ProxyMiddleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = get_proxy_ip()
settings中开启中间件:
DOWNLOADER_MIDDLEWARES = {
'myscrapy.middlewares.MyscrapyDownloaderMiddleware': 543,
'myscrapy.middlewares.RandomUserAgentMidddlware': 0,
'myscrapy.middlewares.ProxyMiddleware': 1,
}
封装阿里云IP代理:
根目录下新建一个文件 aliproxy.py,里面内容如下。(可根据不同代理池环境修改接口)
import urllib.request
import json
def get_proxy_ip():
host = 'http://zip.market.alicloudapi.com'
path = '/devtoolservice/ipagency'
method = 'GET'
appcode = 'xxxxxxxxxxxx'
querys = 'foreigntype=0&protocol=0'
bodys = {}
url = host + path + '?' + querys
request = urllib.request.Request(url)
request.add_header('Authorization', 'APPCODE ' + "xxxxxxxxx")
response = urllib.request.urlopen(request)
content = response.read()
if (content):
load = json.loads(str(content, encoding='utf8'))
address_ = load['result'][0]['address']
return address_