scrapy设置代理的方法
方法一:
直接在spider文件下设置代理,通过传参的方式设置在Request中
import scrapy class MimvpSpider(scrapy.spiders.Spider): name = "mimvp" allowed_domains = ["mimvp.com"] start_urls = [ "http://proxy.mimvp.com/exist.php", "https://proxy.mimvp.com/exist.php", ] ## 代理设置方式1:直接在代理里设置 def start_requests(self): urls = [ "http://proxy.mimvp.com/exist.php", "https://proxy.mimvp.com/exist.php", ] for url in urls: meta_proxy = "" if url.startswith("http://"): meta_proxy = "http://180.96.27.12:88" # http代理 elif url.startswith("https://"): meta_proxy = "http://109.108.87.136:53281" # https代理 yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': meta_proxy}) def parse(self, response): mimvp_url = response.url # 爬取时请求的url body = response.body # 返回网页内容 print("mimvp_url : " + str(mimvp_url)) print("body : " + str(body))
方法二:
利用中间件middlewares.py+settings配置
middlewares.py
## 代理设置方式2: 通过 middlewares.py + settings.py 配置文件设置 ## mimvp custom by yourself class ProxyMiddleware(object): def process_request(self,request,spider): if request.url.startswith("http://"): request.meta['proxy']="http://180.96.27.12:88" # http代理 elif request.url.startswith("https://"): request.meta['proxy']="http://109.108.87.136:53281" # https代理 # # proxy authentication # proxy_user_pass = "USERNAME:PASSWORD" # encoded_user_pass = base64.encodestring(proxy_user_pass) # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
settings.py
# Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # 优先级越小越先执行 DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, 'mimvp_proxy_python_scrapy.middlewares.ProxyMiddleware': 100, }