Scrapy下载器中间件用法示例
1.爬虫文件httpbin.py
# -*- coding: utf-8 -*- import scrapy class HttpbinSpider(scrapy.Spider): name = 'httpbin' allowed_domains = ['httpbin.org'] start_urls = ['http://httpbin.org/get'] def parse(self, response): self.logger.debug(response.text) self.logger.debug('Status Code: ' + str(response.status))
2.中间件文件middlewares.py
不在settings.py文件中设置请求头。而是在下载器中间件中,也就是在请求requests之前加上请求头
另外在response响应后更改状态码
import random from scrapy import Request class RandomUserAgentMiddleware(): def __init__(self): self.user_agents = [ 'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2', 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1' ] def process_request(self, request, spider): request.headers['User-Agent'] = random.choice(self.user_agents) def process_response(self, request, response, spider): response.status = 201 return response
3.配置文件settings.py
在配置文件中启用该配置
DOWNLOADER_MIDDLEWARES = { 'scrapydownloadertest.middlewares.RandomUserAgentMiddleware': 543, }