反爬虫措施
1)动态修改User-Agent
2)动态修改ip
3)延迟DOWNLOAD_DELAY = 0.5
1)在middleware中新建一个类,从fake_useragent中导入UserAgent模块
from fake_useragent import UserAgent class RandomUserAgentMiddleware(object): @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self,crawler): super(RandomUserAgentMiddleware,self).__init__() self.ua=UserAgent() def process_request(self, request, spider): request.headers.setdefault(b'User-Agent', self.ua.random) def spider_opened(self, spider): pass
在settings设置DOWNLOADER_MIDDLEWARES
先把系统自带的useragent禁用:None
DOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 'JobboleSpider.middlewares.RandomUserAgentMiddleware': 543, }
2)动态修改ip
import random class RandomProxyIPMiddleware(object): @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler): self.ip_list = [ "http://180.125.196.155:8888", #ip代理 ] def process_request(self, request, spider): request.meta['proxy']=random.choice(self.ip_list) def spider_opened(self, spider): pass
3)在settings中设置延迟
DOWNLOAD_DELAY = 0.5