scrapy 中间件

scrapy 中间件

中间件流程:

class WxappSpiderMiddleware(object):
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # 爬虫处理前
        return None

    def process_spider_output(self, response, result, spider):
        # 爬虫返回数据前执行
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # 异常处理
        pass

    def process_start_requests(self, start_requests, spider):
    # 爬虫开始请求前
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        #爬虫结束执行
        spider.logger.info('Spider opened: %s' % spider.name)

随机请求头:


http://httpbin.org/user-agent    查看自己的user-agent

        import random
# 请求头--》 所有(http://useragentstring.com/pages/useragentstring.php?name=Chrome)
class UserAgentDownloaderMiddleware(object):
   
    USER_AGENTS = [
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2919.83 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2762.73 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
    ]

    def process_request(self, request, spider):
        user_agent =  random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent
        
 
settings 配置:
DOWNLOADER_MIDDLEWARES = {
   'wxapp.middlewares.UserAgentDownloaderMiddleware': 543,
}


spider 配置:

def parse(self,response):
    user_agent = json.loads(response.text)['user-agent']
    # 重复请求 url
    yield scrapy Request(self.start_url[0],dont_filter=True)   # 关闭去重请求
                         

ip 代理中间件:

出现验证码: 1. 识别   2. 跟换代理

代理服务商:
	快代理
   
httpbin.org/ip      打印当前代理ip


class IPDownloaderMiddleware(object):
    # 高匿名 + https  + 稳定
    PROXIES =[
        "ip:port",
        "",
        "",
        "",
        "",
    ]
    
    def process_request(self,request,spider):
        proxy = random.choice(self.PROXIES)
        request.meta['proxy'] = proxy
        
        
 setting 配置:

DOWNLOADER_MIDDLEWARES = {
   'wxapp.middlewares.IPDownloaderMiddleware': 543,
}

ip独享模式:

import base64
class IPDownloaderMiddleware(object):
    # 高匿名 + https  + 稳定

    def process_request(self, request, spider):
        proxy = "ip:port"
        user_password = "name:password"
        b64_user = base64.b64encode(user_password.encode('utf-8'))
        request.meta['proxy'] = proxy
        request.headers['Proxy-Authorization'] = 'Basic' + b64_user.decode('utf-8')

注意:

content 用extract()
content  = "".join(content).strip() 

posted @ 2020-04-25 23:10  black__star  阅读(122)  评论(0编辑  收藏  举报