下载中间件,selenium集成

下载中间件

"""
# 2大中间件:下载中间件,爬虫中间件

# 1 写在middlewares.py中(名字随便命名)
# 2 配置生效()
		SPIDER_MIDDLEWARES = {
   'cnblogs_crawl.middlewares.CnblogsCrawlSpiderMiddleware': 543,
}
	DOWNLOADER_MIDDLEWARES = {
   'cnblogs_crawl.middlewares.CnblogsCrawlDownloaderMiddleware': 543,
}
  
# 2 下载中间件
	-process_request:(请求去,走)
  			# - return None: 继续处理当次请求,进入下一个中间件
        # - return Response: 当次请求结束,把Response丢给引擎处理(可以自己爬,包装成Response)
        # - return Request : 相当于把Request重新给了引擎,引擎再去做调度
        # - 抛异常:执行process_exception
  -process_response:(请求回来,走)
  		  # - return a Response object :继续处理当次Response,继续走后续的中间件
        # - return a Request object:重新给引擎做调度
        # - or raise IgnoreRequest :process_exception
  -process_exception:(出异常,走)
  		   # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain  :停止异常处理链,给引擎(给爬虫)
        # - return a Request object: stops process_exception() chain :停止异常处理链,给引擎(重新调度)
"""

class CnblogsCrawlDownloaderMiddleware(object)

  • 加cookie
"""

 def process_request(self, request, spider):

        # 1 加cookie(request.cookies就是你访问网站的cookie)
        print(request.cookies)

        # 从你的cookie池中取出来,可以修改cookie
        request.cookies = {'name': 'alen', 'age': 18}
        # 之后你在打印发现就是你修改之后的cookie了
        print(request.cookies)

"""
  • 加代理
"""
# 前提是你去GitHub下把那个代理池的项目拷贝下来

class CnblogsCrawlDownloaderMiddleware(object):


    def get_proxy(self):
        import requests
        try:
            ret = requests.get('http://127.0.0.1:5010/get').json()['proxy']
        except:
            ret = requests.get('https://127.0.0.1:5010/get').json()['proxy']
        print(ret)
        return ret
   
    def process_request(self, request, spider):

        # 2 加代理

        request.meta['proxy'] = self.get_proxy()
        print(request.meta['proxy'])
    
"""
  • 修改ua
"""
		from fake_useragent import UserAgent
        ua = UserAgent(verify_ssl=False)
        request.headers['User-Agent']=ua.random
        print(request.headers)
"""

selenium集成

"""
# 在爬虫已启动,就打开一个chrom浏览器,以后都用这一个浏览器来爬数据

# 1 在爬虫中创建bro对象
	bro = webdriver.Chrome(executable_path='/Users/liuqingzheng/Desktop/crawl/cnblogs_crawl/cnblogs_crawl/chromedriver')

# 2 中间件中使用:
  spider.bro.get(request.url)
  text=spider.bro.page_source
  response=HtmlResponse(url=request.url,status=200,body=text.encode('utf-8'))
  return response
# 3 关闭,在爬虫中
    def close(self, reason):
        self.bro.close()
"""
posted @ 2020-04-12 22:09  alen_zhan  阅读(208)  评论(0编辑  收藏  举报
返回顶部