selenium在scrapy中的使用流程
# 当前爬虫用的selenium是同一个
1、在爬虫中初始化webdriver对象
import scrapy from selenium import webdriver class CnblogSpider(scrapy.Spider): name = 'cnblog' allowed_domains = ['www.cnblogs.com'] start_urls = ['http://www.cnblogs.com/'] # 在爬虫中初始化webdriver对象 bro = webdriver.Chrome(executable_path='../chromedriver.exe') def parse(self, response): print(response.status) # 在爬虫中关闭 def close(self, reason): print("我结束了") self.bro.close()
2、在中间件中使用(process_request)
def process_request(self, request, spider): # 继承selenium # from selenium import webdriver from scrapy.http import Response,HtmlResponse # bro= webdriver.Chrome(executable_path='../chromedriver.exe') spider.bro.get('https://dig.chouti.com/') print(spider.bro.page_source) # 必须return response对象 response=HtmlResponse(url='https://dig.chouti.com/',body=spider.bro.page_source.encode('utf-8'),request=request) return response
3、在settings.py中开启中间件
# 下载中间件 DOWNLOADER_MIDDLEWARES = { 'cnblogs.middlewares.CnblogsDownloaderMiddleware': 543, }