custom_setting
custom_setting
一、定义
二、配置
1.middlewares
1 # ----- SeleniumMiddlerware中间件,不添加全局 ----- 2 from selenium import webdriver 3 from selenium.common.exceptions import TimeoutException 4 from selenium.webdriver.common.by import By 5 from selenium.webdriver.support.ui import WebDriverWait 6 from selenium.webdriver.support import expected_conditions as EC 7 from selenium.webdriver.common.keys import Keys 8 from scrapy.http import HtmlResponse 9 from logging import getLogger 10 import time 11 class SeleniumMiddleware(object): 12 # Middleware中会传递进来一个spider,这就是我们的spider对象,从中可以获取__init__时的chrome相关元素 13 @classmethod 14 def process_request(self, request, spider): 15 """ 16 用chrome抓取页面 17 :param request:Request请求对象 18 :param spider: Spider对象 19 :return: HtmlResponse响应 20 """ 21 print(f"chrome is getting page") 22 # 依靠meta中的标记,决定是否启用selenium来爬取 23 usedSelenium = request.meta.get('usedSelenium',False) 24 print("usedSelenium开始执行") 25 if usedSelenium: 26 try: 27 spider.browser.get(request.url) 28 # 搜索框是否出现 29 input = spider.wait.until( 30 EC.presence_of_element_located((By.XPATH, "//div[@class='nav-search-field ']/input")) 31 ) 32 time.sleep(2) 33 input.clear() 34 input.send_keys("iphone 7s") 35 # 按enter键,进行搜索 36 input.send_keys(Keys.RETURN) 37 # 查看搜索结果是否出现 38 searchRes = spider.wait.until( 39 EC.presence_of_all_elements_located((By.XPATH, "//div[@id='resultsCol']")) 40 ) 41 except Exception as err: 42 print(f"chrome getting page error,Exception = {err}") 43 return HtmlResponse(url=request.url, status=500, request=request) 44 finally: 45 time.sleep(3) 46 # 页面爬取成功,构造一个成功的Response对象(HtmlResponse是它的子类) 47 return HtmlResponse(url=request.url, 48 body=spider.browser.page_source, 49 request=request, 50 # 指定网页编码 51 encoding = "UTF-8", 52 status=200, 53 )
2.settings/在settings同目录下新建custom_settings.py
1 # -*- coding: utf-8 -*- 2 custom_settings_for_spider1 = { 3 'LOG_LEVEL': 'INFO', 4 'DOWNLOAD_DELAY': 0, 5 'COOKIES_ENABLED': False, # enabled by default 6 'DOWNLOADER_MIDDLEWARES': { 7 # 代理中间件 8 'video_spider.middlewares.ProxiesMiddleware': 400, 9 # SeleniumMiddleware中间件 10 'video_spider.middlewares.SeleniumMiddleware': 543, 11 # 将scrapy默认的user-agent中间件关闭 12 'scrapy.downloadmiddlewares.useragent.UserAgentMiddleware': None, 13 }, 14 15 }
3.在spider文件中引入custom_settings
1 import scrapy 2 from scrapy import Request 3 from selenium import webdriver 4 from selenium.webdriver.support.ui import WebDriverWait 5 # scrapy 信号相关库 6 from scrapy.utils.project import get_project_settings 7 from scrapy import signals 8 from pydispatch import dispatcher 9 # setting 10 from ..custom_settings import * 11 12 class ShanbaySpider(scrapy.Spider): 13 name = 'shanbay' 14 allowed_domains = ['shanbay.com'] 15 # start_urls = ['http://shanbay.com/'] 16 custom_settings = custom_settings_for_spider1 17 # 将Chrome初始化放到spider中,成为spider中的元素 18 def __init__(self, timeout=30, isLoadImage=True, windowHeight=None, windowWidth=None): 19 # 从settings.py中获取设置参数 20 print("浏览器开始执行") 21 self.mySetting = get_project_settings() 22 self.timeout = self.mySetting['SELENIUM_TIMEOUT'] 23 self.isLoadImage = self.mySetting['LOAD_IMAGE'] 24 self.windowHeight = self.mySetting['WINDOW_HEIGHT'] 25 self.windowWidth = self.mySetting['windowWidth'] 26 # 初始化Chrome对象 27 self.browser = webdriver.Chrome() 28 print("六拉你去") 29 if self.windowHeight and self.windowHeight: 30 self.browser.set_window_size(900, 900) 31 self.browser.set_page_load_timeout(self.timeout) 32 self.wait = WebDriverWait(self.browser, 25) 33 # 初始化父类,方便不同爬虫文件执行不同执行方式 34 super(ShanbaySpider, self).__init__() 35 # 设置信号量,当收到spider_closed信号时,调用mySpiderCloseHandle方法,关闭chrome 36 dispatcher.connect(receiver=self.CloseHandle, 37 signal=signals.spider_closed 38 ) 39 40 # 信号量处理函数:关闭chrome浏览器 41 def CloseHandle(self, spider): 42 print(f"CloseHandle:enter") 43 self.browser.quit() 44 45 # ------------ spider 开始执行 -------------- 46 # --- 网络请求 --- 47 def start_requests(self): 48 for i in range(29): 49 page = 540709 + i * 3 50 url_base = 'https://www.shanbay.com/wordlist/187711/' + str(page) + '/?page={}' 51 for x in range(10): 52 url = url_base.format(x+1) 53 yield Request( 54 url, 55 meta={'usedSelenium': True, 'dont_redirect': True}, 56 callback=self.parse, 57 errback=self.error, 58 ) 59 def error(self, response): 60 pass 61 def parse(self, response): 62 from ..items import ShanbaySpiderItem 63 html_contenxs = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div/table/tbody/tr//*/text()') 64 item = ShanbaySpiderItem() 65 66 for result in html_contenxs: 67 item['Chinese'] = result.extract() 68 print(item['Chinese']) 69 yield item