Scrapy集成selenium-案例-淘宝首页推荐商品获取
scrapy特性就是效率高,异步,如果非要集成selenium实际上意义不是特别大....因为selenium慢....
案例:淘宝首页推荐商品的标题获取
爬虫类 toabao.py
import scrapy
from scrapy.http import HtmlResponse
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule, Spider
class TaobaoSpider(Spider):
name = "tb"
allowed_domains = ["taobao.com"]
start_urls = ["https://taobao.com"]
def parse(self, response: HtmlResponse, **kwargs):
# with open("tb.html", "w", encoding="utf-8") as f:
# f.write(response.text)
# 由于自己编写的Selenium中间件的存在,response返回的数据是selenium返回的。
selector_list = response.xpath('//div[@class="tb-recommend-content-item"]')
for selector in selector_list:
title = selector.xpath('./a/div[@class="info-wrapper"]/div[@class="title"]/text()').get()
print(f"{title=}")
settings.py
# 把一些配置放到settings中,方便修改
##### Selenium集成配置 #####
DIRVER_PATH = r'C:\Users\Administrator\Desktop\chromedriver.exe'
# 跳过selenium检测
STEALTH_JS = r'C:\Users\Administrator\Desktop\stealth.min.js'
DOWNLOADER_MIDDLEWARES = {
# 启用下载器中间件
"scrapy_demo.middlewares.SeleniumDownLoaderMiddleware": 543,
}
SeleniumMiddleware.py 下载器中间件
class SeleniumDownLoaderMiddleware:
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
dirver_path = crawler.settings["DIRVER_PATH"]
stealth_js = crawler.settings["STEALTH_JS"]
# 这个类方法就是实例化中间件自己本身。
s = cls(dirver_path=dirver_path, stealth_js=stealth_js)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def __init__(self, dirver_path, stealth_js, *args, **kwargs):
# 获取配置
# settings = get_project_settings()
# 加载读懂
# service = ChromeService(executable_path=settings["DIRVER_PATH"])
service = ChromeService(executable_path=dirver_path)
# 设置浏览器选项
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")
# 接管现有的浏览器
options.add_experimental_option("debuggerAddress", "127.0.0.1:9333")
# 实例化浏览器
self.driver = webdriver.Chrome(service=service, options=options)
# 防检测selenium
# with open(settings["STEALTH_JS"]) as f:
with open(stealth_js) as f:
js = f.read()
self.driver.execute_cdp_cmd(
cmd="Page.addScriptToEvaluateOnNewDocument",
cmd_args={
"source": js
}
)
# 显示等待5秒
self.driver.implicitly_wait(5)
def __del__(self):
self.driver.quit()
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
# 打开url
self.driver.get(request.url)
# self.driver.get(
# "https://s.taobao.com/search?commend=all&ie=utf8&initiative_id=tbindexz_20170306&page=1&q=%E9%9B%B6%E9%A3%9F&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ssid=s5-e")
#### 模拟各种操作....
# self.driver.find_element(By.PARTIAL_LINK_TEXT, "天猫超").click()
# self.driver.switch_to.window(self.driver.window_handles[-1])
"""
1、先获取当前的高度
2、进入死循环
2-1、滚动到最底部
2-2、判断当前高度是否比上一次高度大,
如果是表示底部还有数据,
更新当前高度的值,用于下次比较
否则已经到了底部了
"""
# 自动滚动获取新数据
self.load_data_by_scroll(self.driver)
# 看看效果
# time.sleep(5)
# 返回响应。
return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding="utf-8", status=200)
def load_data_by_scroll(self, driver: WebDriver):
js = 'return document.body.scrollHeight;'
# 获取当前高度
check_height = driver.execute_script(js)
while True:
# 先滚动到最底部,如果能继续加载更新,那么document.body.scrollHeight的值就会变大
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
# 判断当前的document.body.scrollHeight是否比原来的大
# 这里巧妙利用WebDriverWait,until只要拿不到返回值然后超过给定的时间(5秒)后就会发生TimeoutException异常
# 发生异常就是表示已经没有更多数据了,那么直接跳出死循环
WebDriverWait(driver, 5, 0.2).until(lambda x: x.execute_script(js) > check_height)
# 更新值,用于下次比较
check_height = driver.execute_script(js)
except Exception as e:
break
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
本文来自博客园,作者:运维爱背锅,转载请注明原文链接:https://www.cnblogs.com/juelian/p/17559667.html
分类:
爬虫 / Scrapy框架
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步