Scrapy中搭配Splash丶selenium
Splash的初识
1 # Splash类似Selenium模拟浏览器浏览网页来动态爬取网站 2 # 文档:https://splash.readthedocs.io/en/stable/ 3 # Splash 在docker中安装: https://splash.readthedocs.io/en/stable/install.html#linux-docker (docker笔记点这里) 4 # docker pull scrapinghub/splash 5 # 启动Splash 6 # docker run -it -p 8050:8050 --rm scrapinghub/splash 7 # https://splash.readthedocs.io/en/stable/faq.html?highlight=scrapy#python-scrapy 8 # pip3 install scrapy-splash 9 10 import requests 11 from fake_useragent import UserAgent 12 from urllib.parse import quote 13 14 headers = { 15 'User-Agent': UserAgent().chrome 16 } 17 18 # 调用splash,来控制(lua语言)抓取网页内容 (lua笔记点这里) 19 def main(): 20 url = r"https://www.baidu.com" 21 lua_script = '''function main(splash, args) 22 splash:go('{url}') 23 splash:wait(1) 24 return splash:html() 25 end'''.format(url=url) 26 response = requests.get(r"http://localhost:8050/execute?lua_source={}".format(quote(lua_script)), headers=headers) 27 response.encoding = 'utf-8' 28 print(response.text) 29 30 # splash直接访问网站 31 def test1(): 32 url = r"https://www.baidu.com" 33 response = requests.get(r"http://localhost:8050/render.html?url={}".format(quote(url)), headers=headers) 34 response.encoding = 'utf-8' 35 print(response.text) 36 37 if __name__ == '__main__': 38 test1()
Scrapy中的Splash
1 # Splash 配置 2 # 配置Splash URL地址 3 SPLASH_URL = 'http://localhost:8050/' 4 # 下载器中间件 5 DOWNLOADER_MIDDLEWARES = { 6 "scrapy_splash.SplashCookiesMiddleware": 200, 7 "scrapy_splash.SplashMiddleware": 201, 8 "myproject.middlewares.MyprojectDownloaderMiddleware": 202 9 } 10 # 去重过滤器 11 DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 12 # 使用Splash的Http缓存 13 HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
1 import scrapy 2 from scrapy_splash import SplashRequest 3 from urllib.parse import quote 4 5 6 class CnblogsSpider(scrapy.Spider): 7 name = "cnblogs" 8 allowed_domains = ["cnblogs.com"] 9 # start_urls = ["https://www.cnblogs.com/watermeloncode/"] 10 11 def parse(self, response): 12 print(response.text) 13 14 def parse1(self, response): 15 print(response.text) 16 17 def start_requests(self): 18 lua_script = ''' 19 function main(splash, args) 20 splash:go('{url}') 21 splash:wait(1) 22 return splash:html() 23 end'''.format(url=r"https://www.cnblogs.com/watermeloncode/") 24 # 要是执行lua代码的话就需要指定endpoint='execute', args={'lua_source':lua_script} 25 # yield SplashRequest(r"https://www.cnblogs.com/watermeloncode/", callback=self.parse, endpoint='execute', args={'lua_source':lua_script, 'wait':1}) 26 yield SplashRequest(r"https://www.cnblogs.com/watermeloncode/", callback=self.parse, endpoint='render.html', args={'wait': 1})
Scrapy中的Selenium
1 1.配置下载器 2 class MyprojectDownloaderMiddleware: 3 def process_request(self, request, spider): 4 if spider.name == 'guazi': 5 url = request.url 6 # 在这里调用浏览器get请求 7 spider.chrome.get(url) 8 html = spider.chrome.page_source 9 # 在这里返回HtmlResponse对象,后面就不会再次调用requests去发起请求了.因为这里我们已经把请求到的页面放置进去了. 10 return HtmlResponse(url=url, body=html, request=request, encoding='utf-8') 11 12 2.Spider中控制创建浏览器与关闭浏览器 13 import scrapy 14 from scrapy import signals 15 from selenium import webdriver 16 17 class GuaziSpider(scrapy.Spider): 18 name = "guazi" 19 allowed_domains = ["guazi.com"] 20 start_urls = ["https://www.guaz.com/"] 21 22 @classmethod 23 def from_crawler(cls, crawler, *args, **kwargs): 24 # Spider对象的创建都在这里 25 # 记得调用父类方法创建好Spider对象 26 spider = super(GuaziSpider, cls).from_crawler(crawler, *args, **kwargs) 27 # 添加一个关闭时需执行的方法.这里在处理spider_closed信号的时候会执行spider.spider_closed回调 28 crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed) 29 # 给Spider对象中内置一个浏览器对象 30 spider.chrome = webdriver.Chrome() 31 return spider 32 33 def spider_closed(self, spider): 34 # 收到signals.spider_closed信号的时候执行该方法,关闭浏览器 35 spider.chrome.quit() 36 print('close selenium') 37 38 def parse(self, response): 39 print(response.text)