scrapy+selenium爬虫案例--爬取网易新闻
一、目标
爬取网易新闻首页(https://news.163.com/)中(国内、国际、军事、航空)四个版块的新闻内容,并且按照格式:(网易新闻/版块名/新闻标题.txt)创建文件名,写入新闻内容
二、分析
通过对首页、各版块页、以及详情页分析发现,只有四个版块中的新闻标题是通过js动态加载获取的,因此这部分内容准备采用selenium+ChromeDriver去获取。
在scrapy中使用selenium+ChromeDriver的大体策略如下:
- 爬虫类中定义一个类属性来实例化浏览器对象,再定义一个类属性为空列表[],用于存储这4个版块的请求url
- 爬虫类中再重写父类的一个静态方法close(spider,reason),用于爬虫结束后关闭浏览器
- 定义一个中间件类,通过process_request(self,request,spider)方法去拦截这4个版块的请求url,然后通过selenium+ChromeDriver去发送请求并获取响应内容,然后手动构建一个response对象,return这个response对象
三、代码实现
-
items.py
import scrapy class WangyiproItem(scrapy.Item): path = scrapy.Field() #存储新闻的路径 title = scrapy.Field() #新闻标题 content = scrapy.Field() #新闻内容
-
wangyi.py
import scrapy import os from wangyiPro.items import WangyiproItem from selenium import webdriver class WangyiSpider(scrapy.Spider): name = 'wangyi' allowed_domains = ['163.com'] start_urls = ['https://news.163.com/'] model_urls = [] #用于存储国内、国际、军事和航空4个版块的url bro = webdriver.Chrome() #浏览器对象 def parse(self, response): '''解析首页''' # 创建目录:网易新闻 if not os.path.exists('网易新闻'): os.mkdir('网易新闻') li_list = response.xpath('//div[@class="ns_area list"]/ul/li') index_list = [2,3,5,6] #国内、国际、军事和航空4个版块对应的索引 for index in index_list: li = li_list[index] # 获取这4个版块的url和名称 model_url = li.xpath('./a/@href').extract_first() model_name = li.xpath('./a/text()').extract_first() self.model_urls.append(model_url) # 存储新闻的路径 path = '网易新闻/'+model_name #在目录:网易新闻下继续按照各版块名新建目录 if not os.path.exists(path): os.mkdir(path) yield scrapy.Request(url=model_url,callback=self.parse_model,meta={'path':path}) def parse_model(self,response): '''解析各版块页''' path = response.meta.get('path') div_list = response.xpath('//div[contains(@class,"data_row")]') for div in div_list: detail_title = div.xpath('.//h3/a/text()').extract_first() detail_url = div.xpath('.//h3/a/@href').extract_first() #实例化item对象 item = WangyiproItem() item['title'] = detail_title item['path'] = path yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item}) def parse_detail(self,response): '''解析详情页''' item = response.meta.get('item') content_list = response.css('.post_body p::text').extract() content = ''.join(content_list) item['content'] = content yield item @staticmethod def close(spider, reason): '''关闭浏览器''' spider.bro.quit()
注意:千万不要提前在parse方法中去实例化item对象,而是应该为每一个详情页去实例化一个!!!
-
middlewares.py
from fake_useragent import UserAgent from time import sleep from scrapy.http import HtmlResponse class RandomUserAgentMiddlerware: def process_request(self,request,spider): '''给每个请求添上一个随机User-Agent''' ua = UserAgent() request.headers['User-Agent'] = ua.random class WangyiproDownloaderMiddleware: def process_request(self, request, spider): '''拦截4个版块的请求''' if request.url in spider.model_urls: bro = spider.bro bro.get(request.url) sleep(3) # 不断滚动滚动条,直到无法滚动 current_height = bro.execute_script('return document.body.scrollHeight;') while 1: bro.execute_script('window.scrollTo(0,document.body.scrollHeight);') sleep(3) new_height = bro.execute_script('return document.body.scrollHeight;') if new_height == current_height: break current_height = new_height try: #页面滚动到底部后,有的需要通过点击'加载更多'按钮来继续加载新闻标题 bro.find_element_by_css_selector('.post_addmore > span').click() sleep(2) except: pass # 返回自己构建的响应 return HtmlResponse(url=bro.current_url,body=bro.page_source,request=request,encoding='utf-8')
-
pipelines.py
class WangyiproPipeline: def process_item(self, item, spider): file_name = item['path']+'/'+item['title']+'.txt' with open(file_name,'w',encoding='utf-8') as f: f.write(item['content']) return item
-
settings.py
BOT_NAME = 'wangyiPro' SPIDER_MODULES = ['wangyiPro.spiders'] NEWSPIDER_MODULE = 'wangyiPro.spiders' ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 2 #设置下载延迟 #开启下载中间件 DOWNLOADER_MIDDLEWARES = { 'wangyiPro.middlewares.RandomUserAgentMiddlerware':600, 'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543, } #开启管道 ITEM_PIPELINES = { 'wangyiPro.pipelines.WangyiproPipeline': 300, } #设置log级别 LOG_LEVEL = 'ERROR'
四、爬取结果展示