scrapy之盗墓笔记三级页面爬取
#今日目标 **scrapy之盗墓笔记三级页面爬取** 今天要爬取的是盗墓笔记小说,由分析该小说的主要内容在三级页面里,故需要我们 一一解析 *代码实现* daomu.py ``` import scrapy from ..items import DaomuItem class DaomuSpider(scrapy.Spider): name = 'daomu' allowed_domains = ['daomubiji.com'] start_urls = ['http://www.daomubiji.com/'] # 解析一级页面的parse函数 def parse(self, response): # link_list: ['http://xxx/dao-mu-bi-ji-1','','',''] link_list = response.xpath('//ul[@class="sub-menu"]/li/a/@href').extract() for link in link_list: # 交给调度器 yield scrapy.Request( url = link, callback = self.parse_two_html ) # 解析二级页面函数(圈名 章节数 章节名 链接) def parse_two_html(self,response): # 基准xpath article_list = response.xpath('//article') for article in article_list: # 创建item对象 item = DaomuItem() # info_list: ['七星鲁王','第一章','血尸'] info_list = article.xpath('./a/text()').get().split() if len(info_list) == 3: item['volume_name'] = info_list[0] item['zh_num'] = info_list[1] item['zh_name'] = info_list[2] else: item['volume_name'] = info_list[0] item['zh_name'] = info_list[1] item['zh_num'] = '' # 提取链接并发给调度器入队列 item['zh_link'] = article.xpath('./a/@href').get() yield scrapy.Request( url = item['zh_link'], # meta参数: 传递item对象到下一个解析函数 meta = {'item':item}, callback = self.parse_three_html ) # 解析三级页面(小说内容)函数 def parse_three_html(self,response): # 获取上一个函数传递过来的item对象 item = response.meta['item'] # content_list: ['段落1','段落2','',''] content_list = response.xpath( '//article[@class="article-content"]//p/text()' ).extract() item['zh_content'] = '\n'.join(content_list) yield item ``` items.py ``` import scrapy class DaomuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 卷名 volume_name = scrapy.Field() # 章节数 zh_num = scrapy.Field() # 章节名称 zh_name = scrapy.Field() # 章节链接 zh_link = scrapy.Field() # 小说内容 zh_content = scrapy.Field() ``` pipelines.py ``` class DaomuPipeline(object): def process_item(self, item, spider): filename = '/home/tarena/daomu/{}_{}_{}'.format( item['volume_name'], item['zh_num'], item['zh_name'] ) with open(filename,'w') as f: f.write(item['zh_content']) return item ```