爬取多页电影信息及简介
爬取标题,评分,排名,摘要,剧情简介
使用 Feed exports 保存爬取的数据
代码:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 import re 4 from ..items import ScrapyDoubanItem 5 6 7 class DoubanSpider(scrapy.Spider): 8 name = 'douban' 9 # allowed_domains = ['douban.com'] 10 start_urls = ['https://movie.douban.com/top250'] 11 12 def parse(self, response): 13 # print(response.url) 14 abstract = response.xpath('//*[@class="inq"]/text()').extract() 15 detail_pages = response.xpath('//div[@class="hd"]/a/@href').extract() 16 # print(detail_pages) 17 for ind,detail_page in enumerate(detail_pages): 18 # print(detail_page) 19 abstract_detail = abstract[ind] 20 yield scrapy.Request(detail_page,callback=self.parse_detail,meta={'abstract_detail':abstract_detail}) 21 next_page = response.xpath('//span[@class="next"]/a/@href').extract_first() 22 base_url = 'https://movie.douban.com/top250' 23 # print(title) 24 if next_page: 25 yield scrapy.Request(url=base_url+next_page,callback=self.parse) 26 27 def parse_detail(self, response): 28 # print(response.url) 29 title = response.xpath('//*[@property="v:itemreviewed"]/text()').extract_first() 30 score = response.xpath('//*[@class="ll rating_num"]/text()').extract_first() 31 rank = response.xpath('//*[@class="top250-no"]/text()').extract() 32 describe = response.xpath('//*[@property="v:summary"]/text()').extract_first() 33 abstract_detail = response.meta['abstract_detail'] 34 item = ScrapyDoubanItem() 35 item['title'] = title 36 item['score'] = score 37 item['rank'] = rank 38 item['abstract_detail'] = abstract_detail 39 item['describe'] = describe 40 yield item 41 # print(title, abstract_detail, score, rank) 42 # print(score, abstract_detail)