Scrapy 实现爬取多页数据 多层url数据爬取 meta传递 链接为相对路径
提取电影网站的片名,导演,影片播放地址。
item.py
1 import scrapy 2 3 4 class MovieItem(scrapy.Item): 5 # define the fields for your item here like: 6 name = scrapy.Field() #没啥好说的,定义三个返回变量 7 actor = scrapy.Field() 8 link = scrapy.Field()
spider,py
1 import scrapy 2 3 from movie.items import MovieItem 4 5 class MovieproSpider(scrapy.Spider): 6 name = 'moviePro' 7 allowed_domains = ['4567tv.tv'] 8 start_urls = ['https://www.4567tv.tv/frim/index1.html'] 9 page = 1 10 page_url = 'https://www.4567tv.tv/frim/index1-%s.html' 11 12 13 def parse(self, response): 14 li_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]') 15 for li in li_list: 16 item = MovieItem() 17 name = li.xpath('./div/div/h4/a/text()').extract_first() 18 detail_url = 'https://www.4567tv.tv' + li.xpath('./div/div/h4/a/@href').extract_first() 19 item['name'] = name 20 yield scrapy.Request(url = detail_url, callback = self.parse_detail, meta ={'item':item}) 21 22 if self.page <= 10: 23 self.page += 1 24 new_page_url = self.page_url % self.page 25 yield scrapy.Request(url = new_page_url, callback = (self.parse)) 26 27 def parse_detail(self, response): 28 item = response.meta['item'] #注意这里实例化的是meta的,是parse函数传递过来的第二层内容 29 actor = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[3]/a/text()').extract_first() 30 link = response.xpath('/html/body/div[1]/div/div/div/div[1]/a/@href').extract_first() 31 item['actor'] = actor 32 item['link'] = 'https://www.4567tv.tv' + link #这个连接是相对路径,重新构造地址 33 yield item
settings.py
1 FEED_EXPORT_ENCODING ='utf-8' #插入这两行,解码成中文,不然出现的是乱码
2 FEED_EXPORT_ENCODING = 'gb18030'
运行
scrapy crawl moviePro -o mov.csv
结果