Scrapy——將爬取圖片下載到本地
1. Spider程序:
1 import scrapy, json 2 from UnsplashImageSpider.items import ImageItem 3 4 class UnsplashImageSpider(scrapy.Spider): 5 # 定义Spider的名称 6 name = 'unsplash_image' 7 allowed_domains = ['unsplash.com'] 8 # 定义起始页面 9 start_urls = ['https://unsplash.com/napi/photos?page=1&per_page=12'] 10 def __init__ (self): 11 self.page_index = 1 12 13 def parse(self, response): 14 # 解析服务器响应的JSON字符串 15 photo_list = json.loads(response.text) # ① 16 # 遍历每张图片 17 for photo in photo_list: 18 item = ImageItem() 19 item['image_id'] = photo['id'] 20 item['download'] = photo['links']['download'] 21 yield item 22 23 self.page_index += 1 24 # 获取下一页的链接 25 next_link = 'https://unsplash.com/napi/photos?page='\ 26 + str(self.page_index) + '&per_page=12' 27 # 继续获取下一页的图片 28 yield scrapy.Request(next_link, callback=self.parse)
2. 在Pipeline中使用urllib.request包直接下載圖片:
1 from urllib.request import * 2 3 class UnsplashimagespiderPipeline(object): 4 def process_item(self, item, spider): 5 # 每个item代表一个要下载的图片 6 print('----------' + item['image_id']) 7 real_url = item['download'] + "?force=true" 8 try: 9 pass 10 # 打开URL对应的资源 11 with urlopen(real_url) as result: 12 # 读取图片数据 13 data = result.read() 14 # 打开图片文件 15 with open("images/" + item['image_id'] + '.jpg', 'wb+') as f: 16 # 写入读取的数据 17 f.write(data) 18 except: 19 print('下载图片出现错误' % item['image_id'])