scrapy1.6下载图片
# 创建项目 scrapy startproject downimg
# settings.py 67行 ITEM_PIPELINES = { #'downimg.pipelines.DownimgPipeline': 300, 'scrapy.pipelines.images.ImagesPipeline': 1 } MAGES_STORE = 'images' # 存储图片文件位置 IMAGES_URLS_FIELD = 'src' # 下载URL上面的SRC
# items.py 新增 class DownImagesItem(scrapy.Item): src = scrapy.Field() pass
# 创建爬虫文件 spiders/img_spider.py import scrapy from ..items import DownImagesItem class ImgSpider(scrapy.Spider): # 爬虫名称 name = 'img' # 指定域名 爬虫不会访问其他域名 allowed_domains = ['ivsky.com'] # 需要采集的页面 start_urls = ['https://www.ivsky.com/tupian/tiankong_t811/'] def parse(self, response): # 获取列表全部图片 img_url = response.css("ul.pli li a img::attr(src)").getall() for img in img_url: item = DownImagesItem() # 拼接完整URL放入item item['src'] = ['https:'+img] yield item next_url = response.css("div.pagelist a.page-next::attr(href)").get() if next_url: # 拼接完成下一页路径 url = 'https://www.ivsky.com'+next_url yield scrapy.Request(url=url, callback=self.parse)
# 执行爬虫 # scrapy.cfg 目录下执行 scrapy crawl img