scrapy 下载图片 from cuiqingcai
import scrapy class MzituScrapyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() image_urls = scrapy.Field() url = scrapy.Field() pass
官方的:
https://doc.scrapy.org/en/latest/topics/media-pipeline.html?highlight=item_complete#scrapy.pipelines.images.ImagesPipeline.item_completed
https://doc.scrapy.org/en/latest/topics/media-pipeline.html?highlight=item_complete
没有分类,很难看, 再重写一下ImagesPipeline中的file_path方法!
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy import Request from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem import re class MzituScrapyPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): """ :param request: 每一个图片下载管道请求 :param response: :param info: :param strip :清洗Windows系统的文件夹非法字符,避免无法创建目录 :return: 每套图的分类目录 """ item = request.meta['item'] folder = item['name'] folder_strip = strip(folder) image_guid = request.url.split('/')[-1] filename = u'full/{0}/{1}'.format(folder_strip, image_guid) return filename def get_media_requests(self, item, info): """ :param item: spider.py中返回的item :param info: :return: """ for img_url in item['image_urls']: referer = item['url'] yield Request(img_url, meta={'item': item, 'referer': referer}) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") return item # def process_item(self, item, spider): # return item def strip(path): """ :param path: 需要清洗的文件夹名字 :return: 清洗掉Windows系统非法文件夹名字的字符串 """ path = re.sub(r'[?\\*|“<>:/]', '', str(path)) return path if __name__ == "__main__": a = '我是一个?\*|“<>:/错误的字符串' print(strip(a))
写一个中间件来处理图片下载的防盗链:
class MeiZiTu(object): def process_request(self, request, spider): '''设置headers和切换请求头 :param request: 请求体 :param spider: spider对象 :return: None ''' referer = request.meta.get('referer', None) if referer: request.headers['referer'] = referer
最后一步设置ImagesPipeline的存储目录!
在settings.py中写入:
IMAGES_STORE = 'F:\mzitu\\'
在settings.py中写入以下配置。
# 30 days of delay for images expiration
IMAGES_EXPIRES = 30
ITEM_PIPELINES = {
'mzitu_scrapy.pipelines.MzituScrapyPipeline': 300,
}
DOWNLOADER_MIDDLEWARES = {
'mzitu_scrapy.middlewares.MeiZiTu': 543,
}