scrapy 下载图片 from cuiqingcai

import scrapy


class MzituScrapyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    image_urls = scrapy.Field()
    url = scrapy.Field()
    pass

  

官方的:

https://doc.scrapy.org/en/latest/topics/media-pipeline.html?highlight=item_complete#scrapy.pipelines.images.ImagesPipeline.item_completed

https://doc.scrapy.org/en/latest/topics/media-pipeline.html?highlight=item_complete

 

没有分类,很难看, 再重写一下ImagesPipeline中的file_path方法!

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import re


class MzituScrapyPipeline(ImagesPipeline):

    def file_path(self, request, response=None, info=None):
        """
        :param request: 每一个图片下载管道请求
        :param response:
        :param info:
        :param strip :清洗Windows系统的文件夹非法字符,避免无法创建目录
        :return: 每套图的分类目录
        """
        item = request.meta['item']
        folder = item['name']
        folder_strip = strip(folder)
        image_guid = request.url.split('/')[-1]
        filename = u'full/{0}/{1}'.format(folder_strip, image_guid)
        return filename

    def get_media_requests(self, item, info):
        """
        :param item: spider.py中返回的item
        :param info:
        :return:
        """
        for img_url in item['image_urls']:
            referer = item['url']
            yield Request(img_url, meta={'item': item,
                                         'referer': referer})


    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        return item

    # def process_item(self, item, spider):
    #     return item

def strip(path):
    """
    :param path: 需要清洗的文件夹名字
    :return: 清洗掉Windows系统非法文件夹名字的字符串
    """
    path = re.sub(r'[?\\*|“<>:/]', '', str(path))
    return path




if __name__ == "__main__":
    a = '我是一个?\*|“<>:/错误的字符串'
    print(strip(a))

  

写一个中间件来处理图片下载的防盗链:

class MeiZiTu(object):

    def process_request(self, request, spider):
        '''设置headers和切换请求头
        :param request: 请求体
        :param spider: spider对象
        :return: None
        '''
        referer = request.meta.get('referer', None)
        if referer:
            request.headers['referer'] = referer

  

最后一步设置ImagesPipeline的存储目录!

在settings.py中写入:

IMAGES_STORE = 'F:\mzitu\\'

 

在settings.py中写入以下配置。

# 30 days of delay for images expiration

IMAGES_EXPIRES = 30
 
ITEM_PIPELINES = {
   'mzitu_scrapy.pipelines.MzituScrapyPipeline': 300,
}

 

DOWNLOADER_MIDDLEWARES = {

   'mzitu_scrapy.middlewares.MeiZiTu': 543,
}
 
posted @ 2018-10-26 19:10  CrossPython  阅读(271)  评论(0编辑  收藏  举报