Scrapy爬取妹子图保存到不同目录下
进行设置settings
#启动图片管道 ITEM_PIPELINES = { 'mztu.pipelines.ImagesPipelinse': 300, } #设置默认目录地址 注意下载图片的话默认地址必须设置!!! IMAGES_STORE = "E:\study\Python\scrapy\mztu\imges" #设置图片通道失效时间 IMAGES_EXPIRES =90 #缩略图生成 #IMAGES_THUMBS = { # 'small': (50, 50), # 'big': (270, 270), #}
spider目录
# -*- coding: utf-8 -*- import scrapy from mztu.items import MztuItem class ZimdgSpider(scrapy.Spider): name = 'zimdg' allowed_domains = ['mzitu.com'] #生成链接列表 start_urls = ['http://www.mzitu.com/xinggan/page/{}/'.format(str(x)) for x in range(118)] def parse(self, response): #解析出链接 set_li = response.xpath("//div[@class='postlist']/ul/li") for ecth in set_li: ed = ecth.xpath('./a/@href').extract() #进行二次分类解析 yield scrapy.Request(ed[0],callback=self.parse_item) def parse_item(self,response): itme = MztuItem() # 获取页数链接进行访问 offset = int(response.xpath('//div[@class="pagenavi"]/a/span/text()')[4].extract()) #生成链接访问 #遍历链接访问 for i in [response.url+"/{}".format(str(x)) for x in range(1,offset+1)]: itme['Referer']=i #将meta传入链接 yield scrapy.Request(itme['Referer'],meta={'meta_1':itme}, callback=self.parse_ponse) # for i in url: def parse_ponse(self,response): #获取itme资源 itme = response.meta['meta_1'] #获取图片地址 imgs = response.xpath('//div[@class="main-image"]/p/a/img/@src')[0].extract() #获取图片目录 title = response.xpath('//div[@class="main-image"]/p/a/img/@alt')[0].extract() itme["title"]= title itme["imge_url"]= imgs #itme["nickname"] = itme["Referer"][itme["Referer"].rfind("/"):]+itme["imge_url"][itme["imge_url"].rfind('/')+1:itme["imge_url"].rfind('.')] #itme["nickname"] = itme["imge_url"][itme["imge_url"].rfind('/')+1:itme["imge_url"].rfind('.')] yield itme
items
import scrapy class MztuItem(scrapy.Item): #目录 title = scrapy.Field() #图片地址 imge_url = scrapy.Field() #请求头 Referer = scrapy.Field() image_Path = scrapy.Field() #图片名称 # nickname = scrapy.Field()
pipelines管道
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # 导入这个包为了移动文件 import shutil #此包不解释 import scrapy # 导入项目设置 from scrapy.utils.project import get_project_settings # 导入scrapy框架的图片下载类 from scrapy.pipelines.images import ImagesPipeline #此包不解释 import os class ImagesPipelinse(ImagesPipeline): #def process_item(self, item, spider): # return item # 获取settings文件里设置的变量值 IMAGES_STORE = get_project_settings().get("IMAGES_STORE") # 重写ImagesPipeline类的此方法 # 发送图片下载请求 def get_media_requests(self, item, info): image_url = item["imge_url"] #headers是请求头主要是防反爬虫 yield scrapy.Request(image_url,headers={'Referer':item['Referer']}) def item_completed(self, result, item, info): image_path = [x["path"] for ok, x in result if ok] # 定义分类保存的路径 img_path = "%s\%s" % (self.IMAGES_STORE, item['title']) # 目录不存在则创建目录 if os.path.exists(img_path) == False: os.mkdir(img_path) # 将文件从默认下路路径移动到指定路径下 shutil.move(self.IMAGES_STORE + "\\" +image_path[0], img_path + "\\" +image_path[0][image_path[0].find("full\\")+6:]) item['image_Path'] = img_path + "\\" + image_path[0][image_path[0].find("full\\")+6:] return item
这里实现图片保存到不同的目录下,主要函数是shutil.move(),将图片从原始默认路径移动到指定目录下