ptt = r"http[s]*://[a-zA-Z0-9-./]+(?:jpg|jpeg|png)"
先是爬取到 图片url -> yeild url到piplines中
定义图片下载的专属piplines,类中的3个函数名固定的,是从写方法,注意图片命名
class DownloadImagesPipeline(ImagesPipeline): def get_media_requests(self, item, info): image_url = item['image_urls'] yield Request(image_url) def file_path(self, request, response=None, info=None): image_guid = "test" # 取原url的图片命名 取名需要改图片名字,否正原名下载会失败!!! return 'full/%s.jpg' % (image_guid) # 返回图片的名字 def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise Exception("Item contains no images") # 如果没有路径则抛出异常 item['image_paths'] = image_paths return item
在setting中
project_dir = os.path.dirname(__file__) IMAGES_STORE = os.path.join(project_dir, "images") #指定图片储存目录 # size IMAGES_MIN_HEIGHT = 100 IMAGES_MIN_WIDTH = 100 ITEM_PIPELINES = { 'SpiderJD.pipelines.DownloadImagesPipeline': 300, # 启动图片下载piplines中间件 'SpiderJD.pipelines.SpiderjdPipeline': 300, }
运行即可下载图片到指定目录