scrapy 图片爬取 多层多页 保存不同的文件夹 重命名full文件夹
记录下整个爬虫代码,我已经把实验网站爬完了。。
items.py
1 import scrapy 2 3 4 class DemoItem(scrapy.Item): 5 # define the fields for your item here like: 6 folder_name = scrapy.Field() #选取页面里的主题或者标题作为文件夹名字,代替full文件夹 7 #img_name = scrapy.Field() # 提取的图片的名字,如果没有就不使用 8 img_url = scrapy.Field() # 图片的链接
spider.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from demo.items import DemoItem 4 5 6 class LogosSpider(scrapy.Spider): 7 name = 'logos' 8 allowed_domains = ['tttt8.net'] 9 #start_urls = ['http://www.tttt8.net/category/legbaby/'] 10 #start_urls = ['http://www.tttt8.net/category/ugirls/'] 11 #start_urls = ['http://www.tttt8.net/category/kelagirls/'] 12 start_urls = ['http://www.tttt8.net/category/xiurenwang/micatruisg/'] 13 #这里的page是用来构造一级页面翻页链接 14 page = 1 15 16 def parse(self, response): 17 #提取所有专栏的列表 18 li_list = response.xpath('//*[@id="post_container"]/li') 19 for li in li_list: 20 #实例化 21 item = DemoItem() 22 #这里只提取专栏的名字,用于后面储存文件夹的命名 23 item['folder_name'] = li.xpath('./div[2]/h2/a/text()').extract_first() 24 #提取二级页面的链接,准备给二级页面函数去工作 25 next_plink = li.xpath('./div[1]/a/@href').extract_first() 26 #item把一级页面实例化的内容传送给二级页面接收 27 yield scrapy.Request(url = next_plink, callback = self.parse2, meta = {'item':item}) 28 # 一级页面的翻页列表/ 自行查找别的方法翻页,网上好多,我觉得这个构造更简单 29 page_list = response.xpath('//div[@class="pagination"]/a/@href').extract() 30 #找到最后一页的页码 31 last_page = page_list[-1] 32 #提取出最大的页码 33 max_num = int(last_page[-2]) 34 #构造翻页的页码链接 35 if self.page <= max_num: 36 self.page += 1 37 new_page_url = self.start_urls[0] + 'page/' + str(self.page) + '/' 38 yield scrapy.Request(url = new_page_url, callback = self.parse) 39 40 def parse2(self, response): 41 #接收一级页面的内容,和翻页请求的衔接,不然翻页就丢失了,会报错 42 item = response.meta['item'] 43 p_list = response.xpath('//*[@id="post_content"]/p/img') 44 #正常的提取图片的链接给pepeline去下载 45 for img in p_list: 46 img_url = img.xpath('./@src').extract_first() 47 # 这里必须加中括号[],图片下载的函数,要求是list类型 48 item['img_url'] = [img_url] 49 yield item 50 # 二级页面的翻页列表,然后在后面yield请求 51 next_page_list = response.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/a/@href').extract() 52 for next_page in next_page_list: 53 #这里一定要加meta,不然二级的翻页就error,这里我搞了好久才发现。 54 yield scrapy.Request(url = next_page, callback = self.parse2, meta = {'item':item})
settings.py
1 # -*- coding: utf-8 -*- 2 3 4 BOT_NAME = 'demo' 5 SPIDER_MODULES = ['demo.spiders'] 6 NEWSPIDER_MODULE = 'demo.spiders' 7 8 #存储路径和header 9 IMAGES_STORE = 'D:\pics' 10 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 11 12 DOWNLOAD_DELAY = 0.2 13 #机器人关掉 14 ROBOTSTXT_OBEY = False 15 16 ITEM_PIPELINES = { 17 'demo.pipelines.DemoPipeline': 300, 18 } 19 #下载指定的field 20 IMAGES_URLS_FIELD = 'img_url'
pipelines.py
1 import scrapy 2 from scrapy.exceptions import DropItem 3 from scrapy.pipelines.images import ImagesPipeline 4 5 6 class DemoPipeline(ImagesPipeline): 7 # 固定的改写的函数,不需要修改 8 def get_media_requests(self, item, info): 9 for img_url in item['img_url']: 10 # 下载完后给别的函数去改名字,所以用meta传下去 11 yield scrapy.Request(img_url, meta = {'item':item}) 12 13 def file_path(self, request, response = None, info = None): 14 item = request.meta['item'] 15 folder_name = item['folder_name'] 16 # img_name = item['img_name'] #图片没有名字不启用这个语句 17 # 因为图片没有名字就用url截取最后的字符串作为名字 18 image_guid = request.url.split('/')[-1] 19 img_name = image_guid 20 # name = img_name + image_guid 21 # name = name + '.jpg' 22 # 0 代表文件夹,1 代表文件 23 filename = u'{0}/{1}'.format(folder_name, img_name) 24 return filename 25 26 # 固定改写的函数,不需要修改 27 def item_completed(self, results, item, info): 28 image_paths = [x['path'] for ok, x in results if ok] 29 if not image_paths: 30 raise DropItem('Image Downloaded Failed') 31 return item
结果: