爬虫————获取图片(scrapy使用自带类 ImagesPipeline)
爬取的图片网址为:http://699pic.com/collectInfo/273785
解决问题的参考链接:https://blog.csdn.net/loner_fang/article/details/81111879
问题代码1
imgpro.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 4 from tupian.items import TupianItem 5 from scrapy.selector.unified import SelectorList 6 7 8 class ImgproSpider(scrapy.Spider): 9 name = 'imgpro' 10 #allowed_domains = ['699pic.com'] 11 start_urls = ['http://699pic.com/collectInfo/273785'] 12 13 def parse(self, response): 14 li_ul = response.xpath('/html/body/div[11]/div/ul') 15 # print(type(li_ul)) 16 for li in li_ul : 17 img_src = li.xpath('./li/a/img/@src').extract() 18 # print(img_src) 19 if img_src: 20 item = TupianItem() 21 item['src'] = img_src 22 yield item
pipeline.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 8 9 # class TupianPipeline(object): 10 # def process_item(self, item, spider): 11 # return item 12 13 from scrapy.pipelines.images import ImagesPipeline 14 import scrapy 15 16 class TupianPipeline(ImagesPipeline): 17 18 # 对某一个媒体资源进行请求发送 19 # item就是接收到的spider提交过来的item 20 def get_media_requests(self, item, info): 21 22 23 yield scrapy.Request(item['src']) 24 25 #制定媒体数据存储的名称/html/body/div[11]/div/ul /html/body/div[11]/div/ul 26 def file_path(self, request, response=None, info=None): 27 name = request.url.split('/')[-1] 28 print("正在下载。。。。。。。",name) 29 return name 30 31 #将item传递给下一个即将执行的管道类 32 def item_completed(self, results, item, info): 33 return item
错误类型
分析错误原因:
1、在写爬虫文件解析response时,获取图片的下载地址,一开始写的xpath是@src
修改后为下图所示
后来排查代码,观察网页源代码,发现源代码中@src的值是http://static.699pic.com/images/blank.png。刚才element中的src是经过渲染之后的值,所以最后采用的@data-original。这也就证实了爬虫获取到的response是网页的源码,爬取之前需要先确认源码和element中的元素和值是否一致,只有一致了才可以直接使用element中的元素和值。
2、遍历image_urls里的每一个url,调用调度器和下载器,下载图片 。 图片下载完毕后,处理结果会以二元组的方式返回给item_completed()函数
错误点为下图
修改后为下图
1 # -*- coding: utf-8 -*- 2 import scrapy 3 4 from tupian.items import TupianItem 5 from scrapy.selector.unified import SelectorList 6 7 8 class ImgproSpider(scrapy.Spider): 9 name = 'imgpro' 10 #allowed_domains = ['699pic.com'] 11 start_urls = ['http://699pic.com/collectInfo/273785'] 12 13 def parse(self, response): 14 li_ul = response.xpath('/html/body/div[11]/div/ul') 15 # print(type(li_ul)) 16 for li in li_ul : 17 img_src = li.xpath('./li/a/img/@data-original').extract() 18 # print(img_src) 19 if img_src: 20 item = TupianItem() 21 item['src'] = img_src 22 yield item
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 8 9 # class TupianPipeline(object): 10 # def process_item(self, item, spider): 11 # return item 12 13 from scrapy.pipelines.images import ImagesPipeline 14 import scrapy 15 16 class TupianPipeline(ImagesPipeline): 17 18 # 对某一个媒体资源进行请求发送 19 # item就是接收到的spider提交过来的item 20 def get_media_requests(self, item, info): 21 22 for image_url in item['src']: 23 yield scrapy.Request(image_url) 24 25 #制定媒体数据存储的名称/html/body/div[11]/div/ul /html/body/div[11]/div/ul 26 def file_path(self, request, response=None, info=None): 27 name = request.url.split('/')[-1] 28 print("正在下载。。。。。。。",name) 29 return name 30 31 #将item传递给下一个即将执行的管道类 32 def item_completed(self, results, item, info): 33 return item
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://docs.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class TupianItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 src = scrapy.Field()
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for tupian project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # https://docs.scrapy.org/en/latest/topics/settings.html 9 # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'tupian' 13 14 SPIDER_MODULES = ['tupian.spiders'] 15 NEWSPIDER_MODULE = 'tupian.spiders' 16 17 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 #USER_AGENT = 'tupian (+http://www.yourdomain.com)' 20 21 # Obey robots.txt rules 22 ROBOTSTXT_OBEY = False 23 24 LOG_LEVEL = 'ERROR' 25 26 IMAGES_STORE = './imgsLib' 27 28 # Configure maximum concurrent requests performed by Scrapy (default: 16) 29 #CONCURRENT_REQUESTS = 32 30 31 # Configure a delay for requests for the same website (default: 0) 32 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 33 # See also autothrottle settings and docs 34 #DOWNLOAD_DELAY = 3 35 # The download delay setting will honor only one of: 36 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 37 #CONCURRENT_REQUESTS_PER_IP = 16 38 39 # Disable cookies (enabled by default) 40 #COOKIES_ENABLED = False 41 42 # Disable Telnet Console (enabled by default) 43 #TELNETCONSOLE_ENABLED = False 44 45 # Override the default request headers: 46 DEFAULT_REQUEST_HEADERS = { 47 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 48 'Accept-Language': 'en', 49 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36' 50 51 } 52 53 # Enable or disable spider middlewares 54 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 55 #SPIDER_MIDDLEWARES = { 56 # 'tupian.middlewares.TupianSpiderMiddleware': 543, 57 #} 58 59 # Enable or disable downloader middlewares 60 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 61 #DOWNLOADER_MIDDLEWARES = { 62 # 'tupian.middlewares.TupianDownloaderMiddleware': 543, 63 #} 64 65 # Enable or disable extensions 66 # See https://docs.scrapy.org/en/latest/topics/extensions.html 67 #EXTENSIONS = { 68 # 'scrapy.extensions.telnet.TelnetConsole': None, 69 #} 70 71 # Configure item pipelines 72 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 73 ITEM_PIPELINES = { 74 # 'scrapy.pipelines.images.ImagesPipeline': 1, 75 'tupian.pipelines.TupianPipeline': 300, 76 } 77 78 # Enable and configure the AutoThrottle extension (disabled by default) 79 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 80 #AUTOTHROTTLE_ENABLED = True 81 # The initial download delay 82 #AUTOTHROTTLE_START_DELAY = 5 83 # The maximum download delay to be set in case of high latencies 84 #AUTOTHROTTLE_MAX_DELAY = 60 85 # The average number of requests Scrapy should be sending in parallel to 86 # each remote server 87 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 # Enable showing throttling stats for every response received: 89 #AUTOTHROTTLE_DEBUG = False 90 91 # Enable and configure HTTP caching (disabled by default) 92 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 #HTTPCACHE_ENABLED = True 94 #HTTPCACHE_EXPIRATION_SECS = 0 95 #HTTPCACHE_DIR = 'httpcache' 96 #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'