使用scrapy框架爬取桌面背景图片

目标数据： zol桌面壁纸，[风景] [1920*1080] 分类下19页每个图册的图片

items.py

 1 import scrapy
 2 
 3 
 4 class Zol2Item(scrapy.Item):
 5     # define the fields for your item here like:
 6     # name = scrapy.Field()
 7     image_urls = scrapy.Field()
 8     images = scrapy.Field()
 9 
10     image_title = scrapy.Field()

View Code

pipelines.py

 1 from scrapy import Request
 2 from scrapy.pipelines.images import ImagesPipeline
 3 
 4 class ZolPipeline(ImagesPipeline):
 5     # num = 1
 6     def get_media_requests(self, item, info):
 7         image_url = item["image_urls"]
 8         if image_url:
 9             # self.num + 1
10             yield Request(url=image_url, meta={"item": item})
11 
12     def file_path(self, request, response=None, info=None):
13         ## start of deprecation warning block (can be removed in the future)
14         def _warn():
15             from scrapy.exceptions import ScrapyDeprecationWarning
16             import warnings
17             warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
18                           'please use file_path(request, response=None, info=None) instead',
19                           category=ScrapyDeprecationWarning, stacklevel=1)
20 
21         # check if called from image_key or file_key with url as first argument
22         if not isinstance(request, Request):
23             _warn()
24             url = request
25         else:
26             url = request.url
27 
28         # detect if file_key() or image_key() methods have been overridden
29         if not hasattr(self.file_key, '_base'):
30             _warn()
31             return self.file_key(url)
32         elif not hasattr(self.image_key, '_base'):
33             _warn()
34             return self.image_key(url)
35         ## end of deprecation warning block
36 
37         return 'desk/{}.jpg'.format(request.meta["item"]["image_title"])

View Code

middlewares.py

 1 from scrapy import signals
 2 from zol2.useragents import agents
 3 
 4 
 5 class Zol2SpiderMiddleware(object):
 6     # Not all methods need to be defined. If a method is not defined,
 7     # scrapy acts as if the spider middleware does not modify the
 8     # passed objects.
 9 
10     @classmethod
11     def from_crawler(cls, crawler):
12         # This method is used by Scrapy to create your spiders.
13         s = cls()
14         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
15         return s
16 
17     def process_spider_input(self, response, spider):
18         # Called for each response that goes through the spider
19         # middleware and into the spider.
20 
21         # Should return None or raise an exception.
22         return None
23 
24     def process_spider_output(self, response, result, spider):
25         # Called with the results returned from the Spider, after
26         # it has processed the response.
27 
28         # Must return an iterable of Request, dict or Item objects.
29         for i in result:
30             yield i
31 
32     def process_spider_exception(self, response, exception, spider):
33         # Called when a spider or process_spider_input() method
34         # (from other spider middleware) raises an exception.
35 
36         # Should return either None or an iterable of Response, dict
37         # or Item objects.
38         pass
39 
40     def process_start_requests(self, start_requests, spider):
41         # Called with the start requests of the spider, and works
42         # similarly to the process_spider_output() method, except
43         # that it doesn’t have a response associated.
44 
45         # Must return only requests (not items).
46         for r in start_requests:
47             yield r
48 
49     def spider_opened(self, spider):
50         spider.logger.info('Spider opened: %s' % spider.name)
51 
52 
53 class Zol2DownloaderMiddleware(object):
54     # Not all methods need to be defined. If a method is not defined,
55     # scrapy acts as if the downloader middleware does not modify the
56     # passed objects.
57 
58     @classmethod
59     def from_crawler(cls, crawler):
60         # This method is used by Scrapy to create your spiders.
61         s = cls()
62         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
63         return s
64 
65     def process_request(self, request, spider):
66         # Called for each request that goes through the downloader
67         # middleware.
68 
69         # Must either:
70         # - return None: continue processing this request
71         # - or return a Response object
72         # - or return a Request object
73         # - or raise IgnoreRequest: process_exception() methods of
74         #   installed downloader middleware will be called
75         return None
76 
77     def process_response(self, request, response, spider):
78         # Called with the response returned from the downloader.
79 
80         # Must either;
81         # - return a Response object
82         # - return a Request object
83         # - or raise IgnoreRequest
84         return response
85 
86     def process_exception(self, request, exception, spider):
87         # Called when a download handler or a process_request()
88         # (from other downloader middleware) raises an exception.
89 
90         # Must either:
91         # - return None: continue processing this exception
92         # - return a Response object: stops process_exception() chain
93         # - return a Request object: stops process_exception() chain
94         pass
95 
96     def spider_opened(self, spider):
97         spider.logger.info('Spider opened: %s' % spider.name)

View Code

settings.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # Scrapy settings for zol2 project
 4 #
 5 # For simplicity, this file contains only settings considered important or
 6 # commonly used. You can find more settings consulting the documentation:
 7 #
 8 #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 
12 BOT_NAME = 'zol2'
13 
14 SPIDER_MODULES = ['zol2.spiders']
15 NEWSPIDER_MODULE = 'zol2.spiders'
16 
17 
18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
20 
21 # Obey robots.txt rules
22 # ROBOTSTXT_OBEY = True
23 
24 # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 #CONCURRENT_REQUESTS = 32
26 
27 # Configure a delay for requests for the same website (default: 0)
28 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 # See also autothrottle settings and docs
30 DOWNLOAD_DELAY = 0.5
31 # The download delay setting will honor only one of:
32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 #CONCURRENT_REQUESTS_PER_IP = 16
34 
35 # Disable cookies (enabled by default)
36 #COOKIES_ENABLED = False
37 
38 # Disable Telnet Console (enabled by default)
39 #TELNETCONSOLE_ENABLED = False
40 
41 # Override the default request headers:
42 #DEFAULT_REQUEST_HEADERS = {
43 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 #   'Accept-Language': 'en',
45 #}
46 
47 # Enable or disable spider middlewares
48 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 #SPIDER_MIDDLEWARES = {
50 #    'zol2.middlewares.Zol2SpiderMiddleware': 543,
51 #}
52 
53 # Enable or disable downloader middlewares
54 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 #DOWNLOADER_MIDDLEWARES = {
56 #    'zol2.middlewares.Zol2DownloaderMiddleware': 543,
57 #}
58 
59 # Enable or disable extensions
60 # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 #EXTENSIONS = {
62 #    'scrapy.extensions.telnet.TelnetConsole': None,
63 #}
64 
65 # Configure item pipelines
66 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 ITEM_PIPELINES = {
68    'zol2.pipelines.Zol2Pipeline': 300,
69 }
70 IMAGES_STORE = "/home/pyvip/env_spider/zol2/zol2/images"
71 
72 # Enable and configure the AutoThrottle extension (disabled by default)
73 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 #AUTOTHROTTLE_ENABLED = True
75 # The initial download delay
76 #AUTOTHROTTLE_START_DELAY = 5
77 # The maximum download delay to be set in case of high latencies
78 #AUTOTHROTTLE_MAX_DELAY = 60
79 # The average number of requests Scrapy should be sending in parallel to
80 # each remote server
81 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 # Enable showing throttling stats for every response received:
83 #AUTOTHROTTLE_DEBUG = False
84 
85 # Enable and configure HTTP caching (disabled by default)
86 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 #HTTPCACHE_ENABLED = True
88 #HTTPCACHE_EXPIRATION_SECS = 0
89 #HTTPCACHE_DIR = 'httpcache'
90 #HTTPCACHE_IGNORE_HTTP_CODES = []
91 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View Code

pazol2.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.linkextractors import LinkExtractor
 4 from scrapy.spiders import CrawlSpider, Rule
 5 from zol2.items import Zol2Item
 6 
 7 class Pazol2Spider(CrawlSpider):
 8     name = 'pazol2'
 9     # allowed_domains = ['desk.zol.com.cn']
10     start_urls = ['http://desk.zol.com.cn/fengjing/1920x1080/']
11     front_url = "http://desk.zol.com.cn"
12     num = 1
13 
14     rules = (
15         # 1.解决翻页
16         Rule(LinkExtractor(allow=r'/fengjing/1920x1080/[0-1]?[0-9]?.html'), callback='parse_album', follow=True),
17         # 2.进入各个图库的每一张图片页
18         Rule(LinkExtractor(allow=r'/bizhi/\d+_\d+_\d+.html', restrict_xpaths=("//div[@class='main']/ul[@class='pic-list2  clearfix']/li", "//div[@class='photo-list-box']")), follow=True),
19         # 3.点击各个图片1920*1080按钮，获得html
20         Rule(LinkExtractor(allow=r'/showpic/1920x1080_\d+_\d+.html'), callback='get_img', follow=True),
21     )
22 
23     def get_img(self, response):
24         item = Zol2Item()
25         item['image_urls'] = response.xpath("//body/img[1]/@src").extract_first()
26         item['image_title'] = str(self.num)
27         self.num += 1
28         yield item

View Code

爬取结果

共爬取了4517张图片，用时108分钟

放在桌面图库，半小时换一张，美滋滋。

posted @ 2019-08-27 13:07 逍遥bb 阅读(296) 评论(0) 收藏举报

刷新页面返回顶部

逍遥bb

使用scrapy框架爬取桌面背景图片

目标数据： zol桌面壁纸，[风景] [1920*1080] 分类下19页每个图册的图片

公告