scrapy之360图片爬取
#今日目标 **scrapy之360图片爬取** 今天要爬取的是360美女图片,首先分析页面得知网页是动态加载,故需要先找到网页链接规律, 然后调用ImagesPipeline类实现图片爬取 *代码实现* so.py ``` # -*- coding: utf-8 -*- import scrapy import json from ..items import SoItem class SoSpider(scrapy.Spider): name = 'so' allowed_domains = ['imaeg.os.com'] # 重写 def start_requests(self): url = 'http://image.so.com/zjl?ch=beauty&sn={}&listtype=new&temp=1' # 生成5页的地址,交给调度器 for i in range(5): sn = i*30 full_url = url.format(sn) yield scrapy.Request( url = full_url, callback = self.parse_image, dont_filter=False ) def parse_image(self,response): html = json.loads(response.text) # 提取图片链接 for img in html['list']: item = SoItem() item['img_link'] = img['qhimg_url'] yield item ```
item.py ``` import scrapy class SoItem(scrapy.Item): # define the fields for your item here like: # 图片链接 img_link = scrapy.Field() ``` pipelines.py ``` # 导入scrapy的图片管道类 from scrapy.pipelines.images import ImagesPipeline import scrapy # 1. 继承 ImagesPipeline # 2. 重写 类内方法 class SoPipeline(ImagesPipeline): def get_media_requests(self, item, info): # 把图片链接发给调度器 yield scrapy.Request(url = item['img_link'],dont_filter=False) ``` settings.py ``` # Obey robots.txt rules ROBOTSTXT_OBEY = False CONCURRENT_REQUESTS = 10 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0', } ITEM_PIPELINES = { 'So.pipelines.SoPipeline': 300, } IMAGES_STORE = '/home/ccc/image/' #个人保存路径 ```