爬取图片

爬虫图片案列:

煎蛋网(http://jandan.net)

spider.py:

import scrapy
from jiandan.items import JiandanItem
 
from scrapy.crawler import CrawlerProcess
 
class jiandanSpider(scrapy.Spider):
    name = 'jiandan'
    allowed_domains = []
    start_urls = ["http://jandan.net/ooxx"]
     
    def parse(self, response):
        item = JiandanItem()
        item['image_urls'] = response.xpath('//img//@src').extract()#提取图片链接
        # print 'image_urls',item['image_urls']
        yield item
        new_url= response.xpath('//a[@class="previous-comment-page"]//@href').extract_first()#翻页
        # print 'new_url',new_url
        if new_url:
            yield scrapy.Request(new_url,callback=self.parse)

item.py:

import scrapy
class JiandanItem(scrapy.Item):
    # define the fields for your item here like:
    image_urls = scrapy.Field()#图片的链接

pip.py

import os
import urllib
 
from jiandan import settings
 
class JiandanPipeline(object):
 
    def process_item(self, item, spider):
        dir_path = '%s/%s'%(settings.IMAGES_STORE,spider.name)#存储路径
        print 'dir_path',dir_path
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        for image_url in item['image_urls']:
            list_name = image_url.split('/')
            file_name = list_name[len(list_name)-1]#图片名称
            # print 'filename',file_name
            file_path = '%s/%s'%(dir_path,file_name)
            # print 'file_path',file_path
            if os.path.exists(file_name):
                continue
            with open(file_path,'wb') as file_writer:
                conn = urllib.urlopen(image_url)#下载图片
                file_writer.write(conn.read())
            file_writer.close()
        return item

setting.py:

BOT_NAME = 'jiandan'
 
SPIDER_MODULES = ['jiandan.spiders']
NEWSPIDER_MODULE = 'jiandan.spiders'
 
 
ITEM_PIPELINES = {
   'jiandan.pipelines.JiandanPipeline': 1,
}
IMAGES_STORE='E:'
DOWNLOAD_DELAY = 0.25

main.py:
    
scrapy crawl jiandan
posted @ 2020-04-30 10:17  black__star  阅读(120)  评论(0编辑  收藏  举报