scrapy--dbmeinv

Posted on 2018-08-01 15:08  eilinge  阅读(664)  评论(0编辑  收藏  举报

  第一次将自己的爬虫文件与大家分享.豆瓣美女网页图片爬取.比较简单,但很实用.给大家提供思路为主,增强个人的爬虫能力.希望能帮助到大家!!!

好了,让我们进入正题。

先给大家看下成果!!!激励大家赶快行动起来

1.爬虫文件:Dbmeinv.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from dbmeinv.items import DbmeinvItem
import re
import pdb


class DbmeinvSpider(scrapy.Spider):
    name = 'Dbmeinv'
    allowed_domains = ['www.dbmeinv.com']
    start_urls = ['https://www.dbmeinv.com/index.htm?cid=6',
                  'https://www.dbmeinv.com/index.htm?cid=7',
                  'https://www.dbmeinv.com/index.htm?cid=3']

    def parse(self, response):
        #提取本页url
        le = LinkExtractor(restrict_css='ul.thumbnails')

        for link in le.extract_links(response):
            yield scrapy.Request(link.url,callback=self.parse_images)
        
        #提取下一页url
        le1 = LinkExtractor(restrict_css='li.next_page')
        link1 = le1.extract_links(response)

        if link1:
            yield scrapy.Request(link1[0].url,callback=self.parse)

    def parse_images(self,response):
        meinv = DbmeinvItem()
        
        #这个网页比较特殊,图片的src所在的不同标签分3种情况,具体可以根据scrapy爬虫中遇到的error,到相应网页中查找到图片的src所在的不同标签
        if response.xpath('//div[@class="image-wrapper"]/img/@src').extract():
            url1 = response.xpath('//div[@class="image-wrapper"]/img/@src').extract()[0]
            meinv['images_url'] = url1
            image_name = re.findall(r'large/(.+?\.jpg)',url1)
            meinv['images'] = image_name[0]

        if response.xpath('//div[@class="panel-body markdown"]//img/@src'):
            url2 = response.xpath('//div[@class="panel-body markdown"]//img/@src').extract()[0]
            meinv['images_url'] = url2
            image_name = re.findall(r'large/(.+?\.jpg)', url2)
            meinv['images'] = image_name[0]

        if response.xpath('//div[@class="topic-detail panel panel-default"]//img/@src'):
            url3 = response.xpath('//div[@class="topic-detail panel panel-default"]//img/@src').extract()[1]
            meinv['images_url'] = url3
            image_name = re.findall(r'large/(.+?\.jpg)', url3)
            meinv['images'] = image_name[0]

        yield meinv    

2.items.py

import scrapy

class DbmeinvItem(scrapy.Item):

    images_url = scrapy.Field()
    images     = scrapy.Field()

3.pipelines.py

import scrapy
from scrapy.pipelines.images import ImagesPipeline  #ImagesPipeline直接提取存储图片
from scrapy.exceptions import DropItem
from dbmeinv.items import DbmeinvItem
import pdb  

class DbmeinvPipeline(ImagesPipeline):
    def get_media_requests(self,item,info):      #根据images_url相对应的src,进行下载
        yield scrapy.Request(item['images_url'])

    def item_completed(self,results,item,info):    #检查是否存储成功
        images_paths = [x['path'] for ok,x in results if ok]

        if not images_paths:
            raise DropItem("Item contains no images")

        return item


class DuplicatesPipeline(object):            #这个类是根据图片名称,去重作用
    def __init__(self):
        self.ids_seen = set()

    def process_item(self, item, spider):
        if item['images'] in self.ids_seen:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['images'])
            return item

4.settings.py

IMAGES_STORE = r'C:\Users\Desktop\dbmeinv'     #图片存储文件名

USER_AGENT ={       #设置浏览器的User_agent,避免ERROR 403
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
}

CONCURRENT_REQUESTS = 16    #同时来16个请求
DOWNLOAD_DELAY = 0.2        #0.2s后开启处理第一个请求
ROBOTSTXT_OBEY = False    #不遵守robots.txt
COOKIES_ENABLED = False   #禁用COOKIES

ITEM_PIPELINES = {      #设置启动顺序,1-1000,数字越小,优先级越高
'dbmeinv.pipelines.DbmeinvPipeline': 1,
'dbmeinv.pipelines.DuplicatesPipeline':200,
}

如果有遇到问题,欢迎来提问!!!大家一起进步