爬虫_自己写的笔记

scrapy

custom_settings

class XxSpider(scrapy.Spider):
    name = 'xxxx'
    custom_settings = {
        'ITEM_PIPELINES': {
            # 'cl.pipelines.XxxxxPipeline': 400,
            'cl.pipelines.GifPipeline': 400
        }
    }
使用custom_settings
# 基类

class Spider(object_ref):
    """Base class for scrapy spiders. All spiders must inherit from this
    class.
    """

    name = None
    custom_settings = None


    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {}, priority='spider')
原理_针对每一个spider进行更新

 

支持gif的pipeline

import requests
import os
import hashlib
import time

class GifPipeline(object):
    def __init__(self,images_store,download_delay):
        # if not os.path.exists('imgs'):
        #     os.makedirs('imgs')
        self.download_delay = download_delay
        self.images_store = images_store
        pass

    def process_item(self, item, spider):
        for url in item["img_urls"]:
            print(url)
            suffix = "jpg" if "." not in url else url.rsplit(".",1)[-1]
            _file_name = hashlib.sha1(bytes(url,encoding="utf-8")).hexdigest()
            title = item["title"]
            file_name = '{}{}.{}'.format(title,_file_name,suffix)
            response = requests.get(url, stream=True)
            with open(os.path.join(self.images_store, file_name), mode='wb') as f:
                f.write(response.content)
            time.sleep(self.download_delay)
        return item

    @classmethod
    def from_crawler(cls, crawler,):
        """
        初始化时候,用于创建pipeline对象
        :param crawler:
        :return:
        """
        download_delay = crawler.settings.get('DOWNLOAD_DELAY')
        images_store = crawler.settings.get('IMAGES_STORE')
        return cls(images_store,download_delay)

自定义pipeline,做了一些修改,支持gif
自己写的支持gif的pipeline,粗糙
一、items.py




import scrapy


class HupuGifItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    hupu_image_url = scrapy.Field()
    images = scrapy.Field()

二、pipelines.py



# -*- coding: utf-8 -*-

from scrapy.pipelines.images import ImagesPipeline
from hupu_gif import settings
import requests
import os


class HupuGifPipeline(ImagesPipeline):
    def process_item(self, item, spider):
        if 'hupu_image_url' in item:
            images = []


        dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)


        for image_url in item['hupu_image_url']:
            us = image_url.split('/')[-1]
            file_path = '%s/%s' % (dir_path, us)
            images.append(file_path)
            if os.path.exists(file_path):
                continue
            with open(file_path, 'wb') as handle:
                response = requests.get('http:'+image_url, stream=True)
                for block in response.iter_content(1024):
                    if not block:
                        break
                    handle.write(block)


        item['images'] = images
        return item
--------------------- 
作者:Lee007008 
来源:CSDN 
原文:https://blog.csdn.net/qaz2170/article/details/61417514 
版权声明:本文为博主原创文章,转载请附上博文链接!

别人写的,不明白这里类字段images是什么用的,这个file_path的列表,推测是为后面的pipeline做后续做持久化
别人写的支持gif的pipeline,同样粗糙

 

其他

防盗链

img标签不让src外链的东西

解决:img的src为自己的后台,再由后台发请求获取数据。可能也需要一些cookie,referer的字段

 

请求头

user-agent: 当前用户使用的设备
Referer: "xxx" # 从什么url转过来的
content-type: application/json,Content-Type:application/x-www-form-urlencoded
host
  request.get('www....')


cookies关键

 

posted @ 2018-11-05 11:29  fat39  阅读(201)  评论(0编辑  收藏  举报