爬虫_自己写的笔记
scrapy
custom_settings
class XxSpider(scrapy.Spider): name = 'xxxx' custom_settings = { 'ITEM_PIPELINES': { # 'cl.pipelines.XxxxxPipeline': 400, 'cl.pipelines.GifPipeline': 400 } }
# 基类 class Spider(object_ref): """Base class for scrapy spiders. All spiders must inherit from this class. """ name = None custom_settings = None @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider')
支持gif的pipeline
import requests import os import hashlib import time class GifPipeline(object): def __init__(self,images_store,download_delay): # if not os.path.exists('imgs'): # os.makedirs('imgs') self.download_delay = download_delay self.images_store = images_store pass def process_item(self, item, spider): for url in item["img_urls"]: print(url) suffix = "jpg" if "." not in url else url.rsplit(".",1)[-1] _file_name = hashlib.sha1(bytes(url,encoding="utf-8")).hexdigest() title = item["title"] file_name = '{}{}.{}'.format(title,_file_name,suffix) response = requests.get(url, stream=True) with open(os.path.join(self.images_store, file_name), mode='wb') as f: f.write(response.content) time.sleep(self.download_delay) return item @classmethod def from_crawler(cls, crawler,): """ 初始化时候,用于创建pipeline对象 :param crawler: :return: """ download_delay = crawler.settings.get('DOWNLOAD_DELAY') images_store = crawler.settings.get('IMAGES_STORE') return cls(images_store,download_delay) 自定义pipeline,做了一些修改,支持gif
一、items.py import scrapy class HupuGifItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() hupu_image_url = scrapy.Field() images = scrapy.Field() 二、pipelines.py # -*- coding: utf-8 -*- from scrapy.pipelines.images import ImagesPipeline from hupu_gif import settings import requests import os class HupuGifPipeline(ImagesPipeline): def process_item(self, item, spider): if 'hupu_image_url' in item: images = [] dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name) if not os.path.exists(dir_path): os.makedirs(dir_path) for image_url in item['hupu_image_url']: us = image_url.split('/')[-1] file_path = '%s/%s' % (dir_path, us) images.append(file_path) if os.path.exists(file_path): continue with open(file_path, 'wb') as handle: response = requests.get('http:'+image_url, stream=True) for block in response.iter_content(1024): if not block: break handle.write(block) item['images'] = images return item --------------------- 作者:Lee007008 来源:CSDN 原文:https://blog.csdn.net/qaz2170/article/details/61417514 版权声明:本文为博主原创文章,转载请附上博文链接! 别人写的,不明白这里类字段images是什么用的,这个file_path的列表,推测是为后面的pipeline做后续做持久化
其他
防盗链
img标签不让src外链的东西
解决:img的src为自己的后台,再由后台发请求获取数据。可能也需要一些cookie,referer的字段
请求头
user-agent: 当前用户使用的设备
Referer: "xxx" # 从什么url转过来的
content-type: application/json,Content-Type:application/x-www-form-urlencoded
host
request.get('www....')
cookies关键