【Scrapy】settings

settings.py

# Scrapy settings for ormspider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'demospider'  # 项目名称

SPIDER_MODULES = ['demospider.spiders']
NEWSPIDER_MODULE = 'demospider.spiders'

LOG_ENABLED = True  # 表示启动日志功能
LOG_FILE = 'log.txt'  # 日志文件名，None 表示将日志写入到标准输出。
LOG_LEVEL = 'ERROR'  # 设定日志的级别，默认值是 DEBUG。可选的值包括 CRITICAL、ERROR、WARNING、INFO 和 DEBUG

# 访问URL去重
#DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl'

# 允许状态码及设置重试
RETRY_ENABLED = True  # 用于开启中间件，默认为TRUE
#RETRY_TIMES = 2 # 重试次数, 默认为2
#RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 520, 400, 403, 404]  # 要重试的状态码，默认为[500, 503, 504, 400, 408]
#RETRY_PRIORITY_ADJUST = -1  # 调整相对于原始请求的重试请求优先级，默认为-1

# 禁止重定向
#REDIRECT_ENABLED = False

# 客户端User-Agent请求头
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'

# 是否遵循爬虫协议
ROBOTSTXT_OBEY = False

# 下载器总共最大处理的并发请求数,默认值16
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5   # 下载延迟（限制每个IP而不是每个域）
RANDOMIZE_DOWNLOAD_DELAY = True  # 随机访问间隔
#DOWNLOAD_TIMEOUT = 3  # 下载器超时时间，默认值是180秒
#DOWNLOAD_MAXSIZE = 0  # 载器下载的内容大小限制，单位是字节。默认值是 1073741824，表示 1024MB。
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16  # 每个域名能够被执行的最大并发请求数目
#CONCURRENT_REQUESTS_PER_IP = 16  # 能够被单个IP处理的并发请求数

# Disable cookies (enabled by default)
COOKIES_ENABLED = False
#COOKIES_DEBUG  # 记录所有cookie

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Scrapy发送HTTP请求默认使用的请求头
DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
   'Accept-Encoding': 'gzip, deflate, br',
   'Accept-Language': 'zh-CN,zh;q=0.9',
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
 }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
    'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 530,
    'demospider.middlewares.DemospiderSpiderMiddleware': 542,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'demospider.middlewares.DemospiderDownloaderMiddleware': 410,
    'demospider.middlewares.UserAgentDownloaderMiddleware': 420,
    'demospider.middlewares.TooManyRequestsRetryMiddleware': 430,
    #'demospider.middlewares.ProxyDownloaderMiddleware': 440,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 后面的数字位于0到1000之间，表示优先级，数字越低，优先级越高
ITEM_PIPELINES = {#'demospider.pipelines.DedupPipeline': 300,
    #'demospider.pipelines.ValidatePipeline': 310,
    #'scrapy.pipelines.images.ImagesPipeline': 320,  
    #'scrapy.pipelines.files.FilesPipeline': 321,  
    #'demospider.pipelines.ImagesDownloadPipeline': 322,   
    #'demospider.pipelines.FilesDownloadPipeline': 323,
    'demospider.pipelines.MongodbPipeline': 340,
    #'demospider.pipelines.JsonPipeline': 350,
    #'demospider.pipelines.TxtPipeline': 360,
    #'demospider.pipelines.TsvPipeline': 370,
    #'demospider.pipelines.CsvPipeline': 380,
    #'demospider.pipelines.ExcelPipeline': 390,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True  # 是否开启智能限速（默认否）
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5   # 起始的延迟
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60  # 最大延迟（最小延迟为DOWNLOAD_DELAY）
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# 每秒并发请求数的平均值，不能高于 CONCURRENT_REQUESTS_PER_DOMAIN或CONCURRENT_REQUESTS_PER_IP
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True  # 是否启用缓存策略(默认否)
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"  # 缓存策略：所有请求均缓存，下次在请求直接访问原来的缓存即可
#HTTPCACHE_EXPIRATION_SECS = 0  # 缓存超时时间
#HTTPCACHE_DIR = 'httpcache'  # 缓存保存路径
#HTTPCACHE_IGNORE_HTTP_CODES = []  # 缓存忽略的Http状态码
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'  # 缓存存储的插件

# 爬虫允许的最大深度，可以通过meta查看当前深度；0表示无深度
# DEPTH_LIMIT = 3

# 后进先出，深度优先
# DEPTH_PRIORITY = 0    # 0表示深度优先Lifo(默认)；
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'

# 先进先出，广度优先
# DEPTH_PRIORITY = 1    # 1表示广度优先FiFo
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'

IMAGES_STORE = 'D:/mages'  # 文件保存路径
IMAGES_URLS_FIELD = 'image_urls'
IMAGES_RESULT_FIELD = 'images'
#IMAGES_EXPIRES = 30  # 过期天数天内抓取的都不会被重抓
#IMAGES_MIN_HEIGHT = 500  # 小于指定尺寸的图像将被忽略，不下载
#IMAGES_MIN_WIDTH = 900
#IMAGES_THUMBS = {'small':(50,50), 'big':(270,270)}  # 为图片生成缩略图，下载一张图片时本地会出现3张图片（1张原图，2张缩略图）

FILES_STORE = 'D:/Files'  # 文件保存路径
FILES_URLS_FIELD = 'file_urls'
FILES_RESULT_FIELD = 'files'
#FILES_EXPIRES = 30  # 过期天数

#FEED_URI = 'D:/Files'  # ftp://user:password@ftp.example.com/scraping/feeds/%(name)s/%(time)s.json
#FEED_EXPORT_ENCODING = 'utf-8'  # 导出文件编码
#FEED_FORMAT = 'csv'  # 导出数据格式
#FEED_EXPORT_FIELDS = []  # 字段导出的列表
#FEED_EXPORT_INDENT = 0  # 缩进空格数，默认0，负数则新行打印，None则最紧凑打印
#FEED_STORE_EMPTY = False  # 是否导出空feed，默认False
# 存储后端
FEED_STORAGES = {
    #'': 'scrapy.extensions.feedexport.FileFeedStorage',
    #'file': 'scrapy.extensions.feedexport.FileFeedStorage',  # feeds被存储到本地文件系统
    #'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage',  # 标准输出
    #'s3': 'scrapy.extensions.feedexport.S3FeedStorage',  # feeds被存储到Amazon S3
    #'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage',  # feeds被存储到FTP服务器
}
# 序列化格式
FEED_EXPORTERS = {
    #'json': 'scrapy.exporters.JsonItemExporter',
    #'jsonlines': 'scrapy.exporters.JsonLinesItemExporter',
    #'jl': 'scrapy.exporters.JsonLinesItemExporter',
    #'csv': 'scrapy.exporters.CsvItemExporter',
    #'xml': 'scrapy.exporters.XmlItemExporter',
    #'marshal': 'scrapy.exporters.MarshalItemExporter',
    #'pickle': 'scrapy.exporters.PickleItemExporter',
}

# mongodb配置
MONGO_URI = 'mongodb://localhost:27017/'
MONGO_DB = 'db_demospider'

setings.py是整个项目的默认设置。

也可以对单个爬虫某些设置修改：

# -*- coding: utf-8 -*-
import scrapyclass DemoSpider(scrapy.Spider):
    name = 'demo' 
    allowed_domains = ['www.demo.com'] 
    start_urls = ['https://www.demo.com/index.html'] 
    
    # 这里会对settings.py的设置内容进行覆盖，写法与settings.py不一样
    custom_settings = {
        'ROBOTSTXT_OBEY':True,
    }
    
    def parse(self, response):
        pass

posted on 2022-07-31 20:34 PythonBaby 阅读(30) 评论(0) 收藏举报

刷新页面返回顶部

PythonBaby

导航

公告

【Scrapy】settings