【Scrapy】settings
settings.py
# Scrapy settings for ormspider project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'demospider' # 项目名称 SPIDER_MODULES = ['demospider.spiders'] NEWSPIDER_MODULE = 'demospider.spiders' LOG_ENABLED = True # 表示启动日志功能 LOG_FILE = 'log.txt' # 日志文件名,None 表示将日志写入到标准输出。 LOG_LEVEL = 'ERROR' # 设定日志的级别,默认值是 DEBUG。可选的值包括 CRITICAL、ERROR、WARNING、INFO 和 DEBUG # 访问URL去重 #DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl' # 允许状态码及设置重试 RETRY_ENABLED = True # 用于开启中间件,默认为TRUE #RETRY_TIMES = 2 # 重试次数, 默认为2 #RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 520, 400, 403, 404] # 要重试的状态码,默认为[500, 503, 504, 400, 408] #RETRY_PRIORITY_ADJUST = -1 # 调整相对于原始请求的重试请求优先级,默认为-1 # 禁止重定向 #REDIRECT_ENABLED = False # 客户端User-Agent请求头 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' # 是否遵循爬虫协议 ROBOTSTXT_OBEY = False # 下载器总共最大处理的并发请求数,默认值16 #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 0.5 # 下载延迟(限制每个IP而不是每个域) RANDOMIZE_DOWNLOAD_DELAY = True # 随机访问间隔 #DOWNLOAD_TIMEOUT = 3 # 下载器超时时间,默认值是180秒 #DOWNLOAD_MAXSIZE = 0 # 载器下载的内容大小限制,单位是字节。默认值是 1073741824,表示 1024MB。 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 # 每个域名能够被执行的最大并发请求数目 #CONCURRENT_REQUESTS_PER_IP = 16 # 能够被单个IP处理的并发请求数 # Disable cookies (enabled by default) COOKIES_ENABLED = False #COOKIES_DEBUG # 记录所有cookie # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Scrapy发送HTTP请求默认使用的请求头 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 530, 'demospider.middlewares.DemospiderSpiderMiddleware': 542, } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'demospider.middlewares.DemospiderDownloaderMiddleware': 410, 'demospider.middlewares.UserAgentDownloaderMiddleware': 420, 'demospider.middlewares.TooManyRequestsRetryMiddleware': 430, #'demospider.middlewares.ProxyDownloaderMiddleware': 440, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html # 后面的数字位于0到1000之间,表示优先级,数字越低,优先级越高 ITEM_PIPELINES = {#'demospider.pipelines.DedupPipeline': 300, #'demospider.pipelines.ValidatePipeline': 310, #'scrapy.pipelines.images.ImagesPipeline': 320, #'scrapy.pipelines.files.FilesPipeline': 321, #'demospider.pipelines.ImagesDownloadPipeline': 322, #'demospider.pipelines.FilesDownloadPipeline': 323, 'demospider.pipelines.MongodbPipeline': 340, #'demospider.pipelines.JsonPipeline': 350, #'demospider.pipelines.TxtPipeline': 360, #'demospider.pipelines.TsvPipeline': 370, #'demospider.pipelines.CsvPipeline': 380, #'demospider.pipelines.ExcelPipeline': 390, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # 是否开启智能限速(默认否) # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # 起始的延迟 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # 最大延迟(最小延迟为DOWNLOAD_DELAY) # The average number of requests Scrapy should be sending in parallel to # each remote server # 每秒并发请求数的平均值,不能高于 CONCURRENT_REQUESTS_PER_DOMAIN或CONCURRENT_REQUESTS_PER_IP #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True # 是否启用缓存策略(默认否) # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy" # 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可 #HTTPCACHE_EXPIRATION_SECS = 0 # 缓存超时时间 #HTTPCACHE_DIR = 'httpcache' # 缓存保存路径 #HTTPCACHE_IGNORE_HTTP_CODES = [] # 缓存忽略的Http状态码 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # 缓存存储的插件 # 爬虫允许的最大深度,可以通过meta查看当前深度;0表示无深度 # DEPTH_LIMIT = 3 # 后进先出,深度优先 # DEPTH_PRIORITY = 0 # 0表示深度优先Lifo(默认); # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue' # 先进先出,广度优先 # DEPTH_PRIORITY = 1 # 1表示广度优先FiFo # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue' IMAGES_STORE = 'D:/mages' # 文件保存路径 IMAGES_URLS_FIELD = 'image_urls' IMAGES_RESULT_FIELD = 'images' #IMAGES_EXPIRES = 30 # 过期天数天内抓取的都不会被重抓 #IMAGES_MIN_HEIGHT = 500 # 小于指定尺寸的图像将被忽略,不下载 #IMAGES_MIN_WIDTH = 900 #IMAGES_THUMBS = {'small':(50,50), 'big':(270,270)} # 为图片生成缩略图,下载一张图片时本地会出现3张图片(1张原图,2张缩略图) FILES_STORE = 'D:/Files' # 文件保存路径 FILES_URLS_FIELD = 'file_urls' FILES_RESULT_FIELD = 'files' #FILES_EXPIRES = 30 # 过期天数 #FEED_URI = 'D:/Files' # ftp://user:password@ftp.example.com/scraping/feeds/%(name)s/%(time)s.json #FEED_EXPORT_ENCODING = 'utf-8' # 导出文件编码 #FEED_FORMAT = 'csv' # 导出数据格式 #FEED_EXPORT_FIELDS = [] # 字段导出的列表 #FEED_EXPORT_INDENT = 0 # 缩进空格数,默认0,负数则新行打印,None则最紧凑打印 #FEED_STORE_EMPTY = False # 是否导出空feed,默认False # 存储后端 FEED_STORAGES = { #'': 'scrapy.extensions.feedexport.FileFeedStorage', #'file': 'scrapy.extensions.feedexport.FileFeedStorage', # feeds被存储到本地文件系统 #'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage', # 标准输出 #'s3': 'scrapy.extensions.feedexport.S3FeedStorage', # feeds被存储到Amazon S3 #'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage', # feeds被存储到FTP服务器 } # 序列化格式 FEED_EXPORTERS = { #'json': 'scrapy.exporters.JsonItemExporter', #'jsonlines': 'scrapy.exporters.JsonLinesItemExporter', #'jl': 'scrapy.exporters.JsonLinesItemExporter', #'csv': 'scrapy.exporters.CsvItemExporter', #'xml': 'scrapy.exporters.XmlItemExporter', #'marshal': 'scrapy.exporters.MarshalItemExporter', #'pickle': 'scrapy.exporters.PickleItemExporter', } # mongodb配置 MONGO_URI = 'mongodb://localhost:27017/' MONGO_DB = 'db_demospider'
setings.py是整个项目的默认设置。
也可以对单个爬虫某些设置修改:
# -*- coding: utf-8 -*- import scrapyclass DemoSpider(scrapy.Spider): name = 'demo' allowed_domains = ['www.demo.com'] start_urls = ['https://www.demo.com/index.html'] # 这里会对settings.py的设置内容进行覆盖,写法与settings.py不一样 custom_settings = { 'ROBOTSTXT_OBEY':True, } def parse(self, response): pass
posted on 2022-07-31 20:34 PythonBaby 阅读(29) 评论(0) 编辑 收藏 举报
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 使用C#创建一个MCP客户端
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· ollama系列1:轻松3步本地部署deepseek,普通电脑可用
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 按钮权限的设计及实现