scrapy源码解读(二)命令行工具
一、前言
在使用scrapy框架开发爬虫时,scrapy命令是必不可少的。如使用startproject命令创建一个爬虫项目,genspider命令创建一个具体的爬虫,crawl命令运行编写完成的爬虫等等。。。
scrapy的命令分为全局命令和项目命令。顾名思义,全局命令是不管什么时候都能使用,项目命令只能用于具体已存在的项目上。
二、全局命令
- startproject(创建爬虫项目,一个项目下有一个或多个爬虫 -- scrapy startproject project_name [project_dir])
- genspider(进入爬虫项目目录,创建爬虫 -- cd project_name/project_dir && scrapy genspider spider_name domain_name)
- settings(获取项目配置 -- scrapy settings --get BOT_NAME)
- runspider(运行爬虫 -- scrapy runspider myspider.py)
- shell(scrapy交互终端,在不启动爬虫的情况下调试代码 -- scrapy shell http://www.baidu.com)
- fetch(对指定url发起get请求,并将返回的内容输出到控制台 -- scrapy fetch http://www.baidu.com)
- view(用不到,忽略)
- version(查看scrapy版本 -- scrapy version)
三、项目命令(局部命令)
- crawl(启动爬虫 -- scrapy crawl spider_name)
- check(用不到,忽略)
- list(列出当前的所有项目的所有爬虫 -- scrapy list)
- edit(修改spider模块的代码,很少用 -- scrapy edit spider_name)
- parse(用不到。。)
- bench(用不到。。)
四、命令源码
以上为scrapy命令行工具的基本介绍,不明白的请对具体命令自行百度。本文的重点是介绍命令的源码。如有错误之处还请指正,scrapy源码解读系列文章会一直更新。
- startproject
1 import re 2 import os 3 import string 4 from importlib import import_module 5 from os.path import join, exists, abspath 6 from shutil import ignore_patterns, move, copy2, copystat 7 8 import scrapy 9 from scrapy.commands import ScrapyCommand 10 from scrapy.utils.template import render_templatefile, string_camelcase 11 from scrapy.exceptions import UsageError 12 13 14 TEMPLATES_TO_RENDER = ( 15 ('scrapy.cfg',), 16 ('${project_name}', 'settings.py.tmpl'), 17 ('${project_name}', 'items.py.tmpl'), 18 ('${project_name}', 'pipelines.py.tmpl'), 19 ('${project_name}', 'middlewares.py.tmpl'), 20 ) 21 22 IGNORE = ignore_patterns('*.pyc', '.svn') 23 24 25 class Command(ScrapyCommand): 26 27 requires_project = False 28 default_settings = {'LOG_ENABLED': False, 29 'SPIDER_LOADER_WARN_ONLY': True} 30 31 def syntax(self): 32 return "<project_name> [project_dir]" 33 34 def short_desc(self): 35 return "Create new project" 36 37 def _is_valid_name(self, project_name): 38 """ 39 校验项目名称,避免出现特殊符号 40 :param project_name: 项目名称 41 :return: 42 """ 43 def _module_exists(module_name): 44 try: 45 import_module(module_name) 46 return True 47 except ImportError: 48 return False 49 50 if not re.search(r'^[_a-zA-Z]\w*$', project_name): 51 print('Error: Project names must begin with a letter and contain' 52 ' only\nletters, numbers and underscores') 53 elif _module_exists(project_name): # 判断项目名是否已经存在 54 print('Error: Module %r already exists' % project_name) 55 else: 56 return True 57 return False 58 59 def _copytree(self, src, dst): 60 """ 61 复制src目录的所有文件至dst目录下 62 Since the original function always creates the directory, to resolve 63 the issue a new function had to be created. It's a simple copy and 64 was reduced for this case. 65 More info at: 66 https://github.com/scrapy/scrapy/pull/2005 67 """ 68 ignore = IGNORE 69 names = os.listdir(src) 70 ignored_names = ignore(src, names) 71 72 if not os.path.exists(dst): 73 os.makedirs(dst) 74 75 for name in names: 76 if name in ignored_names: 77 continue 78 79 srcname = os.path.join(src, name) 80 dstname = os.path.join(dst, name) 81 if os.path.isdir(srcname): 82 self._copytree(srcname, dstname) # 循环调用 83 else: 84 copy2(srcname, dstname) # 复制srcname文件内容到dstname文件 85 copystat(src, dst) # 复制文件的元数据 86 87 def run(self, args, opts): 88 """ 89 命令的运行方法,1-验证传参是否合法 2-验证项目名是否存在 3-复制模板文件到项目目录下 90 4-渲染项目目录下的模板文件,实现项目初始文件的创建 5-打印创建成功的输出 91 :param args: 92 :param opts: 93 :return: 94 """ 95 if len(args) not in (1, 2): # args为命令的传参,project_name, [project_dir(可选)] 96 raise UsageError() 97 98 project_name = args[0] 99 project_dir = args[0] 100 101 if len(args) == 2: 102 project_dir = args[1] 103 104 if exists(join(project_dir, 'scrapy.cfg')): 105 self.exitcode = 1 106 print('Error: scrapy.cfg already exists in %s' % abspath(project_dir)) 107 return 108 109 if not self._is_valid_name(project_name): 110 self.exitcode = 1 # 项目名不正确时,正常退出 111 return 112 113 self._copytree(self.templates_dir, abspath(project_dir)) # 复制文件 114 move(join(project_dir, 'module'), join(project_dir, project_name)) # 移动文件 115 for paths in TEMPLATES_TO_RENDER: 116 path = join(*paths) 117 tplfile = join(project_dir, 118 string.Template(path).substitute(project_name=project_name)) # string.Template方法,实现了字符串的替换,类似于字符串的%操作符和format方法 119 render_templatefile(tplfile, project_name=project_name, 120 ProjectName=string_camelcase(project_name)) # 也是利用string.Template方法,实现模板文件内容的动态替换(替换project_name和ProjectName) 121 print("New Scrapy project '%s', using template directory '%s', " 122 "created in:" % (project_name, self.templates_dir)) 123 print(" %s\n" % abspath(project_dir)) 124 print("You can start your first spider with:") 125 print(" cd %s" % project_dir) 126 print(" scrapy genspider example example.com") 127 128 @property 129 def templates_dir(self): 130 _templates_base_dir = self.settings['TEMPLATES_DIR'] or \ 131 join(scrapy.__path__[0], 'templates') 132 return join(_templates_base_dir, 'project')
- genspider
1 import os 2 import shutil 3 import string 4 5 from importlib import import_module 6 from os.path import join, dirname, abspath, exists, splitext 7 8 import scrapy 9 from scrapy.commands import ScrapyCommand 10 from scrapy.utils.template import render_templatefile, string_camelcase 11 from scrapy.exceptions import UsageError 12 13 14 def sanitize_module_name(module_name): 15 """Sanitize the given module name, by replacing dashes and points 16 with underscores and prefixing it with a letter if it doesn't start 17 with one 18 """ 19 module_name = module_name.replace('-', '_').replace('.', '_') 20 if module_name[0] not in string.ascii_letters: 21 module_name = "a" + module_name 22 return module_name 23 24 25 class Command(ScrapyCommand): 26 27 requires_project = False 28 default_settings = {'LOG_ENABLED': False} 29 30 def syntax(self): 31 return "[options] <name> <domain>" 32 33 def short_desc(self): 34 return "Generate new spider using pre-defined templates" 35 36 def add_options(self, parser): 37 """ 38 在genspider命令后添加可选参数 39 :param parser: 40 :return: 41 """ 42 ScrapyCommand.add_options(self, parser) 43 parser.add_option("-l", "--list", dest="list", action="store_true", # 列出可选的用来生成初始文件的模板 44 help="List available templates") 45 parser.add_option("-e", "--edit", dest="edit", action="store_true", # 创建spider.py文件后编辑它 46 help="Edit spider after creating it") 47 parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE", # 将模板文件输出到控制台 48 help="Dump template to standard output") 49 parser.add_option("-t", "--template", dest="template", default="basic", # 使用指定模板,默认使用basic.tmpl模板文件 50 help="Uses a custom template.") 51 parser.add_option("--force", dest="force", action="store_true", # 强制创建spider.py文件 52 help="If the spider already exists, overwrite it with the template") 53 54 def run(self, args, opts): 55 """ 56 命令的主方法 57 :param args: 58 :param opts: 59 :return: 60 """ 61 if opts.list: # 如果有可选参数-l,则列出模板文件并退出运行 62 self._list_templates() 63 return 64 if opts.dump: # 有可选参数-d,则读取模板文件并打印 65 template_file = self._find_template(opts.dump) 66 if template_file: 67 with open(template_file, "r") as f: 68 print(f.read()) 69 return 70 if len(args) != 2: # 如果参数不是2个,则报错(scrapy genspider spider_name domain_name) 71 raise UsageError() 72 73 name, domain = args[0:2] 74 module = sanitize_module_name(name) 75 76 if self.settings.get('BOT_NAME') == module: # 爬虫名不能和项目名相同 77 print("Cannot create a spider with the same name as your project") 78 return 79 80 try: 81 spidercls = self.crawler_process.spider_loader.load(name) # 加载爬虫名模块,如果加载出来,标明该爬虫已经存在,需要换个名字 82 except KeyError: 83 pass 84 else: 85 # if spider already exists and not --force then halt 86 if not opts.force: 87 print("Spider %r already exists in module:" % name) 88 print(" %s" % spidercls.__module__) 89 return 90 template_file = self._find_template(opts.template) # 获取模板文件 91 if template_file: 92 self._genspider(module, name, domain, opts.template, template_file) # 创建爬虫的初始文件 93 if opts.edit: 94 self.exitcode = os.system('scrapy edit "%s"' % name) 95 96 def _genspider(self, module, name, domain, template_name, template_file): 97 """利用模板文件和string.Template方法创建spider初始文件""" 98 """Generate the spider module, based on the given template""" 99 tvars = { 100 'project_name': self.settings.get('BOT_NAME'), 101 'ProjectName': string_camelcase(self.settings.get('BOT_NAME')), 102 'module': module, 103 'name': name, 104 'domain': domain, 105 'classname': '%sSpider' % ''.join(s.capitalize() \ 106 for s in module.split('_')) 107 } 108 if self.settings.get('NEWSPIDER_MODULE'): 109 spiders_module = import_module(self.settings['NEWSPIDER_MODULE']) 110 spiders_dir = abspath(dirname(spiders_module.__file__)) 111 else: 112 spiders_module = None 113 spiders_dir = "." 114 spider_file = "%s.py" % join(spiders_dir, module) 115 shutil.copyfile(template_file, spider_file) # 复制模板文件内容到spider_file文件中 116 render_templatefile(spider_file, **tvars) # 使用string.Template方法(类似字符串的format方法实现指定字符串的替换) 117 print("Created spider %r using template %r " % (name, \ 118 template_name), end=('' if spiders_module else '\n')) 119 if spiders_module: 120 print("in module:\n %s.%s" % (spiders_module.__name__, module)) 121 122 def _find_template(self, template): 123 """获取spider模板的文件路径""" 124 template_file = join(self.templates_dir, '%s.tmpl' % template) 125 if exists(template_file): 126 return template_file 127 print("Unable to find template: %s\n" % template) 128 print('Use "scrapy genspider --list" to see all available templates.') 129 130 def _list_templates(self): 131 print("Available templates:") 132 for filename in sorted(os.listdir(self.templates_dir)): 133 if filename.endswith('.tmpl'): 134 print(" %s" % splitext(filename)[0]) 135 136 @property 137 def templates_dir(self): 138 """获取模板文件的目录路径""" 139 _templates_base_dir = self.settings['TEMPLATES_DIR'] or \ 140 join(scrapy.__path__[0], 'templates') 141 return join(_templates_base_dir, 'spiders')
- crawl
1 from scrapy.commands import ScrapyCommand 2 from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli 3 from scrapy.exceptions import UsageError 4 5 6 class Command(ScrapyCommand): 7 8 requires_project = True 9 10 def syntax(self): 11 return "[options] <spider>" 12 13 def short_desc(self): 14 return "Run a spider" 15 16 def add_options(self, parser): 17 ScrapyCommand.add_options(self, parser) 18 parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", # -a 爬虫运行时的参数 19 help="set spider argument (may be repeated)") 20 parser.add_option("-o", "--output", metavar="FILE", action="append", # -o 爬虫结果输出到指定文件中(如json文件) 21 help="dump scraped items into FILE (use - for stdout)") 22 parser.add_option("-t", "--output-format", metavar="FORMAT", # -t 爬虫抓取结果输出的格式 23 help="format to use for dumping items with -o") 24 25 def process_options(self, args, opts): 26 ScrapyCommand.process_options(self, args, opts) 27 try: 28 opts.spargs = arglist_to_dict(opts.spargs) 29 except ValueError: 30 raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) 31 if opts.output: 32 feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format) 33 self.settings.set('FEEDS', feeds, priority='cmdline') 34 35 def run(self, args, opts): 36 if len(args) < 1: # crawl命令的参数只有一个--即为要运行的爬虫名(scrapy crawl spider_name) 37 raise UsageError() 38 elif len(args) > 1: 39 raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported") 40 spname = args[0] 41 42 # 这里调用的是scrapy/crawler.py文件中Crawler类的crawl方法 43 # 1-先通过spname参数创建Spider类的实例化对象,创建ExecutionEngine类的实例化对象, 44 # 2-通过调用ExecutionEngine类的open_spider方法,开启爬虫,开始调用请求进行数据的采集(仔细看看open_spider()方法!!!) 45 # 最终返回的defer对象,包含正常结果和异常结果的两种回调函数,如果爬虫的异步网络请求是正常的,则调用callback;否则调用errback(twisted的Deferred对象) 46 crawl_defer = self.crawler_process.crawl(spname, **opts.spargs) 47 48 49 if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception): # 如果result值为None而且类型为Exception的子类 50 self.exitcode = 1 # 错误返回码1 51 else: 52 self.crawler_process.start() # 启动reactor事件监听循环(有IO事件就调用逻辑处理代码,没有就一直监听等待IO事件的到来,且几乎不消耗CPU) 53 54 if self.crawler_process.bootstrap_failed or \ 55 (hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception): 56 self.exitcode = 1 # 有异常退出
- bench
1 import sys 2 import time 3 import subprocess 4 from urllib.parse import urlencode 5 6 import scrapy 7 from scrapy.commands import ScrapyCommand 8 from scrapy.linkextractors import LinkExtractor 9 10 11 class Command(ScrapyCommand): 12 13 default_settings = { 14 'LOG_LEVEL': 'INFO', 15 'LOGSTATS_INTERVAL': 1, 16 'CLOSESPIDER_TIMEOUT': 10, 17 } 18 19 def short_desc(self): 20 # 运行快速基准测试(scrapy bench,用来测试此硬件的抓取速度基准。 21 # 但是自己编写的爬虫的抓取速度取决于代码逻辑和抓取的东西多少) 22 return "Run quick benchmark test" 23 24 def run(self, args, opts): 25 with _BenchServer(): 26 # 开始抓取速度的测试 27 self.crawler_process.crawl(_BenchSpider, total=100000) 28 self.crawler_process.start() 29 30 31 class _BenchServer: 32 """ 33 __enter__()和__exit__()魔法方法说明: 34 有一些任务,可能事先需要设置,事后做清理工作。__enter__()是负责事先处理的方法,__exit__()负责最后处理的方法。 35 这两者通常一起结合with使用。 36 python中的with语句,要求with后面的对象必须有一个__enter__()和一个__exit__()方法。with语句可以很好地处理上下文环境产生的异常, 37 如果出现异常,会调用with对象的__exit__()方法。 38 __enter__()和__exit__()讲解:https://blog.csdn.net/xc_zhou/article/details/80810111 39 """ 40 def __enter__(self): 41 from scrapy.utils.test import get_testenv 42 pargs = [sys.executable, '-u', '-m', 'scrapy.utils.benchserver'] 43 self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE, 44 env=get_testenv()) 45 self.proc.stdout.readline() 46 47 def __exit__(self, exc_type, exc_value, traceback): 48 self.proc.kill() 49 self.proc.wait() 50 time.sleep(0.2) 51 52 53 class _BenchSpider(scrapy.Spider): 54 """用来测试的爬虫抓取类""" 55 """A spider that follows all links""" 56 name = 'follow' 57 total = 10000 58 show = 20 59 baseurl = 'http://localhost:8998' 60 link_extractor = LinkExtractor() 61 62 def start_requests(self): 63 qargs = {'total': self.total, 'show': self.show} 64 url = '{}?{}'.format(self.baseurl, urlencode(qargs, doseq=1)) 65 return [scrapy.Request(url, dont_filter=True)] 66 67 def parse(self, response): 68 for link in self.link_extractor.extract_links(response): 69 yield scrapy.Request(link.url, callback=self.parse)
- settings
1 import json 2 3 from scrapy.commands import ScrapyCommand 4 from scrapy.settings import BaseSettings 5 6 """ 7 settings是用来获取参数某一项的配置的,比如获取settings.py文件的BOT_NAME, 8 可以使用scrapy settings --get BOT_NAME 9 除了--get,还有--getbool(获取布尔值),--getlist(获取list类型的值)。。。 10 不确定值的类型,可统一使用--get来获取 11 """ 12 13 14 class Command(ScrapyCommand): 15 16 requires_project = False 17 default_settings = {'LOG_ENABLED': False, 18 'SPIDER_LOADER_WARN_ONLY': True} 19 20 def syntax(self): 21 return "[options]" 22 23 def short_desc(self): 24 return "Get settings values" 25 26 def add_options(self, parser): 27 ScrapyCommand.add_options(self, parser) 28 parser.add_option("--get", dest="get", metavar="SETTING", 29 help="print raw setting value") 30 parser.add_option("--getbool", dest="getbool", metavar="SETTING", 31 help="print setting value, interpreted as a boolean") 32 parser.add_option("--getint", dest="getint", metavar="SETTING", 33 help="print setting value, interpreted as an integer") 34 parser.add_option("--getfloat", dest="getfloat", metavar="SETTING", 35 help="print setting value, interpreted as a float") 36 parser.add_option("--getlist", dest="getlist", metavar="SETTING", 37 help="print setting value, interpreted as a list") 38 39 def run(self, args, opts): 40 settings = self.crawler_process.settings 41 if opts.get: 42 s = settings.get(opts.get) 43 if isinstance(s, BaseSettings): 44 print(json.dumps(s.copy_to_dict())) 45 else: 46 print(s) 47 elif opts.getbool: 48 print(settings.getbool(opts.getbool)) 49 elif opts.getint: 50 print(settings.getint(opts.getint)) 51 elif opts.getfloat: 52 print(settings.getfloat(opts.getfloat)) 53 elif opts.getlist: 54 print(settings.getlist(opts.getlist))
- 未完待续