scrapy cmdline实现
#scrapy/cmdline.py
1 from __future__ import with_statement
2
3 import sys
4 import os
5 import optparse
6 import cProfile
7 import inspect
8
9 import scrapy
10 from scrapy.crawler import CrawlerProcess
11 from scrapy.xlib import lsprofcalltree
12 from scrapy.conf import settings
13 from scrapy.command import ScrapyCommand
14 from scrapy.exceptions import UsageError, ScrapyDeprecationWarning
15 from scrapy.utils.misc import walk_modules
16 from scrapy.utils.project import inside_project
17
18 def _iter_command_classes(module_name):
19 # TODO: add `name` attribute to commands and and merge this function with
20 # scrapy.utils.spider.iter_spider_classes
21 for module in walk_modules(module_name):
22 for obj in vars(module).itervalues():
23 if inspect.isclass(obj) and \
24 issubclass(obj, ScrapyCommand) and \
25 obj.__module__ == module.__name__:
26 yield obj
27
28 def _get_commands_from_module(module, inproject):
29 d = {}
30 for cmd in _iter_command_classes(module):
31 if inproject or not cmd.requires_project:
32 cmdname = cmd.__module__.split('.')[-1]
33 d[cmdname] = cmd()
34 return d
35
36 def _get_commands_dict(inproject):
37 cmds = _get_commands_from_module('scrapy.commands', inproject)
38 cmds_module = settings['COMMANDS_MODULE']
39 if cmds_module:
40 cmds.update(_get_commands_from_module(cmds_module, inproject))
41 return cmds
42
43 def _pop_command_name(argv):
44 i = 0
45 for arg in argv[1:]:
46 if not arg.startswith('-'):
47 del argv[i]
48 return arg
49 i += 1
50
51 def _print_header(inproject):
52 if inproject:
53 print "Scrapy %s - project: %s\n" % (scrapy.__version__, \
54 settings['BOT_NAME'])
55 else:
56 print "Scrapy %s - no active project\n" % scrapy.__version__
57
58 def _print_commands(inproject):
59 _print_header(inproject)
60 print "Usage:"
61 print " scrapy <command> [options] [args]\n"
62 print "Available commands:"
63 cmds = _get_commands_dict(inproject)
64 for cmdname, cmdclass in sorted(cmds.iteritems()):
65 print " %-13s %s" % (cmdname, cmdclass.short_desc())
66 print
67 print 'Use "scrapy <command> -h" to see more info about a command'
68
69 def _print_unknown_command(cmdname, inproject):
70 _print_header(inproject)
71 print "Unknown command: %s\n" % cmdname
72 print 'Use "scrapy" to see available commands'
73 if not inproject:
74 print
75 print "More commands are available in project mode"
76
77 def _check_deprecated_scrapy_ctl(argv, inproject):
78 """Check if Scrapy was called using the deprecated scrapy-ctl command and
79 warn in that case, also creating a scrapy.cfg if it doesn't exist.
80 """
81 if not any('scrapy-ctl' in x for x in argv):
82 return
83 import warnings
84 warnings.warn("`scrapy-ctl.py` command-line tool is deprecated and will be removed in Scrapy 0.11, use `scrapy` instead",
85 ScrapyDeprecationWarning, stacklevel=3)
86 if inproject:
87 projpath = os.path.abspath(os.path.dirname(os.path.dirname(settings.settings_module.__file__)))
88 cfg_path = os.path.join(projpath, 'scrapy.cfg')
89 if not os.path.exists(cfg_path):
90 with open(cfg_path, 'w') as f:
91 f.write("# generated automatically - feel free to edit" + os.linesep)
92 f.write("[settings]" + os.linesep)
93 f.write("default = %s" % settings.settings_module.__name__ + os.linesep)
94
95 def _run_print_help(parser, func, *a, **kw):
96 try:
97 func(*a, **kw)
98 except UsageError, e:
99 if str(e):
100 parser.error(str(e))
101 if e.print_help:
102 parser.print_help()
103 sys.exit(2)
104
105 def execute(argv=None):
106 if argv is None:
107 argv = sys.argv
108
109 crawler = CrawlerProcess(settings)
110 crawler.install()
111 inproject = inside_project()
112 _check_deprecated_scrapy_ctl(argv, inproject) # TODO: remove for Scrapy 0.11
113 cmds = _get_commands_dict(inproject)
114 cmdname = _pop_command_name(argv)
115 parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
116 conflict_handler='resolve')
117 if not cmdname:
118 _print_commands(inproject)
119 sys.exit(0)
120 elif cmdname not in cmds:
121 _print_unknown_command(cmdname, inproject)
122 sys.exit(2)
123 #print cmds
124 cmd = cmds[cmdname]
125 print cmd
126 parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
127 parser.description = cmd.long_desc()
128 settings.defaults.update(cmd.default_settings)
129 cmd.settings = settings
130 cmd.add_options(parser)
131 opts, args = parser.parse_args(args=argv[1:])
132 _run_print_help(parser, cmd.process_options, args, opts)
133 cmd.set_crawler(crawler)
134 _run_print_help(parser, _run_command, cmd, args, opts)
135 sys.exit(cmd.exitcode)
136
137 def _run_command(cmd, args, opts):
138 if opts.profile or opts.lsprof:
139 _run_command_profiled(cmd, args, opts)
140 else:
141 cmd.run(args, opts)
142
143 def _run_command_profiled(cmd, args, opts):
144 if opts.profile:
145 sys.stderr.write("scrapy: writing cProfile stats to %r\n" % opts.profile)
146 if opts.lsprof:
147 sys.stderr.write("scrapy: writing lsprof stats to %r\n" % opts.lsprof)
148 loc = locals()
149 p = cProfile.Profile()
150 p.runctx('cmd.run(args, opts)', globals(), loc)
151 if opts.profile:
152 p.dump_stats(opts.profile)
153 k = lsprofcalltree.KCacheGrind(p)
154 if opts.lsprof:
155 with open(opts.lsprof, 'w') as f:
156 k.output(f)
157
158 if __name__ == '__main__':
159 execute()
scrapy cmdline的实现方法比较直接,基本步骤为:
1. 递归遍历commands文件夹下所有模块和包的子模块,找出每个模块里面属于ScrapyCommand的子类,然后返回一个 cmdname:cmdobj的dict(_get_commands_dict)
2. 根据命令行传入的名字,查找dict,提取出cmdobj然后执行。
scrapy的这种实现就是每次都要扫描所有文件,所以命令启动会比较慢。