andrewwf 2020-04-20
scrapy的命令分为全局命令和项目命令。顾名思义,全局命令是不管什么时候都能使用,项目命令只能用于具体已存在的项目上。
以上为scrapy命令行工具的基本介绍,不明白的请对具体命令自行百度。本文的重点是介绍命令的源码。如有错误之处还请指正,scrapy源码解读系列文章会一直更新。
import re import os import string from importlib import import_module from os.path import join, exists, abspath from shutil import ignore_patterns, move, copy2, copystat import scrapy from scrapy.commands import ScrapyCommand from scrapy.utils.template import render_templatefile, string_camelcase from scrapy.exceptions import UsageError TEMPLATES_TO_RENDER = ( (‘scrapy.cfg‘,), (‘${project_name}‘, ‘settings.py.tmpl‘), (‘${project_name}‘, ‘items.py.tmpl‘), (‘${project_name}‘, ‘pipelines.py.tmpl‘), (‘${project_name}‘, ‘middlewares.py.tmpl‘), ) IGNORE = ignore_patterns(‘*.pyc‘, ‘.svn‘) class Command(ScrapyCommand): requires_project = False default_settings = {‘LOG_ENABLED‘: False, ‘SPIDER_LOADER_WARN_ONLY‘: True} def syntax(self): return "<project_name> [project_dir]" def short_desc(self): return "Create new project" def _is_valid_name(self, project_name): """ 校验项目名称,避免出现特殊符号 :param project_name: 项目名称 :return: """ def _module_exists(module_name): try: import_module(module_name) return True except ImportError: return False if not re.search(r‘^[_a-zA-Z]\w*$‘, project_name): print(‘Error: Project names must begin with a letter and contain‘ ‘ only\nletters, numbers and underscores‘) elif _module_exists(project_name): # 判断项目名是否已经存在 print(‘Error: Module %r already exists‘ % project_name) else: return True return False def _copytree(self, src, dst): """ 复制src目录的所有文件至dst目录下 Since the original function always creates the directory, to resolve the issue a new function had to be created. It‘s a simple copy and was reduced for this case. More info at: https://github.com/scrapy/scrapy/pull/2005 """ ignore = IGNORE names = os.listdir(src) ignored_names = ignore(src, names) if not os.path.exists(dst): os.makedirs(dst) for name in names: if name in ignored_names: continue srcname = os.path.join(src, name) dstname = os.path.join(dst, name) if os.path.isdir(srcname): self._copytree(srcname, dstname) # 循环调用 else: copy2(srcname, dstname) # 复制srcname文件内容到dstname文件 copystat(src, dst) # 复制文件的元数据 def run(self, args, opts): """ 命令的运行方法,1-验证传参是否合法 2-验证项目名是否存在 3-复制模板文件到项目目录下 4-渲染项目目录下的模板文件,实现项目初始文件的创建 5-打印创建成功的输出 :param args: :param opts: :return: """ if len(args) not in (1, 2): # args为命令的传参,project_name, [project_dir(可选)] raise UsageError() project_name = args[0] project_dir = args[0] if len(args) == 2: project_dir = args[1] if exists(join(project_dir, ‘scrapy.cfg‘)): self.exitcode = 1 print(‘Error: scrapy.cfg already exists in %s‘ % abspath(project_dir)) return if not self._is_valid_name(project_name): self.exitcode = 1 # 项目名不正确时,正常退出 return self._copytree(self.templates_dir, abspath(project_dir)) # 复制文件 move(join(project_dir, ‘module‘), join(project_dir, project_name)) # 移动文件 for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name)) # string.Template方法,实现了字符串的替换,类似于字符串的%操作符和format方法 render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) # 也是利用string.Template方法,实现模板文件内容的动态替换(替换project_name和ProjectName) print("New Scrapy project ‘%s‘, using template directory ‘%s‘, " "created in:" % (project_name, self.templates_dir)) print(" %s\n" % abspath(project_dir)) print("You can start your first spider with:") print(" cd %s" % project_dir) print(" scrapy genspider example example.com") @property def templates_dir(self): _templates_base_dir = self.settings[‘TEMPLATES_DIR‘] or 131 join(scrapy.__path__[0], ‘templates‘) return join(_templates_base_dir, ‘project‘)
import os import shutil import string from importlib import import_module from os.path import join, dirname, abspath, exists, splitext import scrapy from scrapy.commands import ScrapyCommand from scrapy.utils.template import render_templatefile, string_camelcase from scrapy.exceptions import UsageError def sanitize_module_name(module_name): """Sanitize the given module name, by replacing dashes and points with underscores and prefixing it with a letter if it doesn‘t start with one """ module_name = module_name.replace(‘-‘, ‘_‘).replace(‘.‘, ‘_‘) if module_name[0] not in string.ascii_letters: module_name = "a" + module_name return module_name class Command(ScrapyCommand): requires_project = False default_settings = {‘LOG_ENABLED‘: False} def syntax(self): return "[options] <name> <domain>" def short_desc(self): return "Generate new spider using pre-defined templates" def add_options(self, parser): """ 在genspider命令后添加可选参数 :param parser: :return: """ ScrapyCommand.add_options(self, parser) parser.add_option("-l", "--list", dest="list", action="store_true", # 列出可选的用来生成初始文件的模板 help="List available templates") parser.add_option("-e", "--edit", dest="edit", action="store_true", # 创建spider.py文件后编辑它 help="Edit spider after creating it") parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE", # 将模板文件输出到控制台 help="Dump template to standard output") parser.add_option("-t", "--template", dest="template", default="basic", # 使用指定模板,默认使用basic.tmpl模板文件 help="Uses a custom template.") parser.add_option("--force", dest="force", action="store_true", # 强制创建spider.py文件 help="If the spider already exists, overwrite it with the template") def run(self, args, opts): """ 命令的主方法 :param args: :param opts: :return: """ if opts.list: # 如果有可选参数-l,则列出模板文件并退出运行 self._list_templates() return if opts.dump: # 有可选参数-d,则读取模板文件并打印 template_file = self._find_template(opts.dump) if template_file: with open(template_file, "r") as f: print(f.read()) return if len(args) != 2: # 如果参数不是2个,则报错(scrapy genspider spider_name domain_name) raise UsageError() name, domain = args[0:2] module = sanitize_module_name(name) if self.settings.get(‘BOT_NAME‘) == module: # 爬虫名不能和项目名相同 print("Cannot create a spider with the same name as your project") return try: spidercls = self.crawler_process.spider_loader.load(name) # 加载爬虫名模块,如果加载出来,标明该爬虫已经存在,需要换个名字 except KeyError: pass else: # if spider already exists and not --force then halt if not opts.force: print("Spider %r already exists in module:" % name) print(" %s" % spidercls.__module__) return template_file = self._find_template(opts.template) # 获取模板文件 if template_file: self._genspider(module, name, domain, opts.template, template_file) # 创建爬虫的初始文件 if opts.edit: self.exitcode = os.system(‘scrapy edit "%s"‘ % name) def _genspider(self, module, name, domain, template_name, template_file): """利用模板文件和string.Template方法创建spider初始文件""" """Generate the spider module, based on the given template""" tvars = { ‘project_name‘: self.settings.get(‘BOT_NAME‘), ‘ProjectName‘: string_camelcase(self.settings.get(‘BOT_NAME‘)), ‘module‘: module, ‘name‘: name, ‘domain‘: domain, ‘classname‘: ‘%sSpider‘ % ‘‘.join(s.capitalize() 106 for s in module.split(‘_‘)) } if self.settings.get(‘NEWSPIDER_MODULE‘): spiders_module = import_module(self.settings[‘NEWSPIDER_MODULE‘]) spiders_dir = abspath(dirname(spiders_module.__file__)) else: spiders_module = None spiders_dir = "." spider_file = "%s.py" % join(spiders_dir, module) shutil.copyfile(template_file, spider_file) # 复制模板文件内容到spider_file文件中 render_templatefile(spider_file, **tvars) # 使用string.Template方法(类似字符串的format方法实现指定字符串的替换) print("Created spider %r using template %r " % (name, 118 template_name), end=(‘‘ if spiders_module else ‘\n‘)) if spiders_module: print("in module:\n %s.%s" % (spiders_module.__name__, module)) def _find_template(self, template): """获取spider模板的文件路径""" template_file = join(self.templates_dir, ‘%s.tmpl‘ % template) if exists(template_file): return template_file print("Unable to find template: %s\n" % template) print(‘Use "scrapy genspider --list" to see all available templates.‘) def _list_templates(self): print("Available templates:") for filename in sorted(os.listdir(self.templates_dir)): if filename.endswith(‘.tmpl‘): print(" %s" % splitext(filename)[0]) @property def templates_dir(self): """获取模板文件的目录路径""" _templates_base_dir = self.settings[‘TEMPLATES_DIR‘] or 140 join(scrapy.__path__[0], ‘templates‘) return join(_templates_base_dir, ‘spiders‘)
from scrapy.commands import ScrapyCommand from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli from scrapy.exceptions import UsageError class Command(ScrapyCommand): requires_project = True def syntax(self): return "[options] <spider>" def short_desc(self): return "Run a spider" def add_options(self, parser): ScrapyCommand.add_options(self, parser) parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", # -a 爬虫运行时的参数 help="set spider argument (may be repeated)") parser.add_option("-o", "--output", metavar="FILE", action="append", # -o 爬虫结果输出到指定文件中(如json文件) help="dump scraped items into FILE (use - for stdout)") parser.add_option("-t", "--output-format", metavar="FORMAT", # -t 爬虫抓取结果输出的格式 help="format to use for dumping items with -o") def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) if opts.output: feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format) self.settings.set(‘FEEDS‘, feeds, priority=‘cmdline‘) def run(self, args, opts): if len(args) < 1: # crawl命令的参数只有一个--即为要运行的爬虫名(scrapy crawl spider_name) raise UsageError() elif len(args) > 1: raise UsageError("running ‘scrapy crawl‘ with more than one spider is no longer supported") spname = args[0] # 这里调用的是scrapy/crawler.py文件中Crawler类的crawl方法 # 1-先通过spname参数创建Spider类的实例化对象,创建ExecutionEngine类的实例化对象, # 2-通过调用ExecutionEngine类的open_spider方法,开启爬虫,开始调用请求进行数据的采集(仔细看看open_spider()方法!!!) # 最终返回的defer对象,包含正常结果和异常结果的两种回调函数,如果爬虫的异步网络请求是正常的,则调用callback;否则调用errback(twisted的Deferred对象) crawl_defer = self.crawler_process.crawl(spname, **opts.spargs) if getattr(crawl_defer, ‘result‘, None) is not None and issubclass(crawl_defer.result.type, Exception): # 如果result值为None而且类型为Exception的子类 self.exitcode = 1 # 错误返回码1 else: self.crawler_process.start() # 启动reactor事件监听循环(有IO事件就调用逻辑处理代码,没有就一直监听等待IO事件的到来,且几乎不消耗CPU) if self.crawler_process.bootstrap_failed or 55 (hasattr(self.crawler_process, ‘has_exception‘) and self.crawler_process.has_exception): self.exitcode = 1 # 有异常退出