scrapy源码解读(二)命令行工具

andrewwf 2020-04-20

一、前言

scrapy的命令分为全局命令和项目命令。顾名思义,全局命令是不管什么时候都能使用,项目命令只能用于具体已存在的项目上。

二、全局命令

  • startproject(创建爬虫项目,一个项目下有一个或多个爬虫 -- scrapy startproject project_name [project_dir])
  • genspider(进入爬虫项目目录,创建爬虫 -- cd project_name/project_dir && scrapy genspider spider_name domain_name)
  • settings(获取项目配置 -- scrapy settings --get BOT_NAME)
  • runspider(运行爬虫 -- scrapy runspider myspider.py)
  • shell(scrapy交互终端,在不启动爬虫的情况下调试代码 -- scrapy shell http://www.baidu.com)
  • fetch(对指定url发起get请求,并将返回的内容输出到控制台 -- scrapy fetch http://www.baidu.com)
  • view(用不到,忽略)
  • version(查看scrapy版本 -- scrapy version)

三、项目命令(局部命令)

  • crawl(启动爬虫 -- scrapy crawl spider_name)
  • check(用不到,忽略)
  • list(列出当前的所有项目的所有爬虫 -- scrapy list)
  • edit(修改spider模块的代码,很少用 -- scrapy edit spider_name)
  • parse(用不到。。)
  • bench(用不到。。)

四、命令源码

以上为scrapy命令行工具的基本介绍,不明白的请对具体命令自行百度。本文的重点是介绍命令的源码。如有错误之处还请指正,scrapy源码解读系列文章会一直更新。

  • startproject
import re
import os
import string
from importlib import import_module
from os.path import join, exists, abspath
from shutil import ignore_patterns, move, copy2, copystat

import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.utils.template import render_templatefile, string_camelcase
from scrapy.exceptions import UsageError


TEMPLATES_TO_RENDER = (
    (‘scrapy.cfg‘,),
    (‘${project_name}‘, ‘settings.py.tmpl‘),
    (‘${project_name}‘, ‘items.py.tmpl‘),
    (‘${project_name}‘, ‘pipelines.py.tmpl‘),
    (‘${project_name}‘, ‘middlewares.py.tmpl‘),
)

IGNORE = ignore_patterns(‘*.pyc‘, ‘.svn‘)


class Command(ScrapyCommand):

    requires_project = False
    default_settings = {‘LOG_ENABLED‘: False,
                        ‘SPIDER_LOADER_WARN_ONLY‘: True}

    def syntax(self):
        return "<project_name> [project_dir]"

    def short_desc(self):
        return "Create new project"

    def _is_valid_name(self, project_name):
        """
        校验项目名称,避免出现特殊符号
        :param project_name: 项目名称
        :return:
        """
        def _module_exists(module_name):
            try:
                import_module(module_name)
                return True
            except ImportError:
                return False

        if not re.search(r‘^[_a-zA-Z]\w*$‘, project_name):
            print(‘Error: Project names must begin with a letter and contain‘
                  ‘ only\nletters, numbers and underscores‘)
        elif _module_exists(project_name):  # 判断项目名是否已经存在
            print(‘Error: Module %r already exists‘ % project_name)
        else:
            return True
        return False

    def _copytree(self, src, dst):
        """
        复制src目录的所有文件至dst目录下
        Since the original function always creates the directory, to resolve
        the issue a new function had to be created. It‘s a simple copy and
        was reduced for this case.
        More info at:
        https://github.com/scrapy/scrapy/pull/2005
        """
        ignore = IGNORE
        names = os.listdir(src)
        ignored_names = ignore(src, names)

        if not os.path.exists(dst):
            os.makedirs(dst)

        for name in names:
            if name in ignored_names:
                continue

            srcname = os.path.join(src, name)
            dstname = os.path.join(dst, name)
            if os.path.isdir(srcname):
                self._copytree(srcname, dstname)  # 循环调用
            else:
                copy2(srcname, dstname)  # 复制srcname文件内容到dstname文件
        copystat(src, dst)  # 复制文件的元数据

    def run(self, args, opts):
        """
        命令的运行方法,1-验证传参是否合法 2-验证项目名是否存在 3-复制模板文件到项目目录下 
        4-渲染项目目录下的模板文件,实现项目初始文件的创建 5-打印创建成功的输出
        :param args:
        :param opts:
        :return:
        """
        if len(args) not in (1, 2):  # args为命令的传参,project_name, [project_dir(可选)]
            raise UsageError()

        project_name = args[0]
        project_dir = args[0]

        if len(args) == 2:
            project_dir = args[1]

        if exists(join(project_dir, ‘scrapy.cfg‘)):
            self.exitcode = 1
            print(‘Error: scrapy.cfg already exists in %s‘ % abspath(project_dir))
            return

        if not self._is_valid_name(project_name):
            self.exitcode = 1  # 项目名不正确时,正常退出
            return

        self._copytree(self.templates_dir, abspath(project_dir))  # 复制文件
        move(join(project_dir, ‘module‘), join(project_dir, project_name))  # 移动文件
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(project_dir,
                string.Template(path).substitute(project_name=project_name))  # string.Template方法,实现了字符串的替换,类似于字符串的%操作符和format方法
            render_templatefile(tplfile, project_name=project_name,
                ProjectName=string_camelcase(project_name))  # 也是利用string.Template方法,实现模板文件内容的动态替换(替换project_name和ProjectName)
        print("New Scrapy project ‘%s‘, using template directory ‘%s‘, "
              "created in:" % (project_name, self.templates_dir))
        print("    %s\n" % abspath(project_dir))
        print("You can start your first spider with:")
        print("    cd %s" % project_dir)
        print("    scrapy genspider example example.com")

    @property
    def templates_dir(self):
        _templates_base_dir = self.settings[‘TEMPLATES_DIR‘] or 131             join(scrapy.__path__[0], ‘templates‘)
        return join(_templates_base_dir, ‘project‘)
  • genspider
import os
import shutil
import string

from importlib import import_module
from os.path import join, dirname, abspath, exists, splitext

import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.utils.template import render_templatefile, string_camelcase
from scrapy.exceptions import UsageError


def sanitize_module_name(module_name):
    """Sanitize the given module name, by replacing dashes and points
    with underscores and prefixing it with a letter if it doesn‘t start
    with one
    """
    module_name = module_name.replace(‘-‘, ‘_‘).replace(‘.‘, ‘_‘)
    if module_name[0] not in string.ascii_letters:
        module_name = "a" + module_name
    return module_name


class Command(ScrapyCommand):

    requires_project = False
    default_settings = {‘LOG_ENABLED‘: False}

    def syntax(self):
        return "[options] <name> <domain>"

    def short_desc(self):
        return "Generate new spider using pre-defined templates"

    def add_options(self, parser):
        """
        在genspider命令后添加可选参数
        :param parser: 
        :return: 
        """
        ScrapyCommand.add_options(self, parser)
        parser.add_option("-l", "--list", dest="list", action="store_true",  # 列出可选的用来生成初始文件的模板
            help="List available templates")
        parser.add_option("-e", "--edit", dest="edit", action="store_true",  # 创建spider.py文件后编辑它
            help="Edit spider after creating it")
        parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE",  # 将模板文件输出到控制台
            help="Dump template to standard output")
        parser.add_option("-t", "--template", dest="template", default="basic",  # 使用指定模板,默认使用basic.tmpl模板文件
            help="Uses a custom template.")
        parser.add_option("--force", dest="force", action="store_true",  # 强制创建spider.py文件
            help="If the spider already exists, overwrite it with the template")

    def run(self, args, opts):
        """
        命令的主方法
        :param args: 
        :param opts: 
        :return: 
        """
        if opts.list:  # 如果有可选参数-l,则列出模板文件并退出运行
            self._list_templates()
            return
        if opts.dump:  # 有可选参数-d,则读取模板文件并打印
            template_file = self._find_template(opts.dump)
            if template_file:
                with open(template_file, "r") as f:
                    print(f.read())
            return
        if len(args) != 2:  # 如果参数不是2个,则报错(scrapy genspider spider_name domain_name)
            raise UsageError()

        name, domain = args[0:2]
        module = sanitize_module_name(name)

        if self.settings.get(‘BOT_NAME‘) == module:  # 爬虫名不能和项目名相同
            print("Cannot create a spider with the same name as your project")
            return

        try:
            spidercls = self.crawler_process.spider_loader.load(name)  # 加载爬虫名模块,如果加载出来,标明该爬虫已经存在,需要换个名字
        except KeyError:
            pass
        else:
            # if spider already exists and not --force then halt
            if not opts.force:
                print("Spider %r already exists in module:" % name)
                print("  %s" % spidercls.__module__)
                return
        template_file = self._find_template(opts.template)  # 获取模板文件
        if template_file:
            self._genspider(module, name, domain, opts.template, template_file)  # 创建爬虫的初始文件
            if opts.edit:
                self.exitcode = os.system(‘scrapy edit "%s"‘ % name)

    def _genspider(self, module, name, domain, template_name, template_file):
        """利用模板文件和string.Template方法创建spider初始文件"""
        """Generate the spider module, based on the given template"""
        tvars = {
            ‘project_name‘: self.settings.get(‘BOT_NAME‘),
            ‘ProjectName‘: string_camelcase(self.settings.get(‘BOT_NAME‘)),
            ‘module‘: module,
            ‘name‘: name,
            ‘domain‘: domain,
            ‘classname‘: ‘%sSpider‘ % ‘‘.join(s.capitalize() 106                 for s in module.split(‘_‘))
        }
        if self.settings.get(‘NEWSPIDER_MODULE‘):
            spiders_module = import_module(self.settings[‘NEWSPIDER_MODULE‘])
            spiders_dir = abspath(dirname(spiders_module.__file__))
        else:
            spiders_module = None
            spiders_dir = "."
        spider_file = "%s.py" % join(spiders_dir, module)
        shutil.copyfile(template_file, spider_file)  # 复制模板文件内容到spider_file文件中
        render_templatefile(spider_file, **tvars)  # 使用string.Template方法(类似字符串的format方法实现指定字符串的替换)
        print("Created spider %r using template %r " % (name, 118             template_name), end=(‘‘ if spiders_module else ‘\n‘))
        if spiders_module:
            print("in module:\n  %s.%s" % (spiders_module.__name__, module))

    def _find_template(self, template):
        """获取spider模板的文件路径"""
        template_file = join(self.templates_dir, ‘%s.tmpl‘ % template)
        if exists(template_file):
            return template_file
        print("Unable to find template: %s\n" % template)
        print(‘Use "scrapy genspider --list" to see all available templates.‘)

    def _list_templates(self):
        print("Available templates:")
        for filename in sorted(os.listdir(self.templates_dir)):
            if filename.endswith(‘.tmpl‘):
                print("  %s" % splitext(filename)[0])

    @property
    def templates_dir(self):
        """获取模板文件的目录路径"""
        _templates_base_dir = self.settings[‘TEMPLATES_DIR‘] or 140             join(scrapy.__path__[0], ‘templates‘)
        return join(_templates_base_dir, ‘spiders‘)
  • crawl
from scrapy.commands import ScrapyCommand
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
from scrapy.exceptions import UsageError


class Command(ScrapyCommand):

    requires_project = True

    def syntax(self):
        return "[options] <spider>"

    def short_desc(self):
        return "Run a spider"

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
        parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",  # -a 爬虫运行时的参数
                          help="set spider argument (may be repeated)")
        parser.add_option("-o", "--output", metavar="FILE", action="append",  # -o 爬虫结果输出到指定文件中(如json文件)
                          help="dump scraped items into FILE (use - for stdout)")
        parser.add_option("-t", "--output-format", metavar="FORMAT",  # -t 爬虫抓取结果输出的格式
                          help="format to use for dumping items with -o")

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)
        try:
            opts.spargs = arglist_to_dict(opts.spargs)
        except ValueError:
            raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
        if opts.output:
            feeds = feed_process_params_from_cli(self.settings, opts.output, opts.output_format)
            self.settings.set(‘FEEDS‘, feeds, priority=‘cmdline‘)

    def run(self, args, opts):
        if len(args) < 1: # crawl命令的参数只有一个--即为要运行的爬虫名(scrapy crawl spider_name)
            raise UsageError()
        elif len(args) > 1:
            raise UsageError("running ‘scrapy crawl‘ with more than one spider is no longer supported")
        spname = args[0]

        # 这里调用的是scrapy/crawler.py文件中Crawler类的crawl方法
        # 1-先通过spname参数创建Spider类的实例化对象,创建ExecutionEngine类的实例化对象,
        # 2-通过调用ExecutionEngine类的open_spider方法,开启爬虫,开始调用请求进行数据的采集(仔细看看open_spider()方法!!!)
        # 最终返回的defer对象,包含正常结果和异常结果的两种回调函数,如果爬虫的异步网络请求是正常的,则调用callback;否则调用errback(twisted的Deferred对象)
        crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)


        if getattr(crawl_defer, ‘result‘, None) is not None and issubclass(crawl_defer.result.type, Exception):  # 如果result值为None而且类型为Exception的子类
            self.exitcode = 1  # 错误返回码1
        else:
            self.crawler_process.start()  # 启动reactor事件监听循环(有IO事件就调用逻辑处理代码,没有就一直监听等待IO事件的到来,且几乎不消耗CPU)

            if self.crawler_process.bootstrap_failed or 55                     (hasattr(self.crawler_process, ‘has_exception‘) and self.crawler_process.has_exception):
                self.exitcode = 1  # 有异常退出
  • 未完待续

相关推荐

TiDBPingCAP / 0评论 2020-07-29