Cinder Volume 服务启动流程分析和周期性任务分析

1、cinder-volume服务的程序入口

#!/usr/bin/python2
# PBR Generated from u'console_scripts'
import sys
from cinder.cmd.volume import main
if __name__ == "__main__":
    sys.exit(main())

2、cinder/cmd/volume.py的main方法实现

def main():
    objects.register_all()   # import cinder/objects目录下的所有模块
    gmr_opts.set_defaults(CONF)  # oslo_reports模块，用于生成错误报告，如内存泄漏等
    CONF(sys.argv[1:], project='cinder',
         version=version.version_string())
    logging.setup(CONF, "cinder")
    python_logging.captureWarnings(True)
    priv_context.init(root_helper=shlex.split(utils.get_root_helper()))  # oslo_privsep，service的执行权限设置
    utils.monkey_patch()   # monkey-patch，替换库比如socket、thread等，不改变import的行为，改变的是操作的指向函数
    gmr.TextGuruMeditation.setup_autorun(version, conf=CONF)
    launcher = service.get_launcher()----------------第一步
    LOG = logging.getLogger(__name__)
    service_started = False

    if CONF.enabled_backends:-----------在cinder.conf文件中使能的多存储后端
        for backend in filter(None, CONF.enabled_backends):-------循环读取配置文件中enabled_backends的值
            CONF.register_opt(host_opt, group=backend)
            backend_host = getattr(CONF, backend).backend_host
            host = "%s@%s" % (backend_host or CONF.host, backend)------生成cinder-volume host的名字，使用命令行cinder service-list的时候，查到的名字
            # We also want to set cluster to None on empty strings, and we
            # ignore leading and trailing spaces.
            cluster = CONF.cluster and CONF.cluster.strip()
            cluster = (cluster or None) and '%s@%s' % (cluster, backend)
            try:
                server = service.Service.create(host=host,------------------s1 步为每一个backend生成一个服务对象
                                                service_name=backend,
                                                binary='cinder-volume',
                                                coordination=True,
                                                cluster=cluster)
            except Exception:
                msg = _('Volume service %s failed to start.') % host
                LOG.exception(msg)
            else:--------如果没有异常发生，那么走这个分支
                # Dispose of the whole DB connection pool here before
                # starting another process.  Otherwise we run into cases where
                # child processes share DB connections which results in errors.
                session.dispose_engine()
                launcher.launch_service(server)--------------第二步，调用launcher里面的launch_service
                service_started = True
    else:
        LOG.error(_LE('Configuration for cinder-volume does not specify '
                      '"enabled_backends". Using DEFAULT section to configure '
                      'drivers is not supported since Ocata.'))

    if not service_started:
        msg = _('No volume service(s) started successfully, terminating.')
        LOG.error(msg)
        sys.exit(1)

    launcher.wait()

3、对service.Service.create()的详解　　

cinder/service.py:class Service(service.Service)里面的类方法
    @classmethod
    def create(cls, host=None, binary=None, topic=None, manager=None,
               report_interval=None, periodic_interval=None,
               periodic_fuzzy_delay=None, service_name=None,
               coordination=False, cluster=None):
        """Instantiates class and passes back application object.

        :param host: defaults to CONF.host
        :param binary: defaults to basename of executable
        :param topic: defaults to bin_name - 'cinder-' part
        :param manager: defaults to CONF.<topic>_manager
        :param report_interval: defaults to CONF.report_interval
        :param periodic_interval: defaults to CONF.periodic_interval
        :param periodic_fuzzy_delay: defaults to CONF.periodic_fuzzy_delay
        :param cluster: Defaults to None, as only some services will have it

        """
        if not host:
            host = CONF.host
        if not binary:
            binary = os.path.basename(inspect.stack()[-1][1])
        if not topic:
            topic = binary
        if not manager:
            subtopic = topic.rpartition('cinder-')[2]
            manager = CONF.get('%s_manager' % subtopic, None)
        if report_interval is None:
            report_interval = CONF.report_interval-------------读取配置文件中的上报周期值
        if periodic_interval is None:
            periodic_interval = CONF.periodic_interval---------读取配置文件中的轮训周期值
        if periodic_fuzzy_delay is None:
            periodic_fuzzy_delay = CONF.periodic_fuzzy_delay
        service_obj = cls(host, binary, topic, manager,--------调用类初始化一个服务对象
                          report_interval=report_interval,
                          periodic_interval=periodic_interval,
                          periodic_fuzzy_delay=periodic_fuzzy_delay,
                          service_name=service_name,
                          coordination=coordination,
                          cluster=cluster)

        return service_obj

4、　launcher = service.get_launcher()详解

cinder/cmd/volume.py
from cinder import service
launcher = service.get_launcher()

cinder/service.py
from oslo_service import service
def get_launcher():
    # Note(lpetrut): ProcessLauncher uses green pipes which fail on Windows
    # due to missing support of non-blocking I/O pipes. For this reason, the
    # service must be spawned differently on Windows, using the ServiceLauncher
    # class instead.
    if os.name == 'nt':
        return Launcher()--------------------windows操作系统
    else:
        return process_launcher()--------------Linux操作系统

def process_launcher():
    return service.ProcessLauncher(CONF)	

oslo_service/service.py
class ProcessLauncher(object):
    """Launch a service with a given number of workers."""

    def __init__(self, conf, wait_interval=0.01, restart_method='reload'):
        """Constructor.
        :param conf: an instance of ConfigOpts
        :param wait_interval: The interval to sleep for between checks
                              of child process exit.
        :param restart_method: If 'reload', calls reload_config_files on
            SIGHUP. If 'mutate', calls mutate_config_files on SIGHUP. Other
            values produce a ValueError.
        """
        self.conf = conf
        conf.register_opts(_options.service_opts)
        self.children = {}
        self.sigcaught = None
        self.running = True
        self.wait_interval = wait_interval
        self.launcher = None
        rfd, self.writepipe = os.pipe()
        self.readpipe = eventlet.greenio.GreenPipe(rfd, 'r')
        self.signal_handler = SignalHandler()
        self.handle_signal()
        self.restart_method = restart_method
        if restart_method not in _LAUNCHER_RESTART_METHODS:
            raise ValueError(_("Invalid restart_method: %s") % restart_method)	

   def launch_service(self, service, workers=1):-----默认是一个worker,即一个进程
        """Launch a service with a given number of workers.
       :param service: a service to launch, must be an instance of
              :class:`oslo_service.service.ServiceBase`
       :param workers: a number of processes in which a service
              will be running
        """
        _check_service_base(service)
        wrap = ServiceWrapper(service, workers)

        # Hide existing objects from the garbage collector, so that most
        # existing pages will remain in shared memory rather than being
        # duplicated between subprocesses in the GC mark-and-sweep. (Requires
        # Python 3.7 or later.)
        if hasattr(gc, 'freeze'):
            gc.freeze()

        LOG.info('Starting %d workers', wrap.workers)
        while self.running and len(wrap.children) < wrap.workers:
            self._start_child(wrap)

    def _start_child(self, wrap):
        if len(wrap.forktimes) > wrap.workers:
            # Limit ourselves to one process a second (over the period of
            # number of workers * 1 second). This will allow workers to
            # start up quickly but ensure we don't fork off children that
            # die instantly too quickly.
            if time.time() - wrap.forktimes[0] < wrap.workers:
                LOG.info('Forking too fast, sleeping')
                time.sleep(1)

            wrap.forktimes.pop(0)

        wrap.forktimes.append(time.time())

        pid = os.fork()
		使用fork创建子进程后，子进程会复制父进程的数据信息，而后，程序就分两个进程继续运行后面的程序
		在子进程内，这个方法会返回0；在父进程内，这个方法会返回子进程的编号PID
        if pid == 0:
            self.launcher = self._child_process(wrap.service)-----fork的子进程接管服务
            while True:
                self._child_process_handle_signal()
                status, signo = self._child_wait_for_exit_or_signal(
                    self.launcher)
                if not _is_sighup_and_daemon(signo):
                    self.launcher.wait()
                    break
                self.launcher.restart()------重启这个进程

            os._exit(status)

        LOG.debug('Started child %d', pid)

        wrap.children.add(pid)
        self.children[pid] = wrap

        return pid
因此得出，每个 backend 启动的 cinder-volume service 都是独立的子进程，即每个 launcher 都是一个子进程
oslo_service/service.py:class Launcher(object):
    def restart(self):
        """Reload config files and restart service.
        :returns: The return value from reload_config_files or
          mutate_config_files, according to the restart_method.
        """
        if self.restart_method == 'reload':
            self.conf.reload_config_files()
        elif self.restart_method == 'mutate':
            self.conf.mutate_config_files()
        self.services.restart()
oslo_service/service.py:class Services(object)
    def restart(self):
        """Reset services and start them in new threads."""
        self.stop()
        self.done = event.Event()
        for restart_service in self.services:
            restart_service.reset()
            self.tg.add_thread(self.run_service, restart_service, self.done)

    @staticmethod
    def run_service(service, done):
        """Service start wrapper.
        :param service: service to run
        :param done: event to wait on until a shutdown is triggered
        :returns: None
        """
        try:
            service.start()-------调用服务的start方法，最终调用的是cinder/service.py:class Service(service.Service):start方法，因为该方法继承oslo_service,service方法
        except Exception:
            LOG.exception('Error starting thread.')
            raise SystemExit(1)
        else:
            done.wait()		
			
cinder/service.py:class Service类:start方法，
   def start(self):
        version_string = version.version_string()
        LOG.info(_LI('Starting %(topic)s node (version %(version_string)s)'),
                 {'topic': self.topic, 'version_string': version_string})
        self.model_disconnected = False

        if self.coordination:
            coordination.COORDINATOR.start()

        self.manager.init_host(added_to_cluster=self.added_to_cluster,--------------------初始化驱动程序
                               service_id=Service.service_id)

        LOG.debug("Creating RPC server for service %s", self.topic)

        ctxt = context.get_admin_context()
        endpoints = [self.manager]
        endpoints.extend(self.manager.additional_endpoints)
        obj_version_cap = objects.Service.get_minimum_obj_version(ctxt)
        LOG.debug("Pinning object versions for RPC server serializer to %s",
                  obj_version_cap)
        serializer = objects_base.CinderObjectSerializer(obj_version_cap)

        target = messaging.Target(topic=self.topic, server=self.host)
        self.rpcserver = rpc.get_server(target, endpoints, serializer)
        self.rpcserver.start()

        # NOTE(dulek): Kids, don't do that at home. We're relying here on
        # oslo.messaging implementation details to keep backward compatibility
        # with pre-Ocata services. This will not matter once we drop
        # compatibility with them.
        if self.topic == constants.VOLUME_TOPIC:
            target = messaging.Target(
                topic='%(topic)s.%(host)s' % {'topic': self.topic,
                                              'host': self.host},
                server=vol_utils.extract_host(self.host, 'host'))
            self.backend_rpcserver = rpc.get_server(target, endpoints,
                                                    serializer)
            self.backend_rpcserver.start()

        # TODO(geguileo): In O - Remove the is_svc_upgrading_to_n part
        if self.cluster and not self.is_svc_upgrading_to_n(self.binary):
            LOG.info(_LI('Starting %(topic)s cluster %(cluster)s (version '
                         '%(version)s)'),
                     {'topic': self.topic, 'version': version_string,
                      'cluster': self.cluster})
            target = messaging.Target(
                topic='%s.%s' % (self.topic, self.cluster),
                server=vol_utils.extract_host(self.cluster, 'host'))
            serializer = objects_base.CinderObjectSerializer(obj_version_cap)
            self.cluster_rpcserver = rpc.get_server(target, endpoints,
                                                    serializer)
            self.cluster_rpcserver.start()

        self.manager.init_host_with_rpc()

        if self.report_interval:
            pulse = loopingcall.FixedIntervalLoopingCall(----------------启动上报给scheduler定时任务
                self.report_state)
            pulse.start(interval=self.report_interval,
                        initial_delay=self.report_interval)
            self.timers.append(pulse)

        if self.periodic_interval:
            if self.periodic_fuzzy_delay:
                initial_delay = random.randint(0, self.periodic_fuzzy_delay)
            else:
                initial_delay = None

            periodic = loopingcall.FixedIntervalLoopingCall(--------------启动轮训根性定时任务
                self.periodic_tasks)
            periodic.start(interval=self.periodic_interval,
                           initial_delay=initial_delay)
            self.timers.append(periodic)

cinder/volume/manager.py:VolumeManager
init_host 过程分析
    def init_host(self, added_to_cluster=None, **kwargs):
        """Perform any required initialization."""
        ctxt = context.get_admin_context()
        if not self.driver.supported:
            utils.log_unsupported_driver_warning(self.driver)

            if not self.configuration.enable_unsupported_driver:
                LOG.error(_LE("Unsupported drivers are disabled."
                              " You can re-enable by adding "
                              "enable_unsupported_driver=True to the "
                              "driver section in cinder.conf"),
                          resource={'type': 'driver',
                                    'id': self.__class__.__name__})
                return

        # If we have just added this host to a cluster we have to include all
        # our resources in that cluster.
        if added_to_cluster:
            self._include_resources_in_cluster(ctxt)

        LOG.info(_LI("Starting volume driver %(driver_name)s (%(version)s)"),
                 {'driver_name': self.driver.__class__.__name__,
                  'version': self.driver.get_version()})
        try:
            self.driver.do_setup(ctxt)---------------------存储的初始化操作放到了 do_setup 中，当 do_setup 失败时，并不会导致服务启动失败，也不会影响 multi backend 的其他 backend。
            self.driver.check_for_setup_error()
        except Exception:
            LOG.exception(_LE("Failed to initialize driver."),
                          resource={'type': 'driver',
                                    'id': self.__class__.__name__})
            # we don't want to continue since we failed
            # to initialize the driver correctly.
            return

        # Initialize backend capabilities list
        self.driver.init_capabilities()

        volumes = self._get_my_volumes(ctxt)
        snapshots = self._get_my_snapshots(ctxt)
        self._sync_provider_info(ctxt, volumes, snapshots)
        # FIXME volume count for exporting is wrong

        self.stats['pools'] = {}
        self.stats.update({'allocated_capacity_gb': 0})

        try:
            for volume in volumes:
                # available volume should also be counted into allocated
                if volume['status'] in ['in-use', 'available']:
                    # calculate allocated capacity for driver
                    self._count_allocated_capacity(ctxt, volume)

                    try:
                        if volume['status'] in ['in-use']:
                            self.driver.ensure_export(ctxt, volume)
                    except Exception:
                        LOG.exception(_LE("Failed to re-export volume, "
                                          "setting to ERROR."),
                                      resource=volume)
                        volume.conditional_update({'status': 'error'},
                                                  {'status': 'in-use'})
            # All other cleanups are processed by parent class CleanableManager

        except Exception:
            LOG.exception(_LE("Error during re-export on driver init."),
                          resource=volume)
            return

        self.driver.set_throttle()

        # at this point the driver is considered initialized.
        # NOTE(jdg): Careful though because that doesn't mean
        # that an entry exists in the service table
        self.driver.set_initialized()-------------set_initialized 将驱动的 initialized 属性设定为 true，标志驱动成功启动并初始化完成。

        # Keep the image tmp file clean when init host.
        backend_name = vol_utils.extract_host(self.service_topic_queue)
        image_utils.cleanup_temporary_file(backend_name)

        # collect and publish service capabilities
        self.publish_service_capabilities(ctxt)
        LOG.info(_LI("Driver initialization completed successfully."),
                 resource={'type': 'driver',
                           'id': self.driver.__class__.__name__})

        # Make sure to call CleanableManager to do the cleanup
        super(VolumeManager, self).init_host(added_to_cluster=added_to_cluster,
                                             **kwargs)