[SaltStack] salt-minion启动流程

SaltStack源码阅读

前面理了下salt-master的启动流程, 这次来看看salt-minion的启动流程.

启动salt-minion方法:

/etc/init.d/salt-minion start

看看/etc/init.d/salt-master逻辑:

$ cat /etc/init.d/salt-master

SALTMINION=/usr/bin/salt-minion
PYTHON=/usr/bin/python
MINION_ARGS=""
	
start() {
	echo -n $"Starting salt-minion daemon: "
	if [ -f $SUSE_RELEASE ]; then
		startproc -f -p /var/run/$SERVICE.pid $SALTMINION -d $MINION_ARGS
		rc_status -v
	elif [ -e $DEBIAN_VERSION ]; then
		if [ -f $LOCKFILE ]; then
			echo -n "already started, lock file found"
			RETVAL=1
		elif $PYTHON $SALTMINION -d $MINION_ARGS >& /dev/null; then
			echo -n "OK"
			RETVAL=0
		fi
	else
		if [[ ! -z "$(pidofproc -p /var/run/$SERVICE.pid $PROCESS)" ]]; then
			RETVAL=$?
			echo -n "already running"
		else
			daemon --check $SERVICE $SALTMINION -d $MINION_ARGS
		fi
	fi
	RETVAL=$?
	echo
	return $RETVAL
}		

继续看看/usr/bin/salt-minion:

$ cat /usr/bin/salt-master

#!/usr/bin/python
'''
This script is used to kick off a salt minion daemon
'''

from salt.scripts import salt_minion
from multiprocessing import freeze_support


if __name__ == '__main__':
	# This handles the bootstrapping code that is included with frozen
	# scripts. It is a no-op on unfrozen code.
	freeze_support()
	salt_minion()

调用salt_minion()方法, 在script.py里:

$ cat scripts.py

def salt_minion():
	'''
	Start the salt minion.
	'''
	import salt.cli.daemons
	import multiprocessing
	if '' in sys.path:
    	sys.path.remove('')

	if salt.utils.is_windows():
    	minion = salt.cli.daemons.Minion()
    	minion.start()
    	return

	if '--disable-keepalive' in sys.argv:
    	sys.argv.remove('--disable-keepalive')
    	minion = salt.cli.daemons.Minion()
    	minion.start()
    	return

	# keep one minion subprocess running
	while True:
    	try:
        	queue = multiprocessing.Queue()
    	except Exception:
        	# This breaks in containers
        	minion = salt.cli.daemons.Minion()
        	minion.start()
        	return
    	process = multiprocessing.Process(target=minion_process, args=(queue,))
    	process.start()
    	try:
        	process.join()
        	try:
            	restart_delay = queue.get(block=False)
        	except Exception:
            	if process.exitcode == 0:
                	# Minion process ended naturally, Ctrl+C or --version
                	break
            	restart_delay = 60
        	if restart_delay == 0:
            	# Minion process ended naturally, Ctrl+C, --version, etc.
            	break
        	# delay restart to reduce flooding and allow network resources to close
        	time.sleep(restart_delay)
    	except KeyboardInterrupt:
        	break
    	# need to reset logging because new minion objects
    	# cause extra log handlers to accumulate
    	rlogger = logging.getLogger()
    	for handler in rlogger.handlers:
        	rlogger.removeHandler(handler)
    	logging.basicConfig()

这里启动minion使用了multiprocessing.Process方法, target是minion_process函数, 主要流程是:

def minion_process(queue):
	'''
	Start a minion process
	'''
	import salt.cli.daemons
	# salt_minion spawns this function in a new process

	def suicide_when_without_parent(parent_pid):
    	'''
    	Have the minion suicide if the parent process is gone

   		NOTE: there is a small race issue where the parent PID could be replace
    	with another process with the same PID!
    	'''
    	while True:
        	time.sleep(5)
        	try:
            	# check pid alive (Unix only trick!)
            	os.kill(parent_pid, 0)
        	except OSError:
            	# forcibly exit, regular sys.exit raises an exception-- which
            	# isn't sufficient in a thread
            	os._exit(999)
	if not salt.utils.is_windows():
    	thread = threading.Thread(target=suicide_when_without_parent, args=(os.getppid(),))
    	thread.start()

	restart = False
	minion = None
	try:
    	minion = salt.cli.daemons.Minion()
    	minion.start()
	except (Exception, SaltClientError, SaltReqTimeoutError, SaltSystemExit) as exc:
    	log.error('Minion failed to start: ', exc_info=True)
    	restart = True
	except SystemExit as exc:
    	restart = False

	if restart is True:
    	log.warn('** Restarting minion **')
    	delay = 60
    	if minion is not None:
        	if hasattr(minion, 'config'):
            	delay = minion.config.get('random_reauth_delay', 60)
    	random_delay = randint(1, delay)
    	log.info('Sleeping random_reauth_delay of {0} seconds'.format(random_delay))
    	# preform delay after minion resources have been cleaned
    	queue.put(random_delay)
	else:
    	queue.put(0)

这里调用了salt.cli.daemons.Minion的start方法, 目标类文件在: ~/salt/cli/daemons.py

class Minion(parsers.MinionOptionParser):
	'''
	Create a minion server
	'''
	
	def prepare(self):
    	'''
    	Run the preparation sequence required to start a salt minion.

    	If sub-classed, don't **ever** forget to run:

        super(YourSubClass, self).prepare()
    	'''
    	self.parse_args()

    	try:
        	if self.config['verify_env']:
            	confd = self.config.get('default_include')
            	if confd:
                	# If 'default_include' is specified in config, then use it
                	if '*' in confd:
                    	# Value is of the form "minion.d/*.conf"
                    	confd = os.path.dirname(confd)
                	if not os.path.isabs(confd):
                    	# If configured 'default_include' is not an absolute
                    	# path, consider it relative to folder of 'conf_file'
                    	# (/etc/salt by default)
                    	confd = os.path.join(
                        	os.path.dirname(self.config['conf_file']), confd
                    	)
            	else:
                	confd = os.path.join(
                    	os.path.dirname(self.config['conf_file']), 'minion.d'
                	)
            	v_dirs = [
                	    self.config['pki_dir'],
                   	 	self.config['cachedir'],
                    	self.config['sock_dir'],
                    	self.config['extension_modules'],
                    	confd,
                	]
            	if self.config.get('transport') == 'raet':
                	v_dirs.append(os.path.join(self.config['pki_dir'], 'accepted'))
                	v_dirs.append(os.path.join(self.config['pki_dir'], 'pending'))
                	v_dirs.append(os.path.join(self.config['pki_dir'], 'rejected'))
                	v_dirs.append(os.path.join(self.config['cachedir'], 'raet'))
            	verify_env(
                	v_dirs,
                	self.config['user'],
                	permissive=self.config['permissive_pki_access'],
                	pki_dir=self.config['pki_dir'],
            	)
            	logfile = self.config['log_file']
            	if logfile is not None and not logfile.startswith(('tcp://',
                                                            'udp://',
                                                            'file://')):
                	# Logfile is not using Syslog, verify
                	current_umask = os.umask(0o027)
                	verify_files([logfile], self.config['user'])
                	os.umask(current_umask)
    	except OSError as err:
        	logger.exception('Failed to prepare salt environment')
        	sys.exit(err.errno)

    	self.setup_logfile_logger()
    	logger.info(
        	'Setting up the Salt Minion "{0}"'.format(
            	self.config['id']
        	)
    	)
    	migrations.migrate_paths(self.config)
    	# TODO: AIO core is separate from transport
    	if self.config['transport'].lower() in ('zeromq', 'tcp'):
        	# Late import so logging works correctly
        	import salt.minion
        	# If the minion key has not been accepted, then Salt enters a loop
        	# waiting for it, if we daemonize later then the minion could halt
        	# the boot process waiting for a key to be accepted on the master.
        	# This is the latest safe place to daemonize
        	self.daemonize_if_required()
        	self.set_pidfile()
        	if isinstance(self.config.get('master'), list):
            	if self.config.get('master_type') == 'failover':
                	self.minion = salt.minion.Minion(self.config)
            	else:
                	self.minion = salt.minion.MultiMinion(self.config)
        	else:
            	self.minion = salt.minion.Minion(self.config)
    	else:
        	import salt.daemons.flo
        	self.daemonize_if_required()
        	self.set_pidfile()
        	self.minion = salt.daemons.flo.IofloMinion(self.config)

def start(self):
    '''
    Start the actual minion.

    If sub-classed, don't **ever** forget to run:

        super(YourSubClass, self).start()

    NOTE: Run any required code before calling `super()`.
    '''
    try:
        self.prepare()
        if check_user(self.config['user']):
            logger.info('The salt minion is starting up')
            self.minion.tune_in() #建立publisher
    except (KeyboardInterrupt, SaltSystemExit) as exc:
        logger.warn('Stopping the Salt Minion')
        if isinstance(exc, KeyboardInterrupt):
            logger.warn('Exiting on Ctrl-c')
        else:
            logger.error(str(exc))
    finally:
        self.shutdown()

在这里minion使用prepare方法准备salt minion参数和环境配置, 通过self.minion.tune_in()尝试建立publisher, 和master建立通信机制, 目标文件在: ~/salt/minion.py

class MultiMinion(MinionBase):
	'''
	如果minion-conf里配置了多minion, 则会使用MultiMinion来启动minion
	Create a multi minion interface, this creates as many minions as are
	defined in the master option and binds each minion object to a respective
	master.
	'''

	# Multi Master Tune In
	def tune_in(self):
		'''
    	Bind to the masters

    	This loop will attempt to create connections to masters it hasn't connected
    	to yet, but once the initial connection is made it is up to ZMQ to do the
    	reconnect (don't know of an API to get the state here in salt)
    	'''
    	# Fire off all the minion coroutines
    	self.minions = self._spawn_minions()

    	# serve forever!
    	self.io_loop.start()

这里tune_in()是关键的步骤, minion链接master, 维护minion Pub/Sub socket通信等等.

先看看tune_in()方法:

# Multi Master Tune In
def tune_in(self):
    '''
    Bind to the masters

    This loop will attempt to create connections to masters it hasn't connected
    to yet, but once the initial connection is made it is up to ZMQ to do the
    reconnect (don't know of an API to get the state here in salt)
    '''
    # Fire off all the minion coroutines
    self.minions = self._spawn_minions()

    # serve forever!
    self.io_loop.start() #启动io_loop异步事件驱动
    
def _spawn_minions(self):
    '''
    Spawn all the coroutines which will sign in to masters
    '''
    if not isinstance(self.opts['master'], list):
        log.error(
            'Attempting to start a multimaster system with one master')
        sys.exit(salt.defaults.exitcodes.EX_GENERIC)
    for master in set(self.opts['master']):
        s_opts = copy.deepcopy(self.opts)
        s_opts['master'] = master
        s_opts['multimaster'] = True
        s_opts['auth_timeout'] = self.MINION_CONNECT_TIMEOUT
        self.io_loop.spawn_callback(self._connect_minion, s_opts) #callback调用_connect_minion


def _connect_minion(self, opts):
    '''
    Create a minion, and asynchronously connect it to a master
    '''
    last = 0  # never have we signed in
    auth_wait = opts['acceptance_wait_time']
    while True:
        try:
            minion = Minion(opts,
                            self.MINION_CONNECT_TIMEOUT,
                            False,
                            io_loop=self.io_loop,
                            loaded_base_name='salt.loader.{0}'.format(opts['master']),
                            )
            yield minion.connect_master() 
            minion.tune_in(start=False)
            break
        except SaltClientError as exc:
            log.error('Error while bringing up minion for multi-master. Is master at {0} responding?'.format(opts['master']))
            last = time.time()
            if auth_wait < self.max_auth_wait:
                auth_wait += self.auth_wait
            yield tornado.gen.sleep(auth_wait)  # TODO: log?
        except Exception as e:
            log.critical('Unexpected error while connecting to {0}'.format(opts['master']), exc_info=True)

到这里大家会发现tune_in进入循环了... 没错! minion server进程会无限循环下去, 维护minion Pub/Sub socket, 保持和salt master的链接, 监听event事件完成Job工作等等.

接下来看看connect_master()方法:

def connect_master(self):
    '''
    Return a future which will complete when you are connected to a master
    '''
    master, self.pub_channel = yield self.eval_master(self.opts, self.timeout, self.safe)
    
    yield self._post_master_init(master)

eval_master方法计算并返回master地址以及生成pub_channel;

_post_master_init方法是在链接完成后进行初始化;

在eval_master里, 如果master_type是func, 则会load所有models; 如果master_type是failover, 则会获取master address list, 进而会调用salt.transport.client.AsyncPubChannel.factory()生成pub_channel, 调用connect()方法链接master主机, 具体逻辑是下面这样的:

def eval_master(self,
                opts,
                timeout=60,
                safe=True,
                failed=False):

    # check if master_type was altered from its default
    if opts['master_type'] != 'str':
        # check for a valid keyword
        if opts['master_type'] == 'func': #func类型
            # split module and function and try loading the module
            mod, fun = opts['master'].split('.')
            try:
                master_mod = salt.loader.raw_mod(opts, mod, fun) #load各个modules
                if not master_mod:
                    raise TypeError
                # we take whatever the module returns as master address
                opts['master'] = master_mod[mod + '.' + fun]() #return master地址
            except TypeError:
                msg = ('Failed to evaluate master address from '
                       'module \'{0}\''.format(opts['master']))
                log.error(msg)
                sys.exit(salt.defaults.exitcodes.EX_GENERIC)
            log.info('Evaluated master from module: {0}'.format(master_mod))

        # if failover is set, master has to be of type list
        elif opts['master_type'] == 'failover': #failover类型
            if isinstance(opts['master'], list):
            
            	'''
            	使用isinstance方法, failover链接master
            	'''
                log.info('Got list of available master addresses:'
                         ' {0}'.format(opts['master']))
                if opts['master_shuffle']:
                '''
                If master is a list of addresses, shuffle them before trying to connect to distribute the minions over all available masters.
                '''
                    shuffle(opts['master'])
            # if opts['master'] is a str and we have never created opts['master_list']
            elif isinstance(opts['master'], str) and ('master_list' not in opts):
                # We have a string, but a list was what was intended. Convert.
                # See issue 23611 for details
                opts['master'] = list(opts['master'])
            elif opts['__role'] == 'syndic':#二级master
                log.info('Syndic setting master_syndic to \'{0}\''.format(opts['master']))

            # if failed=True, the minion was previously connected
            # we're probably called from the minions main-event-loop
            # because a master connection loss was detected. remove
            # the possibly failed master from the list of masters.
            elif failed:
                log.info('Removing possibly failed master {0} from list of'
                         ' masters'.format(opts['master']))
                # create new list of master with the possibly failed one removed
                opts['master'] = [x for x in opts['master_list'] if opts['master'] != x]

            else:
                msg = ('master_type set to \'failover\' but \'master\' '
                       'is not of type list but of type '
                       '{0}'.format(type(opts['master'])))
                log.error(msg)
                sys.exit(salt.defaults.exitcodes.EX_GENERIC)
        else:
            msg = ('Invalid keyword \'{0}\' for variable '
                   '\'master_type\''.format(opts['master_type']))
            log.error(msg)
            sys.exit(salt.defaults.exitcodes.EX_GENERIC)

    # if we have a list of masters, loop through them and be
    # happy with the first one that allows us to connect
    if isinstance(opts['master'], list):
        conn = False
        # shuffle the masters and then loop through them
        local_masters = copy.copy(opts['master'])

        for master in local_masters:
            opts['master'] = master
            opts.update(resolve_dns(opts))
            super(Minion, self).__init__(opts)  # TODO: only run init once?? This will run once per attempt

            # on first run, update self.opts with the whole master list
            # to enable a minion to re-use old masters if they get fixed
            if 'master_list' not in opts:
                opts['master_list'] = local_masters

            try:
                pub_channel = salt.transport.client.AsyncPubChannel.factory(opts,
										timeout=timeout,                                                                                											safe=safe,
										io_loop=self.io_loop,
										)
                yield pub_channel.connect()
                conn = True
                break
            except SaltClientError:
            
            	'''
            	链接master失败, 尝试下一个master链接
            	'''
                msg = ('Master {0} could not be reached, trying '
                       'next master (if any)'.format(opts['master']))
                log.info(msg)
                continue

        if not conn:
            self.connected = False
            msg = ('No master could be reached or all masters denied '
                   'the minions connection attempt.')
            log.error(msg)
        else:
            self.connected = True
            raise tornado.gen.Return((opts['master'], pub_channel))

    # single master sign in
    else:
        opts.update(resolve_dns(opts)) #处理dns解析master ip address
        pub_channel = salt.transport.client.AsyncPubChannel.factory(self.opts,
                                                                    timeout=timeout,
                                                                    safe=safe,
                                                                    io_loop=self.io_loop,
                                                                    )
        yield pub_channel.connect()
        self.tok = pub_channel.auth.gen_token('salt')
        self.connected = True
        raise tornado.gen.Return((opts['master'], pub_channel))

可以看出在上面的逻辑中, 无论是multi master还是single master, 最终都会调用salt.transport.client.AsyncPubChannel类的factory()和connect()方法来完成connect master, 目标文件在: ~/salt/transport/client.py里.

class AsyncPubChannel(AsyncChannel):
	'''
	Factory class to create subscription channels to the master's Publisher
	'''
	@classmethod
	def factory(cls, opts, **kwargs):
    	# Default to ZeroMQ for now
    	ttype = 'zeromq'

    	# determine the ttype
    	if 'transport' in opts:
        	ttype = opts['transport']
    	elif 'transport' in opts.get('pillar', {}).get('master', {}):
        	ttype = opts['pillar']['master']['transport']

    	# switch on available ttypes
    	if ttype == 'zeromq':
        	import salt.transport.zeromq
        	return salt.transport.zeromq.AsyncZeroMQPubChannel(opts, **kwargs)
   		elif ttype == 'raet':  # TODO:
        	import salt.transport.raet
        	return salt.transport.raet.AsyncRAETPubChannel(opts, **kwargs)
    	elif ttype == 'tcp':
        	if not cls._resolver:
            	# TODO: add opt to specify number of resolver threads
            	AsyncChannel._init_resolver()
        	import salt.transport.tcp
        	return salt.transport.tcp.AsyncTCPPubChannel(opts, **kwargs)
    	elif ttype == 'local':  # TODO:
        	import salt.transport.local
        	return salt.transport.local.AsyncLocalPubChannel(opts, **kwargs)
    	else:
        	raise Exception('Channels are only defined for ZeroMQ and raet')
        	# return NewKindOfChannel(opts, **kwargs)

现在基本都是基于zeromq通信的, salt.transport.zeromq.AsyncZeroMQPubChannel(opts, **kwargs)方法处理pub_channel, 目标类文件在: ~/salt/transport/zeromq.py

class AsyncZeroMQPubChannel(salt.transport.mixins.auth.AESPubClientMixin, salt.transport.client.AsyncPubChannel):
'''
A transport channel backed by ZeroMQ for a Salt Publisher to use to
publish commands to connected minions
'''
def __init__(self,
             opts,
             **kwargs):
    self.opts = opts
    self.ttype = 'zeromq'

    if 'io_loop' in kwargs:
        self.io_loop = kwargs['io_loop']
    else:
        self.io_loop = tornado.ioloop.IOLoop() #IOLoop网络事件

    self.hexid = hashlib.sha1(self.opts['id']).hexdigest()

    self.auth = salt.crypt.AsyncAuth(self.opts, io_loop=self.io_loop) #异步的网络加密验证

    self.serial = salt.payload.Serial(self.opts)

    self.context = zmq.Context() #ZMQ发布订阅模型(PUB/SUB)
    self._socket = self.context.socket(zmq.SUB) #启动zmq的SUB(client)端socket

    if self.opts['zmq_filtering']:
        # TODO: constants file for "broadcast"
        self._socket.setsockopt(zmq.SUBSCRIBE, 'broadcast')
        self._socket.setsockopt(zmq.SUBSCRIBE, self.hexid)
    else:
        self._socket.setsockopt(zmq.SUBSCRIBE, '')

    self._socket.setsockopt(zmq.SUBSCRIBE, '')
    self._socket.setsockopt(zmq.IDENTITY, self.opts['id']) #minion自己的conf中的id

    # TODO: cleanup all the socket opts stuff
    if hasattr(zmq, 'TCP_KEEPALIVE'):
        self._socket.setsockopt(
            zmq.TCP_KEEPALIVE, self.opts['tcp_keepalive']
        )
        self._socket.setsockopt(
            zmq.TCP_KEEPALIVE_IDLE, self.opts['tcp_keepalive_idle']
        )
        self._socket.setsockopt(
            zmq.TCP_KEEPALIVE_CNT, self.opts['tcp_keepalive_cnt']
        )
        self._socket.setsockopt(
            zmq.TCP_KEEPALIVE_INTVL, self.opts['tcp_keepalive_intvl']
        )

    recon_delay = self.opts['recon_default']

    if self.opts['recon_randomize']:
        recon_delay = randint(self.opts['recon_default'],
                              self.opts['recon_default'] + self.opts['recon_max']
                      )

        log.debug("Generated random reconnect delay between '{0}ms' and '{1}ms' ({2})".format(
            self.opts['recon_default'],
            self.opts['recon_default'] + self.opts['recon_max'],
            recon_delay)
        )

    log.debug("Setting zmq_reconnect_ivl to '{0}ms'".format(recon_delay))
    self._socket.setsockopt(zmq.RECONNECT_IVL, recon_delay)

    if hasattr(zmq, 'RECONNECT_IVL_MAX'):
        log.debug("Setting zmq_reconnect_ivl_max to '{0}ms'".format(
            self.opts['recon_default'] + self.opts['recon_max'])
        )

        self._socket.setsockopt(
            zmq.RECONNECT_IVL_MAX, self.opts['recon_max']
        )

    if self.opts['ipv6'] is True and hasattr(zmq, 'IPV4ONLY'):
        # IPv6 sockets work for both IPv6 and IPv4 addresses
        self._socket.setsockopt(zmq.IPV4ONLY, 0)

    if HAS_ZMQ_MONITOR and self.opts['zmq_monitor']:
        self._monitor = ZeroMQSocketMonitor(self._socket)
        self._monitor.start_io_loop(self.io_loop)

# TODO: this is the time to see if we are connected, maybe use the req channel to guess?
@tornado.gen.coroutine
def connect(self):
    if not self.auth.authenticated: #判断是否auth验证
        yield self.auth.authenticate()
    self.publish_port = self.auth.creds['publish_port']
    self._socket.connect(self.master_pub) #连接master

看到这里也就明白了, minion在启动后最终使用了ZMQ库的Pub/Sub模型, connect方法链接master机器.

至此salt-minion算是启动起来了 -_-


先去喝杯水, 我想静静 -😃

接下来需要狠狠脑补下zeroMQ等网络通信机制.

From reno

2015-07-10 14:59

posted @ 2015-07-10 15:19  Renolei  阅读(10405)  评论(0编辑  收藏  举报