linux负载均衡(四)migration线程
在本系列第二篇中,我们讲到,如果在sched_balance_rq中没能成功迁移进程,进而需要active balance,那么会让migration线程去进行更激进的线程迁移。本篇来介绍下migration线程。
在linux系统中使用ps aux | grep migration,我们会看到一系列形如“migration/%d"的内核线程。它的定义在kernel/stop_machine.c中。
static struct smp_hotplug_thread cpu_stop_threads = { .store = &cpu_stopper.thread, .thread_should_run = cpu_stop_should_run, .thread_fn = cpu_stopper_thread, .thread_comm = "migration/%u", .create = cpu_stop_create, .park = cpu_stop_park, .selfparking = true, };
smp_hotplug_thread是cpu hotplug相关的线程描述符,其中重要的成员有thread_should_run,用来判断是否要执行线程函数;thread_fn指向线程函数;thread_comm就是该线程的名字了,命名方式是migration/,并以当前的cpuid结尾。这是一个per cpu线程,每个cpu都会有一个。它会在cpu_stop_init中被初始化。
static int __init cpu_stop_init(void) { unsigned int cpu; for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); raw_spin_lock_init(&stopper->lock); INIT_LIST_HEAD(&stopper->works); } BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); stop_machine_unpark(raw_smp_processor_id()); stop_machine_initialized = true; return 0; } early_initcall(cpu_stop_init);
该函数为每个cpu的per cpu变量cpu_stopper初始化锁和works list,调用smpboot_register_percpu_thread注册cpu_stop_threads。early_initcall(cpu_stop_init)表明该函数会在内核初始化时被调用。
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) { ... for_each_online_cpu(cpu) { ret = __smpboot_create_thread(plug_thread, cpu); if (ret) { smpboot_destroy_threads(plug_thread); goto out; } smpboot_unpark_thread(plug_thread, cpu); } list_add(&plug_thread->list, &hotplug_threads); ... return ret; }
可以看到它为每个cpu注册一个migration线程。
内核线程被创建后会贯穿整个系统的生命周期。它平时睡眠,有事做的时候会被唤醒。唤醒它的接口是stop_one_cpu和stop_one_cpu_nowait,区别仅在于是否会等待线程结束。
bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; return cpu_stop_queue_work(cpu, work_buf); }
stop_one_cpu_nowait的入参解释。cpu表示唤醒哪个cpu上的migration线程,fn是要执行的函数,arg是函数参数,work_buf是任务结构暂存点。cpu_stop_queue_work将任务加入队列。
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); DEFINE_WAKE_Q(wakeq); ... __cpu_stop_queue_work(stopper, work, &wakeq); ... wake_up_q(&wakeq); return enabled; }
__cpu_stop_queue_work会将work添加到stopper->works,将stopper->thread添加到wakeq。stopper->thread是在__smpboot_create_thread中赋值的,值指向migration线程的task struct结构体。wake_up_q会将migration线程唤醒。
在migration线程被唤醒后,执行的第一个函数是smpboot_thread_fn,它在创建线程的时候设置。
__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) { tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, ht->thread_comm); }
smpboot_thread_fn会在一个while循环中执行任务。
static int smpboot_thread_fn(void *data) { struct smpboot_thread_data *td = data; struct smp_hotplug_thread *ht = td->ht; while (1) { set_current_state(TASK_INTERRUPTIBLE); preempt_disable(); ... if (!ht->thread_should_run(td->cpu)) { preempt_enable_no_resched(); schedule(); } else { __set_current_state(TASK_RUNNING); preempt_enable(); ht->thread_fn(td->cpu); } } }
migration线程会调用thread_should_run判断当前是否有任务要做,如果有就调用thread_fn,也就是cpu_stopper_thread。
static void cpu_stopper_thread(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stop_work *work; repeat: work = NULL; raw_spin_lock_irq(&stopper->lock); if (!list_empty(&stopper->works)) { work = list_first_entry(&stopper->works, struct cpu_stop_work, list); list_del_init(&work->list); } raw_spin_unlock_irq(&stopper->lock); if (work) { cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; int ret; /* cpu stop callbacks must not sleep, make in_atomic() == T */ stopper->caller = work->caller; stopper->fn = fn; preempt_count_inc(); ret = fn(arg); if (done) { if (ret) done->ret = ret; cpu_stop_signal_done(done); } preempt_count_dec(); stopper->fn = NULL; stopper->caller = 0; WARN_ONCE(preempt_count(), "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; } }
如果从stopper->works中取出要做的任务,其中最重要的就是fn和arg,fn就是我们最终要执行的函数任务。
了解了migration线程的工作原理,下面我们来看load balance是怎么使用它的。
在解析sched_balance_rq的时候我们提到,如果需要做active balance会调用stop_one_cpu_nowait。
static int sched_balance_rq(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { ... if (need_active_balance(&env)) { ... if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } preempt_disable(); raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } ...
从上面的介绍可知,active_load_balance_cpu_stop是migration线程最终要执行的函数,busiest是函数参数。记住,busiest->push_cpu被赋值为当前的cpu,也就是迁移进程的目的cpu,要被stop的cpu,也就是唤醒的migration线程所在的cpu是busiest所在的cpu。
/* * active_load_balance_cpu_stop is run by the CPU stopper. It pushes * running tasks off the busiest CPU onto idle CPUs. It requires at * least 1 task to be running on each physical CPU where possible, and * avoids physical / logical imbalances. */ static int active_load_balance_cpu_stop(void *data) { struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; struct task_struct *p = NULL; struct rq_flags rf; rq_lock_irq(busiest_rq, &rf); /* * Between queueing the stop-work and running it is a hole in which * CPUs can become inactive. We should not move tasks from or to * inactive CPUs. */ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) goto out_unlock; /* Make sure the requested CPU hasn't gone down in the meantime: */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) goto out_unlock; /* Is there any task to move? */ if (busiest_rq->nr_running <= 1) goto out_unlock; /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-CPU setup. */ WARN_ON_ONCE(busiest_rq == target_rq); /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } if (likely(sd)) { struct lb_env env = { .sd = sd, .dst_cpu = target_cpu, .dst_rq = target_rq, .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, .flags = LBF_ACTIVE_LB, }; schedstat_inc(sd->alb_count); update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { schedstat_inc(sd->alb_pushed); /* Active balancing done, reset the failure counter. */ sd->nr_balance_failed = 0; } else { schedstat_inc(sd->alb_failed); } } rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; rq_unlock(busiest_rq, &rf); if (p) attach_one_task(target_rq, p); local_irq_enable(); return 0; }
active_load_balance_cpu_stop的目的是从busiest cpu上将正在运行的task取下来转移到idle cpu上。
检查busiest cpu和target cpu是不是都处在活跃状态,只有都在活跃状态才能继续。
检查busiest cpu是不是当前cpu,或者busiest rq是不是设置了active balance。
检查busiest rq可运行任务数量是不是大于1。
找到包含busiest cpu和target cpu的最低调度域。
设置lb_env,调用detach_one_task迁移线程。
无论成功与否都会将busiest_rq上的active_balance标志置零。
如果成功最调用attach_one_task将取下的task加入到target rq上。
detach_one_task
static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p; lockdep_assert_rq_held(env->src_rq); list_for_each_entry_reverse(p, &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; detach_task(p, env); /* * Right now, this is only the second place where * lb_gained[env->idle] is updated (other is detach_tasks) * so we can safely collect stats here rather than * inside detach_tasks(). */ schedstat_inc(env->sd->lb_gained[env->idle]); return p; } return NULL; }
detach_one_task会反向循环busiest rq的cfs_tasks,在使用detach_task之前调用can_migrate_task判断该task是否适合迁移。看起来如果在can_migrate_task中加入hook点,强行让某些task不迁移是有效的。
attach_one_task
static void attach_one_task(struct rq *rq, struct task_struct *p) { struct rq_flags rf; rq_lock(rq, &rf); update_rq_clock(rq); attach_task(rq, p); rq_unlock(rq, &rf); }
attach_one_task调用attach_task将取下的task加入target rq。
总结:
migration线程的执行流程比较繁琐。active balance执行函数比较简单,只是遍历src rq,取出一个task迁移到target rq。总的流程是,当sched_balance_rq需要active balance时会调用stop_one_cpu_nowait唤醒在busiest cpu上的migration线程,然后在该cpu上的rq中找到一个可以迁移的线程,迁移到target cpu中。这个target cpu是执行sched_balance_rq的那个cpu。这么看好像迁移线程也没有比sched_balance_rq激进到哪里去,仅仅是因为抢占了之前运行的task,所以将它也纳入到可迁移进程列表中而已,差不多是重复之前的流程,这样下来也未必能真正迁移出进程。