linux负载均衡（四）migration线程

在本系列第二篇中，我们讲到，如果在sched_balance_rq中没能成功迁移进程，进而需要active balance，那么会让migration线程去进行更激进的线程迁移。本篇来介绍下migration线程。

在linux系统中使用ps aux | grep migration，我们会看到一系列形如“migration/%d"的内核线程。它的定义在kernel/stop_machine.c中。

static struct smp_hotplug_thread cpu_stop_threads = {
        .store                  = &cpu_stopper.thread,
        .thread_should_run      = cpu_stop_should_run,
        .thread_fn              = cpu_stopper_thread,
        .thread_comm            = "migration/%u",
        .create                 = cpu_stop_create,
        .park                   = cpu_stop_park,
        .selfparking            = true,
};

smp_hotplug_thread是cpu hotplug相关的线程描述符，其中重要的成员有thread_should_run，用来判断是否要执行线程函数；thread_fn指向线程函数；thread_comm就是该线程的名字了，命名方式是migration/，并以当前的cpuid结尾。这是一个per cpu线程，每个cpu都会有一个。它会在cpu_stop_init中被初始化。

static int __init cpu_stop_init(void)
{
    unsigned int cpu;

    for_each_possible_cpu(cpu) {
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);

        raw_spin_lock_init(&stopper->lock);
        INIT_LIST_HEAD(&stopper->works);
    }

    BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
    stop_machine_unpark(raw_smp_processor_id());
    stop_machine_initialized = true;
    return 0;
}
early_initcall(cpu_stop_init);

该函数为每个cpu的per cpu变量cpu_stopper初始化锁和works list，调用smpboot_register_percpu_thread注册cpu_stop_threads。early_initcall(cpu_stop_init)表明该函数会在内核初始化时被调用。

int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
...
    for_each_online_cpu(cpu) {
        ret = __smpboot_create_thread(plug_thread, cpu);
        if (ret) {
            smpboot_destroy_threads(plug_thread);
            goto out;
        }
        smpboot_unpark_thread(plug_thread, cpu);
    }
    list_add(&plug_thread->list, &hotplug_threads);
...
    return ret;
}

可以看到它为每个cpu注册一个migration线程。

内核线程被创建后会贯穿整个系统的生命周期。它平时睡眠，有事做的时候会被唤醒。唤醒它的接口是stop_one_cpu和stop_one_cpu_nowait，区别仅在于是否会等待线程结束。

bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
                        struct cpu_stop_work *work_buf)
{
        *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
        return cpu_stop_queue_work(cpu, work_buf);
}

stop_one_cpu_nowait的入参解释。cpu表示唤醒哪个cpu上的migration线程，fn是要执行的函数，arg是函数参数，work_buf是任务结构暂存点。cpu_stop_queue_work将任务加入队列。

static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
        DEFINE_WAKE_Q(wakeq);

...
                __cpu_stop_queue_work(stopper, work, &wakeq);
...
        wake_up_q(&wakeq);

        return enabled;
}

__cpu_stop_queue_work会将work添加到stopper->works，将stopper->thread添加到wakeq。stopper->thread是在__smpboot_create_thread中赋值的，值指向migration线程的task struct结构体。wake_up_q会将migration线程唤醒。

在migration线程被唤醒后，执行的第一个函数是smpboot_thread_fn，它在创建线程的时候设置。

__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
        tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
                                    ht->thread_comm);
}

smpboot_thread_fn会在一个while循环中执行任务。

static int smpboot_thread_fn(void *data)
{
        struct smpboot_thread_data *td = data;
        struct smp_hotplug_thread *ht = td->ht;

        while (1) {
                set_current_state(TASK_INTERRUPTIBLE);
                preempt_disable();
...
                if (!ht->thread_should_run(td->cpu)) {
                        preempt_enable_no_resched();
                        schedule();
                } else {
                        __set_current_state(TASK_RUNNING);
                        preempt_enable();
                        ht->thread_fn(td->cpu);
                }
        }
}

migration线程会调用thread_should_run判断当前是否有任务要做，如果有就调用thread_fn，也就是cpu_stopper_thread。

static void cpu_stopper_thread(unsigned int cpu)
{
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
        struct cpu_stop_work *work;

repeat:
        work = NULL;
        raw_spin_lock_irq(&stopper->lock);
        if (!list_empty(&stopper->works)) {
                work = list_first_entry(&stopper->works,
                                        struct cpu_stop_work, list);
                list_del_init(&work->list);
        }
        raw_spin_unlock_irq(&stopper->lock);

        if (work) {
                cpu_stop_fn_t fn = work->fn;
                void *arg = work->arg;
                struct cpu_stop_done *done = work->done;
                int ret;

                /* cpu stop callbacks must not sleep, make in_atomic() == T */
                stopper->caller = work->caller;
                stopper->fn = fn;
                preempt_count_inc();
                ret = fn(arg);
                if (done) {
                        if (ret)
                                done->ret = ret;
                        cpu_stop_signal_done(done);
                }
                preempt_count_dec();
                stopper->fn = NULL;
                stopper->caller = 0;
                WARN_ONCE(preempt_count(),
                          "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
                goto repeat;
        }
}

如果从stopper->works中取出要做的任务，其中最重要的就是fn和arg，fn就是我们最终要执行的函数任务。

了解了migration线程的工作原理，下面我们来看load balance是怎么使用它的。

在解析sched_balance_rq的时候我们提到，如果需要做active balance会调用stop_one_cpu_nowait。

static int sched_balance_rq(int this_cpu, struct rq *this_rq,
            struct sched_domain *sd, enum cpu_idle_type idle,
            int *continue_balancing)
{
...
        if (need_active_balance(&env)) {
...
            if (!busiest->active_balance) {
                busiest->active_balance = 1;
                busiest->push_cpu = this_cpu;
                active_balance = 1;
            }

            preempt_disable();
            raw_spin_rq_unlock_irqrestore(busiest, flags);
            if (active_balance) {
                stop_one_cpu_nowait(cpu_of(busiest),
                    active_load_balance_cpu_stop, busiest,
                    &busiest->active_balance_work);
            }
...

从上面的介绍可知，active_load_balance_cpu_stop是migration线程最终要执行的函数，busiest是函数参数。记住，busiest->push_cpu被赋值为当前的cpu，也就是迁移进程的目的cpu，要被stop的cpu，也就是唤醒的migration线程所在的cpu是busiest所在的cpu。

/*
 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
 * running tasks off the busiest CPU onto idle CPUs. It requires at
 * least 1 task to be running on each physical CPU where possible, and
 * avoids physical / logical imbalances.
 */
static int active_load_balance_cpu_stop(void *data)
{
    struct rq *busiest_rq = data;
    int busiest_cpu = cpu_of(busiest_rq);
    int target_cpu = busiest_rq->push_cpu;
    struct rq *target_rq = cpu_rq(target_cpu);
    struct sched_domain *sd;
    struct task_struct *p = NULL;
    struct rq_flags rf;

    rq_lock_irq(busiest_rq, &rf);
    /*
     * Between queueing the stop-work and running it is a hole in which
     * CPUs can become inactive. We should not move tasks from or to
     * inactive CPUs.
     */
    if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
        goto out_unlock;

    /* Make sure the requested CPU hasn't gone down in the meantime: */
    if (unlikely(busiest_cpu != smp_processor_id() ||
             !busiest_rq->active_balance))
        goto out_unlock;

    /* Is there any task to move? */
    if (busiest_rq->nr_running <= 1)
        goto out_unlock;

    /*
     * This condition is "impossible", if it occurs
     * we need to fix it. Originally reported by
     * Bjorn Helgaas on a 128-CPU setup.
     */
    WARN_ON_ONCE(busiest_rq == target_rq);

    /* Search for an sd spanning us and the target CPU. */
    rcu_read_lock();
    for_each_domain(target_cpu, sd) {
        if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
            break;
    }

    if (likely(sd)) {
        struct lb_env env = {
            .sd        = sd,
            .dst_cpu    = target_cpu,
            .dst_rq        = target_rq,
            .src_cpu    = busiest_rq->cpu,
            .src_rq        = busiest_rq,
            .idle        = CPU_IDLE,
            .flags        = LBF_ACTIVE_LB,
        };

        schedstat_inc(sd->alb_count);
        update_rq_clock(busiest_rq);

        p = detach_one_task(&env);
        if (p) {
            schedstat_inc(sd->alb_pushed);
            /* Active balancing done, reset the failure counter. */
            sd->nr_balance_failed = 0;
        } else {
            schedstat_inc(sd->alb_failed);
        }
    }
    rcu_read_unlock();
out_unlock:
    busiest_rq->active_balance = 0;
    rq_unlock(busiest_rq, &rf);

    if (p)
        attach_one_task(target_rq, p);

    local_irq_enable();

    return 0;
}

active_load_balance_cpu_stop的目的是从busiest cpu上将正在运行的task取下来转移到idle cpu上。

检查busiest cpu和target cpu是不是都处在活跃状态，只有都在活跃状态才能继续。

检查busiest cpu是不是当前cpu，或者busiest rq是不是设置了active balance。

检查busiest rq可运行任务数量是不是大于1。

找到包含busiest cpu和target cpu的最低调度域。

设置lb_env，调用detach_one_task迁移线程。

无论成功与否都会将busiest_rq上的active_balance标志置零。

如果成功最调用attach_one_task将取下的task加入到target rq上。

detach_one_task

static struct task_struct *detach_one_task(struct lb_env *env)
{
    struct task_struct *p;

    lockdep_assert_rq_held(env->src_rq);

    list_for_each_entry_reverse(p,
            &env->src_rq->cfs_tasks, se.group_node) {
        if (!can_migrate_task(p, env))
            continue;

        detach_task(p, env);

        /*
         * Right now, this is only the second place where
         * lb_gained[env->idle] is updated (other is detach_tasks)
         * so we can safely collect stats here rather than
         * inside detach_tasks().
         */
        schedstat_inc(env->sd->lb_gained[env->idle]);
        return p;
    }
    return NULL;
}

detach_one_task会反向循环busiest rq的cfs_tasks，在使用detach_task之前调用can_migrate_task判断该task是否适合迁移。看起来如果在can_migrate_task中加入hook点，强行让某些task不迁移是有效的。

attach_one_task

static void attach_one_task(struct rq *rq, struct task_struct *p)
{
    struct rq_flags rf;

    rq_lock(rq, &rf);
    update_rq_clock(rq);
    attach_task(rq, p);
    rq_unlock(rq, &rf);
}

attach_one_task调用attach_task将取下的task加入target rq。

总结：

migration线程的执行流程比较繁琐。active balance执行函数比较简单，只是遍历src rq，取出一个task迁移到target rq。总的流程是，当sched_balance_rq需要active balance时会调用stop_one_cpu_nowait唤醒在busiest cpu上的migration线程，然后在该cpu上的rq中找到一个可以迁移的线程，迁移到target cpu中。这个target cpu是执行sched_balance_rq的那个cpu。这么看好像迁移线程也没有比sched_balance_rq激进到哪里去，仅仅是因为抢占了之前运行的task，所以将它也纳入到可迁移进程列表中而已，差不多是重复之前的流程，这样下来也未必能真正迁移出进程。

posted on 2024-12-19 11:58 半山随笔阅读(17) 评论(0) 编辑收藏举报

刷新页面返回顶部

linux负载均衡（四）migration线程

导航

公告