linux kernel负载均衡分析（一）

linux的负载均衡是一个很负载的过程，本篇讲一下触发负载均衡的流程，下一篇具体讲load balance流程。

负载均衡是kernel调度一个重要的方面，下面是三篇博客，讲得很好。

CFS任务的负载均衡（概述） (wowotech.net)

CFS任务的负载均衡（任务放置） (wowotech.net)

CFS任务的负载均衡（load balance） (wowotech.net)

在6.10以后的内核中，原来的rebalance_domains改名为sched_balance_domains。load_balance改名为sched_balance_rq。

sched_balance_rq是一个负载均衡的必经之路，从它的调用路径可以看到负载均衡的几种触发方式。

上面这张图是sched_balance_rq的调用链。可以看到直接调用它的有两个函数，sched_balance_newidle和sched_balance_domains，分别来看。

sched_balance_newidle的情况

调用它的是pick_next_task_fair，这个函数是在cfs调度器中选择下一个task时调用的，而它嗲用sched_balance——newidle的前提时当前没有可供调用的task，在进入idle状态之前，调用sched_balance_newidle尝试从其他busy的cpu上拉取task。总结一下这种情形，在cpu进入idle之前先看看其他cpu有没有多余的task。

static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
{
    unsigned long next_balance = jiffies + HZ;
    int this_cpu = this_rq->cpu;
    int continue_balancing = 1;
    u64 t0, t1, curr_cost = 0;
    struct sched_domain *sd;
    int pulled_task = 0;

    update_misfit_status(NULL, this_rq);

    /*
     * There is a task waiting to run. No need to search for one.
     * Return 0; the task will be enqueued when switching to idle.
     */
    if (this_rq->ttwu_pending)          //如果这个rq有被选中作为ttwu的cpu rq，返回
        return 0;

    /*
     * We must set idle_stamp _before_ calling sched_balance_rq()
     * for CPU_NEWLY_IDLE, such that we measure the this duration
     * as idle time.
     */
    this_rq->idle_stamp = rq_clock(this_rq);

    /*
     * Do not pull tasks towards !active CPUs...
     */
    if (!cpu_active(this_cpu))
        return 0;

    /*
     * This is OK, because current is on_cpu, which avoids it being picked
     * for load-balance and preemption/IRQs are still disabled avoiding
     * further scheduler activity on it and we're being very careful to
     * re-start the picking loop.
     */
    rq_unpin_lock(this_rq, rf);

    rcu_read_lock();
    sd = rcu_dereference_check_sched_domain(this_rq->sd);

    if (!get_rd_overloaded(this_rq->rd) ||                 //root domain没有overload的cpu
        (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {   //rq的avg_idle小于max_newidle_lb_cost

        if (sd)
            update_next_balance(sd, &next_balance);
        rcu_read_unlock();

        goto out;
    }
    rcu_read_unlock();

    raw_spin_rq_unlock(this_rq);

    t0 = sched_clock_cpu(this_cpu);           //记录当前的时间用于测量负载均衡的时间作为domain cost
    sched_balance_update_blocked_averages(this_cpu);

    rcu_read_lock();
    for_each_domain(this_cpu, sd) {          开始沿着当前cpu最低domain向上进行负载均衡
        u64 domain_cost;

        update_next_balance(sd, &next_balance);

        if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)     //当前rq的avg_idle没有当前已经产生的cost+max_newidle_lb_cost大
            break;                                                       //说明代价过大，break

        if (sd->flags & SD_BALANCE_NEWIDLE) {                           //当前的domain允许newidle balance

            pulled_task = sched_balance_rq(this_cpu, this_rq,           //负载均衡核心函数
                           sd, CPU_NEWLY_IDLE,
                           &continue_balancing);

            t1 = sched_clock_cpu(this_cpu);                             //这次负载均衡结束，看一下时间
            domain_cost = t1 - t0;                                      //domain cost就是这次负载均衡消耗的时间
            update_newidle_cost(sd, domain_cost);

            curr_cost += domain_cost;                                   // curr cost记录本次newidle balance总的耗时
            t0 = t1;
        }

        /*
         * Stop searching for tasks to pull if there are
         * now runnable tasks on this rq.
         */
        if (pulled_task || !continue_balancing)                      //这次拉取到了task，或者不能再继续做balance，break
            break;
    }
    rcu_read_unlock();

    raw_spin_rq_lock(this_rq);

    if (curr_cost > this_rq->max_idle_balance_cost)
        this_rq->max_idle_balance_cost = curr_cost;                          //如果这次newidle balance耗时超过max，更新max cost

    /*
     * While browsing the domains, we released the rq lock, a task could
     * have been enqueued in the meantime. Since we're not going idle,
     * pretend we pulled a task.
     */
    if (this_rq->cfs.h_nr_running && !pulled_task)          //在我们做balance的时候，有人塞给我们一个task，假装已经拉取到了一个task
        pulled_task = 1;

    /* Is there a task of a high priority class? */
    if (this_rq->nr_running != this_rq->cfs.h_nr_running)
        pulled_task = -1;

out:
    /* Move the next balance forward */
    if (time_after(this_rq->next_balance, next_balance))
        this_rq->next_balance = next_balance;

    if (pulled_task)
        this_rq->idle_stamp = 0;                     //拉取到task设置idle时间戳为0， why？
    else
        nohz_newidle_balance(this_rq);

    rq_repin_lock(this_rq, rf);

    return pulled_task;
}

如果拉取到任务，pick_next_task_fair会重新pick task，将拉取到的task调度上去。

sched_balance_domains的情况

调用它的函数有两个：sched_balance_softirq和_nohz_idle_balance。但从上面那张图很难了解到这两个函数的调用栈。我们再看一下下面这张图。

这张图有点复杂。我们看到，调用sched_balance_domains和_nohz_idle_balance的都是sched_balance_softirq。

static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
{
    struct rq *this_rq = this_rq();
    enum cpu_idle_type idle = this_rq->idle_balance;
    /*
     * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
     * balancing on behalf of the other idle CPUs whose ticks are
     * stopped. Do nohz_idle_balance *before* sched_balance_domains to
     * give the idle CPUs a chance to load balance. Else we may
     * load balance only within the local sched_domain hierarchy
     * and abort nohz_idle_balance altogether if we pull some load.
     */
    if (nohz_idle_balance(this_rq, idle))
        return;

    /* normal load balance */
    sched_balance_update_blocked_averages(this_rq->cpu);
    sched_balance_domains(this_rq, idle);
}

这是sched软中断的处理函数。它会先尝试nohz_idle_balance，如果成功直接返回，反之会再调用sched_balance_domains。在linux中，如果开启了nohz，cpu是可以在空闲的时候关闭时钟中断，也叫tickless mode。处于nohz idle状态的cpu是不会有时钟中断的，当busy cpu需要空闲cpu的帮助时，可以使用nohz_idle_balance，发送ipi中断给idle cpu，让idle cpu将本地的task拉取过去。如果成功拉取，自然不需要本地再进行负载均衡。sched_balance_domains是正常的负载均衡。

再向上看的话可以发现开启sched软中断的途径有两种，都是再update_process_times内，也就是在时钟中断处理函数内，也就是说sched软中断是周期性调用的。

sched_balance_trigger负责开启软中断。有两种途径。

/*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 */
void sched_balance_trigger(struct rq *rq)
{
    /*
     * Don't need to rebalance while attached to NULL domain or
     * runqueue CPU is not active
     */
    if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
        return;

    if (time_after_eq(jiffies, rq->next_balance))
        raise_softirq(SCHED_SOFTIRQ);

    nohz_balancer_kick(rq);
}

该函数的入参是rq，这种周期性的load balance是针对当前cpu的。两种情况触发，

第一是当前时刻已经到了下一次该做负载均衡的时间，判断的方法是对比当前的jiffies和rq->next_balance的值；

第二种是调用nohz_balance_kick，它会选则一个idle cpu来求援，让其帮忙拉取本地任务过去。

nohz的情形比较复杂，重点看一下。

static void nohz_balancer_kick(struct rq *rq)
{
    unsigned long now = jiffies;
    struct sched_domain_shared *sds;
    struct sched_domain *sd;
    int nr_busy, i, cpu = rq->cpu;
    unsigned int flags = 0;

    if (unlikely(rq->idle_balance))
        return;

    /*
     * We may be recently in ticked or tickless idle mode. At the first
     * busy tick after returning from idle, we will update the busy stats.
     */
    nohz_balance_exit_idle(rq);

    /*
     * None are in tickless mode and hence no need for NOHZ idle load
     * balancing:
     */
    if (likely(!atomic_read(&nohz.nr_cpus)))     //nohz是一个全局变量，表示处于nohz状态的cpu，数量，cpumask等。这一行判断当前是不是没有处于
        return;                                  //nohz状态的cpu

    if (READ_ONCE(nohz.has_blocked) &&
        time_after(now, READ_ONCE(nohz.next_blocked)))        //到了next blocked时间，这啥意思？
        flags = NOHZ_STATS_KICK;

    if (time_before(now, nohz.next_balance))         //还没到balance的时间
        goto out;

    if (rq->nr_running >= 2) {                       //当前的rq任务数量大于1个
        flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
        goto out;
    }

    rcu_read_lock();

    sd = rcu_dereference(rq->sd);
    if (sd) {
        /*
         * If there's a runnable CFS task and the current CPU has reduced
         * capacity, kick the ILB to see if there's a better CPU to run on:
         */
        if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {          //当前cpu太弱，请求更强的cpu帮忙
            flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
            goto unlock;
        }
    }

    sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
    if (sd) {
        /*
         * When ASYM_PACKING; see if there's a more preferred CPU
         * currently idle; in which case, kick the ILB to move tasks
         * around.
         *
         * When balancing between cores, all the SMT siblings of the
         * preferred CPU must be idle.
         */
        for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
            if (sched_asym(sd, i, cpu)) {
                flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
                goto unlock;
            }
        }
    }

    sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
    if (sd) {
        /*
         * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
         * to run the misfit task on.
         */
        if (check_misfit_status(rq)) {
            flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
            goto unlock;
        }

        /*
         * For asymmetric systems, we do not want to nicely balance
         * cache use, instead we want to embrace asymmetry and only
         * ensure tasks have enough CPU capacity.
         *
         * Skip the LLC logic because it's not relevant in that case.
         */
        goto unlock;
    }

    sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
    if (sds) {
        /*
         * If there is an imbalance between LLC domains (IOW we could
         * increase the overall cache utilization), we need a less-loaded LLC
         * domain to pull some load from. Likewise, we may need to spread
         * load within the current LLC domain (e.g. packed SMT cores but
         * other CPUs are idle). We can't really know from here how busy
         * the others are - so just get a NOHZ balance going if it looks
         * like this LLC domain has tasks we could move.
         */
        nr_busy = atomic_read(&sds->nr_busy_cpus);                   //llc domain的busy cpu数量大于1
        if (nr_busy > 1) {
            flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
            goto unlock;
        }
    }
unlock:
    rcu_read_unlock();
out:
    if (READ_ONCE(nohz.needs_update))
        flags |= NOHZ_NEXT_KICK;

    if (flags)
        kick_ilb(flags);                         //去叫一个cpu过来帮忙
}

这个函数决定在什么情况下去喊idle core帮忙：到了next blocked时间 || 当前rq大于1个task || 当前cpu太弱 || 当前cpu所在的llc domain的busy cpu大于1 || nohz.needs_update不为0。如果当前还没到balance的时间且nohz.needs_update为0则不会触发。

static void kick_ilb(unsigned int flags)
{
    int ilb_cpu;

    /*
     * Increase nohz.next_balance only when if full ilb is triggered but
     * not if we only update stats.
     */
    if (flags & NOHZ_BALANCE_KICK)
        nohz.next_balance = jiffies+1;

    ilb_cpu = find_new_ilb();           //找一个idle的cpu。
    if (ilb_cpu < 0)
        return;

    /*
     * Don't bother if no new NOHZ balance work items for ilb_cpu,
     * i.e. all bits in flags are already set in ilb_cpu.
     */
    if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)   //如果传入的flag跟ilb cpu rq的flag相比没有新增则返回，rq的nohz_flag是啥意思？
        return;

    /*
     * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
     * the first flag owns it; cleared by nohz_csd_func().
     */
    flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
    if (flags & NOHZ_KICK_MASK)
        return;

    /*
     * This way we generate an IPI on the target CPU which
     * is idle, and the softirq performing NOHZ idle load balancing
     * will be run before returning from the IPI.
     */
    smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);    //向我们选中的cpu发ipi中断
}

关键是要找一个合适的idle cpu。

static inline int find_new_ilb(void)
{
    const struct cpumask *hk_mask;
    int ilb_cpu;

    hk_mask = housekeeping_cpumask(HK_TYPE_MISC);

    for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {

        if (ilb_cpu == smp_processor_id())
            continue;

        if (idle_cpu(ilb_cpu))
            return ilb_cpu;
    }

    return -1;
}

找idle cpu的范围是nohz的idle_cpus_mask成员。还要满足housekeeping的要求。在发送完ipi中断后，ipi中断处理函数会调用nohz_csd_func处理nohz相关的事务。

static void nohz_csd_func(void *info)
{
        struct rq *rq = info;
        int cpu = cpu_of(rq);
        unsigned int flags;

        /*
         * Release the rq::nohz_csd.
         */
        flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
        WARN_ON(!(flags & NOHZ_KICK_MASK));

        rq->idle_balance = idle_cpu(cpu);
        if (rq->idle_balance && !need_resched()) {
                rq->nohz_idle_balance = flags;
                raise_softirq_irqoff(SCHED_SOFTIRQ);
        }
}

检查完标志位后，给rq的idle_balance赋值，用于表示当前cpu是否位idle cpu，最后触发sched softirq。记住当前的cpu已经不是发送ipi中断的那个cpu了。

这样nohz idle balance trigger sched 软中断的情况就分析完了。我们来看一下具体处理nohz idle balance的函数。

static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
    unsigned int flags = this_rq->nohz_idle_balance;

    if (!flags)
        return false;

    this_rq->nohz_idle_balance = 0;

    if (idle != CPU_IDLE)
        return false;

    _nohz_idle_balance(this_rq, flags);

    return true;
}

一些必要的检查后调用_nohz_idle_balance。

static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
{
    /* Earliest time when we have to do rebalance again */
    unsigned long now = jiffies;
    unsigned long next_balance = now + 60*HZ;
    bool has_blocked_load = false;
    int update_next_balance = 0;
    int this_cpu = this_rq->cpu;
    int balance_cpu;
    struct rq *rq;

    SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);

    /*
     * We assume there will be no idle load after this update and clear
     * the has_blocked flag. If a cpu enters idle in the mean time, it will
     * set the has_blocked flag and trigger another update of idle load.
     * Because a cpu that becomes idle, is added to idle_cpus_mask before
     * setting the flag, we are sure to not clear the state and not
     * check the load of an idle cpu.
     *
     * Same applies to idle_cpus_mask vs needs_update.
     */
    if (flags & NOHZ_STATS_KICK)
        WRITE_ONCE(nohz.has_blocked, 0);
    if (flags & NOHZ_NEXT_KICK)
        WRITE_ONCE(nohz.needs_update, 0);

    /*
     * Ensures that if we miss the CPU, we must see the has_blocked
     * store from nohz_balance_enter_idle().
     */
    smp_mb();

    /*
     * Start with the next CPU after this_cpu so we will end with this_cpu and let a
     * chance for other idle cpu to pull load.
     */
    for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {                 //_nohz_idle_balance不是只在被kick的cpu上做
        if (!idle_cpu(balance_cpu))                                                    //负载均衡，而是在所有idle core上，也不是拉取
            continue;                                                                  //kicker的任务，而是在每个idle core做负载均衡

        /*
         * If this CPU gets work to do, stop the load balancing
         * work being done for other CPUs. Next load
         * balancing owner will pick it up.
         */
        if (need_resched()) {                                            //如果当前的cpu需要调度，放弃本次load balance，设置blocked_load
            if (flags & NOHZ_STATS_KICK)                                 //设置nohz的needs update
                has_blocked_load = true;
            if (flags & NOHZ_NEXT_KICK)
                WRITE_ONCE(nohz.needs_update, 1);
            goto abort;
        }

        rq = cpu_rq(balance_cpu);

        if (flags & NOHZ_STATS_KICK)
            has_blocked_load |= update_nohz_stats(rq);

        /*
         * If time for next balance is due,
         * do the balance.
         */
        if (time_after_eq(jiffies, rq->next_balance)) {
            struct rq_flags rf;

            rq_lock_irqsave(rq, &rf);
            update_rq_clock(rq);
            rq_unlock_irqrestore(rq, &rf);

            if (flags & NOHZ_BALANCE_KICK)
                sched_balance_domains(rq, CPU_IDLE);                   //标志设置了nohz_balance_kick，进行负载均衡
        }

        if (time_after(next_balance, rq->next_balance)) {
            next_balance = rq->next_balance;
            update_next_balance = 1;
        }
    }

    /*
     * next_balance will be updated only when there is a need.
     * When the CPU is attached to null domain for ex, it will not be
     * updated.
     */
    if (likely(update_next_balance))
        nohz.next_balance = next_balance;

    if (flags & NOHZ_STATS_KICK)
        WRITE_ONCE(nohz.next_blocked,
               now + msecs_to_jiffies(LOAD_AVG_PERIOD));

abort:
    /* There is still blocked load, enable periodic update */
    if (has_blocked_load)
        WRITE_ONCE(nohz.has_blocked, 1);                     //设置has blocked，表示本次idle balance没完成
}

nohz并非在kickee cpu上做负载均衡，而是在所有idle cpu上尝试做负载均衡。不想newidle balance，它只在自己的cpu，沿着domain做balance。

nohz的情形分析完了。

posted on 2024-12-18 20:36 半山随笔阅读(7) 评论(0) 编辑收藏举报

刷新页面返回顶部

linux kernel负载均衡分析（一）

导航

公告