Linux中的进程调度(六)
从现在开始来分析和负载平衡有关的策略。
/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * * It also gets called by the fork code, when changing the parent's * timeslices. */ void scheduler_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; sched_clock_tick(); spin_lock(&rq->lock); update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu);//检查当前cpu运行队列是否为空(只有idle进程) trigger_load_balance(rq, cpu); #endif }可见,在每次处理时钟中断时,在最后会检查一下是否需要进行一次负载平衡。 进入到trigger_load_balance中去,从名字就可以猜出个大概。
/* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. * * In case of CONFIG_NO_HZ, this is the place where we nominate a new * idle load balancing owner or decide to stop the periodic load balancing, * if the whole system is idle. */ static inline void trigger_load_balance(struct rq *rq, int cpu) { #ifdef CONFIG_NO_HZ /* * If we were in the nohz mode recently and busy at the current * scheduler tick, then check if we need to nominate new idle * load balancer. */ if (rq->in_nohz_recently && !rq->idle_at_tick) { rq->in_nohz_recently = 0; if (atomic_read(&nohz.load_balancer) == cpu) { cpumask_clear_cpu(cpu, nohz.cpu_mask); atomic_set(&nohz.load_balancer, -1); } if (atomic_read(&nohz.load_balancer) == -1) { /* * simple selection for now: Nominate the * first cpu in the nohz list to be the next * ilb owner. * * TBD: Traverse the sched domains and nominate * the nearest cpu in the nohz.cpu_mask. */ int ilb = cpumask_first(nohz.cpu_mask); if (ilb < nr_cpu_ids) resched_cpu(ilb); } } /* * If this cpu is idle and doing idle load balancing for all the * cpus with ticks stopped, is it time for that to stop? */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { resched_cpu(cpu); return; } /* * If this cpu is idle and the idle load balancing is done by * someone else, then no need raise the SCHED_SOFTIRQ */ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && cpumask_test_cpu(cpu, nohz.cpu_mask)) return; #endif if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); }忽略掉CONFIG_NO_HZ的部分,可以看到,这个函数就是判断一下当前的jiffies是不是已经比rq->next_balance值大,如果值大的话,会进一步调用raise_softirq提交一个软中断。提交的过程很简单,就是把SCHED_SOFTIRQ对应的位置位,处理软中断时检查是否位,如果置位调用相应的软中断处理函数。 用cscope在源码中搜索,发现有如下语句:
#ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #endif呵,这个软中断还是特别为SMP架构准备的呢~ 顺着这个线索,去查看run_rebalance_domains的实现
/* * run_rebalance_domains is triggered when needed from the scheduler tick. * In CONFIG_NO_HZ case, the idle load balance owner will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; rebalance_domains(this_cpu, idle); #ifdef CONFIG_NO_HZ /* * If this cpu is the owner for idle load balancing, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ if (this_rq->idle_at_tick && atomic_read(&nohz.load_balancer) == this_cpu) { struct rq *rq; int balance_cpu; for_each_cpu(balance_cpu, nohz.cpu_mask) { if (balance_cpu == this_cpu) continue; /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load * balancing owner will pick it up. */ if (need_resched()) break; rebalance_domains(balance_cpu, CPU_IDLE); rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; } } #endif }忽略CONFIG_NO_HZ,那么这个函数就是根据当前cpu的负载状态(为idle进程还是其它)确定idle参数,然后调用rebalance_domains
/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in arch_init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { int balance = 1; struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; cpumask_var_t tmp; /* Fails alloc? Rebalancing probably not a priority right now. */ if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) return; for_each_domain(cpu, sd) {//对于该cpu所在的调度域及其父调度域 if (!(sd->flags & SD_LOAD_BALANCE))//如果这个调度域已经明确表示不参与负载平衡,则跳过 continue; interval = sd->balance_interval;//得到该调度域的平衡周期 if (idle != CPU_IDLE) interval *= sd->busy_factor;//根据当前cpu状态对此周期进行修正 /* scale ms to jiffies */ interval = msecs_to_jiffies(interval);//将毫秒转化成jiffie数 if (unlikely(!interval)) interval = 1; if (interval > HZ*NR_CPUS/10)//继续修正 interval = HZ*NR_CPUS/10; need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { if (!spin_trylock(&balancing)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) {//真的需要进行负载平衡了 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is * not idle. */ idle = CPU_NOT_IDLE;//注释已经写的很清楚 } sd->last_balance = jiffies;//更新最后一次平衡的时间 } if (need_serialize) spin_unlock(&balancing); out: if (time_after(next_balance, sd->last_balance + interval)) {//设置下一次进行平衡操作的时间 next_balance = sd->last_balance + interval; update_next_balance = 1; } /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more * actively. */ if (!balance) break; } /* * next_balance will be updated only when there is a need. * When the cpu is attached to null domain for ex, it will not be * updated. */ if (likely(update_next_balance)) rq->next_balance = next_balance; free_cpumask_var(tmp); }重点就是load_balance了
/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance, struct cpumask *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; cpumask_setall(cpus);//先将所有cpu置位 /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, * let the state of idle sibling percolate up as CPU_IDLE, instead of * portraying it as CPU_NOT_IDLE. */ if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))//SMP结构中,SHARE_CPUPOWER不会出现 sd_idle = 1; schedstat_inc(sd, lb_count[idle]);//更新统计信息 redo: update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance);//找也该调度域中最忙的调度组 if (*balance == 0) goto out_balanced; if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], imbalance); ld_moved = 0; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running nr_balance_failed++; if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else sd->nr_balance_failed = 0; if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { /* * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call * move_tasks). */ if (sd->balance_interval max_interval) sd->balance_interval *= 2; } if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) ld_moved = -1; goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; out_one_pinned: /* tune up the balancing interval */ if ((all_pinned && sd->balance_interval balance_interval max_interval)) sd->balance_interval *= 2; if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) ld_moved = -1; else ld_moved = 0; out: if (ld_moved) update_shares(sd); return ld_moved; }其中update_shares有必要去看一下
static void update_shares(struct sched_domain *sd) { u64 now = cpu_clock(raw_smp_processor_id()); s64 elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {还要再进行一次确认,是不是需要将此调度域中的每个进程组的share值更新 sd->last_update = now; walk_tg_tree(tg_nop, tg_shares_up, sd); } }如果真的需要去更新该调度域的各个进程组的share值的话,将调用wsalk_tg_tree进行更新操作,tg_nop,shares_up是两个函数指针,其中在这里,tg_nop进行空操作,shares_up将进行真正的更新操作。
/* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; int ret; rcu_read_lock(); parent = &root_task_group; down: ret = (*down)(parent, data); if (ret) goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; up: continue; } ret = (*up)(parent, data); if (ret) goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; out_unlock: rcu_read_unlock(); return ret; }代码比较难读,不如自己在纸上画个图,实际走一遍,就看清楚了,这里实际上就是从下而上,从左到右,依次更新每个调度组的share值,具体的更新方法在shares_up函数中体现。
//注释中也说明了刚才的遍历方法 /* * Re-compute the task group their per cpu shares over the given domain. * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ static int tg_shares_up(struct task_group *tg, void *data) { unsigned long weight, rq_weight = 0; unsigned long shares = 0; struct sched_domain *sd = data; int i; for_each_cpu(i, sched_domain_span(sd)) {//对于该调度域的所有cpu /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to * run here it will not get delayed by group starvation. */ weight = tg->cfs_rq[i]->load.weight;//将该调度组在该调度域中各个cpu上的运行队列的负载相加 if (!weight)//如果在该cpu上没有负载,就要分一些过来了,注意与下一条语句联系 weight = NICE_0_LOAD; tg->cfs_rq[i]->rq_weight = weight;//注意这里是cfs_rq的rq_weight, rq_weight += weight;//计算总的rq_weight shares += tg->cfs_rq[i]->shares;//将该调度组在该调度域中各个cpu上的运行队列的shares值相加 } //进行一下修正 if ((!shares && rq_weight) || shares > tg->shares) shares = tg->shares; if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))//对于SMP来说,这个条件是满足的, shares = tg->shares;//shares值直接变成了该调度组的shares值了 for_each_cpu(i, sched_domain_span(sd))//进行完刚才的统计后,再来一次循环,这次要更新了 update_group_shares_cpu(tg, i, shares, rq_weight); return 0; }再来看update_group_shares_cpu
/* * Calculate and set the cpu's group shares. */ static void//注意这里的参数,tg就是在刚才树的遍历中遍历到的组,cpu是该调度域中的cpu i,sd_shares是该调度组的shares值,sd_rq_weight是该组在该调度域中各个cpu上的运行队列的负载和 update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight) { unsigned long shares; unsigned long rq_weight; if (!tg->se[cpu]) return; rq_weight = tg->cfs_rq[cpu]->rq_weight; /* * Sum shares * rq_weight * shares = ----------------------- * Sum rq_weight * */ shares = (sd_shares * rq_weight) / sd_rq_weight;//代码中的注释已经写的很清楚了 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);//进行一下修正 if (abs(shares - tg->se[cpu]->load.weight) > sysctl_sched_shares_thresh) {//为了避免操作过于频繁,只有结果大于一个可控值时,才进行更新。 struct rq *rq = cpu_rq(cpu); unsigned long flags; spin_lock_irqsave(&rq->lock, flags); tg->cfs_rq[cpu]->shares = shares; __set_se_shares(tg->se[cpu], shares);//shares值最后还是要落实到"se"(scheduler entiry)中去 spin_unlock_irqrestore(&rq->lock, flags); } }注意上面的计算方法,是说对组内的cpu来讲,共同来分担该组的shares值,具体的分担方法是,按比例来,哪个cpu的负载占所有cpu负载的百分比大,哪个cpu分得的shares值也就大一些,优先级就大一些,运行的时候就会多一些筹码,这里需要返回去看pick_up_next的部分代码 __set_se_shares代码如下
static void __set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; int on_rq; on_rq = se->on_rq; if (on_rq) dequeue_entity(cfs_rq, se, 0); se->load.weight = shares; se->load.inv_weight = 0; if (on_rq) enqueue_entity(cfs_rq, se, 0); }很好理解,先移出可执行队列,更新其负载后,再移入可执行队列。 update_shares到这里就分析完了,注意update_shares的执行时间是在已经确定需要进行负载平衡,但是还没有开始确定怎么平衡之前。先更新一下该调度域中各个组中的负载情况,有助于下面的调度组以及进程的选择。 返回刚才的load_balance函数,继续往下进行。
redo: update_shares(sd);//将此调度域中的每个参加调度的进程组的share值进行更新 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance);//找到该调度域中最忙的调度组 if (*balance == 0) goto out_balanced; if (!group) {//如果都不太忙,当然不需要平衡操作 schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列,也就是最忙的cpu if (!busiest) {//如果所有cpu都不符合标准,也不需要平衡操作 schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息 ld_moved = 0;//是否移动了某些进程的标志 if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程,注释中写了,如果进程数小于1,那么将其移走后,进程数达到零,不还是不平衡么?所以干脆不移动 /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示,在busiest队列中挑选可进程,移动到this_rq中去。 imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); /* * some other cpu did the load balance for us. */ if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } }先看 find_busiest_group函数,这个函数比较长
/* * find_busiest_group finds and returns the busiest CPU group within the * domain. It calculates and returns the amount of weighted load which * should be moved to restore balance via the imbalance parameter. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, const struct cpumask *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; unsigned long busiest_load_per_task, busiest_nr_running; unsigned long this_load_per_task, this_nr_running; int load_idx, group_imb = 0; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int power_savings_balance = 1; unsigned long leader_nr_running = 0, min_load_per_task = 0; unsigned long min_nr_running = ULONG_MAX; struct sched_group *group_min = NULL, *group_leader = NULL; #endif max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; if (idle == CPU_NOT_IDLE)//先根据传进来的idle参数来确定load_idx的值,这个值在下面寻找最忙调度组(不是进程组)时会作为一个重要指标 load_idx = sd->busy_idx;//busy_idx默认为3 else if (idle == CPU_NEWLY_IDLE) load_idx = sd->newidle_idx;//newidle_idx为2 else load_idx = sd->idle_idx;idle_idx为1 do {//从这里一直到while(group!=sd->groups)是一个大循环,其目的就是遍布这个调度域中所有的调度组,找出最忙的那个,其中,this_cpu所属的调度组不参与与其它调度组的竞争 unsigned long load, group_capacity, max_cpu_load, min_cpu_load; int local_group; int i; int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group));//如果发现tihs_cpu属于当前的调度组,那么将local_group置位 if (local_group) balance_cpu = cpumask_first(sched_group_cpus(group));//如果正在处理"local_group",那么将balance_cpu暂定为该组中第一个cpu /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; sum_avg_load_per_task = avg_load_per_task = 0; max_cpu_load = 0; min_cpu_load = ~0UL; for_each_cpu_and(i, sched_group_cpus(group), cpus) {//对于该组中每个cpu struct rq *rq = cpu_rq(i); if (*sd_idle && rq->nr_running) *sd_idle = 0; /* Bias balancing toward cpus of our domain */ if (local_group) {//如果是本地组,且当前cpu为idle_cpu,并在循环中还没有进行过对balance_cpu的修正 if (idle_cpu(i) && !first_idle_cpu) { first_idle_cpu = 1; balance_cpu = i;//将balance_cpu置为i,仔细考虑下,这里的逻辑就是说,如果本地组中有空闲cpu,那么就将第一个空闲cpu作为balance_cpu,否则,将该组中第一个cpu作为balance_cpu } load = target_load(i, load_idx);//累加计算该组的负载,增加的数目要根据前面确定的load_idx来确定 } else {//如果当前组不是本地组 load = source_load(i, load_idx);//同上 if (load > max_cpu_load)//如果该调度组的总负载大于已经找到的最大负载,或者小于已经找到的最小负载,则更新最大/最小值 max_cpu_load = load; if (min_cpu_load > load) min_cpu_load = load; } avg_load += load;//根据load_idx计算出来的负载之和 sum_nr_running += rq->nr_running;//组内各个cpu上可运行队列中进程数目之和 sum_weighted_load += weighted_cpuload(i);//该组内当前所有cpu的负载之和,注意这里是当前的,和avg_load不同,因为avg_load的计算涉及到历史值,也就是和load_idx有关 sum_avg_load_per_task += cpu_avg_load_per_task(i);//该cpu上所有进程的平均负载 } /* * First idle cpu or the first cpu(busiest) in this sched group * is eligible for doing load balancing at this and above * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ if (idle != CPU_NEWLY_IDLE && local_group && balance_cpu != this_cpu && balance) { *balance = 0; goto ret; } total_load += avg_load;//调度域的总负载 total_pwr += group->__cpu_power;//这个cpu_power还没弄清是怎么回事 /* Adjust by relative CPU power of the group */ avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE);//根据该总的avg_load以及其power确定该组最终的avg_load /* * Consider the group unbalanced when the imbalance is larger * than the average weight of two tasks. * * APZ: with cgroup the avg task weight can vary wildly and * might not be a suitable number - should we keep a * normalized nr_running number somewhere that negates * the hierarchy? */ avg_load_per_task = sg_div_cpu_power(group, sum_avg_load_per_task * SCHED_LOAD_SCALE);//同样,修正该组的avg_load_per_task值 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)//如果该组内最大cpu负载值与最小cpu负载值之差大于平均负载值的2倍,则__group_imb(imbalance)置1,下面会看到它的作用 __group_imb = 1; group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) {//如果是本地组,只更新this相关的变量,并不更新busiest指针的指向 this_load = avg_load; this = group; this_nr_running = sum_nr_running; this_load_per_task = sum_weighted_load; } else if (avg_load > max_load && (sum_nr_running > group_capacity || __group_imb)) {//如果有不平衡的情况,或者组内的进程数目已经超过了该组的能力,且该组的平均负载大于已知的其它组的最大平均负载 max_load = avg_load;//更新最大值 busiest = group;//更新指针指向 busiest_nr_running = sum_nr_running;//更新最忙组中的进程数目 busiest_load_per_task = sum_weighted_load; group_imb = __group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)//SMT结构暂不分析,所以会跳过很长的代码,直到整个do-while循环结束 /* * Busy processors will not participate in power savings * balance. */ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto group_next; /* * If the local group is idle or completely loaded * no need to do power savings balance at this domain */ if (local_group && (this_nr_running >= group_capacity || !this_nr_running)) power_savings_balance = 0; /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ if (!power_savings_balance || sum_nr_running >= group_capacity || !sum_nr_running) goto group_next; /* * Calculate the group which has the least non-idle load. * This is the group from where we need to pick up the load * for saving power */ if ((sum_nr_running cpumask_first(sched_group_cpus(group_min)))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / sum_nr_running; } /* * Calculate the group which is almost near its * capacity but still has some space to pick up some load * from other group and save more power */ if (sum_nr_running leader_nr_running || (sum_nr_running == leader_nr_running && cpumask_first(sched_group_cpus(group)) next; } while (group != sd->groups);//至此,已经将该调度域中所有调度组全部遍历完,如果有符合条件的最忙调度组的话,busiest已经指向它 if (!busiest || this_load >= max_load || busiest_nr_running == 0)//没有符合条件的,或者本地调度组比找到的那一组还要忙,或者最忙的组中已经没有进程,则不需要平衡 goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; if (this_load >= avg_load || 100*max_load imbalance_pct*this_load)//另一个不需要平衡的条件,当前组负载大于平均平均负载,或者最大负载与当前组负载之比小于某个值 goto out_balanced; busiest_load_per_task /= busiest_nr_running;//最忙组中每个进程的平均负载 if (group_imb)//如果组内cpu上的最大负载与最小负载之差大于组内平均负载的2倍,则进行一下修正 busiest_load_per_task = min(busiest_load_per_task, avg_load); /* * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to * reduce the max loaded cpu below the average load, as either of these * actions would just result in more rebalancing later, and ping-pong * tasks around. Thus we look for the minimum possible imbalance. * Negative imbalances (*we* are more loaded than anyone else) will * be counted as no imbalance for these purposes -- we can't fix that * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ if (max_load <= busiest_load_per_task) goto out_balanced; /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ if (max_load __cpu_power, (avg_load - this_load) * this->__cpu_power) / SCHED_LOAD_SCALE;//计算一下需要移动的负载量,下面就是一些太细节的东西了,从逻辑上也讲不好是什么道理,所以不进行分析 /* * if *imbalance is less than the average load per runnable task * there is no gaurantee that any tasks will be moved so we'll have * a think about bumping its value to force at least one task to be * moved */ if (*imbalance this_load_per_task) imbn = 1; } else this_load_per_task = cpu_avg_load_per_task(this_cpu); if (max_load - this_load + busiest_load_per_task >= busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; } /* * OK, we don't have enough imbalance to justify moving tasks, * however we may be able to increase total CPU power used by * moving them. */ pwr_now += busiest->__cpu_power * min(busiest_load_per_task, max_load); pwr_now += this->__cpu_power * min(this_load_per_task, this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ tmp = sg_div_cpu_power(busiest, busiest_load_per_task * SCHED_LOAD_SCALE); if (max_load > tmp) pwr_move += busiest->__cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ if (max_load * busiest->__cpu_power __cpu_power); else tmp = sg_div_cpu_power(this, busiest_load_per_task * SCHED_LOAD_SCALE); pwr_move += this->__cpu_power * min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) *imbalance = busiest_load_per_task; } return busiest; out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; if (this == group_leader && group_leader != group_min) { *imbalance = min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = cpumask_first(sched_group_cpus(group_leader)); } return group_min; } #endif ret: *imbalance = 0; return NULL; }顺着load_balance的调用路线,接下来就要执行find_busiest_queue了,这个函数比较好理解
/* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long imbalance, const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; int i; for_each_cpu(i, sched_group_cpus(group)) { unsigned long wl; if (!cpumask_test_cpu(i, cpus))//该cpu不在当前调度组中 continue; rq = cpu_rq(i); wl = weighted_cpuload(i); if (rq->nr_running == 1 && wl > imbalance)//如果该cpu上只有一个进程,且其负载比需要移动的负载量大 continue; if (wl > max_load) {//更新最大值及最忙队列指针 max_load = wl; busiest = rq; } } return busiest; }再次回到load_balance的调用路线中,这次终于可以实施最终的移动了
busiest = find_busiest_queue(group, idle, imbalance, cpus);//找到该组中最忙的运行队列,也就是最忙的cpu if (!busiest) {//如果所有cpu都不符合标准,也不需要平衡操作 schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], imbalance);//更新统计信息 ld_moved = 0;//是否移动了某些进程的标志 if (busiest->nr_running > 1) {//该cpu上可运行进程数大于1时才考虑移动进程,注释中写了,如果进程数小于1,那么将其移走后,进程数达到零,不还是不平衡么?所以干脆不移动 /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ local_irq_save(flags); double_rq_lock(this_rq, busiest);//同时为两个队列加锁,要考虑防死锁,这里的处理是按指针地址大小进行加锁 ld_moved = move_tasks(this_rq, this_cpu, busiest,//根据imbalance参数的指示,在busiest队列中挑选可进程,移动到this_rq中去。 imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); /* * some other cpu did the load balance for us. */ if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; goto out_balanced; } }加锁之后,便进入到了move_tasks中,
/* * move_tasks tries to move up to max_load_move weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". * Returns 1 if successful and 0 otherwise. * * Called with both runqueues locked. */ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { const struct sched_class *class = sched_class_highest; unsigned long total_load_moved = 0; int this_best_prio = this_rq->curr->prio; do { total_load_moved += class->load_balance(this_rq, this_cpu, busiest, max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); class = class->next; if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; } while (class && max_load_move > total_load_moved); return total_load_moved > 0; }函数主要由一个do-while循环完成,开始时class指向sched_class_highest,而在sched.c里面有有:
#define sched_class_highest (&rt_sched_class)也就是说,在循环第一次执行时,会调用rt_sched_class调度类里对应的load_balance函数,去sched_rt.c里面寻找,发现如下:
.load_balance = load_balance_rt,
static unsigned long load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { /* don't touch RT tasks */ return 0; }可见,这是一个空函数,也就是对于负载平衡,是不会将rt类进程迁移走的,循环只好进入下一个调度类,也就是cfs调度类。去执行它所对应的load_balance函数
#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { long rem_load_move = max_load_move;//rem_load_move remain_load_move int busiest_cpu = cpu_of(busiest);//最忙的列队所对应的cpu struct task_group *tg; rcu_read_lock(); update_h_load(busiest_cpu);//更新一下 list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; /* * empty group */ if (!busiest_cfs_rq->task_weight) continue; rem_load = (u64)rem_load_move * busiest_weight; rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, tg->cfs_rq[busiest_cpu]); if (!moved_load) continue; moved_load *= busiest_h_load; moved_load = div_u64(moved_load, busiest_weight + 1); rem_load_move -= moved_load; if (rem_load_move < 0) break; } rcu_read_unlock(); return max_load_move - rem_load_move; }update_h_load与之前看到过的更新shares值的函数比较像
static void update_h_load(long cpu) { walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); }前面已经说过tg_nop函数是一个空函数,来看一下tg_load_down函数
/* * Compute the cpu's hierarchical load factor for each task group. * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ static int tg_load_down(struct task_group *tg, void *data)//注释里写的还算清楚 { unsigned long load; long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; } else { load = tg->parent->cfs_rq[cpu]->h_load;//父层需要移动的负载量 load *= tg->cfs_rq[cpu]->shares;//这个值其实就是本层的load_weight值 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; } tg->cfs_rq[cpu]->h_load = load; return 0; }上面的代码算下来,就是本层调度组需要移动的负载量=本调度组的shares值*(本调度组的load_weight)/父调度组的load_weight 其实说白了就是按负载比例进行分配。 将本队列中各组需要移动的负载量计算出来以后,就可以去各组中去挑选实际的进程了。回到load_balance_fair函数中
update_h_load(busiest_cpu);//更新一下 list_for_each_entry_rcu(tg, &task_groups, list) {//对于各调度组在该cpu上的运行队列 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load;//刚才update_h_load计算好的该组需要移动的负载量 unsigned long busiest_weight = busiest_cfs_rq->load.weight;//该组的负载 u64 rem_load, moved_load; /* * empty group */ if (!busiest_cfs_rq->task_weight) continue; rem_load = (u64)rem_load_move * busiest_weight; rem_load = div_u64(rem_load, busiest_h_load + 1);//rem_load=rem_load_move*(busiest_weight)/(busiest_h_load+1) moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, tg->cfs_rq[busiest_cpu]);//最终的动作就是这里了 if (!moved_load) continue; moved_load *= busiest_h_load; moved_load = div_u64(moved_load, busiest_weight + 1); rem_load_move -= moved_load;//移动完一个组,将“成果”反馈,看看还是不是需要继续移动下一个组中的进程 if (rem_load_move < 0) break; } rcu_read_unlock(); return max_load_move - rem_load_move;对于__load_balance_fair,如下:
static unsigned long __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct cfs_rq *cfs_rq) { struct rq_iterator cfs_rq_iterator; cfs_rq_iterator.start = load_balance_start_fair; cfs_rq_iterator.next = load_balance_next_fair; cfs_rq_iterator.arg = cfs_rq; return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, idle, all_pinned, this_best_prio, &cfs_rq_iterator); }还需要进入到balance_task中去
static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct rq_iterator *iterator) { int loops = 0, pulled = 0, pinned = 0; struct task_struct *p; long rem_load_move = max_load_move; if (max_load_move == 0) goto out; pinned = 1; /* * Start the load-balancing iterator: */ p = iterator->start(iterator->arg); next: if (!p || loops++ > sysctl_sched_nr_migrate) goto out; if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {//如果该进程负载大于剩余需要移动的负载量的2倍,则不考虑移动此进程,如果此进程不能被移动,则同样不考虑移动此进程 p = iterator->next(iterator->arg); goto next; } pull_task(busiest, p, this_rq, this_cpu);//可以移动,此函数将进程拉到this_cpu的this_rq上来 pulled++;//移动进程数加1 rem_load_move -= p->se.load.weight;//剩余需要移动负载量减小 /* * We only want to steal up to the prescribed amount of weighted load. */ if (rem_load_move > 0) { if (p->prio prio; p = iterator->next(iterator->arg); goto next; } out: /* * Right now, this is one of only two places pull_task() is called, * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ schedstat_add(sd, lb_gained[idle], pulled);//统计信息 if (all_pinned) *all_pinned = pinned; return max_load_move - rem_load_move; }can_migrate_task的代码如下:
/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { /* * We do not migrate tasks that are: * 1) running (obviously), or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); return 0; } *all_pinned = 0; if (task_running(rq, p)) { schedstat_inc(p, se.nr_failed_migrations_running); return 0; } /* * Aggressive migration if: * 1) task is cache cold, or * 2) too many balance attempts have failed. */ if (!task_hot(p, rq->clock, sd) || sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS if (task_hot(p, rq->clock, sd)) { schedstat_inc(sd, lb_hot_gained[idle]); schedstat_inc(p, se.nr_forced_migrations); } #endif return 1; } if (task_hot(p, rq->clock, sd)) { schedstat_inc(p, se.nr_failed_migrations_hot); return 0; } return 1; }注释中写的极为详细,这里不作过多解释。 那么往下,就来看一看pull_task吧
/* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ static void pull_task(struct rq *src_rq, struct task_struct *p, struct rq *this_rq, int this_cpu)//注释说的很清楚 { deactivate_task(src_rq, p, 0);//将p从src队列中拿掉 set_task_cpu(p, this_cpu);//将p中相应指针指向this_cpu,但是还没入新的可执行队列 activate_task(this_rq, p, 0);//最终动作,将p加入this_rq队列 /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ check_preempt_curr(this_rq, p, 0); }set_task_cpu函数:
void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); u64 clock_offset; clock_offset = old_rq->clock - new_rq->clock; trace_sched_migrate_task(p, task_cpu(p), new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) p->se.wait_start -= clock_offset; if (p->se.sleep_start) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; if (old_cpu != new_cpu) { schedstat_inc(p, se.nr_migrations); if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); } #endif p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); }__set_task_cpu:
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { set_task_rq(p, cpu); #ifdef CONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be * successfuly executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment. */ smp_wmb(); task_thread_info(p)->cpu = cpu; #endif }set_task_rq:
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; #endif #ifdef CONFIG_RT_GROUP_SCHED p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu]; #endif }可见,p进程原来属于哪个组,移动后还是属于哪个组,只不过它被移动到了该组在其它cpu上的运行队列中 由move_tasks产生的动作到这里就完了,其实就是按照先算出来的每个组需要移动的负载量,依次从每个组中挑选进程移走。 再次回到load_balance函数中,现在的情况是,通过寻找该调度域中最忙的调度组,以及找到最忙调度组中的最忙cpu,又通过move_tasks将各种进程组中在此队列上的进程进行了适当的迁移,迁移到了this_cpu上,那么,可以最后检查一下工作了,看下刚才上述那些工作完成的怎么样
if (!ld_moved) {//如果没有移动进程 schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {//如果失败次数已经超过cache_nice_tries+2(这个值看名字应该是保证cache hot用的) spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {//找下原因,是不是因为进程被设定了不允许移动到this_cpu上 spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance)//实在不行,唤醒migration_thread进程,同步的去移动进程 wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else sd->nr_balance_failed = 0; if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval;//调整一下平衡周期 } else { /* * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call * move_tasks). */ if (sd->balance_interval max_interval) sd->balance_interval *= 2; }migration_thread是干什么的呢?原来,每个cpu都会绑定一个migration_thread内核线程,专门应对这种情况,至于绑定的方法,那就是将这个线程的task_struct结构体中cpu掩码设置好就OK了,这也说明了为什么前面代码中会有"不允许移动到this_cpu“的情况。 那么migration_thread都干些什么?在sched.c中有如下函数,在fork migration_thread时,该线程将会执行它:
/* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto * another runqueue. */ static int migration_thread(void *data) { int cpu = (long)data; struct rq *rq; rq = cpu_rq(cpu); BUG_ON(rq->migration_thread != current); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { struct migration_req *req; struct list_head *head; spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { spin_unlock_irq(&rq->lock); goto wait_to_die; } if (rq->active_balance) { active_load_balance(rq, cpu); rq->active_balance = 0; } head = &rq->migration_queue; if (list_empty(head)) { spin_unlock_irq(&rq->lock); schedule(); set_current_state(TASK_INTERRUPTIBLE); continue; } req = list_entry(head->next, struct migration_req, list); list_del_init(head->next); spin_unlock(&rq->lock); __migrate_task(req->task, cpu, req->dest_cpu); local_irq_enable(); complete(&req->done); } __set_current_state(TASK_RUNNING); return 0; wait_to_die: /* Wait for kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return 0; }按刚才的情景,会执行到active_load_balance函数
/* * active_load_balance is run by migration threads. It pushes running tasks * off the busiest CPU onto idle CPUs. It requires at least 1 task to be * running on each physical CPU where possible, and avoids physical / * logical imbalances. * * Called with busiest_rq locked. */ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) { int target_cpu = busiest_rq->push_cpu; struct sched_domain *sd; struct rq *target_rq; /* Is there any task to move? */ if (busiest_rq->nr_running flags & SD_LOAD_BALANCE) && cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } if (likely(sd)) { schedstat_inc(sd, alb_count); if (move_one_task(target_rq, target_cpu, busiest_rq, sd, CPU_IDLE))//这里是move_one_task,也就是说只移动一个进程,减小了力度,毕竟是受阻才会执行到这里的 schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); } double_unlock_balance(busiest_rq, target_rq); }在该进程被唤醒之前,push_cpu就已经被设置了load_balance里的this_cpu,也就是说,当时移动不了,那过后再移动,但是,目标cpu还是不变的 此外,migration_thread线程还会检查rq中是否有提交上来的需要转移的进程,如果有,一并将其转移,那么进程究竟是怎么跑到这个队列中来的呢?用cscope一路查下去,发现是在exec中,也就是sys_execve系统调用的执行过程中。