linux调度源代码

内核引进了struct task_group结构，如下：

/* 进程组，用于实现组调度 */

struct task_group {

/* 用于进程找到其所属进程组结构 */

struct cgroup_subsys_state css;



#ifdef CONFIG_FAIR_GROUP_SCHED

/* CFS调度器的进程组变量，在 alloc_fair_sched_group() 中进程初始化及分配内存 */

/* 该进程组在每个CPU上都有对应的一个调度实体，因为有可能此进程组同时在两个CPU上运行(它的A进程在CPU0上运行，B进程在CPU1上运行) */

struct sched_entity **se;

/* 进程组在每个CPU上都有一个CFS运行队列(为什么需要，稍后解释) */

struct cfs_rq **cfs_rq;

/* 用于保存优先级默认为NICE 0的优先级 */

unsigned long shares;



#ifdef CONFIG_SMP

atomic_long_t load_avg;

atomic_t runnable_avg;

#endif

#endif



#ifdef CONFIG_RT_GROUP_SCHED

/* 实时进程调度器的进程组变量，同 CFS */

struct sched_rt_entity **rt_se;

struct rt_rq **rt_rq;



struct rt_bandwidth rt_bandwidth;

#endif



struct rcu_head rcu;

/* 用于建立进程链表(属于此调度组的进程链表) */

struct list_head list;



/* 指向其上层的进程组，每一层的进程组都是它上一层进程组的运行队列的一个调度实体，在同一层中，进程组和进程被同等对待 */

struct task_group *parent;

/* 进程组的兄弟结点链表 */

struct list_head siblings;

/* 进程组的儿子结点链表 */

struct list_head children;



#ifdef CONFIG_SCHED_AUTOGROUP

struct autogroup *autogroup;

#endif



struct cfs_bandwidth cfs_bandwidth;

};

在struct task_group结构中，最重要的成员为 struct sched_entity ** se 和 struct cfs_rq ** cfs_rq。在图1 中，root_task_group与task_group1都只有一个，它们在初始化时会根据CPU个数为se和cfs_rq分配空间，即在task_group1和root_task_group中会为每个CPU分配一个se和cfs_rq，同理用于实时进程的 struct sched_rt_entity ** rt_se 和 struct rt_rq ** rt_rq也是一样。为什么这样呢，原因就是在多核多CPU的情况下，同一进程组的进程有可能在不同CPU上同时运行，所以每个进程组都必须对每个CPU分配它的调度实体(struct sched_entity 和 struct sched_rt_entity)和运行队列(struct cfs_rq 和 struct rt_rq)。

调度实体(struct sched_entity)

　　在组调度中，也涉及到调度实体这个概念，它的结构为struct sched_entity(简称se)，就是图1 红黑树中的se。其实际上就代表了一个调度对象，可以为一个进程，也可以为一个进程组。对于根的红黑树而言，一个进程组就相当于一个调度实体，一个进程也相当于一个调度实体。我们可以先看看其结构，如下：

/* 一个调度实体(红黑树的一个结点)，其包含一组或一个指定的进程，包含一个自己的运行队列，一个父亲指针，一个指向需要调度的运行队列指针 */

struct sched_entity {

/* 权重，在数组prio_to_weight[]包含优先级转权重的数值 */

struct load_weight load; /* for load-balancing */

/* 实体在红黑树对应的结点信息 */

struct rb_node run_node;

/* 实体所在的进程组 */

struct list_head group_node;

/* 实体是否处于红黑树运行队列中 */

unsigned int on_rq;



/* 开始运行时间 */

u64 exec_start;

/* 总运行时间 */

u64 sum_exec_runtime;

/* 虚拟运行时间，在时间中断或者任务状态发生改变时会更新

* 其会不停增长，增长速度与load权重成反比，load越高，增长速度越慢，就越可能处于红黑树最左边被调度

* 每次时钟中断都会修改其值

* 具体见calc_delta_fair()函数

*/

u64 vruntime;

/* 进程在切换进CPU时的sum_exec_runtime值 */

u64 prev_sum_exec_runtime;



/* 此调度实体中进程移到其他CPU组的数量 */

u64 nr_migrations;



#ifdef CONFIG_SCHEDSTATS

/* 用于统计一些数据 */

struct sched_statistics statistics;

#endif



#ifdef CONFIG_FAIR_GROUP_SCHED

/* 代表此进程组的深度，每个进程组都比其parent调度组深度大1 */

int depth;

/* 父亲调度实体指针，如果是进程则指向其运行队列的调度实体，如果是进程组则指向其上一个进程组的调度实体

* 在 set_task_rq 函数中设置

*/

struct sched_entity *parent;

/* 实体所处红黑树运行队列 */

struct cfs_rq *cfs_rq;

/* 实体的红黑树运行队列，如果为NULL表明其是一个进程，若非NULL表明其是调度组 */

struct cfs_rq *my_q;

#endif



#ifdef CONFIG_SMP

/* Per-entity load-tracking */

struct sched_avg avg;

#endif

};

实际上，红黑树是根据 struct rb_node 建立起关系的，不过 struct rb_node 与 struct sched_entity 是一一对应关系，也可以简单看为一个红黑树结点就是一个调度实体。可以看出，在 struct sched_entity 结构中，包含了一个进程(或进程组)调度的全部数据，其被包含在 struct task_struct 结构中的se中，如下：

struct task_struct {

........

/* 表示是否在运行队列 */

int on_rq;



/* 进程优先级

* prio: 动态优先级，范围为100~139，与静态优先级和补偿(bonus)有关

* static_prio: 静态优先级，static_prio = 100 + nice + 20 (nice值为-20~19,所以static_prio值为100~139)

* normal_prio: 没有受优先级继承影响的常规优先级，具体见normal_prio函数，跟属于什么类型的进程有关

*/

int prio, static_prio, normal_prio;

/* 实时进程优先级 */

unsigned int rt_priority;

/* 调度类，调度处理函数类 */

const struct sched_class *sched_class;

/* 调度实体(红黑树的一个结点) */

struct sched_entity se;

/* 调度实体(实时调度使用) */

struct sched_rt_entity rt;

#ifdef CONFIG_CGROUP_SCHED

/* 指向其所在进程组 */

struct task_group *sched_task_group;

#endif

........

}

在 struct sched_entity 结构中，值得我们注意的成员是：

load：权重，通过优先级转换而成，是vruntime计算的关键。
on_rq：表明是否处于CFS红黑树运行队列中，需要明确一个观点就是，CFS运行队列里面包含有一个红黑树，但这个红黑树并不是CFS运行队列的全部，因为红黑树仅仅是用于选择出下一个调度程序的算法。很简单的一个例子，普通程序运行时，其并不在红黑树中，但是还是处于CFS运行队列中，其on_rq为真。只有准备退出、即将睡眠等待和转为实时进程的进程其CFS运行队列的on_rq为假。
vruntime：虚拟运行时间，调度的关键，其计算公式：一次调度间隔的虚拟运行时间 = 实际运行时间 * (NICE_0_LOAD / 权重)。可以看出跟实际运行时间和权重有关，红黑树就是以此作为排序的标准，优先级越高的进程在运行时其vruntime增长的越慢，其可运行时间相对就长，而且也越有可能处于红黑树的最左结点，调度器每次都选择最左边的结点为下一个调度进程。注意其值为单调递增，在每个调度器的时钟中断时当前进程的虚拟运行时间都会累加。单纯的说就是进程们都在比谁的vruntime最小，最小的将被调度。
cfs_rq：此调度实体所处于的CFS运行队列。
my_q：如果此调度实体代表的是一个进程组，那么此调度实体就包含有一个自己的CFS运行队列，其CFS运行队列中存放的是此进程组中的进程，这些进程就不会在其他CFS运行队列的红黑树中被包含(包括顶层红黑树也不会包含他们，他们只属于这个进程组的红黑树)。
　　对于怎么理解一个进程组有它自己的CFS运行队列，其实很好理解，比如在根CFS运行队列的红黑树上有一个进程A一个进程组B，各占50%的CPU，对于根的红黑树而言，他们就是两个调度实体。调度器调度的不是进程A就是进程组B，而如果调度到进程组B，进程组B自己选择一个程序交给CPU运行就可以了，而进程组B怎么选择一个程序给CPU，就是通过自己的CFS运行队列的红黑树选择，如果进程组B还有个子进程组C，原理都一样，就是一个层次结构。

　　而在 struct task_struct 结构中，我们注意到有个调度类，里面包含的是调度处理函数，它具体如下：

struct sched_class {

/* 下一优先级的调度类

* 调度类优先级顺序: stop_sched_class -> dl_sched_class -> rt_sched_class -> fair_sched_class -> idle_sched_class

*/

const struct sched_class *next;



/* 将进程加入到运行队列中，即将调度实体（进程）放入红黑树中，并对 nr_running 变量加1 */

void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);

/* 从运行队列中删除进程，并对 nr_running 变量中减1 */

void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);

/* 放弃CPU，在 compat_yield sysctl 关闭的情况下，该函数实际上执行先出队后入队；在这种情况下，它将调度实体放在红黑树的最右端 */

void (*yield_task) (struct rq *rq);

bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);



/* 检查当前进程是否可被新进程抢占 */

void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);



/*

* It is the responsibility of the pick_next_task() method that will

* return the next task to call put_prev_task() on the @prev task or

* something equivalent.

*

* May return RETRY_TASK when it finds a higher prio class has runnable

* tasks.

*/

/* 选择下一个应该要运行的进程运行 */

struct task_struct * (*pick_next_task) (struct rq *rq,

struct task_struct *prev);

/* 将进程放回运行队列 */

void (*put_prev_task) (struct rq *rq, struct task_struct *p);



#ifdef CONFIG_SMP

/* 为进程选择一个合适的CPU */

int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);

/* 迁移任务到另一个CPU */

void (*migrate_task_rq)(struct task_struct *p, int next_cpu);

/* 用于上下文切换后 */

void (*post_schedule) (struct rq *this_rq);

/* 用于进程唤醒 */

void (*task_waking) (struct task_struct *task);

void (*task_woken) (struct rq *this_rq, struct task_struct *task);

/* 修改进程的CPU亲和力(affinity) */

void (*set_cpus_allowed)(struct task_struct *p,

const struct cpumask *newmask);

/* 启动运行队列 */

void (*rq_online)(struct rq *rq);

/* 禁止运行队列 */

void (*rq_offline)(struct rq *rq);

#endif

/* 当进程改变它的调度类或进程组时被调用 */

void (*set_curr_task) (struct rq *rq);

/* 该函数通常调用自 time tick 函数；它可能引起进程切换。这将驱动运行时（running）抢占 */

void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);

/* 在进程创建时调用，不同调度策略的进程初始化不一样 */

void (*task_fork) (struct task_struct *p);

/* 在进程退出时会使用 */

void (*task_dead) (struct task_struct *p);



/* 用于进程切换 */

void (*switched_from) (struct rq *this_rq, struct task_struct *task);

void (*switched_to) (struct rq *this_rq, struct task_struct *task);

/* 改变优先级 */

void (*prio_changed) (struct rq *this_rq, struct task_struct *task,

int oldprio);



unsigned int (*get_rr_interval) (struct rq *rq,

struct task_struct *task);



void (*update_curr) (struct rq *rq);



#ifdef CONFIG_FAIR_GROUP_SCHED

void (*task_move_group) (struct task_struct *p, int on_rq);

#endif

};

这个调度类具体有什么用呢，实际上在内核中不同的调度算法它们的操作都不相同，为了方便修改、替换调度算法，使用了调度类，每个调度算法只需要实现自己的调度类就可以了，CFS算法有它的调度类，SCHED_FIFO也有它自己的调度类，当一个进程创建时，用什么调度算法就将其 task_struct->sched_class 指向其相应的调度类，调度器每次调度处理时，就通过当前进程的调度类函数进程操作，大大提高了可移植性和易修改性。

CFS运行队列(struct cfs_rq)
　　我们现在知道，在系统中至少有一个CFS运行队列，其就是根CFS运行队列，而其他的进程组和进程都包含在此运行队列中，不同的是进程组又有它自己的CFS运行队列，其运行队列中包含的是此进程组中的所有进程。当调度器从根CFS运行队列中选择了一个进程组进行调度时，进程组会从自己的CFS运行队列中选择一个调度实体进行调度(这个调度实体可能为进程，也可能又是一个子进程组)，就这样一直深入，直到最后选出一个进程进行运行为止。
　　对于 struct cfs_rq 结构没有什么好说明的，只要确定其代表着一个CFS运行队列，并且包含有一个红黑树进行选择调度进程即可。

/* CFS调度的运行队列，每个CPU的rq会包含一个cfs_rq，而每个组调度的sched_entity也会有自己的一个cfs_rq队列 */

struct cfs_rq {

/* CFS运行队列中所有进程的总负载 */

struct load_weight load;

/*

* nr_running: cfs_rq中调度实体数量

* h_nr_running: 只对进程组有效，其下所有进程组中cfs_rq的nr_running之和

*/

unsigned int nr_running, h_nr_running;



u64 exec_clock;

/* 当前CFS队列上最小运行时间，单调递增

* 两种情况下更新该值:

* 1、更新当前运行任务的累计运行时间时

* 2、当任务从队列删除去，如任务睡眠或退出，这时候会查看剩下的任务的vruntime是否大于min_vruntime，如果是则更新该值。

*/

u64 min_vruntime;

#ifndef CONFIG_64BIT

u64 min_vruntime_copy;

#endif

/* 该红黑树的root */

struct rb_root tasks_timeline;

/* 下一个调度结点(红黑树最左边结点，最左边结点就是下个调度实体) */

struct rb_node *rb_leftmost;



/*

* 'curr' points to currently running entity on this cfs_rq.

* It is set to NULL otherwise (i.e when none are currently running).

*/

/*

* curr: 当前正在运行的sched_entity（对于组虽然它不会在cpu上运行，但是当它的下层有一个task在cpu上运行，那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的sched_entity）

* next: 表示有些进程急需运行，即使不遵从CFS调度也必须运行它，调度时会检查是否next需要调度，有就调度next

*

* skip: 略过进程(不会选择skip指定的进程调度)

*/

struct sched_entity *curr, *next, *last, *skip;



#ifdef CONFIG_SCHED_DEBUG

unsigned int nr_spread_over;

#endif



#ifdef CONFIG_SMP

/*

* CFS Load tracking

* Under CFS, load is tracked on a per-entity basis and aggregated up.

* This allows for the description of both thread and group usage (in

* the FAIR_GROUP_SCHED case).

*/

unsigned long runnable_load_avg, blocked_load_avg;

atomic64_t decay_counter;

u64 last_decay;

atomic_long_t removed_load;



#ifdef CONFIG_FAIR_GROUP_SCHED

/* Required to track per-cpu representation of a task_group */

u32 tg_runnable_contrib;

unsigned long tg_load_contrib;



/*

* h_load = weight * f(tg)

*

* Where f(tg) is the recursive weight fraction assigned to

* this group.

*/

unsigned long h_load;

u64 last_h_load_update;

struct sched_entity *h_load_next;

#endif /* CONFIG_FAIR_GROUP_SCHED */

#endif /* CONFIG_SMP */



#ifdef CONFIG_FAIR_GROUP_SCHED

/* 所属于的CPU rq */

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */



/*

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

* (like users, containers etc.)

*

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

* list is used during load balance.

*/

int on_list;

struct list_head leaf_cfs_rq_list;

/* 拥有该CFS运行队列的进程组 */

struct task_group *tg; /* group that "owns" this runqueue */



#ifdef CONFIG_CFS_BANDWIDTH

int runtime_enabled;

u64 runtime_expires;

s64 runtime_remaining;



u64 throttled_clock, throttled_clock_task;

u64 throttled_clock_task_time;

int throttled, throttle_count;

struct list_head throttled_list;

#endif /* CONFIG_CFS_BANDWIDTH */

#endif /* CONFIG_FAIR_GROUP_SCHED */

};

load：其保存的是进程组中所有进程的权值总和，需要注意子进程计算vruntime时需要用到进程组的load。 
CPU运行队列(struct rq)
　　每个CPU都有自己的 struct rq 结构，其用于描述在此CPU上所运行的所有进程，其包括一个实时进程队列和一个根CFS运行队列，在调度时，调度器首先会先去实时进程队列找是否有实时进程需要运行，如果没有才会去CFS运行队列找是否有进行需要运行，这就是为什么常说的实时进程优先级比普通进程高，不仅仅体现在prio优先级上，还体现在调度器的设计上，至于dl运行队列，我暂时还不知道有什么用处，其优先级比实时进程还高，但是创建进程时如果创建的是dl进程创建会错误(具体见sys_fork)。

/* CPU运行队列，每个CPU包含一个struct rq */

struct rq {

/* 处于运行队列中所有就绪进程的load之和 */

raw_spinlock_t lock;



/*

* nr_running and cpu_load should be in the same cacheline because

* remote CPUs use both these fields when doing load calculation.

*/

/* 此CPU上总共就绪的进程数，包括cfs，rt和正在运行的 */

unsigned int nr_running;

#ifdef CONFIG_NUMA_BALANCING

unsigned int nr_numa_running;

unsigned int nr_preferred_running;

#endif

#define CPU_LOAD_IDX_MAX 5

/* 根据CPU历史情况计算的负载，cpu_load[0]一直等于load.weight，当达到负载平衡时，cpu_load[1]和cpu_load[2]都应该等于load.weight */

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

/* 最后一次更新 cpu_load 的时间 */

unsigned long last_load_update_tick;

#ifdef CONFIG_NO_HZ_COMMON

u64 nohz_stamp;

unsigned long nohz_flags;

#endif

#ifdef CONFIG_NO_HZ_FULL

unsigned long last_sched_tick;

#endif

/* 是否需要更新rq的运行时间 */

int skip_clock_update;



/* capture load from *all* tasks on this cpu: */

/* CPU负载，该CPU上所有可运行进程的load之和，nr_running更新时这个值也必须更新 */

struct load_weight load;

unsigned long nr_load_updates;

/* 进行上下文切换次数，只有proc会使用这个 */

u64 nr_switches;



/* cfs调度运行队列，包含红黑树的根 */

struct cfs_rq cfs;

/* 实时调度运行队列 */

struct rt_rq rt;

struct dl_rq dl;



#ifdef CONFIG_FAIR_GROUP_SCHED

/* list of leaf cfs_rq on this cpu: */

struct list_head leaf_cfs_rq_list;



struct sched_avg avg;

#endif /* CONFIG_FAIR_GROUP_SCHED */



/*

* This is part of a global counter where only the total sum

* over all CPUs matters. A task can increase this counter on

* one CPU and if it got migrated afterwards it may decrease

* it on another CPU. Always updated under the runqueue lock:

*/

/* 曾经处于队列但现在处于TASK_UNINTERRUPTIBLE状态的进程数量 */

unsigned long nr_uninterruptible;



/*

* curr: 当前正在此CPU上运行的进程

* idle: 当前CPU上idle进程的指针，idle进程用于当CPU没事做的时候调用，它什么都不执行

*/

struct task_struct *curr, *idle, *stop;

/* 下次进行负载平衡执行时间 */

unsigned long next_balance;

/* 在进程切换时用来存放换出进程的内存描述符地址 */

struct mm_struct *prev_mm;



/* rq运行时间 */

u64 clock;

u64 clock_task;



atomic_t nr_iowait;



#ifdef CONFIG_SMP

struct root_domain *rd;

/* 当前CPU所在基本调度域，每个调度域包含一个或多个CPU组，每个CPU组包含该调度域中一个或多个CPU子集，负载均衡都是在调度域中的组之间完成的，不能跨域进行负载均衡 */

struct sched_domain *sd;



unsigned long cpu_capacity;



unsigned char idle_balance;

/* For active balancing */

int post_schedule;

/* 如果需要把进程迁移到其他运行队列，就需要设置这个位 */

int active_balance;

int push_cpu;

struct cpu_stop_work active_balance_work;



/* 该运行队列所属CPU */

int cpu;

int online;



struct list_head cfs_tasks;



u64 rt_avg;

/* 该运行队列存活时间 */

u64 age_stamp;

u64 idle_stamp;

u64 avg_idle;



/* This is used to determine avg_idle's max value */

u64 max_idle_balance_cost;

#endif



#ifdef CONFIG_IRQ_TIME_ACCOUNTING

u64 prev_irq_time;

#endif

#ifdef CONFIG_PARAVIRT

u64 prev_steal_time;

#endif

#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

u64 prev_steal_time_rq;

#endif



/* calc_load related fields */

/* 用于负载均衡 */

unsigned long calc_load_update;

long calc_load_active;



#ifdef CONFIG_SCHED_HRTICK

#ifdef CONFIG_SMP

int hrtick_csd_pending;

struct call_single_data hrtick_csd;

#endif

/* 调度使用的高精度定时器 */

struct hrtimer hrtick_timer;

#endif



#ifdef CONFIG_SCHEDSTATS

/* latency stats */

struct sched_info rq_sched_info;

unsigned long long rq_cpu_time;

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */



/* sys_sched_yield() stats */

unsigned int yld_count;



/* schedule() stats */

unsigned int sched_count;

unsigned int sched_goidle;



/* try_to_wake_up() stats */

unsigned int ttwu_count;

unsigned int ttwu_local;

#endif



#ifdef CONFIG_SMP

struct llist_head wake_list;

#endif



#ifdef CONFIG_CPU_IDLE

/* Must be inspected within a rcu lock section */

struct cpuidle_state *idle_state;

#endif

};
posted @ 2022-05-06 22:31 GUO_dx 阅读(313) 评论(0) 收藏举报
刷新页面返回顶部
GUOdx

快乐滴程序媛！

linux调度源代码