Loading [Contrib]/a11y/accessibility-menu.js

Linux内核源码—CFS调度(4.20.17)

 


 cfs_rq

每个 cpu 都有一个对应的运行队列 rq,在 rq 中维护着不同调度策略的调度队列。

1
2
3
4
5
6
7
struct rq {
        ...
    struct cfs_rq       cfs;
    struct rt_rq        rt;
    struct dl_rq        dl;
        ...  
};

cfs的调度队列通过红黑树维护,在 cfs_rq 的数据结构中,struct rb_root_cached   tasks_timeline 包含了红黑树 struct rb_root rb_root 和 最左叶子节点缓存 struct rb_node *rb_leftmost 。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
struct cfs_rq {
    struct load_weight  load;  //CFS运行队列的负载权重值
    unsigned long       runnable_weight;
    unsigned int        nr_running;
    unsigned int        h_nr_running;
 
    u64         exec_clock;
    u64         min_vruntime;
#ifndef CONFIG_64BIT
    u64         min_vruntime_copy;
#endif
 
    struct rb_root_cached   tasks_timeline;  //红黑树,维护调度实体
 
    /*
     * 'curr' points to currently running entity on this cfs_rq.
     * It is set to NULL otherwise (i.e when none are currently running).
     */
    struct sched_entity *curr;  //当前运行的调度实体
    struct sched_entity *next;  //下一个调度实体
    struct sched_entity *last;  //队列中最后的调度实体
    struct sched_entity *skip;  //跳过的调度实体
 
#ifdef  CONFIG_SCHED_DEBUG
    unsigned int        nr_spread_over;
#endif
 
#ifdef CONFIG_SMP
    /*
     * CFS load tracking
     */
    struct sched_avg    avg;
#ifndef CONFIG_64BIT
    u64         load_last_update_time_copy;
#endif
    struct {
        raw_spinlock_t  lock ____cacheline_aligned;
        int     nr;
        unsigned long   load_avg;
        unsigned long   util_avg;
        unsigned long   runnable_sum;
    } removed;
 
#ifdef CONFIG_FAIR_GROUP_SCHED
    unsigned long       tg_load_avg_contrib;
    long            propagate;
    long            prop_runnable_sum;
 
    /*
     *   h_load = weight * f(tg)
     *
     * Where f(tg) is the recursive weight fraction assigned to
     * this group.
     */
    unsigned long       h_load;
    u64         last_h_load_update;
    struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
 
#ifdef CONFIG_FAIR_GROUP_SCHED
    struct rq       *rq;    /* CPU runqueue to which this cfs_rq is attached */
 
    /*
     * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
     * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
     * (like users, containers etc.)
     *
     * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
     * This list is used during load balance.
     */
    int         on_list;
    struct list_head    leaf_cfs_rq_list;
    struct task_group   *tg;    /* group that "owns" this runqueue */
 
#ifdef CONFIG_CFS_BANDWIDTH
    int         runtime_enabled;
    int         expires_seq;
    u64         runtime_expires;
    s64         runtime_remaining;
 
    u64         throttled_clock;
    u64         throttled_clock_task;
    u64         throttled_clock_task_time;
    int         throttled;
    int         throttle_count;
    struct list_head    throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

 

vruntime

那么CFS是根据什么来对任务进行排序呢?----------》虚拟运行时间 vruntime

update_curr 函数(/kernel/sched/fair.c)实现了 vruntime 的更新,其步骤是计算出当前进程的运行时间 delta_exec,再结合当前可运行进程总数对delta_exec 进行加权运算。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
static void update_curr(struct cfs_rq *cfs_rq)
{
    struct sched_entity *curr = cfs_rq->curr;  //获取当前调度实体
    u64 now = rq_clock_task(rq_of(cfs_rq));  //获取当前时间
    u64 delta_exec;
 
    if (unlikely(!curr))
        return;
 
    delta_exec = now - curr->exec_start;  //计算当前进程已执行的时间,exec_start是调度实体的开始执行时间
    if (unlikely((s64)delta_exec <= 0))
        return;
 
    curr->exec_start = now;
 
    schedstat_set(curr->statistics.exec_max,
              max(delta_exec, curr->statistics.exec_max));
 
    curr->sum_exec_runtime += delta_exec;  //修改调度实体已执行总时间
    schedstat_add(cfs_rq->exec_clock, delta_exec);
 
    curr->vruntime += calc_delta_fair(delta_exec, curr);  //修改调度实体虚拟运行时间
    update_min_vruntime(cfs_rq);
 
    if (entity_is_task(curr)) {  //如果调度实体是task,也要给它的调度组记录执行时间
        struct task_struct *curtask = task_of(curr);
 
        trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
        cgroup_account_cputime(curtask, delta_exec);
        account_group_exec_runtime(curtask, delta_exec);
    }
 
    account_cfs_rq_runtime(cfs_rq, delta_exec);
}

calc_delta_fair(delta_exec, curr) 实现了虚拟运行时间的计算:

虚拟运行时间 = delta_exec * NICE_0_LOAD / 当前进程的权重

而具体在 __calc_delta 中,是通过(delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 实现的,通过左移和右移避免浮点运算。

从公式可以得出,如果一个进程的虚拟运行时间越小,说明实际运行的时间越少或者是进程的权重大,那么就应该具有更高的优先度。而红黑树维护的就是进程的 vruntime 值,每次选择 vruntime 最小的进程执行,该节点缓存在了最左叶子节点 struct rb_node *rb_leftmost 中。

1
2
3
4
5
6
7
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
    if (unlikely(se->load.weight != NICE_0_LOAD))
        delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 
    return delta;
}

  

进程选择

在进程变为可运行状态(被唤醒)或者是通过 fork() 调用第一次创建进程时,需要将进程插入红黑树,调用 __enqueue_entity 实现这一过程。删除节点也是同样的道理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;  //红黑树根节点
    struct rb_node *parent = NULL;
    struct sched_entity *entry;
    bool leftmost = true;
 
    /*
     * Find the right place in the rbtree:
     */
    while (*link) {
        parent = *link;
        entry = rb_entry(parent, struct sched_entity, run_node);  //rb_entry 只是 container_of 的封装而已,找到首地址
        /*
         * We dont care about collisions. Nodes with
         * the same key stay together.
         */
        if (entity_before(se, entry)) {
            link = &parent->rb_left;
        } else {
            link = &parent->rb_right;
            leftmost = false;
        }
    }
 
    rb_link_node(&se->run_node, parent, link);  //在红黑树中插入节点
    rb_insert_color_cached(&se->run_node,  //设置节点的颜色
                   &cfs_rq->tasks_timeline, leftmost);
}

  

进程调度

进程调度的主要入口点是函数 schedule(/kernel/sched/core.c),它通过 pick_next_task() 函数选择下一个进程,如果选出来的进程与当前运行进程不一致,则调用 context_switch() 函数进行上下文切换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static void __sched notrace __schedule(bool preempt)
{
    cpu = smp_processor_id();
    rq = cpu_rq(cpu);
    prev = rq->curr;  //获取当前运行进程
 
        ...
 
    next = pick_next_task(rq, prev, &rf);
    clear_tsk_need_resched(prev);
    clear_preempt_need_resched();
 
    if (likely(prev != next)) {<br>                ...
        rq = context_switch(rq, prev, next, &rf);
    } else {
        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
        rq_unlock_irq(rq, &rf);
    }<br>        ...
}

pick_next_task() 函数的实现并不复杂,这里用到了一点优化,如果所有的可运行进程都在 cfs 中,那么就可以直接调用 cfs 的 pick_next_task(), 否则就需要按照调度器的优先级来选择。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
    const struct sched_class *class;
    struct task_struct *p;
 
    /*
     * Optimization: we know that if all tasks are in the fair class we can
     * call that function directly, but only if the @prev task wasn't of a
     * higher scheduling class, because otherwise those loose the
     * opportunity to pull in more work from other CPUs.
     */
    if (likely((prev->sched_class == &idle_sched_class ||
            prev->sched_class == &fair_sched_class) &&
           rq->nr_running == rq->cfs.h_nr_running)) {
 
        p = fair_sched_class.pick_next_task(rq, prev, rf);
        if (unlikely(p == RETRY_TASK))
            goto again;
 
        /* Assumes fair_sched_class->next == idle_sched_class */
        if (unlikely(!p))
            p = idle_sched_class.pick_next_task(rq, prev, rf);
 
        return p;
    }
 
again:
    for_each_class(class) {
        p = class->pick_next_task(rq, prev, rf);
        if (p) {
            if (unlikely(p == RETRY_TASK))
                goto again;
            return p;
        }
    }
 
    /* The idle class should always have a runnable task: */
    BUG();
}

  

 

 

References

  1. 【原创】(五)Linux进程调度-CFS调度器
  2. CFS调度主要代码分析一


如果您觉得阅读本文对您有帮助,请点一下“推荐”按钮,您的“推荐”将是我最大的写作动力!欢迎各位转载,但是未经作者本人同意,转载文章之后必须在文章页面明显位置给出作者和原文连接,否则保留追究法律责任的权利。
posted @   Kayden_Cheung  阅读(490)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
历史上的今天:
2021-03-08 生成one-hot的方法
2017-03-08 LA 4254 处理器(二分+贪心)
2017-03-08 Java课程设计—拿火柴小游戏
//目录
点击右上角即可分享
微信分享提示