Linux内核源码—CFS调度(4.20.17)
cfs_rq
每个 cpu 都有一个对应的运行队列 rq,在 rq 中维护着不同调度策略的调度队列。
1 2 3 4 5 6 7 | struct rq { ... struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; ... }; |
cfs的调度队列通过红黑树维护,在 cfs_rq 的数据结构中,struct rb_root_cached tasks_timeline 包含了红黑树 struct rb_root rb_root 和 最左叶子节点缓存 struct rb_node *rb_leftmost 。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | struct cfs_rq { struct load_weight load; //CFS运行队列的负载权重值 unsigned long runnable_weight; unsigned int nr_running; unsigned int h_nr_running; u64 exec_clock; u64 min_vruntime; #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif struct rb_root_cached tasks_timeline; //红黑树,维护调度实体 /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; //当前运行的调度实体 struct sched_entity *next; //下一个调度实体 struct sched_entity *last; //队列中最后的调度实体 struct sched_entity *skip; //跳过的调度实体 #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; #endif #ifdef CONFIG_SMP /* * CFS load tracking */ struct sched_avg avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; int nr; unsigned long load_avg; unsigned long util_avg; unsigned long runnable_sum; } removed; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; long propagate; long prop_runnable_sum; /* * h_load = weight * f(tg) * * Where f(tg) is the recursive weight fraction assigned to * this group. */ unsigned long h_load; u64 last_h_load_update; struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities * (like users, containers etc.) * * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. * This list is used during load balance. */ int on_list; struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; int expires_seq; u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock; u64 throttled_clock_task; u64 throttled_clock_task_time; int throttled; int throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; |
vruntime
那么CFS是根据什么来对任务进行排序呢?----------》虚拟运行时间 vruntime。
update_curr 函数(/kernel/sched/fair.c)实现了 vruntime 的更新,其步骤是计算出当前进程的运行时间 delta_exec,再结合当前可运行进程总数对delta_exec 进行加权运算。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | static void update_curr( struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; //获取当前调度实体 u64 now = rq_clock_task(rq_of(cfs_rq)); //获取当前时间 u64 delta_exec; if (unlikely(!curr)) return ; delta_exec = now - curr->exec_start; //计算当前进程已执行的时间,exec_start是调度实体的开始执行时间 if (unlikely((s64)delta_exec <= 0)) return ; curr->exec_start = now; schedstat_set(curr->statistics.exec_max, max(delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; //修改调度实体已执行总时间 schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); //修改调度实体虚拟运行时间 update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { //如果调度实体是task,也要给它的调度组记录执行时间 struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); } |
calc_delta_fair(delta_exec, curr) 实现了虚拟运行时间的计算:
虚拟运行时间 = delta_exec * NICE_0_LOAD / 当前进程的权重
而具体在 __calc_delta 中,是通过(delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT 实现的,通过左移和右移避免浮点运算。
从公式可以得出,如果一个进程的虚拟运行时间越小,说明实际运行的时间越少或者是进程的权重大,那么就应该具有更高的优先度。而红黑树维护的就是进程的 vruntime 值,每次选择 vruntime 最小的进程执行,该节点缓存在了最左叶子节点 struct rb_node *rb_leftmost 中。
1 2 3 4 5 6 7 | static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { if (unlikely(se->load.weight != NICE_0_LOAD)) delta = __calc_delta(delta, NICE_0_LOAD, &se->load); return delta; } |
进程选择
在进程变为可运行状态(被唤醒)或者是通过 fork() 调用第一次创建进程时,需要将进程插入红黑树,调用 __enqueue_entity 实现这一过程。删除节点也是同样的道理。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | static void __enqueue_entity( struct cfs_rq *cfs_rq, struct sched_entity *se) { struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; //红黑树根节点 struct rb_node *parent = NULL; struct sched_entity *entry; bool leftmost = true ; /* * Find the right place in the rbtree: */ while (*link) { parent = *link; entry = rb_entry(parent, struct sched_entity, run_node); //rb_entry 只是 container_of 的封装而已,找到首地址 /* * We dont care about collisions. Nodes with * the same key stay together. */ if (entity_before(se, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false ; } } rb_link_node(&se->run_node, parent, link); //在红黑树中插入节点 rb_insert_color_cached(&se->run_node, //设置节点的颜色 &cfs_rq->tasks_timeline, leftmost); } |
进程调度
进程调度的主要入口点是函数 schedule(/kernel/sched/core.c),它通过 pick_next_task() 函数选择下一个进程,如果选出来的进程与当前运行进程不一致,则调用 context_switch() 函数进行上下文切换。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | static void __sched notrace __schedule( bool preempt) { cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; //获取当前运行进程 ... next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); if (likely(prev != next)) {<br> ... rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); }<br> ... } |
pick_next_task() 函数的实现并不复杂,这里用到了一点优化,如果所有的可运行进程都在 cfs 中,那么就可以直接调用 cfs 的 pick_next_task(), 否则就需要按照调度器的优先级来选择。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | static inline struct task_struct * pick_next_task( struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { const struct sched_class * class ; struct task_struct *p; /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a * higher scheduling class, because otherwise those loose the * opportunity to pull in more work from other CPUs. */ if (likely((prev->sched_class == &idle_sched_class || prev->sched_class == &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto again; /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) p = idle_sched_class.pick_next_task(rq, prev, rf); return p; } again: for_each_class( class ) { p = class ->pick_next_task(rq, prev, rf); if (p) { if (unlikely(p == RETRY_TASK)) goto again; return p; } } /* The idle class should always have a runnable task: */ BUG(); } |
References:
如果您觉得阅读本文对您有帮助,请点一下“推荐”按钮,您的“推荐”将是我最大的写作动力!欢迎各位转载,但是未经作者本人同意,转载文章之后必须在文章页面明显位置给出作者和原文连接,否则保留追究法律责任的权利。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
2021-03-08 生成one-hot的方法
2017-03-08 LA 4254 处理器(二分+贪心)
2017-03-08 Java课程设计—拿火柴小游戏