Linux中的进程调度(四)
先把我们最初开始的函数写上,前面写的太长,不再看下原始代码可能回不去了。
/* * Share the fairness runtime between parent and child, thus the * total amount of pressure for CPU stays equal - new tasks * get a chance to run but frequent forkers are not allowed to * monopolize the CPU. Note: the parent runqueue is locked, * the child is not running yet. */ static void task_new_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); sched_info_queued(p); update_curr(cfs_rq); place_entity(cfs_rq, se, 1); /* 'curr' will be NULL if the child belongs to a different group */ if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && curr && curr->vruntime < se->vruntime) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); } enqueue_task_fair(rq, p, 0); resched_task(rq->curr); }上次已经分析过update_curr的代码,现在接着往下走,去看place_entity()
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime; vruntime = cfs_rq->min_vruntime; if (sched_feat(TREE_AVG)) { struct sched_entity *last = __pick_last_entity(cfs_rq); if (last) { vruntime += last->vruntime; vruntime >>= 1; } } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) vruntime += sched_vslice(cfs_rq)/2; /* * The 'current' period is already promised to the current tasks, * however the extra weight of the new task will slow them down a * little, place the new task so that it fits in the slot that * stays open at the end. */ if (initial && sched_feat(START_DEBIT)) vruntime += sched_vslice_add(cfs_rq, se); if (!initial) { /* sleeps upto a single latency don't count. */ if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se)) vruntime -= sysctl_sched_latency; /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); } se->vruntime = vruntime; }其中的if else if语句块在默认情况下是都不成立的,不成立的原因很简单,初始化时就这么写的,这里就不啰嗦了。 其中
if (initial && sched_feat(START_DEBIT))在默认情况下还是成立的,所以进去看一下
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { return __sched_vslice(cfs_rq->load.weight + se->load.weight, cfs_rq->nr_running + 1); }注释会对我们有帮助~
/* * We calculate the vruntime slice. * * vs = s/w = p/rw */ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) { u64 vslice = __sched_period(nr_running); vslice *= NICE_0_LOAD; do_div(vslice, rq_weight); return vslice; }再进入到__sched_period中去
/* * The idea is to set a period in which each task runs once. * * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch * this period because otherwise the slices get too small. * * p = (nr <= nl) ? l : l*nr/nl */ static u64 __sched_period(unsigned long nr_running) { u64 period = sysctl_sched_latency; unsigned long nr_latency = sched_nr_latency; if (unlikely(nr_running > nr_latency)) { period *= nr_running; do_div(period, nr_latency); } return period; }注释里写的已经很清楚了,如果进程太多的话,一个调度周期就被分的太细,这样纯粹调度花费的时间所占的比重就已经很大,所以要适当把调度周期放大些,放大是根据进程数量按比例来的。 然后根据新调整的调度周期,将新建进程的虚拟运行时间增加一点(个人感觉这也是考虑到如果不增加的话,如果一个进程不断的fork子进程,会造成CPU垄断) 由于在调用place_entity时最后一个参数为1,所以if(!initial)不成立,直接将计算出来的虚拟时间赋给p的虚拟运行时间。 这便是place_entity完成的功能。 主要就是给当前进程分配一个初始的虚拟运行时间,虽然他还没有运行,就像它的名字,place_entity,给新进程找到一个合适的地方,以便接下来将其放入红黑树。 接下来再回到task_+new_fair中去,place_entity执行完毕后进入一个if条件语句,当满足以下条件时该语句块会被执行: 1)系统定义了创建进程后子进程先执行 2)当前进程所在的CPU就是子进程所被分配到的CPU 3)curr不为空(注释中说明是如果父子进程不属于同一个调度组时curr可能为空,这个在后面会研究) 4)curr的虚拟运行时间小于新进程的虚拟运行时间 如果满足这几个条件,会将父子进程的虚拟运行时间交换,为什么要交换呢? 很明显,条件中说,子进程要先运行,而此时子进程的虚拟运行时间又比父进程的运行时间大,调度器每次都会选一个虚拟运行时间最小的进程来运行,所以当然要将他们两个交换了。 这一切准备好之后(主要是记录更新了当前进程的虚拟运行时间,给新创建的子进程增加了适当的虚拟运行时间),便可以将新创建的进程插入红黑树以待调度了。于是在task_new_fair里就进程了enqueue_task函数。
/* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, wakeup); wakeup = 1; } }如果在有组调度的情况下,这个函数不仅会将当前子进程加入红黑树,具体我们后面再分析。总之,它的主要动作就是调用enqueue_entity:
static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); if (wakeup) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); account_entity_enqueue(cfs_rq, se); }在做了一些必要的更新信息后,主要动作是调用__enqueue_entity
/* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; s64 key = entity_key(cfs_rq, se); int leftmost = 1; /* * Find the right place in the rbtree: */ while (*link) { parent = *link; entry = rb_entry(parent, struct sched_entity, run_node); /* * We dont care about collisions. Nodes with * the same key stay together. */ if (key < entity_key(cfs_rq, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = 0; } } /* * Maintain a cache of leftmost tree entries (it is frequently * used): */ if (leftmost) cfs_rq->rb_leftmost = &se->run_node; rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); }这个函数很容易读懂,先是找到要插入的位置,然后将其插入,(如果插入位置是最左边的叶子结点,那么需要更新缓冲),然后链入结点,并修改结点颜色以保证红黑树的性质,以防子树高度过于不平衡。具体算法可参见算法导论,很容易懂。不过在红黑树的结点表示上,这里用到了一个技巧: 把所有结点的地址按4字节对齐,这样,每个节点的首地址最后两个比特位肯定为零,由于红黑树的颜色状态只有两个,要么红要么黑,所以编码者就利用了最低位来表示当前节点的颜色。这种用内存的方式~~ 将进程插入红黑树后,就可以通过内核在系统调用返回用户空间时进行一次调度了。
enqueue_task_fair(rq, p, 0); resched_task(rq->curr);进入resched_task中去:
/* * resched_task - mark a task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ #ifdef CONFIG_SMP #ifndef tsk_is_polling #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif static void resched_task(struct task_struct *p) { int cpu; assert_spin_locked(&task_rq(p)->lock); if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) return; set_tsk_thread_flag(p, TIF_NEED_RESCHED); cpu = task_cpu(p); if (cpu == smp_processor_id()) return; /* NEED_RESCHED must be visible before we test polling */ smp_mb(); if (!tsk_is_polling(p)) smp_send_reschedule(cpu); }先加锁.
if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) return;这一句很有意思,如果当前进程的TIF_NEED_RESCHED标志已经置位,那么便可以直接返回了,test_tsk_thread_flag是怎么工作的呢?一步步往下深入:
static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); }
static inline int test_ti_thread_flag(struct thread_info *ti, int flag) { return test_bit(flag,&ti->flags); }
#define test_bit(nr,addr) (__builtin_constant_p(nr) ? constant_test_bit((nr),(addr)) : variable_test_bit((nr),(addr)))当前,flag标志应该是一个变量,(谁说的C语言里面没有多态?),接着往里面走:
static inline int variable_test_bit(int nr, const volatile unsigned long * addr) { int oldbit; __asm__ __volatile__( "btl %2,%1ntsbbl %0,%0" :"=r" (oldbit) :"m" (ADDR),"Ir" (nr)); return oldbit; }汇编语句。 btl的功能是测试某个数的特定位是零还是1,测试结果放到CF标志位里。(具体可见intel的用户手册).然后,sbbl %0, %0,也就是说,让oldbit与oldbit的值带位相减,即oldbit-oldbit-CF,如果CF标志位是零,也就是刚才位测试的结果是零的话,当然最后返回的oldbit也是零了,如果CF标志位是1,那么返回的就不一样了。返回的值为-1,多么巧妙。 如果经测试,TIF_NEED_RESCHED还没有置位,那么就将其置位。
set_tsk_thread_flag(p, TIF_NEED_RESCHED);
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); }
static inline void set_ti_thread_flag(struct thread_info *ti, int flag) { set_bit(flag,&ti->flags); }这里所有的涉及平台相关的汇编指令全是以Intel x86为例
static inline void set_bit(int nr, volatile unsigned long * addr) { __asm__ __volatile__( LOCK_PREFIX "btsl %1,%0" :"+m" (ADDR) :"Ir" (nr)); }btsl就不说了,这个指令的功能就是置位,而且是原子的。 置好位后,resched_task的任务只剩下对多处理器存在的情况下任务的处理了,先留着,把整个调度过程走一遍再回过头来分析。