linux rcu
RCU(Read-Copy Update)是一种同步机制,通过保存对象的多个副本来保障读操作的连续性,并保证在预定的读方临界区没有完成之前不会释放这个对象。传统的同步机制如spin lock,semaphore,rwlock等,并发线程不区分读写线程,或者并发线程允许同时读,但是读的时候不允许更新。RCU与这些机制最大的区别是允许在更新的同时读数据。RCU允许同时有一个更新线程和多个读线程并发;
RCU是如何做到上述的那种效果呢? RCU把更新操作分解为两个部分removal 和reclaimation;在removal阶段,删除对该数据结构的引用,因为CPU对单个指针的写入操作是原子的,因此删除过程可以与新的读线程并发执行;reclaimation阶段必须等待removal阶段所有的读线程结束后才可以回收该数据结构,对于removal阶段以后的读线程看到的是更新后的数据结构,因此只需要考虑在removal阶段已经存在的那些读线程;
RCU实现过程主要解决以下2个问题:
1. 在一个读线程遍历链表的过程中,另外一个更新线程对链表进行插入操作,RCU需要保证读线程要么能看见新的节点或者看不见新的节点;
2. 读线程读取了某个链表节点,更新线程可以从链表中删除这个节点,但是不能直接回收这个节点,必须等到所有的读线程完成后才进行回收操作;
经典RCU由三个基本机制组成:Publish-Subscribe Mechanism,Waiting for All Pre-existing RCU Readers to Complete,Maintain Multiple Version of Recently Updated Objects;
Publish-Subscribe Mechanism
订阅发布机制就是能够并发插入链表的能力,允许即使链表正被修改,读线程也可以安全的遍历链表;考虑以下例子:
struct foo { int a; int b; int c; }; struct foo *gp = NULL; p = kmalloc(sizeof(*p), GFP_KERNEL); p->a = 1; p->b = 2; p->c = 3; gp = p;
对于不同的编译器,有可能不能保证最后4条语句的顺序执行。
RCU提供了rcu_assign_pointer用于发布新的数据结构;上面的代码就可以修改为
p->a = 1; p->b = 2; p->c = 3; rcu_assign_pointer(gp, p);
rcu_assign_pointer封装了内存屏障,用于保证操作的顺序;
读线程考虑以下代码:
p = gp; if (p != NULL) { do_something_with(p->a, p->b, p->c); }
看上去好像不会有执行顺序问题,但是某些架构的CPU及其编译器可能会在取p的值之前直接取p的成员。编译器会先猜测p的值,然后取p的成员内容,然后才去取p的真实值来检查之前的猜测是否正确;
RCU提供了rcu_dereference用于订阅其他线程发布的值;
/** * rcu_dereference - fetch an RCU-protected pointer in an * RCU read-side critical section. This pointer may later * be safely dereferenced. * * Inserts memory barriers on architectures that require them * (currently only the Alpha), and, more importantly, documents * exactly which pointers are protected by RCU. */ #define rcu_dereference(p) ({ \ typeof(p) _________p1 = p; \ smp_read_barrier_depends(); \ (_________p1); \ })
读线程的代码就可以修改为
rcu_read_lock(); p = rcu_dereference(gp); if (p != NULL) { do_something_with(p->a, p->b, p->c); } rcu_read_unlock();
Waiting for All Pre-existing RCU Readers to Complete:RCU把所有已存在的读线程完成的这段时间称为grace period,如下图所示:
从图上可以看出grace period从removal阶段发布新的指针开始一直到所有的已存在读者完成对旧版本的节点的引用,直到reclaimation开始;图中可以看出有4个读线程引用了旧版本的数据,因此reclamation阶段必须等到这4个读线程完成后才可以开始;另外grace period开始后的读线程看到的是更新后的节点,因此grace period可以忽略这些读线程;
Linux中使用了一个小技巧来判断读线程是否已经完成对旧版本数据结构的引用,因为经典RCU中不允许阻塞或睡眠,因此可以通过该CPU是否完成了一次上下文切换来判断读线程是否已经完成对旧版本数据结构的引用。也就是说如果CPU完成了至少一次的上下文切换,读线程已经安全地从临界区退出了,因此可以安全地释放旧版本的数据。CPU完成一次上下文切换也称为经历了一个quiescent state。
Maintain Multiple Version of Recently Updated Objects:对于RCU保护的数据,同时对数据结构进行读和更新时,RCU的此项能力保证读线程可以看到不同版本的数据结构,而不是部分更新的数据;
以下分析Linux kernel中RCU的实现。
1. 初始化:
static struct notifier_block __cpuinitdata rcu_nb = { .notifier_call = rcu_cpu_notify, };
/* * Initializes rcu mechanism. Assumed to be called early. * That is before local timer(SMP) or jiffie timer (uniproc) is setup. * Note that rcu_qsctr and friends are implicitly * initialized due to the choice of ``0'' for RCU_CTR_INVALID. */ void __init rcu_init(void) { rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ register_cpu_notifier(&rcu_nb); }
为了支持热拔插的CPU,注册了一个CPU事件的回调,对于已经启动的CPU直接调用rcu_cpu_notify的CPU_UP_PREPARE事件
static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; switch (action) { case CPU_UP_PREPARE: rcu_online_cpu(cpu); break; case CPU_DEAD: rcu_offline_cpu(cpu); break; default: break; } return NOTIFY_OK; }
rcu_online_cpu中对每个CPU的rcu_data进行了初始化
static void __devinit rcu_online_cpu(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); }
这里顺便插入per_cpu的实现分析:
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); static void __init setup_per_cpu_areas(void) { unsigned long size, i; char *ptr; unsigned long nr_possible_cpus = num_possible_cpus(); /* Copy section for each CPU (we discard the original) */ size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); #ifdef CONFIG_MODULES if (size < PERCPU_ENOUGH_ROOM) size = PERCPU_ENOUGH_ROOM; #endif ptr = alloc_bootmem(size * nr_possible_cpus); for_each_possible_cpu(i) { __per_cpu_offset[i] = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); ptr += size; } }
上述代码在kernel初始化过程中调用,首先分配一段内存,然后把.data..percpu段中的数据为每个CPU都拷贝一份数据,并把每个CPU引用自己的那一段副本的地址偏移记录下来;
因此后面每个CPU就可以通过该偏移地址来找到对应自己的那份副本
#define RELOC_HIDE(ptr, off) \ ({ unsigned long __ptr; \ __ptr = (unsigned long) (ptr); \ (typeof(ptr)) (__ptr + (off)); }) #define per_cpu(var, cpu) (*({ \ extern int simple_identifier_##var(void); \ RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); }))
回到rcu_init_percpu_data中,
static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { memset(rdp, 0, sizeof(*rdp)); rdp->curtail = &rdp->curlist; rdp->nxttail = &rdp->nxtlist; rdp->donetail = &rdp->donelist; /* 相等表示当前CPU无需等待quiescent state */ rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; }
其中第二个参数是全局rcu控制块,结构如下:
/* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { long cur; /* Current batch number. */ long completed; /* Number of the last completed batch */ int next_pending; /* Is the next batch already waiting? */ int signaled; spinlock_t lock ____cacheline_internodealigned_in_smp; cpumask_t cpumask; /* CPUs that need to switch in order */ /* for current batch to proceed. */ } ____cacheline_internodealigned_in_smp;
rcu_data结构如下:
/* * Per-CPU data for Read-Copy UPdate. * nxtlist - new callbacks are added here * curlist - current batch for which quiescent cycle started if any */ struct rcu_data { /* 1) quiescent state handling : */ long quiescbatch; /* Batch # for grace period, 正在等待的grace period号 */ int passed_quiesc; /* User-mode/idle loop etc. 是否已经通过至少一次quiescent state */ int qs_pending; /* core waits for quiesc state */ /* 2) batch handling */ long batch; /* Batch # for current RCU batch,当前CPU正在进行的grace period号 */ struct rcu_head *nxtlist; struct rcu_head **nxttail; long qlen; /* # of queued callbacks */ struct rcu_head *curlist; struct rcu_head **curtail; struct rcu_head *donelist; struct rcu_head **donetail; long blimit; /* Upper limit on a processed batch */ int cpu; struct rcu_head barrier; };
DECLARE_PER_CPU(struct rcu_data, rcu_data);
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
一次线程调度表明该CPU已经经历了一次quiescent state, 在进程调度schedule中会调用rcu_qsctr_inc把rdp->passed_quiesc置为1。
/* * Increment the quiescent state counter. * The counter is a bit degenerated: We do not need to know * how many quiescent states passed, just if there was at least * one since the start of the grace period. Thus just a flag. */ static inline void rcu_qsctr_inc(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); rdp->passed_quiesc = 1; }
另外在每次时钟中断都会检查是否有RCU相关工作需要处理
/* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. */ void update_process_times(int user_tick) { struct task_struct *p = current; int cpu = smp_processor_id(); /* Note: this timer irq context must be accounted for as well. */ if (user_tick) account_user_time(p, jiffies_to_cputime(1)); else account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); scheduler_tick(); run_posix_cpu_timers(p); }
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* This cpu has pending rcu entries and the grace period * for them has completed. */ /* 已经完成的grace period号大于等于当前CPU等待的grace period号 */ if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) return 1; /* This cpu has no pending entries, but there are new entries */ /* 上一个等待已完成,有新的call_rcu调用 */ if (!rdp->curlist && rdp->nxtlist) return 1; /* This cpu has finished callbacks to invoke */ /* reclaimation阶段,等待已完成,调用其回调函数 */ if (rdp->donelist) return 1; /* The rcu core waits for a quiescent state from the cpu */ /* 当前CPU已经进入grace period在等待quiescent state */ if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) return 1; /* nothing to do */ return 0; } /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, returning 1 if so. This function is part of the * RCU implementation; it is -not- an exported member of the RCU API. */ int rcu_pending(int cpu) { return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); }
如果rcu_pending返回1,则进入rcu_check_callbacks,检查当前CPU是否已经通过一次quiescent state,并调用rcu_process_callbacks进行处理
void rcu_check_callbacks(int cpu, int user) { /* 处于用户线程上下文中或者CPU空闲且不处于软中断上下文中且已经处理过一次硬件中断 */ if (user || (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { rcu_qsctr_inc(cpu); rcu_bh_qsctr_inc(cpu); } else if (!in_softirq()) rcu_bh_qsctr_inc(cpu); tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); }
/* * This does the RCU processing work from tasklet context. */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* 当前正在等待quiescent state链表不为空且当前等待的grace period已结束 */ if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { /* 把当前等待的rcu置为完成状态表示可以进行reclaimation阶段 */ *rdp->donetail = rdp->curlist; rdp->donetail = rdp->curtail; rdp->curlist = NULL; rdp->curtail = &rdp->curlist; } /* 上一次等待quiescent state已完成且有新的rcu调用 */ if (rdp->nxtlist && !rdp->curlist) { /* 把这些新的rcu调用置为正在等待状态然后开始新的grace period等待 */ local_irq_disable(); rdp->curlist = rdp->nxtlist; rdp->curtail = rdp->nxttail; rdp->nxtlist = NULL; rdp->nxttail = &rdp->nxtlist; local_irq_enable(); /* * start the next batch of callbacks */ /* 当前CPU等待的grace period号为已经在等待的grace period的下一个 */ /* determine batch number */ rdp->batch = rcp->cur + 1; /* see the comment and corresponding wmb() in * the rcu_start_batch() */ smp_rmb(); /* 若其他CPU已经开始等待,等待当前的grace period完成后再通过序列号来判断自己是否可以结束grace period */ if (!rcp->next_pending) { /* and start it/schedule start if it's a new batch */ spin_lock(&rcp->lock); /* 防止有多个CPU进来开始新的grace period等待 */ rcp->next_pending = 1; rcu_start_batch(rcp); spin_unlock(&rcp->lock); } } rcu_check_quiescent_state(rcp, rdp); if (rdp->donelist) rcu_do_batch(rdp); }
/* * Grace period handling: * The grace period handling consists out of two steps: * - A new grace period is started. * This is done by rcu_start_batch. The start is not broadcasted to * all cpus, they must pick this up by comparing rcp->cur with * rdp->quiescbatch. All cpus are recorded in the * rcu_ctrlblk.cpumask bitmap. * - All cpus must go through a quiescent state. * Since the start of the grace period is not broadcasted, at least two * calls to rcu_check_quiescent_state are required: * The first call just notices that a new grace period is running. The * following calls check if there was a quiescent state since the beginning * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If * the bitmap is empty, then the grace period is completed. * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. * Caller must hold rcu_ctrlblk.lock. */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { /* 当前没有正在等待的quiescent state,等待完成后会把rcp->completed = rcp->cur */ if (rcp->next_pending && rcp->completed == rcp->cur) { rcp->next_pending = 0; /* * next_pending == 0 must be visible in * __rcu_process_callbacks() before it can see new value of cur. */ smp_wmb(); /* 启动新的等待,当前grace period号+1 */ rcp->cur++; /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a * Barrier Otherwise it can cause tickless idle CPUs to be * included in rcp->cpumask, which will extend graceperiods * unnecessarily. */ smp_mb(); /* 把当前存在的CPU置1 */ cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); rcp->signaled = 0; } }
/* * Check if the cpu has gone through a quiescent state (say context * switch). If so and if it already hasn't done so in this RCU * quiescent cycle, then indicate that it has done so. */ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* 开始了一个新的等待把qs_pending置为1表示处于grace period */ if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; rdp->passed_quiesc = 0; rdp->quiescbatch = rcp->cur; return; } /* Grace period already completed for this cpu? * qs_pending is checked instead of the actual bitmap to avoid * cacheline trashing. */ /* 等待未完成,返回 */ if (!rdp->qs_pending) return; /* * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ /* 还没有经过上下文切换,grace period未完成 */ if (!rdp->passed_quiesc) return; /* 已经经过了quiescent state */ rdp->qs_pending = 0; spin_lock(&rcp->lock); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. */ /* 当前完成的quiescent state是当前等待的那个grace period则把rcp中当前CPU位清空 */ if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); spin_unlock(&rcp->lock); }
最后执行donelist中的reclaimation
/* * Invoke the completed RCU callbacks. They are expected to be in * a per-cpu list. */ static void rcu_do_batch(struct rcu_data *rdp) { struct rcu_head *next, *list; int count = 0; list = rdp->donelist; while (list) { next = list->next; prefetch(next); list->func(list); list = next; /* 反之一次性调用过多耗时太久 */ if (++count >= rdp->blimit) break; } rdp->donelist = list; local_irq_disable(); rdp->qlen -= count; local_irq_enable(); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; if (!rdp->donelist) rdp->donetail = &rdp->donelist; else tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); }