futex系统调用分析

futex - fast user-space locking

futex是一个内核态和用户态共同参与实现的锁。它基于一个观察,大多数情况下可能并没有锁的争抢,所以没有必要每次都陷入内核态,可以首先在用户态查询一下锁是否被其他进程/线程占用,如果没有占用可直接返回,无需调用futex syscall。用户态这一部分实现比较简单,可以简单理解为使用原子操作读写比较一个共享变量,然后决定下一步的操作。内核态较为复杂,我们下面分析一下内核态的实现。在深入到内核代码之前,我们先了解一下如何使用它。

下面futex接口签名。

       long futex(uint32_t *uaddr, int futex_op, uint32_t val,
                 const struct timespec *timeout,   /* or: uint32_t val2 */
                 uint32_t *uaddr2, uint32_t val3);

uaddr表示futex会在该地址等待或唤醒等待在该地址的进程/线程,对于多进程必须使用共享内存,对于多线程就简单一些。futex_op表示各种操作,比如等待,唤醒,val是跟op相关的值,对于op为FUTEX_WAIT,只有addr指向的地址的值等于val才会等待,否则立即返回。对于FUTEX_WAKE,val表示唤醒进程/线程的个数。timeout表示超时时间,也就是唤醒有两种情形,一种是被其他进程/线程主动唤醒,另一种是超时后自动醒来。这里有一篇文章提供了一个例子可以参考:https://eli.thegreenplace.net/2018/basics-of-futexes/

下面我们进入主题,看一下kernel的实现。

futex syscall。

SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
                const struct __kernel_timespec __user *, utime,
                u32 __user *, uaddr2, u32, val3)
{
...
        return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}

在用户态调用futex后系统调用函数会调用do_futex完成具体任务。

long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        u32 __user *uaddr2, u32 val2, u32 val3)
{
    int cmd = op & FUTEX_CMD_MASK;
    unsigned int flags = 0;
...
    switch (cmd) {
    case FUTEX_WAIT:
        val3 = FUTEX_BITSET_MATCH_ANY;
        fallthrough;
    case FUTEX_WAIT_BITSET:
        return futex_wait(uaddr, flags, val, timeout, val3);
    case FUTEX_WAKE:
        val3 = FUTEX_BITSET_MATCH_ANY;
        fallthrough;
    case FUTEX_WAKE_BITSET:
        return futex_wake(uaddr, flags, val, val3);
    ...
    return -ENOSYS;
}

这里处理所有的op,本文只分析wake和wait。

先来分析wait。

int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
{
    struct hrtimer_sleeper timeout, *to;
    struct restart_block *restart;
    struct futex_hash_bucket *hb;
    struct futex_q q = futex_q_init;
    int ret;

    if (!bitset)
        return -EINVAL;
    q.bitset = bitset;

    to = futex_setup_timer(abs_time, &timeout, flags,
                   current->timer_slack_ns);
retry:
    /*
     * Prepare to wait on uaddr. On success, it holds hb->lock and q
     * is initialized.
     */
    ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
    if (ret)
        goto out;

    /* futex_queue and wait for wakeup, timeout, or a signal. */
    futex_wait_queue(hb, &q, to);
...
}

第一步,设置timer,如果timeout不为空;

第二步,获取futex_q和futex_hash_bucket,为在uaddr上等待做准备;

第三步,在futex queue上等待;

其余是唤醒后的操作。

先看一下futex_q和futex_hash_bucket结构。

//表示哈希futex队列entry,每个task都有一个
struct futex_q {
    struct plist_node list;  // 在该futex上等待的所有task组成的优先排序链表项,会挂到哈希桶上

    struct task_struct *task;   // 该结构所关联的task
    spinlock_t *lock_ptr;  // 哈希桶的锁
    union futex_key key;   // 该futex在哈希桶的key
    struct futex_pi_state *pi_state;
    struct rt_mutex_waiter *rt_waiter;
    union futex_key *requeue_pi_key;
    u32 bitset;    
    atomic_t requeue_state;
#ifdef CONFIG_PREEMPT_RT
    struct rcuwait requeue_wait;
#endif
} __randomize_layout;

在一个futex上等待的task可以有多个,每个task都有一个futex_q与之对应,他们组成一个链表,每个链表在哈希桶中有一个key表示。

看一下futex_key结构。

union futex_key {
    struct {
        u64 i_seq;
        unsigned long pgoff;
        unsigned int offset;
    } shared;
    struct {
        union {
            struct mm_struct *mm;
            u64 __tmp;
        };
        unsigned long address;
        unsigned int offset;
    } private;
    struct {
        u64 ptr;
        unsigned long word;
        unsigned int offset;
    } both;
};

这是一个unin,用来标识futex的位置,根据futex所在位置属于私有还是共享来选择使用哪一项。每一项可以看成是表示位置的三元组。

//所有哈希到同一个位置的futex key共享一个哈希桶,每个key有多个futex_q结构,每个futex_q代表某个task在一个futex上等待
struct futex_hash_bucket {
    atomic_t waiters;    //等待者的数量
    spinlock_t lock;
    struct plist_head chain;  // futex_q list的头
} ____cacheline_aligned_in_smp;

 

futex_wait_setup会准备好futex等待队列或将uaddr加入等待队列。 

int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
             struct futex_q *q, struct futex_hash_bucket **hb)
{
...
retry:
    ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);

retry_private:
    *hb = futex_q_lock(q);

    ret = futex_get_value_locked(&uval, uaddr);
...
	if (uval != val) {
		futex_q_unlock(*hb);
		ret = -EWOULDBLOCK;
	}

	return ret;
}

第一步针对uaddr获取一个futex_key;

第二步获取根据futex_q获取哈希桶hb;

第三步,获取uaddr指向的值,跟入参val比较,如果不相等就返回-EWOUBLOCK的错误,也就是在用户态传入参数时如果futex指向的值不等于val就会立即返回的原因。

get_futex_key会根据uaddr是私有还是共享找出表示该位置的三元组,对于私有:key为{current->mm, address, 0},其中address是uaddr所在虚拟页框的地址;对于共享:key为 {inode->i_sequence, page->index, offset within page}。私有映射很容的得到key,共享映射需要复杂的步骤才能得到key,因此线程模型比进程模型在使用futex上更高效。

futex_hash_bucke根据上面得到的futex_q生成futex_hash_bucket结构。

struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
    __acquires(&hb->lock)
{
    struct futex_hash_bucket *hb;

    hb = futex_hash(&q->key);   //从全局哈希队列中找到一个hash桶

    /*
     * Increment the counter before taking the lock so that
     * a potential waker won't miss a to-be-slept task that is
     * waiting for the spinlock. This is safe as all futex_q_lock()
     * users end up calling futex_queue(). Similarly, for housekeeping,
     * decrement the counter at futex_q_unlock() when some error has
     * occurred and we don't end up adding the task to the list.
     */
    futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */  // 增加哈希桶的waiters计数

    q->lock_ptr = &hb->lock;

    spin_lock(&hb->lock); //锁住哈希桶
    return hb;
}

这个哈希桶是从全局的futex_queue中找到的。

struct futex_hash_bucket *futex_hash(union futex_key *key)
{
    u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
              key->both.offset);

    return &futex_queues[hash & (futex_hashsize - 1)];
}

futex_queues是在futex_init时初始化的。

static int __init futex_init(void)
{
    unsigned int futex_shift;
    unsigned long i;

#if CONFIG_BASE_SMALL
    futex_hashsize = 16;
#else
    futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
#endif

    futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
                           futex_hashsize, 0, 0,
                           &futex_shift, NULL,
                           futex_hashsize, futex_hashsize);
    futex_hashsize = 1UL << futex_shift;

    for (i = 0; i < futex_hashsize; i++) {
        atomic_set(&futex_queues[i].waiters, 0);
        plist_head_init(&futex_queues[i].chain);
        spin_lock_init(&futex_queues[i].lock);
    }

    return 0;
}
core_initcall(futex_init);

这个全局hash_queues的数量是初始化的时候决定的,对于一个拥有256个cpu的机器也只会分配16项。

现在已经准备好哈希桶和futex_q,可以wait了。

void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
                struct hrtimer_sleeper *timeout)
{
    /*
     * The task state is guaranteed to be set before another task can
     * wake it. set_current_state() is implemented using smp_store_mb() and
     * futex_queue() calls spin_unlock() upon completion, both serializing
     * access to the hash list and forcing another memory barrier.
     */
    set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);    //设置当前task的状态为interruptable加freezable
    futex_queue(q, hb);       //将futex加入等待队列

    /* Arm the timer */
    if (timeout)               // 如果设置了超时,开始计时
        hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);

    /*
     * If we have been removed from the hash list, then another task
     * has tried to wake us, and we can skip the call to schedule().
     */
    if (likely(!plist_node_empty(&q->list))) {         // 我们应该还是队列里面没人这么快就唤醒我们,如果没有设置超时或者超时时间还没到就主动调度
        /*
         * If the timer has already expired, current will already be
         * flagged for rescheduling. Only call schedule if there
         * is no timeout, or if it has yet to expire.
         */
        if (!timeout || timeout->task)
            schedule();
    }
__set_current_state(TASK_RUNNING); //这次我们真的活过来了,设置当前task的状态为TASK_RUNNING }

第一步,设置当前进程的state为可中断和可冻结;

第二步,将futex_q加入哈希桶的链上;

第三步,如果设置了超时,开启timer;

第四步,如果现在还没被唤醒或者超时没到就主动调度;

第五步,再次醒来,设置本task的state为TASK_RUNNING;

futex_queue会调用__futex_queue将当前futex_q加入哈希桶的队列。

void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
{
    int prio;

    /*
     * The priority used to register this element is
     * - either the real thread-priority for the real-time threads
     * (i.e. threads with a priority lower than MAX_RT_PRIO)
     * - or MAX_RT_PRIO for non-RT threads.
     * Thus, all RT-threads are woken first in priority order, and
     * the others are woken last, in FIFO order.
     */
    prio = min(current->normal_prio, MAX_RT_PRIO);

    plist_node_init(&q->list, prio);      //初始化futex_q的list,这是一个优先排序链表
    plist_add(&q->list, &hb->chain);     //将futex_q加入哈希桶的链
    q->task = current;
}

futex_wait在进程醒来之后的代码暂时不分析了。至此我们已经了解了futex的等待机制。搞清楚几个数据结构基本就了解了其中的原理,futex_q, futex_hash_bucket, futex_key。画一张图表示一下:

再看一下唤醒的情况。

long futex(uint32_t *uaddr, int futex_op, uint32_t val,
                 const struct timespec *timeout,   /* or: uint32_t val2 */
                 uint32_t *uaddr2, uint32_t val3);

futex_op为FUTEX_WAKE, val代表要唤醒的进程的个数。其他的参数会被忽略。

当我们在用户态调用使用FUTEX_WAKE作为入参调用futex后调用链是do_futex->futex_wake

int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
    struct futex_hash_bucket *hb;
    struct futex_q *this, *next;
    union futex_key key = FUTEX_KEY_INIT;
    int ret;
    DEFINE_WAKE_Q(wake_q);     // 定义一个唤醒队列

    if (!bitset)
        return -EINVAL;

    ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);    //根据uaddr找到futex_key
    if (unlikely(ret != 0))
        return ret;

    hb = futex_hash(&key);      //根据futex_key找到哈希桶

    /* Make sure we really have tasks to wakeup */
    if (!futex_hb_waiters_pending(hb))      //check一下是不是有task在等待
        return ret;

    spin_lock(&hb->lock);        //锁住哈希桶

    plist_for_each_entry_safe(this, next, &hb->chain, list) {      //在哈希桶的chain上循环查找futex_q
        if (futex_match (&this->key, &key)) {                      //每次比对一下futex_key,看看是不是我们要找的
            if (this->pi_state || this->rt_waiter) {
                ret = -EINVAL;
                break;
            }

            /* Check if one of the bits is set in both bitsets */
            if (!(this->bitset & bitset))                       //位图也要对的上
                continue;

            futex_wake_mark(&wake_q, this);          //将futex_q从对应哈希桶的链上取下来,将task加入到wake_q中
            if (++ret >= nr_wake)                    //如果已经唤醒的数量大于需要唤醒的数量break
                break;
        }
    }

    spin_unlock(&hb->lock);
    wake_up_q(&wake_q);                             //这里开始真正的唤醒动作
    return ret;
}

 第一步,获取futex_key和哈希桶;

第二步,在哈希桶上循环找对应futex_key的futex_q,找到后将futex_q从哈希桶上取下来,将对应的task加入到wake_q

第三步,调用wake_up_q唤醒wake_q中的task。

void wake_up_q(struct wake_q_head *head)
{
    struct wake_q_node *node = head->first;

    while (node != WAKE_Q_TAIL) {
        struct task_struct *task;

        task = container_of(node, struct task_struct, wake_q);
        /* Task can safely be re-inserted now: */
        node = node->next;
        task->wake_q.next = NULL;

        /*
         * wake_up_process() executes a full barrier, which pairs with
         * the queueing in wake_q_add() so as not to miss wakeups.
         */
        wake_up_process(task);
        put_task_struct(task);
    }
}

wake_up_q会循环查找wake_q中的task,调用wake_up_process执行唤醒动作,这是另外一个话题了,futex的wait和wake就分析到这里。

posted on 2024-11-07 16:35  半山随笔  阅读(79)  评论(0编辑  收藏  举报

导航