关键词：

工作队列的原理是把work(需要推迟执行的函数)交由一个内核线程来执行，它总是在进程上下文中执行。

工作队列的优点是利用进程上下文来执行中断下半部操作，因此工作队列允许重新调度和睡眠，是异步执行的进程上下文，它还能解决软中断和tasklet执行时间过长导致系统实时性下降等问题。

当驱动程序或者内核子系统在进程上下文中有异步执行的工作任务时，可以使用work item来描述工作任务，包括该工作任务的执行回调函数，把work item添加到一个队列中，然后一个内核线程回去执行这个工作任务的回调函数。

这里work item被称为工作，队列被称为workqueue，即工作队列，内核线程被称为worker。

CMWQ(Concurrency Managed Workqueues)

执行work item任务的线程被称为worker或者工作线程。工作线程会串行化地执行挂入到队列中所有的work item。如果队列中没有work，那么该工作线程就会变成idle态。

为了管理众多工作线程，CMWQ提出了工作线程池(worker-pool)概念，worker-pool有两种：

一是bound型，可以理解为Per-CPU类型，每个CPU都有worker-pool；

另一种是unbound型，即不和具体CPU绑定。

这两种worker-pool都会定义两个线程池，一个给普通优先级的work使用，另一个给高优先级的work使用。

1. 初始化工作队列

1.1 工作、工作队列、工作线程池、工作线程数据结构

workqueue机制最小的调度单元是work_struct，即工作任务。

struct work_struct {
    atomic_long_t data;---------------低比特位部分是work的标志位，剩余比特位通常用于存放上一次运行的worker_pool ID或pool_workqueue的指针。存放的内容有WORK_STRUCT_PWQ标志位来决定
    struct list_head entry;-----------用于把work挂到其他队列上。
    work_func_t func;-----------------工作任务的处理函数
#ifdef CONFIG_LOCKDEP
    struct lockdep_map lockdep_map;
#endif
}

工作队列由struct workqueue_struct数据结构描述：

struct workqueue_struct {
    struct list_head    pwqs;        /* WR: all pwqs of this wq */--------------------该workqueue所在的所有pool_workqueue链表
    struct list_head    list;        /* PL: list of all workqueues */-----------------系统所有workqueue_struct的全局链表

    struct mutex        mutex;        /* protects this wq */
    int            work_color;    /* WQ: current work color */
    int            flush_color;    /* WQ: current flush color */
    atomic_t        nr_pwqs_to_flush; /* flush in progress */
    struct wq_flusher    *first_flusher;    /* WQ: first flusher */
    struct list_head    flusher_queue;    /* WQ: flush waiters */
    struct list_head    flusher_overflow; /* WQ: flush overflow list */

    struct list_head    maydays;    /* MD: pwqs requesting rescue */-------------------所有rescue状态下的pool_workqueue数据结构链表
    struct worker        *rescuer;    /* I: rescue worker */---------------------------rescue内核线程，内存紧张时创建新的工作线程可能会失败，如果创建workqueue是设置了WQ_MEM_RECLAIM，那么rescuer线程会接管这种情况。

    int            nr_drainers;    /* WQ: drain in progress */
    int            saved_max_active; /* WQ: saved pwq max_active */

    struct workqueue_attrs    *unbound_attrs;    /* WQ: only for unbound wqs */---------UNBOUND类型属性
    struct pool_workqueue    *dfl_pwq;    /* WQ: only for unbound wqs */----------------unbound类型的pool_workqueue

#ifdef CONFIG_SYSFS
    struct wq_device    *wq_dev;    /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
    struct lockdep_map    lockdep_map;
#endif
    char            name[WQ_NAME_LEN]; /* I: workqueue name */--------------------------该workqueue的名字

    /* hot fields used during command issue, aligned to cacheline */
    unsigned int        flags ____cacheline_aligned; /* WQ: WQ_* flags */---------------经常被不同CUP访问，因此要和cache line对齐。
    struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */---------------------指向per-cpu类型的pool_workqueue
    struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
}

运行work_struct的内核线程被称为worker，即工作线程。

/*
 * The poor guys doing the actual heavy lifting.  All on-duty workers are
 * either serving the manager role, on idle list or on busy hash.  For
 * details on the locking annotation (L, I, X...), refer to workqueue.c.
 *
 * Only to be used in workqueue and async.
 */
struct worker {
    /* on idle list while idle, on busy hash table while busy */
    union {
        struct list_head    entry;    /* L: while idle */
        struct hlist_node    hentry;    /* L: while busy */
    };

    struct work_struct    *current_work;    /* L: work being processed */----当前正在处理的work
    work_func_t        current_func;    /* L: current_work's fn */-----------当前正在执行的work回调函数
    struct pool_workqueue    *current_pwq; /* L: current_work's pwq */-------当前work所属的pool_workqueue
    bool            desc_valid;    /* ->desc is valid */
    struct list_head    scheduled;    /* L: scheduled works */---------------所有被调度并正准备执行的work_struct都挂入该链表中

    /* 64 bytes boundary on 64bit, 32 on 32bit */

    struct task_struct    *task;        /* I: worker task */-----------------该工作线程的task_struct数据结构
    struct worker_pool    *pool;        /* I: the associated pool */---------该工作线程所属的worker_pool
                        /* L: for rescuers */
    struct list_head    node;        /* A: anchored at pool->workers */------可以把该worker挂入到worker_pool->workers链表中
                        /* A: runs through worker->node */

    unsigned long        last_active;    /* L: last active timestamp */
    unsigned int        flags;        /* X: flags */
    int            id;        /* I: worker id */

    /*
     * Opaque string set with work_set_desc().  Printed out with task
     * dump for debugging - WARN, BUG, panic or sysrq.
     */
    char            desc[WORKER_DESC_LEN];

    /* used only by rescuers to point to the target workqueue */
    struct workqueue_struct    *rescue_wq;    /* I: the workqueue to rescue */
}

CMWQ提出了工作线程池的概念，struct worker_pool数据结构用于描述工作线程池。

worker_pool是per-cpu变量，每个CPU都有worker_pool，而且有两个worker_pool。

一个用于普通优先级工作线程，另一个用于高优先级工作线程。

struct worker_pool {
    spinlock_t        lock;        /* the pool lock */-----------------------用于保护worker_pool的自旋锁
    int            cpu;        /* I: the associated cpu */-------------------对于unbound类型为-1；对于bound类型workqueue表示绑定的CPU ID。
    int            node;        /* I: the associated node ID */
    int            id;        /* I: pool ID */-------------------------------该worker_pool的ID号
    unsigned int        flags;        /* X: flags */

    struct list_head    worklist;    /* L: list of pending works */----------挂入pending状态的work_struct
    int            nr_workers;    /* L: total number of workers */-----------工作线程的数量

    /* nr_idle includes the ones off idle_list for rebinding */
    int            nr_idle;    /* L: currently idle ones */------------------处于idle状态的工作线程的数量

    struct list_head    idle_list;    /* X: list of idle workers */----------处于idle状态的工作线程链表
    struct timer_list    idle_timer;    /* L: worker idle timeout */
    struct timer_list    mayday_timer;    /* L: SOS timer for workers */

    /* a workers is either on busy_hash or idle_list, or the manager */
    DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                        /* L: hash of busy workers */

    /* see manage_workers() for details on the two manager mutexes */
    struct mutex        manager_arb;    /* manager arbitration */
    struct mutex        attach_mutex;    /* attach/detach exclusion */
    struct list_head    workers;    /* A: attached workers */---------------该worker_pool管理的工作线程链表
    struct completion    *detach_completion; /* all workers detached */

    struct ida        worker_ida;    /* worker IDs for task name */

    struct workqueue_attrs    *attrs;        /* I: worker attributes */-----工作线程属性
    struct hlist_node    hash_node;    /* PL: unbound_pool_hash node */
    int            refcnt;        /* PL: refcnt for unbound pools */

    /*
     * The current concurrency level.  As it's likely to be accessed
     * from other CPUs during try_to_wake_up(), put it in a separate
     * cacheline.
     */
    atomic_t        nr_running ____cacheline_aligned_in_smp;----------------用于管理worker的创建和销毁的统计计数，表示运行中的worker数量。该变量可能被多CPU同时访问，因此独占一个缓存行，避免多核读写造成“颠簸”现象。

    /*
     * Destruction of pool is sched-RCU protected to allow dereferences
     * from get_work_pool().
     */
    struct rcu_head        rcu;---------------------------------------------RCU锁
}

struct pool_workqueue用于链接workqueue和worker_pool。

struct pool_workqueue {
    struct worker_pool    *pool;        /* I: the associated pool */-----------指向worker_pool结构
    struct workqueue_struct *wq;        /* I: the owning workqueue */----------指向workqueue_struct结构
    int            work_color;    /* L: current color */
    int            flush_color;    /* L: flushing color */
    int            refcnt;        /* L: reference count */
    int            nr_in_flight[WORK_NR_COLORS];
                        /* L: nr of in_flight works */
    int            nr_active;    /* L: nr of active works */------------------活跃的work_strcut数量
    int            max_active;    /* L: max active works */-------------------最大活跃work_struct数量
    struct list_head    delayed_works;    /* L: delayed works */--------------延迟执行work_struct链表
    struct list_head    pwqs_node;    /* WR: node on wq->pwqs */
    struct list_head    mayday_node;    /* MD: node on wq->maydays */

    /*
     * Release of unbound pwq is punted to system_wq.  See put_pwq()
     * and pwq_unbound_release_workfn() for details.  pool_workqueue
     * itself is also sched-RCU protected so that the first pwq can be
     * determined without grabbing wq->mutex.
     */
    struct work_struct    unbound_release_work;
    struct rcu_head        rcu;------------------------------------------------RCU锁
}

上面几个数据结构的关系图？

1.2 初始化工作队列

首先看一下对创建工作队列有重要影响的flags。

/*
 * Workqueue flags and constants.  For details, please refer to
 * Documentation/workqueue.txt.
 */
enum {
    WQ_UNBOUND        = 1 << 1, /* not bound to any cpu */-----------------绑定到某一个CPU执行
    WQ_FREEZABLE        = 1 << 2, /* freeze during suspend */--------------在suspend进行进程冻结的时候，需要让工作线程完成当前所有的work才完成进程冻结，并且这个过程不会再新开始一个work的执行，知道进程被解冻。
    WQ_MEM_RECLAIM        = 1 << 3, /* may be used for memory reclaim */---在内存紧张导致创建新进程失败，系统通过rescuer内核线程去接管这种情况。
    WQ_HIGHPRI        = 1 << 4, /* high priority */------------------------属于高于高优先级的worker_pool
    WQ_CPU_INTENSIVE    = 1 << 5, /* cpu intensive workqueue */------------属于特别消耗CPU资源的一类work，这个work执行会得到调度器的监管，排在这类work后的non-CPU-intensive类型work可能会推迟执行
    WQ_SYSFS        = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */

    WQ_POWER_EFFICIENT    = 1 << 7,-----------------根据wq_power_efficient来决定此类型的工作队列是bound还是unbound类型，bound型可能导致处于idle的CPU被唤醒，而unbound型则不会必然唤醒idle的CPU。

    __WQ_DRAINING        = 1 << 16, /* internal: workqueue is draining */
    __WQ_ORDERED        = 1 << 17, /* internal: workqueue is ordered */----表示同一时间只能执行一个work_item。
    __WQ_ORDERED_EXPLICIT    = 1 << 19, /* internal: alloc_ordered_workqueue() */

    WQ_MAX_ACTIVE        = 512,      /* I like 512, better ideas? */
    WQ_MAX_UNBOUND_PER_CPU    = 4,      /* 4 * #cpus for unbound wq */
    WQ_DFL_ACTIVE        = WQ_MAX_ACTIVE / 2,
};

内核启动的时候，调用init_workqueues()创建工作线程，同时创建了一些常用的工作队列。

init_workqueues()由early_initcall(init_workqueues)在early阶段调用。

1.2.1 谁？都创建了哪些工作线程？

对于4核SMP系统来说，必然创建的工作线程有：每个CPU的kworker/x:0、kworker/x:0H、以及unbound类型的kworker/u8:0。

init_workqueues()创建CPU0以及unbound工作线程

kworker/0:0和kworker/0:0H以及kworker/u8:0都是由init_workqueues创建的，调用轨迹如下。

kworker/0:0、kworker/0:0H：kernel_init()->kernel_init_freeable()->do_one_initcall()->init_workqueues()->create_worker()

kworker/u8:0：kernel_init()->kernel_init_freeable()->do_one_inicall->init_workqueues()->__alloc_workqueue_key()->apply_workqueue_attrs()->alloc_unbound_pwq()->create_worker()

对于unbound工作线程的创建是因为init_workqueues()中创建了一系列的workqueue，调用alloc_workqueue()->__allow_workqueue_key()->alloc_and_link_pwqs()->apply_workqueue_attrs()->alloc_unbound_pwq()导致的。

这里的init_workqueues()为什么不将CPU1~3的工作线程一起创建了？

虽然此处init_workqueues()是在do_one_initcall中执行，但是此处的do_one_initcall较特殊。

static noinline void __init kernel_init_freeable(void)
{
...
    smp_prepare_cpus(setup_max_cpus);

    do_pre_smp_initcalls();-------------------------------------此处调用的initcall是在__initcall_start~__initcall0_start之间的函数，也即early_initcall()。所以init_workqueues()在smp_init之前被调用。
    lockup_detector_init();

    smp_init();
    sched_init_smp();-------------------------------------------将剩余CPU1~3进行up操作。

    do_basic_setup();-------------------------------------------执行__initcall_0start之后的initcall函数
...
}

在初始化pool的时候，是按照possible的CPU来进行初始化的。而在创建工作线程的时候是按照online的CPU来创建的。

在init_workqueues()的时刻，CPU1~3还没有online。所以会先创建kworker/0:0、kworker/0:0H、kworker/u8:0三个工作线程。

unbound工作线程的pool->id为8也就不难理解了，因为前面4和分配个0~7。

workqueue_cpu_up_callback()创建了其他CPU工作线程

kernel_init()->kernel_init_freeable()->smp_init()->cpu_up()->_cpu_up()->__raw_notifier_call_chain()->workqueue_cpu_up_callback()->create_worker()

在init_workqueues()开头就注册了CPU_PRI_WORKQUEUE_UP处理函数，所以在smp_init()->cpu_up()将CPU启动之后就会为每个CPU创建两个工作线程

1.2.2 init_workqueues()初始化worker_pool、worker、workqueue

static int __init init_workqueues(void)
{
    int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };---------------这里HIGHPRI_NICE_LEVEL为-20，对应的prio为100，是普通进程里面的最高优先级。
    int i, cpu;

    WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

    pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

    cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);--------------跟随CPU_UP/CPU_DOWN动态创建工作线程的接口。
    hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);

    wq_numa_init();

    /* initialize CPU pools */
    for_each_possible_cpu(cpu) {------------------------------------------------遍历每个possible状态的CPU
        struct worker_pool *pool;

        i = 0;
        for_each_cpu_worker_pool(pool, cpu) {-----------------------------------每个CPU两个worker_poo，分别对应per-cpu变量nice值为0的cpu_worker_pool[0]和nice值为-20的cpu_worker_pool[1]。
            BUG_ON(init_worker_pool(pool));-------------------------------------初始化worker_pool
            pool->cpu = cpu;
            cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
            pool->attrs->nice = std_nice[i++];----------------------------------设置nice值
            pool->node = cpu_to_node(cpu);

            /* alloc pool ID */
            mutex_lock(&wq_pool_mutex);
            BUG_ON(worker_pool_assign_id(pool));
            mutex_unlock(&wq_pool_mutex);
        }
    }

    /* create the initial worker */
    for_each_online_cpu(cpu) {--------------------------------------------------遍历所有online状态CPU，对于SMP多核CPU，支队boot cpu创建了工作线程。其他CPU工作线程稍后再cpu_up中创建。
        struct worker_pool *pool;

        for_each_cpu_worker_pool(pool, cpu) {-----------------------------------使用create_worker对每个worker_pool创建两个内核线程对应cpu_worker_pool[0]和cpu_worker_pool[1]
            pool->flags &= ~POOL_DISASSOCIATED;
            BUG_ON(!create_worker(pool));
        }
    }

    /* create default unbound and ordered wq attrs */
    for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
        struct workqueue_attrs *attrs;

        BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
        attrs->nice = std_nice[i];
        unbound_std_wq_attrs[i] = attrs;---------------------------------------设置Unbound类型workqueue的属性

        /*
         * An ordered wq should have only one pwq as ordering is
         * guaranteed by max_active which is enforced by pwqs.
         * Turn off NUMA so that dfl_pwq is used for all nodes.
         */
        BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
        attrs->nice = std_nice[i];
        attrs->no_numa = true;
        ordered_wq_attrs[i] = attrs;-------------------------------------------设置ordered类型workqueue的属性，ordered类型workqueue同一时刻只能有一个work item在运行。
    }

    system_wq = alloc_workqueue("events", 0, 0);-------------------------------普通优先级bound类型工作队列system_wq
    system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);------高优先级bound类型工作队列system_highpri_wq
    system_long_wq = alloc_workqueue("events_long", 0, 0);---------------------
    system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,----------普通优先级unbound类型工作队列system_unbound_wq
                        WQ_UNBOUND_MAX_ACTIVE);
    system_freezable_wq = alloc_workqueue("events_freezable",------------------freezable类型工作队列system_freezable_wq
                          WQ_FREEZABLE, 0);
    system_power_efficient_wq = alloc_workqueue("events_power_efficient",------省电类型的工作队列system_power_efficient_wq
                          WQ_POWER_EFFICIENT, 0);
    system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
                          WQ_FREEZABLE | WQ_POWER_EFFICIENT,-------------------freezable并且省电类型的工作队列system_freezable_power_efficient_wq
                          0);
    BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
           !system_unbound_wq || !system_freezable_wq ||
           !system_power_efficient_wq ||
           !system_freezable_power_efficient_wq);
    return 0;
}

static int workqueue_cpu_up_callback(struct notifier_block *nfb,
                           unsigned long action,
                           void *hcpu)
{
    int cpu = (unsigned long)hcpu;
    struct worker_pool *pool;
    struct workqueue_struct *wq;
    int pi;

    switch (action & ~CPU_TASKS_FROZEN) {
    case CPU_UP_PREPARE:
        for_each_cpu_worker_pool(pool, cpu) {
            if (pool->nr_workers)
                continue;
            if (!create_worker(pool))
                return NOTIFY_BAD;
        }
        break;

    case CPU_DOWN_FAILED:
    case CPU_ONLINE:
        mutex_lock(&wq_pool_mutex);

        for_each_pool(pool, pi) {
            mutex_lock(&pool->attach_mutex);

            if (pool->cpu == cpu)
                rebind_workers(pool);
            else if (pool->cpu < 0)
                restore_unbound_workers_cpumask(pool, cpu);

            mutex_unlock(&pool->attach_mutex);
        }

        /* update NUMA affinity of unbound workqueues */
        list_for_each_entry(wq, &workqueues, list)
            wq_update_unbound_numa(wq, cpu, true);

        mutex_unlock(&wq_pool_mutex);
        break;
    }
    return NOTIFY_OK;
}

static int workqueue_cpu_down_callback(struct notifier_block *nfb,
                         unsigned long action,
                         void *hcpu)
{
    int cpu = (unsigned long)hcpu;
    struct work_struct unbind_work;
    struct workqueue_struct *wq;

    switch (action & ~CPU_TASKS_FROZEN) {
    case CPU_DOWN_PREPARE:
        /* unbinding per-cpu workers should happen on the local CPU */
        INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
        queue_work_on(cpu, system_highpri_wq, &unbind_work);

        /* update NUMA affinity of unbound workqueues */
        mutex_lock(&wq_pool_mutex);
        list_for_each_entry(wq, &workqueues, list)
            wq_update_unbound_numa(wq, cpu, false);
        mutex_unlock(&wq_pool_mutex);

        /* wait for per-cpu unbinding to finish */
        flush_work(&unbind_work);
        destroy_work_on_stack(&unbind_work);
        break;
    }
    return NOTIFY_OK;
}

init_worker_pool()初始化一个worker_pool。

static int init_worker_pool(struct worker_pool *pool)
{
    spin_lock_init(&pool->lock);
    pool->id = -1;
    pool->cpu = -1;----------------------------------------------初始值-1表示当前worker_pool是unbound型的
    pool->node = NUMA_NO_NODE;
    pool->flags |= POOL_DISASSOCIATED;
    INIT_LIST_HEAD(&pool->worklist);
    INIT_LIST_HEAD(&pool->idle_list);
    hash_init(pool->busy_hash);

    init_timer_deferrable(&pool->idle_timer);
    pool->idle_timer.function = idle_worker_timeout;-------------销毁多余worker，每IDLE_WORKER_TIMEOUT(300秒)执行一次。
    pool->idle_timer.data = (unsigned long)pool;

    setup_timer(&pool->mayday_timer, pool_mayday_timeout,
            (unsigned long)pool);--------------------------------设置mayday_timer，周期为MAYDAY_INTERVAL，一HZ的1/10，即100ms。判断workpoll执行异常，则让rescuer worker介入。

    mutex_init(&pool->manager_arb);
    mutex_init(&pool->attach_mutex);
    INIT_LIST_HEAD(&pool->workers);

    ida_init(&pool->worker_ida);
    INIT_HLIST_NODE(&pool->hash_node);
    pool->refcnt = 1;

    /* shouldn't fail above this point */
    pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
    if (!pool->attrs)
        return -ENOMEM;
    return 0;
}

何时创建新kworker？何时销毁kworker？

1.2.2.1 销毁kworker

一个worker被创建后首先进入worker_enter_idle()，里面启动了pool->idle_timer，定时IDLE_WORKER_TIMEOUT即300HZ。

如果一个worker进入idle超过300HZ，即会执行idle_worker_timeout()。

static void idle_worker_timeout(unsigned long __pool)
{
    struct worker_pool *pool = (void *)__pool;

    spin_lock_irq(&pool->lock);

    while (too_many_workers(pool)) {------------------判断当前workpoll中的worker数量是否过程，如果过程则选中一个worker销毁。直到workerpool中没有worker过剩。
        struct worker *worker;
        unsigned long expires;

        /* idle_list is kept in LIFO order, check the last one */
        worker = list_entry(pool->idle_list.prev, struct worker, entry);
        expires = worker->last_active + IDLE_WORKER_TIMEOUT;

        if (time_before(jiffies, expires)) {
            mod_timer(&pool->idle_timer, expires);
            break;
        }

        destroy_worker(worker);-----------------------销毁选中的worker。
    }

    spin_unlock_irq(&pool->lock);
}

/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
    bool managing = mutex_is_locked(&pool->manager_arb);
    int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
    int nr_busy = pool->nr_workers - nr_idle;

    return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;-----首先满足idle worker数量大于2；并且除去两个idle worker线程外的idle worker不能超过busy worker的1/3。所以每个workerpool最少两个worker线程。如果workerpool中有4个worker(3idle+1busy)，则3>2并且(3-2)*4>1，即会选择一个idle销毁。
}

static void destroy_worker(struct worker *worker)
{
    struct worker_pool *pool = worker->pool;

    lockdep_assert_held(&pool->lock);

    /* sanity check frenzy */
    if (WARN_ON(worker->current_work) ||
        WARN_ON(!list_empty(&worker->scheduled)) ||
        WARN_ON(!(worker->flags & WORKER_IDLE)))
        return;

    pool->nr_workers--;---------------------------------------------------------更新对应workerpool的nr_workers和nr_idle数量，并将worker从wokerpoll的worker列表中摘除。
    pool->nr_idle--;

    list_del_init(&worker->entry);
    worker->flags |= WORKER_DIE;------------------------------------------------在worker_thread()中，判断当前worker->flags，如果为WORKER_DIE则销毁线程。
    wake_up_process(worker->task);
}

1.2.2.2 rescue woker

系统每100ms启动检查当前workerpool中是否存在allocation deadlock异常，启动rescuer worker进行处理。

static void pool_mayday_timeout(unsigned long __pool)
{
    struct worker_pool *pool = (void *)__pool;
    struct work_struct *work;

    spin_lock_irq(&pool->lock);
    spin_lock(&wq_mayday_lock);        /* for wq->maydays */

    if (need_to_create_worker(pool)) {
        /*
         * We've been trying to create a new worker but
         * haven't been successful.  We might be hitting an
         * allocation deadlock.  Send distress signals to
         * rescuers.
         */
        list_for_each_entry(work, &pool->worklist, entry)
            send_mayday(work);
    }

    spin_unlock(&wq_mayday_lock);
    spin_unlock_irq(&pool->lock);

    mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}

static bool need_more_worker(struct worker_pool *pool)
{
    return !list_empty(&pool->worklist) && __need_more_worker(pool);--------------当前pool->worklist等待work队列不为空，但是pool->nr_running为空。说明有work在等待执行，但是并没有work被执行，说明需要rescuer worker介入处理。
}

static bool __need_more_worker(struct worker_pool *pool)
{
    return !atomic_read(&pool->nr_running);
}

static void send_mayday(struct work_struct *work)
{
    struct pool_workqueue *pwq = get_work_pwq(work);
    struct workqueue_struct *wq = pwq->wq;

    lockdep_assert_held(&wq_mayday_lock);

    if (!wq->rescuer)
        return;

    /* mayday mayday mayday */
    if (list_empty(&pwq->mayday_node)) {
        /*
         * If @pwq is for an unbound wq, its base ref may be put at
         * any time due to an attribute change.  Pin @pwq until the
         * rescuer is done with it.
         */
        get_pwq(pwq);
        list_add_tail(&pwq->mayday_node, &wq->maydays);
        wake_up_process(wq->rescuer->task);
    }
}

1.2.2.3 创建worker

create_worker()创建内核的工作线程。

static struct worker *create_worker(struct worker_pool *pool)
{
    struct worker *worker = NULL;
    int id = -1;
    char id_buf[16];

    /* ID is needed to determine kthread name */
    id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);----------------从当前worker_pool->worker_ida获取一个空闲id。
    if (id < 0)
        goto fail;

    worker = alloc_worker(pool->node);---------------------------------------分配一个woker结构体
    if (!worker)
        goto fail;

    worker->pool = pool;-----------------------------------------------------woker_pool关联到worker
    worker->id = id;---------------------------------------------------------递增的id

    if (pool->cpu >= 0)------------------------------------------------------初始值为-1表示unbound，当>=0的时候就指定了cpu，说明是bound型的。
        snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
             pool->attrs->nice < 0  ? "H" : "");-----------------------------nice为0表示普通优先级，nice为-20是高优先级。
    else
        snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);

    worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
                          "kworker/%s", id_buf);----------------------------woker和创建的内核工作线程关联上，线程处理函数是worker_thread。
    if (IS_ERR(worker->task))
        goto fail;

    set_user_nice(worker->task, pool->attrs->nice);-------------------------设置内核工作线程的优先级相关

    /* prevent userland from meddling with cpumask of workqueue workers */
    worker->task->flags |= PF_NO_SETAFFINITY;-------------------------------阻止用户修改其CPU亲和性

    /* successful, attach the worker to the pool */
    worker_attach_to_pool(worker, pool);------------------------------------将worker附着到worker_pool上

    /* start the newly created worker */
    spin_lock_irq(&pool->lock);
    worker->pool->nr_workers++;---------------------------------------------统计当前worker对应worker_pool中工作线程数目
    worker_enter_idle(worker);----------------------------------------------让该工作线程进入idle状态。
    wake_up_process(worker->task);------------------------------------------唤醒刚创建的工作线程
    spin_unlock_irq(&pool->lock);

    return worker;

fail:
    if (id >= 0)
        ida_simple_remove(&pool->worker_ida, id);
    kfree(worker);
    return NULL;
}

woker_attact_to_pool()主要是将worker工作线程加入到woker_pool->workers链表中。

static void worker_attach_to_pool(struct worker *worker,
                   struct worker_pool *pool)
{
    mutex_lock(&pool->attach_mutex);

    set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
    if (pool->flags & POOL_DISASSOCIATED)------------------表示worker_pool没有绑定到某个CPU上，所以worker也不会绑定到某个CPU。
        worker->flags |= WORKER_UNBOUND;

    list_add_tail(&worker->node, &pool->workers);----------将当前worker加入到worker_pool末尾。

    mutex_unlock(&pool->attach_mutex);
}

1.2.3 工作线程执行函数

worker_thread()是工作线程的处理函数，不管其所在worker_pool是bound还是unbound型。

worker_thread()处理了大部分work_item，除了属于rescuer的work_item由rescuer_thread()进行处理。

通过worker找到对应的worker_pool，然后遍历worker_pool中的work_struct。

static int worker_thread(void *__worker)
{
    struct worker *worker = __worker;
    struct worker_pool *pool = worker->pool;

    /* tell the scheduler that this is a workqueue worker */
    worker->task->flags |= PF_WQ_WORKER;------------------PF_WQ_WORKER告诉调度器这是一个woker类型的线程。
woke_up:
    spin_lock_irq(&pool->lock);

    /* am I supposed to die? */
    if (unlikely(worker->flags & WORKER_DIE)) {-----------WORKER_DIE表示此工作线程将要被销毁。
        spin_unlock_irq(&pool->lock);
        WARN_ON_ONCE(!list_empty(&worker->entry));
        worker->task->flags &= ~PF_WQ_WORKER;

        set_task_comm(worker->task, "kworker/dying");
        ida_simple_remove(&pool->worker_ida, worker->id);
        worker_detach_from_pool(worker, pool);
        kfree(worker);
        return 0;
    }

    worker_leave_idle(worker);----------------------------清除WORKER_IDLE标志位，并退出idle状态链表
recheck:
    /* no more worker necessary? */
    if (!need_more_worker(pool))--------------------------如果当前worker_pool->worklist中有pending任务，但是当前pool中没有正在运行的线程，need_more_worker()返回true。
        goto sleep;

    /* do we need to manage? */
    if (unlikely(!may_start_working(pool)) && manage_workers(worker))------may_start_working()判断pool中是否有idle状态工作线程。如果没有，那么manage_workers()创建一些工作线程。
        goto recheck;------------------------------------------------------manage_worker()创建新工作线程之后，还需要跳转到recheck标签处再检查一遍，有可能在创建工作线程过程中整个线程池发生了变化。

    /*
     * ->scheduled list can only be filled while a worker is
     * preparing to process a work or actually processing it.
     * Make sure nobody diddled with it while I was sleeping.
     */
    WARN_ON_ONCE(!list_empty(&worker->scheduled));-------------------------scheduled链表表示工作线程准备处理一个work或者正在执行一个work时才会有work添加到该链表中。

    /*
     * Finish PREP stage.  We're guaranteed to have at least one idle
     * worker or that someone else has already assumed the manager
     * role.  This is where @worker starts participating in concurrency
     * management if applicable and concurrency management is restored
     * after being rebound.  See rebind_workers() for details.
     */
    worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);--------因为马上就要开始执行work的回调函数了，对于bound类型增加worker_pool->nr_running计数

    do {-----------------------------------------------------------遍历当前worker_pool->worklist中的工作，调用process_one_work()进行处理。
        struct work_struct *work =
            list_first_entry(&pool->worklist,
                     struct work_struct, entry);

        if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
            /* optimization path, not strictly necessary */
            process_one_work(worker, work);------------------------单独处理一个work
            if (unlikely(!list_empty(&worker->scheduled)))
                process_scheduled_works(worker);-------------------处理worker_pool->scheduled链表上的work_struct。
        } else {---------------------------------------------------如果当前work_struct置位WORK_STRUCT_LINKED表示work后面还串上其它work，把这些work迁移到woeker_pool->scheduled中，然后一并再用process_one_work()函数处理。
            move_linked_works(work, &worker->scheduled, NULL);
            process_scheduled_works(worker);
        }
    } while (keep_working(pool));----------------------------------判断当前worker_pool->worklist不为空，且工作线程池活跃线程小于等于1，那么保持当前工作线程继续工作，以防止工作线程泛滥。

    worker_set_flags(worker, WORKER_PREP);
sleep:
    /*
     * pool->lock is held and there's no work to process and no need to
     * manage, sleep.  Workers are woken up only while holding
     * pool->lock or from local cpu, so setting the current state
     * before releasing pool->lock is enough to prevent losing any
     * event.
     */
    worker_enter_idle(worker);
    __set_current_state(TASK_INTERRUPTIBLE);
    spin_unlock_irq(&pool->lock);
    schedule();
    goto woke_up;
}

manage_workers()函数动态管理创建工作线程的函数。

maybo_create_worker()函数中while首先调用create_worker()来创建新的工作线程。

static bool manage_workers(struct worker *worker)
{
    struct worker_pool *pool = worker->pool;

    if (pool->flags & POOL_MANAGER_ACTIVE)
        return false;

    pool->flags |= POOL_MANAGER_ACTIVE;
    pool->manager = worker;

    maybe_create_worker(pool);

    pool->manager = NULL;
    pool->flags &= ~POOL_MANAGER_ACTIVE;
    wake_up(&wq_manager_wait);
    return true;
}

static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
    spin_unlock_irq(&pool->lock);

    /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
    mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);------------一般为MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2，即10ms。

    while (true) {
        if (create_worker(pool) || !need_to_create_worker(pool))-----------------create_worker()创建成功则退出while循环；或者通过need_to_create_worker()判断是否需要继续创建新线程。
            break;

        schedule_timeout_interruptible(CREATE_COOLDOWN);

        if (!need_to_create_worker(pool))----------------------------------------再次判断是否需要继续创建新线程。
            break;
    }

    del_timer_sync(&pool->mayday_timer);
    spin_lock_irq(&pool->lock);
    /*
     * This is necessary even after a new worker was just successfully
     * created as @pool->lock was dropped and the new worker might have
     * already become busy.
     */
    if (need_to_create_worker(pool))
        goto restart;
}

process_scheduled_works()专门处理worker->scheduled上面的工作，具体处理还是交给process_one_work()。

static void process_scheduled_works(struct worker *worker)
{
    while (!list_empty(&worker->scheduled)) {
        struct work_struct *work = list_first_entry(&worker->scheduled,
                        struct work_struct, entry);
        process_one_work(worker, work);
    }
}


static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
    struct pool_workqueue *pwq = get_work_pwq(work);
    struct worker_pool *pool = worker->pool;
    bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;----------------判断当前的workqueue是否是CPU_INTENSIVE，会对其所在工作线程进行特殊设置。
    int work_color;
    struct worker *collision;
#ifdef CONFIG_LOCKDEP
    /*
     * It is permissible to free the struct work_struct from
     * inside the function that is called from it, this we need to
     * take into account for lockdep too.  To avoid bogus "held
     * lock freed" warnings as well as problems when looking into
     * work->lockdep_map, make a copy and use that here.
     */
    struct lockdep_map lockdep_map;

    lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
    /* ensure we're on the correct CPU */
    WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
             raw_smp_processor_id() != pool->cpu);

    /*
     * A single work shouldn't be executed concurrently by
     * multiple workers on a single cpu.  Check whether anyone is
     * already processing the work.  If so, defer the work to the
     * currently executing one.
     */
    collision = find_worker_executing_work(pool, work);--------------------查询当前work是否在worker_pool->busy_hash表中正在运行，如果在就移到当前work正在执行的worker->scheduled并退出当前处理。
    if (unlikely(collision)) {
        move_linked_works(work, &collision->scheduled, NULL);
        return;
    }

    /* claim and dequeue */
    debug_work_deactivate(work);
    hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
    worker->current_work = work;
    worker->current_func = work->func;
    worker->current_pwq = pwq;
    work_color = get_work_color(work);

    list_del_init(&work->entry);

    /*
     * CPU intensive works don't participate in concurrency management.
     * They're the scheduler's responsibility.  This takes @worker out
     * of concurrency management and the next code block will chain
     * execution of the pending work items.
     */
    if (unlikely(cpu_intensive))
        worker_set_flags(worker, WORKER_CPU_INTENSIVE);--------------------设置当前工作线程flags，调度器就知道内核线程属性了，但实际上调度器暂时并没有做特殊处理。

    /*
     * Wake up another worker if necessary.  The condition is always
     * false for normal per-cpu workers since nr_running would always
     * be >= 1 at this point.  This is used to chain execution of the
     * pending work items for WORKER_NOT_RUNNING workers such as the
     * UNBOUND and CPU_INTENSIVE ones.
     */
    if (need_more_worker(pool))-----------------------判断是否需要唤醒更多工作线程，wake_up_worker()去唤醒worker_pool中第一个idle线程。对于bound型worker_pool此时一般nr_running>=1，所以条件不成立。
        wake_up_worker(pool);

    /*
     * Record the last pool and clear PENDING which should be the last
     * update to @work.  Also, do this inside @pool->lock so that
     * PENDING and queued state changes happen together while IRQ is
     * disabled.
     */
    set_work_pool_and_clear_pending(work, pool->id);---------------清除struct worker中data成员pending标志位，里面使用了smp_wmb保证了pending之前的写操作完成之后才清除pending。

    spin_unlock_irq(&pool->lock);

    lock_map_acquire_read(&pwq->wq->lockdep_map);
    lock_map_acquire(&lockdep_map);
    trace_workqueue_execute_start(work);
    worker->current_func(work);------------------------------------真正执行work的回调函数
    /*
     * While we must be careful to not use "work" after this, the trace
     * point will only record its address.
     */
    trace_workqueue_execute_end(work);
    lock_map_release(&lockdep_map);
    lock_map_release(&pwq->wq->lockdep_map);

    if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
        pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
               "     last function: %pf\n",
               current->comm, preempt_count(), task_pid_nr(current),
               worker->current_func);
        debug_show_held_locks(current);
        dump_stack();
    }

    /*
     * The following prevents a kworker from hogging CPU on !PREEMPT
     * kernels, where a requeueing work item waiting for something to
     * happen could deadlock with stop_machine as such work item could
     * indefinitely requeue itself while all other CPUs are trapped in
     * stop_machine. At the same time, report a quiescent RCU state so
     * the same condition doesn't freeze RCU.
     */
    cond_resched_rcu_qs();

    spin_lock_irq(&pool->lock);

    /* clear cpu intensive status */
    if (unlikely(cpu_intensive))
        worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

    /* we're done with it, release */
    hash_del(&worker->hentry);-----------------------------------work回调函数执行完成后的清理工作
    worker->current_work = NULL;
    worker->current_func = NULL;
    worker->current_pwq = NULL;
    worker->desc_valid = false;
    pwq_dec_nr_in_flight(pwq, work_color);
}

2 创建工作队列

2.1 各种创建工作队列API和flags

创建工作队列的API有很多，但最终都通过__alloc_workqueue_key()去实现。不同API之间的主要区别在于使用了不同的flag。

所以看一下这些flag，同时max_active决定每个CPU最多可有多少个work挂入一个工作队列。

如果bound类型工作队列，max_active最大可以是512；如果max_active为0，表示指定为256。

如果需要严格串行执行工作队列，使用max_active=1和WQ_UNBOUND组合。

/*
 * Workqueue flags and constants.  For details, please refer to
 * Documentation/workqueue.txt.
 */
enum {
    WQ_NON_REENTRANT    = 1 << 0, /* guarantee non-reentrance */-----------确保工作在多个CPU上是不可重入的。
    WQ_UNBOUND        = 1 << 1, /* not bound to any cpu */-----------------工作任务会加入unbound工作队列中，unbound类型work不需要额外的同步管理，unbound工作线程池会尝试尽快执行它的work。
    WQ_FREEZABLE        = 1 << 2, /* freeze during suspend */--------------此标记工作队列会参与到系统suspend过程中，会让工作线程处理完成所有的work才完成进程冻结，并且这个过程不会再新开始一个work执行，直到进程被解冻。
    WQ_MEM_RECLAIM        = 1 << 3, /* may be used for memory reclaim */---当内存紧张时，创建新的工作线程可能会失败，系统还有一个recuer内核线程会去接管这种情况。
    WQ_HIGHPRI        = 1 << 4, /* high priority */------------------------工作队列的任务对应高优先级的worker_pool，即较低nice值。
    WQ_CPU_INTENSIVE    = 1 << 5, /* cpu instensive workqueue */-----------属于特别消耗CPU资源一类work，这类work会得到系统进程调度器的监管，排在这类work后面的non-cpu intensive类型work可能会推迟执行。

WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */

    WQ_RESCUER        = 1 << 7, /* internal: workqueue has rescuer */

__WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */
__WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */-----------同一时间只能执行一个work item。


    WQ_MAX_ACTIVE        = 512,      /* I like 512, better ideas? */
    WQ_MAX_UNBOUND_PER_CPU    = 4,      /* 4 * #cpus for unbound wq */
    WQ_DFL_ACTIVE        = WQ_MAX_ACTIVE / 2,
};

最常见的形式是alloc_workqueue()，其它都是对某些flag的封装。

#define alloc_workqueue(fmt, flags, max_active, args...)        \
    __alloc_workqueue_key((fmt), (flags), (max_active),        \
                  NULL, NULL, ##args)
#define alloc_ordered_workqueue(fmt, flags, args...)            \
    alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)

#define create_workqueue(name)                        \
    alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name)                \
    alloc_workqueue("%s", WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, \
            1, (name))
#define create_singlethread_workqueue(name)                \
    alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, name)

2.2 __alloc_workqueue_key()

__alloc_workqueue_key分配一个workqueue_struct数据结构并进行初始化，和pool_workqueue进行关联等操作。

struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                           unsigned int flags,
                           int max_active,
                           struct lock_class_key *key,
                           const char *lock_name, ...)
{
    size_t tbl_size = 0;
    va_list args;
    struct workqueue_struct *wq;
    struct pool_workqueue *pwq;

    /* see the comment above the definition of WQ_POWER_EFFICIENT */
    if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)----------设置unbound类型workqueue后，究竟选择哪个cpu上唤醒交由进程调度器决定。如果是bound类型就会让idle状态的CPU从idle状态唤醒，从而增加了功耗。
        flags |= WQ_UNBOUND;

    /* allocate wq and format name */
    if (flags & WQ_UNBOUND)
        tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);

    wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
    if (!wq)
        return NULL;

    if (flags & WQ_UNBOUND) {
        wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
        if (!wq->unbound_attrs)
            goto err_free_wq;
    }

    va_start(args, lock_name);
    vsnprintf(wq->name, sizeof(wq->name), fmt, args);
    va_end(args);

    max_active = max_active ?: WQ_DFL_ACTIVE;
    max_active = wq_clamp_max_active(max_active, flags, wq->name);

    /* init wq */
    wq->flags = flags;
    wq->saved_max_active = max_active;
    mutex_init(&wq->mutex);
    atomic_set(&wq->nr_pwqs_to_flush, 0);
    INIT_LIST_HEAD(&wq->pwqs);
    INIT_LIST_HEAD(&wq->flusher_queue);
    INIT_LIST_HEAD(&wq->flusher_overflow);
    INIT_LIST_HEAD(&wq->maydays);

    lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
    INIT_LIST_HEAD(&wq->list);

    if (alloc_and_link_pwqs(wq) < 0)---------------------分配一个workqueue_struct数据结构并初始化
        goto err_free_wq;

    /*
     * Workqueues which may be used during memory reclaim should
     * have a rescuer to guarantee forward progress.
     */
    if (flags & WQ_MEM_RECLAIM) {
        struct worker *rescuer;

        rescuer = alloc_worker(NUMA_NO_NODE);
        if (!rescuer)
            goto err_destroy;

        rescuer->rescue_wq = wq;
        rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
                           wq->name);
        if (IS_ERR(rescuer->task)) {
            kfree(rescuer);
            goto err_destroy;
        }

        wq->rescuer = rescuer;
        rescuer->task->flags |= PF_NO_SETAFFINITY;
        wake_up_process(rescuer->task);
    }

    if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
        goto err_destroy;

    /*
     * wq_pool_mutex protects global freeze state and workqueues list.
     * Grab it, adjust max_active and add the new @wq to workqueues
     * list.
     */
    mutex_lock(&wq_pool_mutex);

    mutex_lock(&wq->mutex);
    for_each_pwq(pwq, wq)
        pwq_adjust_max_active(pwq);
    mutex_unlock(&wq->mutex);

    list_add(&wq->list, &workqueues);

    mutex_unlock(&wq_pool_mutex);

    return wq;

err_free_wq:
    free_workqueue_attrs(wq->unbound_attrs);
    kfree(wq);
    return NULL;
err_destroy:
    destroy_workqueue(wq);
    return NULL;
}

static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
    bool highpri = wq->flags & WQ_HIGHPRI;
    int cpu, ret;

    if (!(wq->flags & WQ_UNBOUND)) {------------------------处理bound类型workqueue
        wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);-cpu_pwqs是一个per-cpu类型，为每个cpu分配一个pool_workqueue数据结构，是动态分配的。cpu_worker_pools是静态定义的per-cpu类型worker_pool数据结构。
        if (!wq->cpu_pwqs)
            return -ENOMEM;

        for_each_possible_cpu(cpu) {
            struct pool_workqueue *pwq =
                per_cpu_ptr(wq->cpu_pwqs, cpu);
            struct worker_pool *cpu_pools =
                per_cpu(cpu_worker_pools, cpu);

            init_pwq(pwq, wq, &cpu_pools[highpri]);--------init_pwq()将动态分配的cpu_pwqs和静态定义的cpu_worker_pools关联起来。

            mutex_lock(&wq->mutex);
            link_pwq(pwq);---------------------------------把pool_workqueue添加到workqueue_struct->pwqs链表中。
            mutex_unlock(&wq->mutex);
        }
        return 0;
    } else if (wq->flags & __WQ_ORDERED) {-----------------处理ordered类型workqueue
        ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
        /* there should only be single pwq for ordering guarantee */
        WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
                  wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
             "ordering guarantee broken for workqueue %s\n", wq->name);
        return ret;
    } else {-----------------------------------------------处理unbound类型workqueue
        return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
    }
}

int apply_workqueue_attrs(struct workqueue_struct *wq,
              const struct workqueue_attrs *attrs)
{
    struct workqueue_attrs *new_attrs, *tmp_attrs;
    struct pool_workqueue **pwq_tbl, *dfl_pwq;
    int node, ret;

    /* only unbound workqueues can change attributes */
    if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
        return -EINVAL;

    /* creating multiple pwqs breaks ordering guarantee */
    if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
        return -EINVAL;

    pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
    new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
    tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
    if (!pwq_tbl || !new_attrs || !tmp_attrs)
        goto enomem;

    /* make a copy of @attrs and sanitize it */
    copy_workqueue_attrs(new_attrs, attrs);
    cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);

    /*
     * We may create multiple pwqs with differing cpumasks.  Make a
     * copy of @new_attrs which will be modified and used to obtain
     * pools.
     */
    copy_workqueue_attrs(tmp_attrs, new_attrs);

    /*
     * CPUs should stay stable across pwq creations and installations.
     * Pin CPUs, determine the target cpumask for each node and create
     * pwqs accordingly.
     */
    get_online_cpus();

    mutex_lock(&wq_pool_mutex);

    /*
     * If something goes wrong during CPU up/down, we'll fall back to
     * the default pwq covering whole @attrs->cpumask.  Always create
     * it even if we don't use it immediately.
     */
    dfl_pwq = alloc_unbound_pwq(wq, new_attrs);---------------------分配一个pool_workqueue数据结构
    if (!dfl_pwq)
        goto enomem_pwq;

    for_each_node(node) {
        if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
            pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);-------查找或者新建一个pool_workqueue
            if (!pwq_tbl[node])
                goto enomem_pwq;
        } else {
            dfl_pwq->refcnt++;
            pwq_tbl[node] = dfl_pwq;
        }
    }

    mutex_unlock(&wq_pool_mutex);

    /* all pwqs have been created successfully, let's install'em */
    mutex_lock(&wq->mutex);

    copy_workqueue_attrs(wq->unbound_attrs, new_attrs);

    /* save the previous pwq and install the new one */
    for_each_node(node)
        pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);

    /* @dfl_pwq might not have been used, ensure it's linked */
    link_pwq(dfl_pwq);
    swap(wq->dfl_pwq, dfl_pwq);

    mutex_unlock(&wq->mutex);

    /* put the old pwqs */
    for_each_node(node)
        put_pwq_unlocked(pwq_tbl[node]);
    put_pwq_unlocked(dfl_pwq);

    put_online_cpus();
    ret = 0;
    /* fall through */
out_free:
    free_workqueue_attrs(tmp_attrs);
    free_workqueue_attrs(new_attrs);
    kfree(pwq_tbl);
    return ret;

enomem_pwq:
    free_unbound_pwq(dfl_pwq);
    for_each_node(node)
        if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
            free_unbound_pwq(pwq_tbl[node]);
    mutex_unlock(&wq_pool_mutex);
    put_online_cpus();
enomem:
    ret = -ENOMEM;
    goto out_free;
}

static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
                    const struct workqueue_attrs *attrs)
{
    struct worker_pool *pool;
    struct pool_workqueue *pwq;

    lockdep_assert_held(&wq_pool_mutex);

    pool = get_unbound_pool(attrs);----------------------------首先查找一个worker_pool，如果没有则创建一个新的worker_pool。
    if (!pool)
        return NULL;

    pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);-----分配一个pool_workqueue数据结构
    if (!pwq) {
        put_unbound_pool(pool);
        return NULL;
    }

    init_pwq(pwq, wq, pool);
    return pwq;
}

static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
    u32 hash = wqattrs_hash(attrs);
    struct worker_pool *pool;
    int node;

    lockdep_assert_held(&wq_pool_mutex);

    /* do we already have a matching pool? */
    hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {------系统定义了一个哈希表unbound_pool_hash，用于管理所有的unbound类型worker_pool
        if (wqattrs_equal(pool->attrs, attrs)) {----------------------------通过wqattrs_equal()判断系统中是否已经有个类型相关worker_pool
            pool->refcnt++;
            return pool;
        }
    }

    /* nope, create a new one */
    pool = kzalloc(sizeof(*pool), GFP_KERNEL);------------------------------如果没有找到，重新分配和初始化一个worker_pool
    if (!pool || init_worker_pool(pool) < 0)
        goto fail;
...
    /* create and start the initial worker */
    if (!create_worker(pool))
        goto fail;

    /* install */
    hash_add(unbound_pool_hash, &pool->hash_node, hash);

    return pool;
...
}

static void put_pwq(struct pool_workqueue *pwq)
{
    lockdep_assert_held(&pwq->pool->lock);
    if (likely(--pwq->refcnt))
        return;
    if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
        return;
    schedule_work(&pwq->unbound_release_work);
}

static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
             struct worker_pool *pool)
{
    BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);

    memset(pwq, 0, sizeof(*pwq));

    pwq->pool = pool;-------------------------------------------pwq->pool指向worker_pool
    pwq->wq = wq;-----------------------------------------------pwq->wq指向workqueue_struct
    pwq->flush_color = -1;
    pwq->refcnt = 1;
    INIT_LIST_HEAD(&pwq->delayed_works);
    INIT_LIST_HEAD(&pwq->pwqs_node);
    INIT_LIST_HEAD(&pwq->mayday_node);
    INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);------------用于释放pool_workqueue
}


static void pwq_unbound_release_workfn(struct work_struct *work)
{
    struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
                          unbound_release_work);-----------从work找到pool_workqueue数据结构指针pwq
    struct workqueue_struct *wq = pwq->wq;
    struct worker_pool *pool = pwq->pool;
    bool is_last;

    if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
        return;

    mutex_lock(&wq->mutex);
    list_del_rcu(&pwq->pwqs_node);
    is_last = list_empty(&wq->pwqs);
    mutex_unlock(&wq->mutex);

    mutex_lock(&wq_pool_mutex);
    put_unbound_pool(pool);
    mutex_unlock(&wq_pool_mutex);

    call_rcu_sched(&pwq->rcu, rcu_free_pwq);

    /*
     * If we're the last pwq going away, @wq is already dead and no one
     * is gonna access it anymore.  Free it.
     */
    if (is_last) {
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
    }
}

3. 调度一个work

一般情况下使用默认的workqueue，首先需要初始化一个work，然后使用schedule_work()把work挂入默认的workqueue中。

3.1 初始化一个work

初始化一个work的API有各种不同形式，但最终都调用__INIT_WORK()。

#define TIMER_DEFERRABLE 0x1LU
#define TIMER_IRQSAFE 0x2LU


#define __INIT_WORK(_work, _func, _onstack)                \
    do {                                \
        __init_work((_work), _onstack);                \
        (_work)->data = (atomic_long_t) WORK_DATA_INIT();    \
        INIT_LIST_HEAD(&(_work)->entry);            \
        (_work)->func = (_func);                \
    } while (0)

#define INIT_WORK(_work, _func)                        \
    __INIT_WORK((_work), (_func), 0)

#define INIT_WORK_ONSTACK(_work, _func)                    \
    __INIT_WORK((_work), (_func), 1)

#define __INIT_DELAYED_WORK(_work, _func, _tflags)            \
    do {                                \
        INIT_WORK(&(_work)->work, (_func));            \
        __setup_timer(&(_work)->timer, delayed_work_timer_fn,    \
                  (unsigned long)(_work),            \
                  (_tflags) | TIMER_IRQSAFE);        \
    } while (0)

#define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags)        \
    do {                                \
        INIT_WORK_ONSTACK(&(_work)->work, (_func));        \
        __setup_timer_on_stack(&(_work)->timer,            \
                       delayed_work_timer_fn,        \
                       (unsigned long)(_work),        \
                       (_tflags) | TIMER_IRQSAFE);    \
    } while (0)

#define INIT_DELAYED_WORK(_work, _func)                    \
    __INIT_DELAYED_WORK(_work, _func, 0)

#define INIT_DELAYED_WORK_ONSTACK(_work, _func)                \
    __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0)

#define INIT_DEFERRABLE_WORK(_work, _func)                \
    __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE)

#define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func)            \
    __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)

当data字段包含WORK_STRUCT_PWQ_BIT标志位时，高位存放上一次pool_workqueue指针，低8位存放标志位；没有包含时，包比特位存放上次worker_pool的ID号，低5位存放标志位。

常见标志位如下：

enum {
    WORK_STRUCT_PENDING_BIT    = 0,    /* work item is pending execution */----表示该work正在pending执行。
    WORK_STRUCT_DELAYED_BIT    = 1,    /* work item is delayed */--------------表示该work被延迟执行了。
    WORK_STRUCT_PWQ_BIT    = 2,    /* data points to pwq */
    WORK_STRUCT_LINKED_BIT    = 3,    /* next work is linked to this one */----表示一个work连接到该work上。
#ifdef CONFIG_DEBUG_OBJECTS_WORK
    WORK_STRUCT_STATIC_BIT    = 4,    /* static initializer (debugobjects) */
    WORK_STRUCT_COLOR_SHIFT    = 5,    /* color for workqueue flushing */
#else
    WORK_STRUCT_COLOR_SHIFT    = 4,    /* color for workqueue flushing */
#endif
    WORK_STRUCT_COLOR_BITS    = 4,
...
}

3.2 schedule_work

在初始化完work之后，调用schedule_work()函数把work挂入系统默认workqueue中。

schedule_work()的默认的工作队列是system_wq，最终将工作交给__queue_work()。

static inline bool schedule_work(struct work_struct *work)
{
    return queue_work(system_wq, work);
}

static inline bool queue_work(struct workqueue_struct *wq,
                  struct work_struct *work)
{
    return queue_work_on(WORK_CPU_UNBOUND, wq, work);------------WORK_CPU_UNBOUND不是表示unbound类型，而是CPU。
}

bool queue_work_on(int cpu, struct workqueue_struct *wq,
           struct work_struct *work)
{
    bool ret = false;
    unsigned long flags;

    local_irq_save(flags);---------------------------------------把work加入工作队列是在关本地中断下运行的。

    if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {------------设置WORK_STRUCT_PENDING_BIT并返回旧值。
        __queue_work(cpu, wq, work);
        ret = true;
    }

    local_irq_restore(flags);
    return ret;
}

static void __queue_work(int cpu, struct workqueue_struct *wq,
             struct work_struct *work)
{
    struct pool_workqueue *pwq;
    struct worker_pool *last_pool;
    struct list_head *worklist;
    unsigned int work_flags;
    unsigned int req_cpu = cpu;

    WARN_ON_ONCE(!irqs_disabled());----------------------------是否处于关中断状态

    debug_work_activate(work);

    /* if draining, only works from the same workqueue are allowed */
    if (unlikely(wq->flags & __WQ_DRAINING) &&
        WARN_ON_ONCE(!is_chained_work(wq)))--------------------__WQ_DRAINING表示要销毁workqueue，那么挂入workqueue中所有的work都要处理完毕才能把这个workqueue销毁。在销毁过程中，一般不允许再有新的work加入队列中。有一种特殊例外是正在清空work时触发了一个queue work操作，这种情况被称为chained work。
        return;
retry:
    if (req_cpu == WORK_CPU_UNBOUND)
        cpu = raw_smp_processor_id();

    /* pwq which will be used unless @work is executing elsewhere */
    if (!(wq->flags & WQ_UNBOUND))
        pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);-----------------对于bound型的workqueue，直接使用本地CPU对应pool_workqueue。
    else
        pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));------对于unbound型，调用unbound_pwq_by_node()寻找本地node节点对应的unbound类型的pool_workqueue。

    /*
     * If @work was previously on a different pool, it might still be
     * running there, in which case the work needs to be queued on that
     * pool to guarantee non-reentrancy.
     */
    last_pool = get_work_pool(work);--------------------------通过work_struct的成员data查询该work上一次是在哪个worker_pool中运行的。
    if (last_pool && last_pool != pwq->pool) {----------------如果上次运行的worker_pool和本次不一致
        struct worker *worker;

        spin_lock(&last_pool->lock);

        worker = find_worker_executing_work(last_pool, work);--判断一个work是否正在last_pool上运行，也即不在当前worker_pool运行，如果是返回这个正在执行的工作线程worker

        if (worker && worker->current_pwq->wq == wq) {
            pwq = worker->current_pwq;-------------------------利用当前work正在执行的pool_workqueue，利用缓存热度，不进行调度。
        } else {
            /* meh... not running there, queue here */
            spin_unlock(&last_pool->lock);
            spin_lock(&pwq->pool->lock);
        }
    } else {
        spin_lock(&pwq->pool->lock);
    }

    if (unlikely(!pwq->refcnt)) {
        if (wq->flags & WQ_UNBOUND) {-------------------对unbound类型pool_workqueue释放是异步的，当refcnt减少到0时，说明该pool_workqueue已经被释放，那么需要跳转到retry出重新选择pool_workqueue。
            spin_unlock(&pwq->pool->lock);
            cpu_relax();
            goto retry;
        }
        /* oops */
        WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
              wq->name, cpu);
    }

    /* pwq determined, queue */
    trace_workqueue_queue_work(req_cpu, pwq, work);

    if (WARN_ON(!list_empty(&work->entry))) {
        spin_unlock(&pwq->pool->lock);
        return;
    }

    pwq->nr_in_flight[pwq->work_color]++;
    work_flags = work_color_to_flags(pwq->work_color);

    if (likely(pwq->nr_active < pwq->max_active)) {-------判断当前pool_workqueue的work活跃数量，如果少于最高限值，就加入pending状态链表worker_pool->worklist,否则加入delayed_works链表中。
        trace_workqueue_activate_work(work);
        pwq->nr_active++;
        worklist = &pwq->pool->worklist;
    } else {
        work_flags |= WORK_STRUCT_DELAYED;
        worklist = &pwq->delayed_works;
    }

    insert_work(pwq, work, worklist, work_flags);---------将当前work加入到pool_workqueue->worklist尾部。

    spin_unlock(&pwq->pool->lock);
}

get_work_pool()通过work_struct找到该work上一次在哪个worker_pool中运行。

static struct worker_pool *get_work_pool(struct work_struct *work)
{
    unsigned long data = atomic_long_read(&work->data);
    int pool_id;

    assert_rcu_or_pool_mutex();

    if (data & WORK_STRUCT_PWQ)----------------------------如果定义了WORK_STRUCT_PWQ，那么直接得到pool_workqueue地址，进而找到worker_pool。
        return ((struct pool_workqueue *)
            (data & WORK_STRUCT_WQ_DATA_MASK))->pool;

    pool_id = data >> WORK_OFFQ_POOL_SHIFT;----------------如果没定义WORK_STRUCT_PWQ，那么可以得到对应的pool_id。
    if (pool_id == WORK_OFFQ_POOL_NONE)
        return NULL;

    return idr_find(&worker_pool_idr, pool_id);------------根据pool_id从worker_pool_idr中找到对应的worker_pool。
}

insert_work()将work加入到worker_pool的列表中，

static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
            struct list_head *head, unsigned int extra_flags)
{
    struct worker_pool *pool = pwq->pool;

    /* we own @work, set data and link */
    set_work_pwq(work, pwq, extra_flags);------------把pool_workqueue指针的值和一些flag设置到data成员中，方便下次调用queue_work()知道本次使用哪个pool_workqueue()。
    list_add_tail(&work->entry, head);---------------将work加入到worker_pool->worklist尾部。
    get_pwq(pwq);------------------------------------增加pool_workqueue->refcnt成员引用计数。

    /*
     * Ensure either wq_worker_sleeping() sees the above
     * list_add_tail() or we see zero nr_running to avoid workers lying
     * around lazily while there are works to be processed.
     */
    smp_mb();----------------------------------------保证wake_up_worker()唤醒worker时，在__schedule()->wq_worker_sleeping()时，这里的list_add_tail()已经完成。同时保证下面__need_more_worker()读取nr_running时list_add_tail()链表已经完成。

    if (__need_more_worker(pool))--------------------如果当前nr_running为0，表示当前worker可能并没有处于运行状态。那么需要wake_up_worker()强行唤醒一次。
        wake_up_worker(pool);
}


static void wake_up_worker(struct worker_pool *pool)
{
    struct worker *worker = first_idle_worker(pool);

    if (likely(worker))
        wake_up_process(worker->task);
}

调用schedule_work()只是把work加入到workqueue中，，但并没有开始实质的调度工作。

加入workqueue的pending链表是关中断环境下进行的
设置work->data成员的WORK_STRUCT_PENDING_BIT标志位
寻找合适的pool_workqueue，优先选择本地CPU对应的pool_workqueue；如果该work正在另一个CPU工作线程池中运行，则优先选择此工作线程池。
找到pool_workqueue，就找到了对应的worker_pool和对应的pending链表

那么work真正执行的地方在哪里呢？参见worker_thread()。

其它基于system_wq的变种还包括如下系列，_on表示指定某个CPU，_delayed表示延时工作。

int schedule_work_on(int cpu, struct work_struct *work)
{
    return queue_work_on(cpu, system_wq, work);
}

int schedule_delayed_work(struct delayed_work *dwork,
                    unsigned long delay)
{
    return queue_delayed_work(system_wq, dwork, delay);
}

int schedule_delayed_work_on(int cpu,
            struct delayed_work *dwork, unsigned long delay)
{
    return queue_delayed_work_on(cpu, system_wq, dwork, delay);
}

3.3 其它系统默认workqueue

上面介绍了schedule_work()，其默认将work放入system_wq上。

系统还有其它很多默认workqueue，这些workqueue也都是通过queue_work()将work放入其上。

下面介绍一些其它系统全局workqueue的使用。

system_highpri_wq 和system_wq的区别在于WQ_HIGHPRI，这些work对应的工作线程位于cpu_worker_pool[1]中。工作线程的nice为-20，要比system_wq对应的工作线程优先级要高。

system_long_wq和system_wq类似，但是一般system_long_wq用于执行时间较长的work，而system_wq放执行较短的work。

这两个workqueue没有明显的区别，更多的是靠使用者自觉。

system_nrt_wq相对于system_wq使用了WQ_NON_REENTRANT。默认情况下工作队列只是确保在同一CPU不可重入，即工作在同一CPU上不会被多个工作线程并发执行，但容许在多个CPU上并发执行。

该标志表明在多个CPU上也是不可重入的，工作将在不可重入workqueue上，并确保至多在一个系统范围内的工作线程上执行。

system_unbound_wq相对于system_wq的区别是被设置为WQ_UNBOUND，没有并发管理，且work最大活跃数不超过WQ_UNBOUND_MAX_ACTIVE，一般为WQ_MAX_ACTIVE=512。

system_unbound_wq对应的工作线程不会被绑定到特定CPU，所有排队的work会被立即执行，只要资源足够并且不超过最大活跃数。

system_freezable_wq 相对于system_wq多了WQ_FREEZABLE标志，表示可以冻结workqueue参与系统的暂停操作，该workqueue的工作将被暂停，除非被唤醒，否者没有新的work被执行。

system_power_efficient_wq相对于system_wq多了WQ_POWER_EFFICIENT标志，将工作队列表示为unbound已达到节省功耗的目的，并且还需要wq_power_efficient打开。否则和system_wq没啥区别。

system_freezable_power_efficient_wq兼具system_freezable_wq的freezable和system_power_efficient_wq的power efficient两个特性。

4. 取消一个work

取消一个work的接口是cancel_work_sync()，该函数通常会取消一个work，但会等待该work执行完毕。

bool cancel_work_sync(struct work_struct *work)
{
    return __cancel_work_timer(work, false);
}


static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
{
    static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);---------------------等待队列cancel_waitq
    unsigned long flags;
    int ret;

    do {
        ret = try_to_grab_pending(work, is_dwork, &flags);------------判断当前work的状态，需要特殊处理-ENOENT情况。

        if (unlikely(ret == -ENOENT)) {
            struct cwt_wait cwait;

            init_wait(&cwait.wait);
            cwait.wait.func = cwt_wakefn;
            cwait.work = work;

            prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
                          TASK_UNINTERRUPTIBLE);
            if (work_is_canceling(work))
                schedule();
            finish_wait(&cancel_waitq, &cwait.wait);
        }
    } while (unlikely(ret < 0));

    /* tell other tasks trying to grab @work to back off */
    mark_work_canceling(work);
    local_irq_restore(flags);

    flush_work(work);-------------------------------------------------会去等待work执行完成
    clear_work_data(work);--------------------------------------------清除work标志位

    /*
     * Paired with prepare_to_wait() above so that either
     * waitqueue_active() is visible here or !work_is_canceling() is
     * visible there.
     */
    smp_mb();
    if (waitqueue_active(&cancel_waitq))
        __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);

    return ret;
}

try_to_grab_pending()判断当前的work可否被取消，返回不同状态。__cancel_work_timer()根据不同状态采取不同操作。

static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
                   unsigned long *flags)
{
    struct worker_pool *pool;
    struct pool_workqueue *pwq;

    local_irq_save(*flags);-----------------------------------------------关本地中断，主要工作都在关中断下进行。

    /* try to steal the timer if it exists */
    if (is_dwork) {
        struct delayed_work *dwork = to_delayed_work(work);

        if (likely(del_timer(&dwork->timer)))
            return 1;
    }

    /* try to claim PENDING the normal way */
    if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))----如果PENDING_BIT为0，说明该work处于idle状态，那么可以轻松的把work取出来。此处重新设置PENDING_BIT位，后续还需要等待该work执行完成。
        return 0;

    /*
     * The queueing is in progress, or it is already queued. Try to
     * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
     */----------------------------------------------------------------------下面的情况说明work正在被执行或者已经在worklist链表中，那么尝试去工作池中把work偷出来，成功后返回1.
    pool = get_work_pool(work);
    if (!pool)
        goto fail;

    spin_lock(&pool->lock);

    pwq = get_work_pwq(work);
    if (pwq && pwq->pool == pool) {
        debug_work_deactivate(work);

        if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
            pwq_activate_delayed_work(work);

        list_del_init(&work->entry);-----------------------------------------将当前work从worker_pool->worklist中移除
        pwq_dec_nr_in_flight(pwq, get_work_color(work));

        /* work->data points to pwq iff queued, point to pool */
        set_work_pool_and_keep_pending(work, pool->id);

        spin_unlock(&pool->lock);
        return 1;
    }
    spin_unlock(&pool->lock);
fail:
    local_irq_restore(*flags);
    if (work_is_canceling(work))--------------------------通过该work->data判断该work正在被取消，返回-ENOENT。__cancel_work_timer()会睡眠等待并继续完成。
        return -ENOENT;
    cpu_relax();
    return -EAGAIN;---------------------------------------返回__cancel_work_timer()重试
}

flush_work()等待work执行完成，返回false表示当前work并没有处于执行状态；返回true表示等到work执行完成。

bool flush_work(struct work_struct *work)
{
    struct wq_barrier barr;

    lock_map_acquire(&work->lockdep_map);
    lock_map_release(&work->lockdep_map);

    if (start_flush_work(work, &barr)) {
        wait_for_completion(&barr.done);
        destroy_work_on_stack(&barr.work);
        return true;
    } else {
        return false;
    }
}

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
{
    struct worker *worker = NULL;
    struct worker_pool *pool;
    struct pool_workqueue *pwq;

    might_sleep();

    local_irq_disable();
    pool = get_work_pool(work);-----------------由work_struct找到worker_pool
    if (!pool) {
        local_irq_enable();
        return false;
    }

    spin_lock(&pool->lock);
    /* see the comment in try_to_grab_pending() with the same code */
    pwq = get_work_pwq(work);-------------------由work_struct找到pool_workqueue
    if (pwq) {
        if (unlikely(pwq->pool != pool))--------表示当前work已经被执行完
            goto already_gone;
    } else {
        worker = find_worker_executing_work(pool, work);-----------返回正在执行work的worker，如果没有则返回NULL，表示已经被执行完毕。
        if (!worker)
            goto already_gone;
        pwq = worker->current_pwq;
    }

    insert_wq_barrier(pwq, barr, work, worker);
    spin_unlock_irq(&pool->lock);

    /*
     * If @max_active is 1 or rescuer is in use, flushing another work
     * item on the same workqueue may lead to deadlock.  Make sure the
     * flusher is not running on the same workqueue by verifying write
     * access.
     */
    if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
        lock_map_acquire(&pwq->wq->lockdep_map);
    else
        lock_map_acquire_read(&pwq->wq->lockdep_map);
    lock_map_release(&pwq->wq->lockdep_map);

    return true;
already_gone:
    spin_unlock_irq(&pool->lock);
    return false;
}

static void insert_wq_barrier(struct pool_workqueue *pwq,
                  struct wq_barrier *barr,
                  struct work_struct *target, struct worker *worker)
{
    struct list_head *head;
    unsigned int linked = 0;

    /*
     * debugobject calls are safe here even with pool->lock locked
     * as we know for sure that this will not trigger any of the
     * checks and call back into the fixup functions where we
     * might deadlock.
     */
    INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);-----------------初始化一个新的barr->work，执行函数是wq_barrier_func，里面complete完成量barr->done。
    __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
    init_completion(&barr->done);------------------------------------初始化barr->done完成量

    /*
     * If @target is currently being executed, schedule the
     * barrier to the worker; otherwise, put it after @target.
     */
    if (worker)------------------------------------------------------当前work正在被执行，放在worker->scheduled.next之后
        head = worker->scheduled.next;
    else {
        unsigned long *bits = work_data_bits(target);----------------否则放在target->entry.next

        head = target->entry.next;
        /* there can already be other linked works, inherit and set */
        linked = *bits & WORK_STRUCT_LINKED;
        __set_bit(WORK_STRUCT_LINKED_BIT, bits);
    }

    debug_work_activate(&barr->work);
    insert_work(pwq, &barr->work, head,
            work_color_to_flags(WORK_NO_COLOR) | linked);------------将barr->work加入到head后
}

关于PENDING_BIT何时被设置以及被清0：

当一个work已经加入到workqueue队列中，schedule_work()->queue_work()->queue_work_on()时被设置。
当一个work在工作线程里马上要执行，worker_thread()->process_on_work()->set_work_pool_and_clear_pend是清0。
上述设置和清0都是在关闭本地中断情况下执行的。

5. 和调度器的交互

假设某个work回调函数执行了睡眠操作，在wait_event_interruptible()中设置当前进程state为TASK_INTERRUPTIBLE，然后执行schedule()进行进程切换，调用轨迹是schedule()->__schedule()。

static void __sched __schedule(void)
{
    struct task_struct *prev, *next;
    unsigned long *switch_count;
    struct rq *rq;
    int cpu;

    preempt_disable();
    cpu = smp_processor_id();
    rq = cpu_rq(cpu);
    rcu_note_context_switch();
    prev = rq->curr;------------------------------------------------prev指当前进程，即执行work的工作线程，state状态为TASK_INTERRUPTIBLE。
...
    if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {-------work回调函数中调度不是中断返回前抢占调度，preempt_count也没有设置PREEMPT_ACTIVE。
        if (unlikely(signal_pending_state(prev->state, prev))) {
            prev->state = TASK_RUNNING;
        } else {
            deactivate_task(rq, prev, DEQUEUE_SLEEP);
            prev->on_rq = 0;

            /*
             * If a worker went to sleep, notify and ask workqueue
             * whether it wants to wake up a task to maintain
             * concurrency.
             */
            if (prev->flags & PF_WQ_WORKER) {
                struct task_struct *to_wakeup;

                to_wakeup = wq_worker_sleeping(prev, cpu);---------当一个工作线程要被调度器换出时，调用wq_worker_sleeping()看看是否需要唤醒同一个线程池中的其它内核线程。
                if (to_wakeup)
                    try_to_wake_up_local(to_wakeup);---------------去唤醒to_wakeup线程
            }
        }
        switch_count = &prev->nvcsw;
    }
...
}

wq_worker_sleeping()检查当前工作线程池中是否有内核线程正准备睡眠。如果有则返回task_struct，否则返回NULL。

在wq_worker_sleeping()返回不为NULL的情况下，调用try_to_wake_up_local()。

try_to_wake_up_local()是执行唤醒进程的操作。

struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
{
    struct worker *worker = kthread_data(task), *to_wakeup = NULL;
    struct worker_pool *pool;

    /*
     * Rescuers, which may not have all the fields set up like normal
     * workers, also reach here, let's not access anything before
     * checking NOT_RUNNING.
     */
    if (worker->flags & WORKER_NOT_RUNNING)
        return NULL;

    pool = worker->pool;

    /* this can only happen on the local cpu */
    if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
        return NULL;

    /*
     * The counterpart of the following dec_and_test, implied mb,
     * worklist not empty test sequence is in insert_work().
     * Please read comment there.
     *
     * NOT_RUNNING is clear.  This means that we're bound to and
     * running on the local cpu w/ rq lock held and preemption
     * disabled, which in turn means that none else could be
     * manipulating idle_list, so dereferencing idle_list without pool
     * lock is safe.
     */
    if (atomic_dec_and_test(&pool->nr_running) &&
        !list_empty(&pool->worklist))
        to_wakeup = first_idle_worker(pool);-------------------从worker_pool->idle_list中找到第一个worker工作线程。
    return to_wakeup ? to_wakeup->task : NULL;
}

static struct worker *first_idle_worker(struct worker_pool *pool)
{
    if (unlikely(list_empty(&pool->idle_list)))
        return NULL;

    return list_first_entry(&pool->idle_list, struct worker, entry);
}


static void try_to_wake_up_local(struct task_struct *p)
{
    struct rq *rq = task_rq(p);

    if (WARN_ON_ONCE(rq != this_rq()) ||
        WARN_ON_ONCE(p == current))
        return;

    lockdep_assert_held(&rq->lock);

    if (!raw_spin_trylock(&p->pi_lock)) {
        raw_spin_unlock(&rq->lock);
        raw_spin_lock(&p->pi_lock);
        raw_spin_lock(&rq->lock);
    }

    if (!(p->state & TASK_NORMAL))
        goto out;

    if (!task_on_rq_queued(p))
        ttwu_activate(rq, p, ENQUEUE_WAKEUP);

    ttwu_do_wakeup(rq, p, 0);------------------------设置进程转改为TASK_RUNNING，并且调用sched_class->task_woken执行进程唤醒抢占操作。
    ttwu_stat(p, smp_processor_id(), 0);
out:
    raw_spin_unlock(&p->pi_lock);
}


static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
    activate_task(rq, p, en_flags);
    p->on_rq = TASK_ON_RQ_QUEUED;

    /* if a worker is waking up, notify workqueue */
    if (p->flags & PF_WQ_WORKER)
        wq_worker_waking_up(p, cpu_of(rq));---------------增加nr_running技术，表示有一个工作线程马上就会被唤醒。
}

6. 小结

posted on 2018-05-29 23:50 ArnoldLu 阅读(15196) 评论(1) 编辑收藏举报

刷新页面返回顶部

Arnold Lu@南京

Linux中断管理 (3)workqueue工作队列