Linux : task work 机制
task work机制可以在内核中向指定的进程添加一些任务函数,这些任务函数会在进程返回用户态时执行,使用的是该进程的上下文。包括下面的这些API:
- task_work_add
- task_work_cancel
- task_work_run
进程对象task_struct中有个字段用来存储这些待进行的任务列表头即task_works,这个结构体包含一个next指针和需要执行的函数指针。
205 /** 206 * struct callback_head - callback structure for use with RCU and task_work 207 * @next: next update requests in a list 208 * @func: actual update function to call after the grace period. 209 */ 210 struct callback_head { 211 struct callback_head *next; 212 void (*func)(struct callback_head *head); 213 };
4 5 static struct callback_head work_exited; /* all we need is ->next == NULL */ 6 7 /** 8 * task_work_add - ask the @task to execute @work->func() 9 * @task: the task which should run the callback 10 * @work: the callback to run 11 * @notify: send the notification if true 12 * 13 * Queue @work for task_work_run() below and notify the @task if @notify. 14 * Fails if the @task is exiting/exited and thus it can't process this @work. 15 * Otherwise @work->func() will be called when the @task returns from kernel 16 * mode or exits. 17 * 18 * This is like the signal handler which runs in kernel mode, but it doesn't 19 * try to wake up the @task. 20 * 21 * RETURNS: 22 * 0 if succeeds or -ESRCH. 23 */ 24 int 25 task_work_add(struct task_struct *task, struct callback_head *work, bool notify) 26 { 27 struct callback_head *head; 28 29 do { 30 head = ACCESS_ONCE(task->task_works); 31 if (unlikely(head == &work_exited)) 32 return -ESRCH; 33 work->next = head; 34 } while (cmpxchg(&task->task_works, head, work) != head); 35 36 if (notify) 37 set_notify_resume(task); 38 return 0; 39 }
主要工作:
1. 通过CAS以无锁的形式添加了一个链表元素。(新元素排在原有链表头部)
2. set_notify_resume函数向指定的进程设置了一个_TIF_NOTIFY_RESUME标记。
task_work_run执行时机
在返回用户态之前会对当前进程的标记检查,如果相关标记置位则会调用do_notify_resume
595 int_signal: 596 testl $_TIF_DO_NOTIFY_MASK,%edx 597 jz 1f 598 movq %rsp,%rdi # &ptregs -> arg1 599 xorl %esi,%esi # oldset -> arg2 600 call do_notify_resume 601 1: movl $_TIF_WORK_MASK,%edi 602 int_restore_rest: 603 RESTORE_REST 604 DISABLE_INTERRUPTS(CLBR_NONE) 605 TRACE_IRQS_OFF 606 jmp int_with_check 607 CFI_ENDPROC 608 END(system_call)
以上文件为entry_64.S,而标记定义在thread_info.c中
130 /* work to do on interrupt/exception return */ 131 #define _TIF_WORK_MASK \ 132 (0x0000FFFF & \ 133 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \ 134 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
70 #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ 71 #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ 72 #define TIF_SIGPENDING 2 /* signal pending */ 73 #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ 74 #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ 75 #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ 76 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 77 #define TIF_SECCOMP 8 /* secure computing */ 78 #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 79 #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ 80 #define TIF_UPROBE 12 /* breakpointed or singlestepping */ 81 #define TIF_NOTSC 16 /* TSC is not accessible in userland */ 82 #define TIF_IA32 17 /* IA32 compatibility process */ 83 #define TIF_FORK 18 /* ret_from_fork */ 84 #define TIF_NOHZ 19 /* in adaptive nohz mode */ 85 #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ 86 #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ 87 #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 88 #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ 89 #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ 90 #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ 91 #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ 92 #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ 93 #define TIF_X32 30 /* 32-bit native x86-64 binary */ 94
即_TIF_WORK_MASK表示除开(_TIF_SYSCALL_TRACE, _TIF_SYSCALL_AUDIT, _TIF_SINGLESTEP, _TIF_SECCOMP, _TIF_SYSCALL_EMU)之外的所有标记。自然包括了_TIF_NOTIFY_RESUME标记。
do_notify_resume函数
729 /* 730 * notification of userspace execution resumption 731 * - triggered by the TIF_WORK_MASK flags 732 */ 733 __visible void 734 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 735 { 736 user_exit(); 737 738 #ifdef CONFIG_X86_MCE 739 /* notify userspace of pending MCEs */ 740 if (thread_info_flags & _TIF_MCE_NOTIFY) 741 mce_notify_process(); 742 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 743 744 if (thread_info_flags & _TIF_UPROBE) 745 uprobe_notify_resume(regs); 746 747 /* deal with pending signal delivery */ 748 if (thread_info_flags & _TIF_SIGPENDING) 749 do_signal(regs); 750 751 if (thread_info_flags & _TIF_NOTIFY_RESUME) { 752 clear_thread_flag(TIF_NOTIFY_RESUME); 753 tracehook_notify_resume(regs); 754 } 755 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) 756 fire_user_return_notifiers(); 757 758 user_enter(); 759 }
可以看到在其中调用tracehook_notify_resume函数,也包括其他一些如信号处理相关的函数。
tracehook_notify_resume
174 /** 175 * tracehook_notify_resume - report when about to return to user mode 176 * @regs: user-mode registers of @current task 177 * 178 * This is called when %TIF_NOTIFY_RESUME has been set. Now we are 179 * about to return to user mode, and the user state in @regs can be 180 * inspected or adjusted. The caller in arch code has cleared 181 * %TIF_NOTIFY_RESUME before the call. If the flag gets set again 182 * asynchronously, this will be called again before we return to 183 * user mode. 184 * 185 * Called without locks. 186 */ 187 static inline void tracehook_notify_resume(struct pt_regs *regs) 188 { 189 /* 190 * The caller just cleared TIF_NOTIFY_RESUME. This barrier 191 * pairs with task_work_add()->set_notify_resume() after 192 * hlist_add_head(task->task_works); 193 */ 194 smp_mb__after_atomic(); 195 if (unlikely(current->task_works)) 196 task_work_run(); 197 }
在进程对象的task_works不为null的情况下才有任务需要执行。
task_work_run
77 /** 78 * task_work_run - execute the works added by task_work_add() 79 * 80 * Flush the pending works. Should be used by the core kernel code. 81 * Called before the task returns to the user-mode or stops, or when 82 * it exits. In the latter case task_work_add() can no longer add the 83 * new work after task_work_run() returns. 84 */ 85 void task_work_run(void) 86 { 87 struct task_struct *task = current; 88 struct callback_head *work, *head, *next; 89 90 for (;;) { 91 /* 92 * work->func() can do task_work_add(), do not set 93 * work_exited unless the list is empty. 94 */ 95 do { 96 work = ACCESS_ONCE(task->task_works); 97 head = !work && (task->flags & PF_EXITING) ? 98 &work_exited : NULL; 99 } while (cmpxchg(&task->task_works, work, head) != work); 100 101 if (!work) 102 break; 103 /* 104 * Synchronize with task_work_cancel(). It can't remove 105 * the first entry == work, cmpxchg(task_works) should 106 * fail, but it can play with *work and other entries. 107 */ 108 raw_spin_unlock_wait(&task->pi_lock); 109 smp_mb(); 110 111 /* Reverse the list to run the works in fifo order */ 112 head = NULL; 113 do { 114 next = work->next; 115 work->next = head; 116 head = work; 117 work = next; 118 } while (work); 119 120 work = head; 121 do { 122 next = work->next; 123 work->func(work); 124 work = next; 125 cond_resched(); 126 } while (work); 127 } 128 }
1. 通过CAS,以无锁的方式取得task_works链表
2. 因为原链表是按元素添加到链表的时间逆序排列的(见task_work_add),先把链表反转一遍
3. 反转链表后,遍历链表,执行各个元素的任务函数即work->func(work)