Linux内核架构读书笔记- 2.4.1 进程复制
Linux 下进程复制三个函数
- fork 建立父进程的一个副本,采用了cow 技术
- vfork 类似于vfork,但是不创建父进程数据的副本,父子之间共享数据
- clone产生线程,可以对父子之间进程的共享复制进行精确复制
1 Cow ( copy on write )
2 执行系统调用
系统相关 系统无关
fork sys_fork
vfork -> sys_fork -> do_fork
clone sys_vfork
do_fork 原型:
1 /* 2 * Ok, this is the main fork-routine. 3 * 4 * It copies the process, and if successful kick-starts 5 * it and waits for it to finish using the VM if required. 6 */ 7 long do_fork(unsigned long clone_flags, 8 unsigned long stack_start, 9 struct pt_regs *regs, 10 unsigned long stack_size, 11 int __user *parent_tidptr, 12 int __user *child_tidptr)
clone_flags: 标志集合,指定控制复制过程中的一些属性最低字节指定子进程终止时候发送给父进程信号,其余高位字节保存一些常数
stack_start:用户栈的起始地址
regs :指向寄存器集合的指针,struct pt_regs 是特定于体系结构
stack_size:用户态栈的大小,该参数没必要,设置为0
parent_tid 、child_tid :指向父子进程的TID(pid tid 区别 参考 http://stackoverflow.com/questions/4517301/difference-between-pid-and-tid )
sys_fork 体系相关,
/arch/x86/kernel/process_32.c
1 asmlinkage int sys_fork(struct pt_regs regs) 2 { 3 return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); 4 }
sys_vfork 与 sys_fork 略微不同,前者使用额外的标志
/arch/x86/kernel/process_32.c
1 /* 2 * This is trivial, and on the face of it looks like it 3 * could equally well be done in user mode. 4 * 5 * Not so, for quite unobvious reasons - register pressure. 6 * In user mode vfork() cannot have a stack frame, and if 7 * done by calling the "clone()" system call directly, you 8 * do not have enough call-clobbered registers to hold all 9 * the information you need. 10 */ 11 asmlinkage int sys_vfork(struct pt_regs regs) 12 { 13 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); 14 }
sys_clone类似
/arch/x86/kernel/process_32.c
1 asmlinkage int sys_clone(struct pt_regs regs) 2 { 3 unsigned long clone_flags; 4 unsigned long newsp; 5 int __user *parent_tidptr, *child_tidptr; 6 7 clone_flags = regs.ebx; 8 newsp = regs.ecx; 9 parent_tidptr = (int __user *)regs.edx; 10 child_tidptr = (int __user *)regs.edi; 11 if (!newsp) 12 newsp = regs.esp; 13 return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); 14 }
3 do_fork
copy_process 后文描述
确定pid,如果设置了CLONE_NEWPID 调用 task_pid_nr_ns,否则调用 调用task_pid_vnr 获取局部id,代码如下
kernel/fork.c
1 /* 2 * this is enough to call pid_nr_ns here, but this if 3 * improves optimisation of regular fork() 4 */ 5 nr = (clone_flags & CLONE_NEWPID) ? 6 task_pid_nr_ns(p, current->nsproxy->pid_ns) : 7 task_pid_vnr(p);
如果使用ptrace 监控新的进程,创建新的进程后会向其发送 SIGSTOP信号,便于调试器检查数据
kernel/fork.c
1 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { 2 /* 3 * We'll start up with an immediate SIGSTOP. 4 */ 5 sigaddset(&p->pending.signal, SIGSTOP); 6 set_tsk_thread_flag(p, TIF_SIGPENDING); 7 }
子进程使用wake_up_new_task唤醒 ,及将task_struct 加入到调度器队列
如果子进程在父进程之前开始运行,可以大大减少复制内存页的工作量
kernel/fork.c
1 if (!(clone_flags & CLONE_STOPPED)) 2 wake_up_new_task(p, clone_flags); 3 else 4 p->state = TASK_STOPPED;
kernel/sched.c
/* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); p->prio = effective_prio(p); if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { /* * Let the scheduling class do new task startup * management (if any): */ p->sched_class->task_new(rq, p); inc_nr_running(p, rq); } check_preempt_curr(rq, p); task_rq_unlock(rq, &flags); }
如果使用Vfork,需要启动子进程的的完成机制,子进程的task_struct 的 vfork_done 用于次,借助于wait_for_completion,父进程一直睡眠直到子进程退出,在子进程退出是,内核调用complete(vfork_done),唤醒因该变量睡眠的进程。通过采用这种方法内核可以确保vfork生成的子进程的父进程一直处于不活动状态,直至子进程退出或执行一个新的程序,父进程的临时睡眠,也确保了两个进程不会彼此干扰操作对方的进程
kernel/fork.c
1 if (clone_flags & CLONE_VFORK) { 2 freezer_do_not_count(); 3 wait_for_completion(&vfork); 4 freezer_count(); 5 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { 6 current->ptrace_message = nr; 7 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 8 } 9 }
4 进程复制
主要由copy_process 完成,下面是简化版本
copy_process 定义
kernel/fork.c
1 /* 2 * This creates a new process as a copy of the old one, 3 * but does not actually start it yet. 4 * 5 * It copies the registers, and all the appropriate 6 * parts of the process environment (as per the clone 7 * flags). The actual kick-off is left to the caller. 8 */ 9 static struct task_struct *copy_process(unsigned long clone_flags, 10 unsigned long stack_start, 11 struct pt_regs *regs, 12 unsigned long stack_size, 13 int __user *child_tidptr, 14 struct pid *pid)
复制受到许多标注的控制,可以参考 clone(2)
有些标志的组合是没有意义的,
kernel/fork.c
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL);
注意
linux/err.h
1 static inline void *ERR_PTR(long error) 2 { 3 return (void *) error; 4 }
dup_task_struct 建立父进程的副本
父子进程的task_struct 实例只有一个不同,新进程分配了一个新的核心态栈,即task_struct->stack,栈和thread_info一同保存在一个联合中,thread_info保存了线程需要的特定于处理器的底层信息。
kernel/sched.c
1 union thread_union { 2 struct thread_info thread_info; 3 unsigned long stack[THREAD_SIZE/sizeof(long)]; 4 };
在大多数体系结构上。使用一两个内存页来保存thread_union实例。
thread_info 保存了特定于体系的,但是大部分类似
<asm-alpha/thread_info.h>
1 struct thread_info { 2 struct pcb_struct pcb; /* palcode state */ 3 4 struct task_struct *task; /* main task structure */ 5 unsigned int flags; /* low level flags */ 6 unsigned int ieee_state; /* see fpu.h */ 7 8 struct exec_domain *exec_domain; /* execution domain */ 9 mm_segment_t addr_limit; /* thread address space */ 10 unsigned cpu; /* current CPU */ 11 int preempt_count; /* 0 => preemptable, <0 => BUG */ 12 13 int bpt_nsaved; 14 unsigned long bpt_addr[2]; /* breakpoint handling */ 15 unsigned int bpt_insn[2]; 16 17 struct restart_block restart_block; 18 };
task 指向进程的task_struct
exec_domain 实现执行区间后者用于在一类计算机上实现多种ABI(应用程序二进制接口),eg:64位运行32位程序
flags:特定于进程的标志,我们只关心两个
如果进程由待决信号则置位TIF_SIGPENDING
TIF_NEED_RESCHED:表示该进程应该或者想要调度器选择另一个进程替换本进程执行。
其他都是与硬件相关的,几乎不使用
CPU 在其上执行的CPU数目
preempt_count实现内核抢占所需要的一个计数器
addr_limit :指定进程可以使用的虚拟地址上线,该限制只限制普通进程,内核进程可以访问整个虚拟地址空间
restart_block 用于实现信号机制
task_strucrt 、thread_info 和内核栈的关系
内核组件的栈使用可能了过多的栈空间时,内核栈会溢出,内核提供了kstack_end 函数,用于判断给定的地址是否位于栈的有效部分
dup_task_struct 复制父进程task_struct 和thread_info ,此时父子进程的task_struct除了stack以外都是一样的,子进程的task_struct 会在copy_process过程中修改
current && current_thread_info 一般体系都定义成宏。current 用于获取当前task_struct,current_thread_info 用于获取当前 thread_info
继续copy_process
dup_task_struct成功之后,检查特定用户是否超过创建最大进程数目
kernel/fork.c
1 if (atomic_read(&p->user->processes) >= 2 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 3 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 4 p->user != current->nsproxy->user_ns->root_user) 5 goto bad_fork_free; 6 }
接下来调用sched_fork ,子进程的状态进行初始化,为挂到运行队列上作准备(后面分析调度,待续 。。。)
接下来调用copy_xyz 函数
kernel/fork.c
1 if ((retval = security_task_alloc(p))) 2 goto bad_fork_cleanup_policy; 3 if ((retval = audit_alloc(p))) 4 goto bad_fork_cleanup_security; 5 /* copy all the process information */ 6 if ((retval = copy_semundo(clone_flags, p))) 7 goto bad_fork_cleanup_audit; 8 if ((retval = copy_files(clone_flags, p))) 9 goto bad_fork_cleanup_semundo; 10 if ((retval = copy_fs(clone_flags, p))) 11 goto bad_fork_cleanup_files; 12 if ((retval = copy_sighand(clone_flags, p))) 13 goto bad_fork_cleanup_fs; 14 if ((retval = copy_signal(clone_flags, p))) 15 goto bad_fork_cleanup_sighand; 16 if ((retval = copy_mm(clone_flags, p))) 17 goto bad_fork_cleanup_signal; 18 if ((retval = copy_keys(clone_flags, p))) 19 goto bad_fork_cleanup_mm; 20 if ((retval = copy_namespaces(clone_flags, p))) 21 goto bad_fork_cleanup_keys; 22 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
- 如果COPY_SYSVSEM 置位。则copy_semundo 使用父进程System V 信号
- 如果CLONE_FILES置位,则copy_files使用父进程的文件描述符,否则创建新的files结构,包含的信息与父进程相同
- 如果CLONE_FS置位,则copy_fs 使用父进程的文件系统上下文
- 如果CLONE_SIGHAND或CLONE_THREAD置位,则使用父进程的信号处理程序
- 如果COPY_MM置位copy_mm让父子进程共享同一地址空间,此时两个进程共享mm_struct
- 如果COPY_MM没有置位,并不一定要复制进程的地址空间,采用cow
- copy_namespace 创建子进程的命名空间
- copy_thread特定于体系结构
继续copy_process ,内核需要填充task_struct父子进程不同的各个成员,包含如下
task_struct 的链表元素
间隔定时器成员,cpu_timers
待决信号列表(TODO。。。)
设置id
对于线程,线程组ID与分支进程相同:
kernel/fork.c
1 p->pid = pid_nr(pid); 2 p->tgid = p->pid; 3 if (clone_flags & CLONE_THREAD) 4 p->tgid = current->tgid;
对于普通进程。父进程是分支进程,对线程,由于分支进程被视为分支进程内部的第二(第三四等)个执行序列,其父进程应该是分支进程的父进程:
kernel/fork.c
1 /* CLONE_PARENT re-uses the old parent */ 2 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) 3 p->real_parent = current->real_parent; 4 else 5 p->real_parent = current; 6 p->parent = p->real_parent;
非线程的普通进程可以通过设置CLONE_PARENT 触发同样的行为,对于线程来说,普通进程的线程组组长是线程本身,对于线程,其组长是当前进程的组长
kernel/fork.c
1 p->group_leader = p; 2 if (clone_flags & CLONE_THREAD) { 3 p->group_leader = current->group_leader; 4 }
新进程必须通过children链表与父进程连接起来,通过宏add_parent处理的 ,此外新进程必须加入到ID数据结构体系中
kernel/fork.c
1 add_parent(p); 2 if (unlikely(p->ptrace & PT_PTRACED)) 3 __ptrace_link(p, current->parent); 4 5 if (thread_group_leader(p)) { 6 if (clone_flags & CLONE_NEWPID) 7 p->nsproxy->pid_ns->child_reaper = p; 8 9 p->signal->tty = current->signal->tty; 10 set_task_pgrp(p, task_pgrp_nr(current)); 11 set_task_session(p, task_session_nr(current)); 12 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 13 attach_pid(p, PIDTYPE_SID, task_session(current)); 14 list_add_tail_rcu(&p->tasks, &init_task.tasks); 15 __get_cpu_var(process_counts)++; 16 } 17 attach_pid(p, PIDTYPE_PID, pid);
5创建线程时的特别问题
讲一下用户线程库用于实现多线程功能的标志
CLONE_PARENT_SETTID将生成线程的pid复制到clone调用指定的用户空间的某个地址
1 if (clone_flags & CLONE_PARENT_SETTID) 2 put_user(nr, parent_tidptr);
CLONE_CHILD_SETTID将另一个传递到CLONE的用户空间指针保存在新进程的task_struct
1 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
在进程第一次执行时,内核会调用schedule_tail函数将当前PID复制到该地址
kernel/sched.c
/** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); finish_task_switch(rq, prev); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); #endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); }
CLONE_ CHILD_CLEARTID首先会在copy_process将用户空间指针child_tidptr保存在task_struct 中,注意这次是另一个不同成员
kernel/fork.c
1 /* 2 * Clear TID on mm_release()? 3 */ 4 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
/* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. * * mm_release is called after a mm_struct has been removed * from the current process. * * This difference is important for error handling, when we * only half set up a mm_struct for a new process and need to restore * the old one. Because we mmput the new mm_struct before * restoring the old one. . . * Eric Biederman 10 January 1998 */ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { struct completion *vfork_done = tsk->vfork_done; /* Get rid of any cached register state */ deactivate_mm(tsk, mm); /* notify parent sleeping on vfork() */ if (vfork_done) { tsk->vfork_done = NULL; complete(vfork_done); } /* * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave * the value intact in a core dump, and to save the unnecessary * trouble otherwise. Userland only wants this done for a sys_exit. */ if (tsk->clear_child_tid && !(tsk->flags & PF_SIGNALED) && atomic_read(&mm->mm_users) > 1) { u32 __user * tidptr = tsk->clear_child_tid; tsk->clear_child_tid = NULL; /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck. */ put_user(0, tidptr); sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0); } }
上述标志可以用于从用户空间检测内核中线程的产生和销毁,CLONE_CHILD_SETTID和CLONE_PARENT_SETTID用于检测线程的生成,CLONE_ CHILD_CLEARTID用于在线程结束时从内核向用户空间传递消息。