关于linux的一点好奇心（五）：进程线程的创建

　　一直以来，进程和线程的区别，这种问题一般会被面试官拿来考考面试者，可见这事就不太简单。简单说一点差异是，进程拥有独立的内存资源信息，而线程则共享父进程的资源信息。也就是说线程不拥有内存资源，所以对系统消耗会更小。所以，线程也有轻量级进程的说法。

　　除了从资源消耗的角度来讲进程线程的差别，还有一个值得说明的是，内存的可见性问题。进程与进程间的资源是隔离的，所以要想共享变量之类的，是很难做到的，但是共享线程变量则容易许多，因为多线程共享内存资源，所以一个线程对内存的改变，其他线程是可以感知到的。这就给应用做一些共享和并发控制带来了很大的方便（其实也有进程间共享信息，进程间通信，只是太重了，应用层无法承受之重）。所以，一般的编程语言都会提供多线程的创建和使用方法。

　　那么，进程线程是如何创建的呢？我们知道，所有上层语言，都会依赖于操作系统底层的基础实现，进线程的创建作为非常核心的概念，自然也是最终要转给操作系统处理。所以，我们想看看操作系统是如何创建进程线程的。

1. linux创建进程概述

　　linux中进程和线程，拥有同样的数据结构，也就是说没有特别区分进程和线程的差别。这样做的好处是，进程和线程都是作为独立的调度单元对待，不需要特殊处理，减少了调度算法的复杂性。

　　一般的，我们可以通过 fork() 函数直接创建进程，通过 pthread_create() 函数进行线程的创建，当然这些都是外层的api函数。我们可以以此二者作为入口进行进线程的创建研究。

　　进线程作为进程调度的单元，它自然有一个对应的描述类，即 task_struct, 它描述了一个进程到底有哪些内容和能力，是一个非常重要的抽象。

// include/linux/sched.h
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
    /*
     * For reasons of header soup (see current_thread_info()), this
     * must be the first element of task_struct.
     */
    struct thread_info        thread_info;
#endif
    /* -1 unrunnable, 0 runnable, >0 stopped: */
    volatile long            state;

    /*
     * This begins the randomizable portion of task_struct. Only
     * scheduling-critical items should be added above here.
     */
    randomized_struct_fields_start

    void                *stack;
    atomic_t            usage;
    /* Per task flags (PF_*), defined further below: */
    unsigned int            flags;
    unsigned int            ptrace;

#ifdef CONFIG_SMP
    struct llist_node        wake_entry;
    int                on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
    /* Current CPU: */
    unsigned int            cpu;
#endif
    unsigned int            wakee_flips;
    unsigned long            wakee_flip_decay_ts;
    struct task_struct        *last_wakee;

    /*
     * recent_used_cpu is initially set as the last CPU used by a task
     * that wakes affine another task. Waker/wakee relationships can
     * push tasks around a CPU where each wakeup moves to the next one.
     * Tracking a recently used CPU allows a quick search for a recently
     * used CPU that may be idle.
     */
    int                recent_used_cpu;
    int                wake_cpu;
#endif
    int                on_rq;

    int                prio;
    int                static_prio;
    int                normal_prio;
    unsigned int            rt_priority;

    const struct sched_class    *sched_class;
    struct sched_entity        se;
    struct sched_rt_entity        rt;
#ifdef CONFIG_CGROUP_SCHED
    struct task_group        *sched_task_group;
#endif
    struct sched_dl_entity        dl;

#ifdef CONFIG_PREEMPT_NOTIFIERS
    /* List of struct preempt_notifier: */
    struct hlist_head        preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
    unsigned int            btrace_seq;
#endif

    unsigned int            policy;
    int                nr_cpus_allowed;
    cpumask_t            cpus_allowed;

#ifdef CONFIG_PREEMPT_RCU
    int                rcu_read_lock_nesting;
    union rcu_special        rcu_read_unlock_special;
    struct list_head        rcu_node_entry;
    struct rcu_node            *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TASKS_RCU
    unsigned long            rcu_tasks_nvcsw;
    u8                rcu_tasks_holdout;
    u8                rcu_tasks_idx;
    int                rcu_tasks_idle_cpu;
    struct list_head        rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */

    struct sched_info        sched_info;

    struct list_head        tasks;
#ifdef CONFIG_SMP
    struct plist_node        pushable_tasks;
    struct rb_node            pushable_dl_tasks;
#endif

    struct mm_struct        *mm;
    struct mm_struct        *active_mm;

    /* Per-thread vma caching: */
    struct vmacache            vmacache;

#ifdef SPLIT_RSS_COUNTING
    struct task_rss_stat        rss_stat;
#endif
    int                exit_state;
    int                exit_code;
    int                exit_signal;
    /* The signal sent when the parent dies: */
    int                pdeath_signal;
    /* JOBCTL_*, siglock protected: */
    unsigned long            jobctl;

    /* Used for emulating ABI behavior of previous Linux versions: */
    unsigned int            personality;

    /* Scheduler bits, serialized by scheduler locks: */
    unsigned            sched_reset_on_fork:1;
    unsigned            sched_contributes_to_load:1;
    unsigned            sched_migrated:1;
    unsigned            sched_remote_wakeup:1;
    /* Force alignment to the next boundary: */
    unsigned            :0;

    /* Unserialized, strictly 'current' */

    /* Bit to tell LSMs we're in execve(): */
    unsigned            in_execve:1;
    unsigned            in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
    unsigned            restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
    unsigned            memcg_may_oom:1;
#ifndef CONFIG_SLOB
    unsigned            memcg_kmem_skip_account:1;
#endif
#endif
#ifdef CONFIG_COMPAT_BRK
    unsigned            brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
    /* disallow userland-initiated cgroup migration */
    unsigned            no_cgroup_migration:1;
#endif

    unsigned long            atomic_flags; /* Flags requiring atomic access. */

    struct restart_block        restart_block;

    pid_t                pid;
    pid_t                tgid;

#ifdef CONFIG_STACKPROTECTOR
    /* Canary value for the -fstack-protector GCC feature: */
    unsigned long            stack_canary;
#endif
    /*
     * Pointers to the (original) parent process, youngest child, younger sibling,
     * older sibling, respectively.  (p->father can be replaced with
     * p->real_parent->pid)
     */

    /* Real parent process: */
    struct task_struct __rcu    *real_parent;

    /* Recipient of SIGCHLD, wait4() reports: */
    struct task_struct __rcu    *parent;

    /*
     * Children/sibling form the list of natural children:
     */
    struct list_head        children;
    struct list_head        sibling;
    struct task_struct        *group_leader;

    /*
     * 'ptraced' is the list of tasks this task is using ptrace() on.
     *
     * This includes both natural children and PTRACE_ATTACH targets.
     * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
     */
    struct list_head        ptraced;
    struct list_head        ptrace_entry;

    /* PID/PID hash table linkage. */
    struct pid_link            pids[PIDTYPE_MAX];
    struct list_head        thread_group;
    struct list_head        thread_node;

    struct completion        *vfork_done;

    /* CLONE_CHILD_SETTID: */
    int __user            *set_child_tid;

    /* CLONE_CHILD_CLEARTID: */
    int __user            *clear_child_tid;

    u64                utime;
    u64                stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
    u64                utimescaled;
    u64                stimescaled;
#endif
    u64                gtime;
    struct prev_cputime        prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    struct vtime            vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
    atomic_t            tick_dep_mask;
#endif
    /* Context switch counts: */
    unsigned long            nvcsw;
    unsigned long            nivcsw;

    /* Monotonic time in nsecs: */
    u64                start_time;

    /* Boot based time in nsecs: */
    u64                real_start_time;

    /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
    unsigned long            min_flt;
    unsigned long            maj_flt;

#ifdef CONFIG_POSIX_TIMERS
    struct task_cputime        cputime_expires;
    struct list_head        cpu_timers[3];
#endif

    /* Process credentials: */

    /* Tracer's credentials at attach: */
    const struct cred __rcu        *ptracer_cred;

    /* Objective and real subjective task credentials (COW): */
    const struct cred __rcu        *real_cred;

    /* Effective (overridable) subjective task credentials (COW): */
    const struct cred __rcu        *cred;

    /*
     * executable name, excluding path.
     *
     * - normally initialized setup_new_exec()
     * - access it with [gs]et_task_comm()
     * - lock it with task_lock()
     */
    char                comm[TASK_COMM_LEN];

    struct nameidata        *nameidata;

#ifdef CONFIG_SYSVIPC
    struct sysv_sem            sysvsem;
    struct sysv_shm            sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
    unsigned long            last_switch_count;
#endif
    /* Filesystem information: */
    struct fs_struct        *fs;

    /* Open file information: */
    struct files_struct        *files;

    /* Namespaces: */
    struct nsproxy            *nsproxy;

    /* Signal handlers: */
    struct signal_struct        *signal;
    struct sighand_struct        *sighand;
    sigset_t            blocked;
    sigset_t            real_blocked;
    /* Restored if set_restore_sigmask() was used: */
    sigset_t            saved_sigmask;
    struct sigpending        pending;
    unsigned long            sas_ss_sp;
    size_t                sas_ss_size;
    unsigned int            sas_ss_flags;

    struct callback_head        *task_works;

    struct audit_context        *audit_context;
#ifdef CONFIG_AUDITSYSCALL
    kuid_t                loginuid;
    unsigned int            sessionid;
#endif
    struct seccomp            seccomp;

    /* Thread group tracking: */
    u32                parent_exec_id;
    u32                self_exec_id;

    /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
    spinlock_t            alloc_lock;

    /* Protection of the PI data structures: */
    raw_spinlock_t            pi_lock;

    struct wake_q_node        wake_q;

#ifdef CONFIG_RT_MUTEXES
    /* PI waiters blocked on a rt_mutex held by this task: */
    struct rb_root_cached        pi_waiters;
    /* Updated under owner's pi_lock and rq lock */
    struct task_struct        *pi_top_task;
    /* Deadlock detection and priority inheritance handling: */
    struct rt_mutex_waiter        *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    /* Mutex deadlock detection: */
    struct mutex_waiter        *blocked_on;
#endif

#ifdef CONFIG_TRACE_IRQFLAGS
    unsigned int            irq_events;
    unsigned long            hardirq_enable_ip;
    unsigned long            hardirq_disable_ip;
    unsigned int            hardirq_enable_event;
    unsigned int            hardirq_disable_event;
    int                hardirqs_enabled;
    int                hardirq_context;
    unsigned long            softirq_disable_ip;
    unsigned long            softirq_enable_ip;
    unsigned int            softirq_disable_event;
    unsigned int            softirq_enable_event;
    int                softirqs_enabled;
    int                softirq_context;
#endif

#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH            48UL
    u64                curr_chain_key;
    int                lockdep_depth;
    unsigned int            lockdep_recursion;
    struct held_lock        held_locks[MAX_LOCK_DEPTH];
#endif

#ifdef CONFIG_UBSAN
    unsigned int            in_ubsan;
#endif

    /* Journalling filesystem info: */
    void                *journal_info;

    /* Stacked block device info: */
    struct bio_list            *bio_list;

#ifdef CONFIG_BLOCK
    /* Stack plugging: */
    struct blk_plug            *plug;
#endif

    /* VM state: */
    struct reclaim_state        *reclaim_state;

    struct backing_dev_info        *backing_dev_info;

    struct io_context        *io_context;

    /* Ptrace state: */
    unsigned long            ptrace_message;
    siginfo_t            *last_siginfo;

    struct task_io_accounting    ioac;
#ifdef CONFIG_TASK_XACCT
    /* Accumulated RSS usage: */
    u64                acct_rss_mem1;
    /* Accumulated virtual memory usage: */
    u64                acct_vm_mem1;
    /* stime + utime since last update: */
    u64                acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
    /* Protected by ->alloc_lock: */
    nodemask_t            mems_allowed;
    /* Seqence number to catch updates: */
    seqcount_t            mems_allowed_seq;
    int                cpuset_mem_spread_rotor;
    int                cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
    /* Control Group info protected by css_set_lock: */
    struct css_set __rcu        *cgroups;
    /* cg_list protected by css_set_lock and tsk->alloc_lock: */
    struct list_head        cg_list;
#endif
#ifdef CONFIG_INTEL_RDT
    u32                closid;
    u32                rmid;
#endif
#ifdef CONFIG_FUTEX
    struct robust_list_head __user    *robust_list;
#ifdef CONFIG_COMPAT
    struct compat_robust_list_head __user *compat_robust_list;
#endif
    struct list_head        pi_state_list;
    struct futex_pi_state        *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
    struct perf_event_context    *perf_event_ctxp[perf_nr_task_contexts];
    struct mutex            perf_event_mutex;
    struct list_head        perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
    unsigned long            preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
    /* Protected by alloc_lock: */
    struct mempolicy        *mempolicy;
    short                il_prev;
    short                pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
    int                numa_scan_seq;
    unsigned int            numa_scan_period;
    unsigned int            numa_scan_period_max;
    int                numa_preferred_nid;
    unsigned long            numa_migrate_retry;
    /* Migration stamp: */
    u64                node_stamp;
    u64                last_task_numa_placement;
    u64                last_sum_exec_runtime;
    struct callback_head        numa_work;

    struct list_head        numa_entry;
    struct numa_group        *numa_group;

    /*
     * numa_faults is an array split into four regions:
     * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
     * in this precise order.
     *
     * faults_memory: Exponential decaying average of faults on a per-node
     * basis. Scheduling placement decisions are made based on these
     * counts. The values remain static for the duration of a PTE scan.
     * faults_cpu: Track the nodes the process was running on when a NUMA
     * hinting fault was incurred.
     * faults_memory_buffer and faults_cpu_buffer: Record faults per node
     * during the current scan window. When the scan completes, the counts
     * in faults_memory and faults_cpu decay and these values are copied.
     */
    unsigned long            *numa_faults;
    unsigned long            total_numa_faults;

    /*
     * numa_faults_locality tracks if faults recorded during the last
     * scan window were remote/local or failed to migrate. The task scan
     * period is adapted based on the locality of the faults with different
     * weights depending on whether they were shared or private faults
     */
    unsigned long            numa_faults_locality[3];

    unsigned long            numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_RSEQ
    struct rseq __user *rseq;
    u32 rseq_len;
    u32 rseq_sig;
    /*
     * RmW on rseq_event_mask must be performed atomically
     * with respect to preemption.
     */
    unsigned long rseq_event_mask;
#endif

    struct tlbflush_unmap_batch    tlb_ubc;

    struct rcu_head            rcu;

    /* Cache last used pipe for splice(): */
    struct pipe_inode_info        *splice_pipe;

    struct page_frag        task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
    struct task_delay_info        *delays;
#endif

#ifdef CONFIG_FAULT_INJECTION
    int                make_it_fail;
    unsigned int            fail_nth;
#endif
    /*
     * When (nr_dirtied >= nr_dirtied_pause), it's time to call
     * balance_dirty_pages() for a dirty throttling pause:
     */
    int                nr_dirtied;
    int                nr_dirtied_pause;
    /* Start of a write-and-pause period: */
    unsigned long            dirty_paused_when;

#ifdef CONFIG_LATENCYTOP
    int                latency_record_count;
    struct latency_record        latency_record[LT_SAVECOUNT];
#endif
    /*
     * Time slack values; these are used to round up poll() and
     * select() etc timeout values. These are in nanoseconds.
     */
    u64                timer_slack_ns;
    u64                default_timer_slack_ns;

#ifdef CONFIG_KASAN
    unsigned int            kasan_depth;
#endif

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
    /* Index of current stored address in ret_stack: */
    int                curr_ret_stack;

    /* Stack of return addresses for return function tracing: */
    struct ftrace_ret_stack        *ret_stack;

    /* Timestamp for last schedule: */
    unsigned long long        ftrace_timestamp;

    /*
     * Number of functions that haven't been traced
     * because of depth overrun:
     */
    atomic_t            trace_overrun;

    /* Pause tracing: */
    atomic_t            tracing_graph_pause;
#endif

#ifdef CONFIG_TRACING
    /* State flags for use by tracers: */
    unsigned long            trace;

    /* Bitmask and counter of trace recursion: */
    unsigned long            trace_recursion;
#endif /* CONFIG_TRACING */

#ifdef CONFIG_KCOV
    /* Coverage collection mode enabled for this task (0 if disabled): */
    unsigned int            kcov_mode;

    /* Size of the kcov_area: */
    unsigned int            kcov_size;

    /* Buffer for coverage collection: */
    void                *kcov_area;

    /* KCOV descriptor wired with this task or NULL: */
    struct kcov            *kcov;
#endif

#ifdef CONFIG_MEMCG
    struct mem_cgroup        *memcg_in_oom;
    gfp_t                memcg_oom_gfp_mask;
    int                memcg_oom_order;

    /* Number of pages to reclaim on returning to userland: */
    unsigned int            memcg_nr_pages_over_high;
#endif

#ifdef CONFIG_UPROBES
    struct uprobe_task        *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
    unsigned int            sequential_io;
    unsigned int            sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
    unsigned long            task_state_change;
#endif
    int                pagefault_disabled;
#ifdef CONFIG_MMU
    struct task_struct        *oom_reaper_list;
#endif
#ifdef CONFIG_VMAP_STACK
    struct vm_struct        *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
    /* A live task holds one reference: */
    atomic_t            stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
    int patch_state;
#endif
#ifdef CONFIG_SECURITY
    /* Used by LSM modules for access restriction: */
    void                *security;
#endif

    /*
     * New fields for task_struct should be added above here, so that
     * they are included in the randomized portion of task_struct.
     */
    randomized_struct_fields_end

    /* CPU-specific state of this task: */
    struct thread_struct        thread;

    /*
     * WARNING: on x86, 'thread_struct' contains a variable-sized
     * structure.  It *MUST* be at the end of 'task_struct'.
     *
     * Do not put anything below here!
     */
};

　　涉及到的数据结构很多，因为很多后续进程的操作，都要基于它进行，如：内存、io、cpu、调度器、文件等，自然任务艰巨。

2. 进程的创建 fork

　　这里说的fork, 是应用层的api fork。用法简单: pid_t pid = fork(); 就可以得到新的进程了，而且新的进程已经自动运行。

　　fork方法被调用一次，成功就会有两次返回；在父进程中返回一次，返回的是子进程的pid（非0）在子进程中返回一次，返回值为0 。

　　这里的fork和系统级的fork 是差不多的，下面会有详细讲解。

3. 线程的创建 pthread_create

　　线程的创建方法为 pthread_create()，在glibc 的源码实现源码: https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/pthread_create.c;h=308db65cd4c148f8a119ed9025af194946aa2c80;hb=HEAD

　　首先，pthread_create() 函数是通过 versioned_symbol 进行导出使用的。而其内部实现函数为 __pthread_create_2_1(), 如下:

versioned_symbol (libpthread, __pthread_create_2_1, pthread_create, GLIBC_2_1);

　　所以我们可以通过 __pthread_create_2_1 进行研究其他创建线程过程。

int
__pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
              void *(*start_routine) (void *), void *arg)
{
  STACK_VARIABLES;

  const struct pthread_attr *iattr = (struct pthread_attr *) attr;
  struct pthread_attr default_attr;
  bool free_cpuset = false;
  bool c11 = (attr == ATTR_C11_THREAD);
  if (iattr == NULL || c11)
    {
      lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
      default_attr = __default_pthread_attr;
      size_t cpusetsize = default_attr.cpusetsize;
      if (cpusetsize > 0)
    {
      cpu_set_t *cpuset;
      if (__glibc_likely (__libc_use_alloca (cpusetsize)))
        cpuset = __alloca (cpusetsize);
      else
        {
          cpuset = malloc (cpusetsize);
          if (cpuset == NULL)
        {
          lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
          return ENOMEM;
        }
          free_cpuset = true;
        }
      memcpy (cpuset, default_attr.cpuset, cpusetsize);
      default_attr.cpuset = cpuset;
    }
      lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
      iattr = &default_attr;
    }

  struct pthread *pd = NULL;
  int err = ALLOCATE_STACK (iattr, &pd);
  int retval = 0;

  if (__glibc_unlikely (err != 0))
    /* Something went wrong.  Maybe a parameter of the attributes is
       invalid or we could not allocate memory.  Note we have to
       translate error codes.  */
    {
      retval = err == ENOMEM ? EAGAIN : err;
      goto out;
    }


  /* Initialize the TCB.  All initializations with zero should be
     performed in 'get_cached_stack'.  This way we avoid doing this if
     the stack freshly allocated with 'mmap'.  */

#if TLS_TCB_AT_TP
  /* Reference to the TCB itself.  */
  pd->header.self = pd;

  /* Self-reference for TLS.  */
  pd->header.tcb = pd;
#endif

  /* Store the address of the start routine and the parameter.  Since
     we do not start the function directly the stillborn thread will
     get the information from its thread descriptor.  */
  pd->start_routine = start_routine;
  pd->arg = arg;
  pd->c11 = c11;

  /* Copy the thread attribute flags.  */
  struct pthread *self = THREAD_SELF;
  pd->flags = ((iattr->flags & ~(ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
           | (self->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)));

  /* Initialize the field for the ID of the thread which is waiting
     for us.  This is a self-reference in case the thread is created
     detached.  */
  pd->joinid = iattr->flags & ATTR_FLAG_DETACHSTATE ? pd : NULL;

  /* The debug events are inherited from the parent.  */
  pd->eventbuf = self->eventbuf;


  /* Copy the parent's scheduling parameters.  The flags will say what
     is valid and what is not.  */
  pd->schedpolicy = self->schedpolicy;
  pd->schedparam = self->schedparam;

  /* Copy the stack guard canary.  */
#ifdef THREAD_COPY_STACK_GUARD
  THREAD_COPY_STACK_GUARD (pd);
#endif

  /* Copy the pointer guard value.  */
#ifdef THREAD_COPY_POINTER_GUARD
  THREAD_COPY_POINTER_GUARD (pd);
#endif

  /* Setup tcbhead.  */
  tls_setup_tcbhead (pd);

  /* Verify the sysinfo bits were copied in allocate_stack if needed.  */
#ifdef NEED_DL_SYSINFO
  CHECK_THREAD_SYSINFO (pd);
#endif

  /* Inform start_thread (above) about cancellation state that might
     translate into inherited signal state.  */
  pd->parent_cancelhandling = THREAD_GETMEM (THREAD_SELF, cancelhandling);

  /* Determine scheduling parameters for the thread.  */
  if (__builtin_expect ((iattr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0, 0)
      && (iattr->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)) != 0)
    {
      /* Use the scheduling parameters the user provided.  */
      if (iattr->flags & ATTR_FLAG_POLICY_SET)
        {
          pd->schedpolicy = iattr->schedpolicy;
          pd->flags |= ATTR_FLAG_POLICY_SET;
        }
      if (iattr->flags & ATTR_FLAG_SCHED_SET)
        {
          /* The values were validated in pthread_attr_setschedparam.  */
          pd->schedparam = iattr->schedparam;
          pd->flags |= ATTR_FLAG_SCHED_SET;
        }

      if ((pd->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
          != (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
        collect_default_sched (pd);
    }

  if (__glibc_unlikely (__nptl_nthreads == 1))
    _IO_enable_locks ();

  /* Pass the descriptor to the caller.  */
  *newthread = (pthread_t) pd;

  LIBC_PROBE (pthread_create, 4, newthread, attr, start_routine, arg);

  /* One more thread.  We cannot have the thread do this itself, since it
     might exist but not have been scheduled yet by the time we've returned
     and need to check the value to behave correctly.  We must do it before
     creating the thread, in case it does get scheduled first and then
     might mistakenly think it was the only thread.  In the failure case,
     we momentarily store a false value; this doesn't matter because there
     is no kosher thing a signal handler interrupting us right here can do
     that cares whether the thread count is correct.  */
  atomic_increment (&__nptl_nthreads);

  /* Our local value of stopped_start and thread_ran can be accessed at
     any time. The PD->stopped_start may only be accessed if we have
     ownership of PD (see CONCURRENCY NOTES above).  */
  bool stopped_start = false; bool thread_ran = false;

  /* Start the thread.  */
  if (__glibc_unlikely (report_thread_creation (pd)))
    {
      stopped_start = true;

      /* We always create the thread stopped at startup so we can
     notify the debugger.  */
      retval = create_thread (pd, iattr, &stopped_start,
                  STACK_VARIABLES_ARGS, &thread_ran);
      if (retval == 0)
    {
      /* We retain ownership of PD until (a) (see CONCURRENCY NOTES
         above).  */

      /* Assert stopped_start is true in both our local copy and the
         PD copy.  */
      assert (stopped_start);
      assert (pd->stopped_start);

      /* Now fill in the information about the new thread in
         the newly created thread's data structure.  We cannot let
         the new thread do this since we don't know whether it was
         already scheduled when we send the event.  */
      pd->eventbuf.eventnum = TD_CREATE;
      pd->eventbuf.eventdata = pd;

      /* Enqueue the descriptor.  */
      do
        pd->nextevent = __nptl_last_event;
      while (atomic_compare_and_exchange_bool_acq (&__nptl_last_event,
                               pd, pd->nextevent)
         != 0);

      /* Now call the function which signals the event.  See
         CONCURRENCY NOTES for the nptl_db interface comments.  */
      __nptl_create_event ();
    }
    }
  else
    retval = create_thread (pd, iattr, &stopped_start,
                STACK_VARIABLES_ARGS, &thread_ran);

  if (__glibc_unlikely (retval != 0))
    {
      if (thread_ran)
    /* State (c) or (d) and we may not have PD ownership (see
       CONCURRENCY NOTES above).  We can assert that STOPPED_START
       must have been true because thread creation didn't fail, but
       thread attribute setting did.  */
    /* See bug 19511 which explains why doing nothing here is a
       resource leak for a joinable thread.  */
    assert (stopped_start);
      else
    {
      /* State (e) and we have ownership of PD (see CONCURRENCY
         NOTES above).  */

      /* Oops, we lied for a second.  */
      atomic_decrement (&__nptl_nthreads);

      /* Perhaps a thread wants to change the IDs and is waiting for this
         stillborn thread.  */
      if (__glibc_unlikely (atomic_exchange_acq (&pd->setxid_futex, 0)
                == -2))
        futex_wake (&pd->setxid_futex, 1, FUTEX_PRIVATE);

      /* Free the resources.  */
      __deallocate_stack (pd);
    }

      /* We have to translate error codes.  */
      if (retval == ENOMEM)
    retval = EAGAIN;
    }
  else
    {
      /* We don't know if we have PD ownership.  Once we check the local
         stopped_start we'll know if we're in state (a) or (b) (see
     CONCURRENCY NOTES above).  */
      if (stopped_start)
    /* State (a), we own PD. The thread blocked on this lock either
       because we're doing TD_CREATE event reporting, or for some
       other reason that create_thread chose.  Now let it run
       free.  */
    lll_unlock (pd->lock, LLL_PRIVATE);

      /* We now have for sure more than one thread.  The main thread might
     not yet have the flag set.  No need to set the global variable
     again if this is what we use.  */
      THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
    }

 out:
  if (__glibc_unlikely (free_cpuset))
    free (default_attr.cpuset);

  return retval;
}

　　以上主要是构建创建线程的必要信息，然后调用 create_thread() 进行线程的创建，而create_thread并非是内核的调用，我们需要再深入了解下。

// sysdeps/unix/sysv/linux/createThread.c
static int
create_thread (struct pthread *pd, const struct pthread_attr *attr,
           bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
{
  /* Determine whether the newly created threads has to be started
     stopped since we have to set the scheduling parameters or set the
     affinity.  */
  if (attr != NULL
      && (__glibc_unlikely (attr->cpuset != NULL)
      || __glibc_unlikely ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0)))
    *stopped_start = true;

  pd->stopped_start = *stopped_start;
  if (__glibc_unlikely (*stopped_start))
    /* See CONCURRENCY NOTES in nptl/pthread_creat.c.  */
    lll_lock (pd->lock, LLL_PRIVATE);

  /* We rely heavily on various flags the CLONE function understands:

     CLONE_VM, CLONE_FS, CLONE_FILES
    These flags select semantics with shared address space and
    file descriptors according to what POSIX requires.

     CLONE_SIGHAND, CLONE_THREAD
    This flag selects the POSIX signal semantics and various
    other kinds of sharing (itimers, POSIX timers, etc.).

     CLONE_SETTLS
    The sixth parameter to CLONE determines the TLS area for the
    new thread.

     CLONE_PARENT_SETTID
    The kernels writes the thread ID of the newly created thread
    into the location pointed to by the fifth parameters to CLONE.

    Note that it would be semantically equivalent to use
    CLONE_CHILD_SETTID but it is be more expensive in the kernel.

     CLONE_CHILD_CLEARTID
    The kernels clears the thread ID of a thread that has called
    sys_exit() in the location pointed to by the seventh parameter
    to CLONE.

     The termination signal is chosen to be zero which means no signal
     is sent.  */
  const int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SYSVSEM
               | CLONE_SIGHAND | CLONE_THREAD
               | CLONE_SETTLS | CLONE_PARENT_SETTID
               | CLONE_CHILD_CLEARTID
               | 0);

  TLS_DEFINE_INIT_TP (tp, pd);
  // __clone 系统调用
  if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
                    clone_flags, pd, &pd->tid, tp, &pd->tid)
            == -1))
    return errno;

  /* It's started now, so if we fail below, we'll have to cancel it
     and let it clean itself up.  */
  *thread_ran = true;

  /* Now we have the possibility to set scheduling parameters etc.  */
  if (attr != NULL)
    {
      INTERNAL_SYSCALL_DECL (err);
      int res;

      /* Set the affinity mask if necessary.  */
      if (attr->cpuset != NULL)
    {
      assert (*stopped_start);

      res = INTERNAL_SYSCALL (sched_setaffinity, err, 3, pd->tid,
                  attr->cpusetsize, attr->cpuset);

      if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res, err)))
      err_out:
        {
          /* The operation failed.  We have to kill the thread.
         We let the normal cancellation mechanism do the work.  */

          pid_t pid = __getpid ();
          INTERNAL_SYSCALL_DECL (err2);
          (void) INTERNAL_SYSCALL_CALL (tgkill, err2, pid, pd->tid,
                        SIGCANCEL);

          return INTERNAL_SYSCALL_ERRNO (res, err);
        }
    }

      /* Set the scheduling parameters.  */
      if ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0)
    {
      assert (*stopped_start);

      res = INTERNAL_SYSCALL (sched_setscheduler, err, 3, pd->tid,
                  pd->schedpolicy, &pd->schedparam);

      if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res, err)))
        goto err_out;
    }
    }

  return 0;
}

　　可见最终是调用到了 __clone() 方法了，__clone 是一个api级的linux调用，此处将会根据标识，创建一个线程。

4. 系统创建进程 do_fork

　　上面的fork()和pthread_create(), 其实都是上层的封装，在操作系统层面，创建进程和线程都是使用的 do_fork();

// kernel/fork.c
/* For compatibility with architectures that call do_fork directly rather than
 * using the syscall entry points below. */
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    // 调用统一的创建进程方法
    return _do_fork(clone_flags, stack_start, stack_size,
            parent_tidptr, child_tidptr, 0);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long _do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr,
          unsigned long tls)
{
    struct completion vfork;
    struct pid *pid;
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Determine whether and which event to report to ptracer.  When
     * called from kernel_thread or CLONE_UNTRACED is explicitly
     * requested, no event is reported; otherwise, report if the event
     * for the type of forking is enabled.
     */
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }
    // 复制当前进程上下文信息
    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
    add_latent_entropy();

    if (IS_ERR(p))
        return PTR_ERR(p);

    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    trace_sched_process_fork(current, p);
    // 获取新创建的进程id
    pid = get_task_pid(p, PIDTYPE_PID);
    nr = pid_vnr(pid);

    if (clone_flags & CLONE_PARENT_SETTID)
        put_user(nr, parent_tidptr);

    if (clone_flags & CLONE_VFORK) {
        p->vfork_done = &vfork;
        init_completion(&vfork);
        get_task_struct(p);
    }
    // 将新进程放入调度队列，以便后续可以被执行
    wake_up_new_task(p);

    /* forking complete and child started to run, tell ptracer */
    if (unlikely(trace))
        ptrace_event_pid(trace, pid);

    if (clone_flags & CLONE_VFORK) {
        if (!wait_for_vfork_done(p, &vfork))
            ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
    }
    // 进程id管理
    put_pid(pid);
    return nr;
}

　　框架简单清晰，先复制当前进程，然后将其唤醒等待调度，然后返回进程id。所以重点是进程如何复制和唤醒，当然其中的标识位设置是很重要的，它决定如何创建进程。

4.1 进程都复制了些什么？

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static __latent_entropy struct task_struct *copy_process(
                    unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace,
                    unsigned long tls,
                    int node)
{
    int retval;
    struct task_struct *p;

    /*
     * Don't allow sharing the root directory with processes in a different
     * namespace
     */
    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

    if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
        return ERR_PTR(-EINVAL);

    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);

    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

    /*
     * Siblings of global init remain as zombies on exit since they are
     * not reaped by their parent (swapper). To solve this and to avoid
     * multi-rooted process trees, prevent global and container-inits
     * from creating siblings.
     */
    if ((clone_flags & CLONE_PARENT) &&
                current->signal->flags & SIGNAL_UNKILLABLE)
        return ERR_PTR(-EINVAL);

    /*
     * If the new process will be in a different pid or user namespace
     * do not allow it to share a thread group with the forking task.
     */
    if (clone_flags & CLONE_THREAD) {
        if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
            (task_active_pid_ns(current) !=
                current->nsproxy->pid_ns_for_children))
            return ERR_PTR(-EINVAL);
    }

    retval = -ENOMEM;
    // 复制当前进程的 task_struct 结构体
    p = dup_task_struct(current, node);
    if (!p)
        goto fork_out;

    /*
     * This _must_ happen before we call free_task(), i.e. before we jump
     * to any of the bad_fork_* labels. This is to avoid freeing
     * p->set_child_tid which is (ab)used as a kthread's data pointer for
     * kernel threads (PF_KTHREAD).
     */
    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;

    ftrace_graph_init_task(p);

    rt_mutex_init_task(p);

#ifdef CONFIG_PROVE_LOCKING
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
    retval = -EAGAIN;
    if (atomic_read(&p->real_cred->user->processes) >=
            task_rlimit(p, RLIMIT_NPROC)) {
        if (p->real_cred->user != INIT_USER &&
            !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
            goto bad_fork_free;
    }
    current->flags &= ~PF_NPROC_EXCEEDED;

    retval = copy_creds(p, clone_flags);
    if (retval < 0)
        goto bad_fork_free;

    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     */
    retval = -EAGAIN;
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
    p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
    p->flags |= PF_FORKNOEXEC;
    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    rcu_copy_process(p);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);

    init_sigpending(&p->pending);

    p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
    p->utimescaled = p->stimescaled = 0;
#endif
    prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    seqcount_init(&p->vtime.seqcount);
    p->vtime.starttime = 0;
    p->vtime.state = VTIME_INACTIVE;
#endif

#if defined(SPLIT_RSS_COUNTING)
    memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

    p->default_timer_slack_ns = current->timer_slack_ns;

    task_io_accounting_init(&p->ioac);
    acct_clear_integrals(p);

    posix_cpu_timers_init(p);

    p->start_time = ktime_get_ns();
    p->real_start_time = ktime_get_boot_ns();
    p->io_context = NULL;
    audit_set_context(p, NULL);
    cgroup_fork(p);
#ifdef CONFIG_NUMA
    p->mempolicy = mpol_dup(p->mempolicy);
    if (IS_ERR(p->mempolicy)) {
        retval = PTR_ERR(p->mempolicy);
        p->mempolicy = NULL;
        goto bad_fork_cleanup_threadgroup_lock;
    }
#endif
#ifdef CONFIG_CPUSETS
    p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
    p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
    seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
    p->irq_events = 0;
    p->hardirqs_enabled = 0;
    p->hardirq_enable_ip = 0;
    p->hardirq_enable_event = 0;
    p->hardirq_disable_ip = _THIS_IP_;
    p->hardirq_disable_event = 0;
    p->softirqs_enabled = 1;
    p->softirq_enable_ip = _THIS_IP_;
    p->softirq_enable_event = 0;
    p->softirq_disable_ip = 0;
    p->softirq_disable_event = 0;
    p->hardirq_context = 0;
    p->softirq_context = 0;
#endif

    p->pagefault_disabled = 0;

#ifdef CONFIG_LOCKDEP
    p->lockdep_depth = 0; /* no locks held yet */
    p->curr_chain_key = 0;
    p->lockdep_recursion = 0;
    lockdep_init_task(p);
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
    p->sequential_io    = 0;
    p->sequential_io_avg    = 0;
#endif

    // 复制其他独立信息到新进程中
    /* Perform scheduler related setup. Assign this task to a CPU. */
    // 设置调度器，及cpu分配
    retval = sched_fork(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_policy;

    retval = perf_event_init_task(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_perf;
    /* copy all the process information */
    shm_init_task(p);
    retval = security_task_alloc(p, clone_flags);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_security;
    // 文件复制，实际上是 fd 的复制
    retval = copy_files(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_semundo;
    // 复制fs结构
    retval = copy_fs(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    // 复制信号监听，如果是线程则不需要
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    // 复制内存，重量级操作，创建线程时则不会真的复制
    retval = copy_mm(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_signal;
    // 复制各大命名空间, pid,uts,ipc,net,cgroup
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_namespaces;
    // 复制线程本地存储(thread local storage)
    retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
    if (retval)
        goto bad_fork_cleanup_io;

    if (pid != &init_struct_pid) {
        pid = alloc_pid(p->nsproxy->pid_ns_for_children);
        if (IS_ERR(pid)) {
            retval = PTR_ERR(pid);
            goto bad_fork_cleanup_thread;
        }
    }

#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
    p->robust_list = NULL;
#ifdef CONFIG_COMPAT
    p->compat_robust_list = NULL;
#endif
    INIT_LIST_HEAD(&p->pi_state_list);
    p->pi_state_cache = NULL;
#endif
    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        sas_ss_reset(p);

    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
    clear_all_latency_tracing(p);

    /* ok, now we should be set up.. */
    p->pid = pid_nr(pid);
    if (clone_flags & CLONE_THREAD) {
        p->exit_signal = -1;
        p->group_leader = current->group_leader;
        p->tgid = current->tgid;
    } else {
        if (clone_flags & CLONE_PARENT)
            p->exit_signal = current->group_leader->exit_signal;
        else
            p->exit_signal = (clone_flags & CSIGNAL);
        p->group_leader = p;
        p->tgid = p->pid;
    }

    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;

    p->pdeath_signal = 0;
    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;

    cgroup_threadgroup_change_begin(current);
    /*
     * Ensure that the cgroup subsystem policies allow the new process to be
     * forked. It should be noted the the new process's css_set can be changed
     * between here and cgroup_post_fork() if an organisation operation is in
     * progress.
     */
    retval = cgroup_can_fork(p);
    if (retval)
        goto bad_fork_free_pid;

    /*
     * Make it visible to the rest of the system, but dont wake it up yet.
     * Need tasklist lock for parent etc handling!
     */
    write_lock_irq(&tasklist_lock);

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
    } else {
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
    }

    klp_copy_process(p);

    spin_lock(&current->sighand->siglock);

    /*
     * Copy seccomp details explicitly here, in case they were changed
     * before holding sighand lock.
     */
    copy_seccomp(p);

    rseq_fork(p, clone_flags);

    /*
     * Process group and session signals need to be delivered to just the
     * parent before the fork or both the parent and the child after the
     * fork. Restart if a signal comes in before we add the new process to
     * it's process group.
     * A fatal signal pending means that current will exit, so the new
     * thread can't slip out of an OOM kill (or normal SIGKILL).
    */
    recalc_sigpending();
    if (signal_pending(current)) {
        retval = -ERESTARTNOINTR;
        goto bad_fork_cancel_cgroup;
    }
    if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
        retval = -ENOMEM;
        goto bad_fork_cancel_cgroup;
    }

    if (likely(p->pid)) {
        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

        init_task_pid(p, PIDTYPE_PID, pid);
        if (thread_group_leader(p)) {
            init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
            init_task_pid(p, PIDTYPE_SID, task_session(current));

            if (is_child_reaper(pid)) {
                ns_of_pid(pid)->child_reaper = p;
                p->signal->flags |= SIGNAL_UNKILLABLE;
            }

            p->signal->leader_pid = pid;
            p->signal->tty = tty_kref_get(current->signal->tty);
            /*
             * Inherit has_child_subreaper flag under the same
             * tasklist_lock with adding child to the process tree
             * for propagate_has_child_subreaper optimization.
             */
            p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                             p->real_parent->signal->is_child_subreaper;
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            attach_pid(p, PIDTYPE_PGID);
            attach_pid(p, PIDTYPE_SID);
            __this_cpu_inc(process_counts);
        } else {
            current->signal->nr_threads++;
            atomic_inc(&current->signal->live);
            atomic_inc(&current->signal->sigcnt);
            list_add_tail_rcu(&p->thread_group,
                      &p->group_leader->thread_group);
            list_add_tail_rcu(&p->thread_node,
                      &p->signal->thread_head);
        }
        attach_pid(p, PIDTYPE_PID);
        nr_threads++;
    }

    total_forks++;
    spin_unlock(&current->sighand->siglock);
    syscall_tracepoint_update(p);
    write_unlock_irq(&tasklist_lock);

    proc_fork_connector(p);
    cgroup_post_fork(p);
    cgroup_threadgroup_change_end(current);
    perf_event_fork(p);

    trace_task_newtask(p, clone_flags);
    uprobe_copy_process(p, clone_flags);

    return p;

bad_fork_cancel_cgroup:
    spin_unlock(&current->sighand->siglock);
    write_unlock_irq(&tasklist_lock);
    cgroup_cancel_fork(p);
bad_fork_free_pid:
    cgroup_threadgroup_change_end(current);
    if (pid != &init_struct_pid)
        free_pid(pid);
bad_fork_cleanup_thread:
    exit_thread(p);
bad_fork_cleanup_io:
    if (p->io_context)
        exit_io_context(p);
bad_fork_cleanup_namespaces:
    exit_task_namespaces(p);
bad_fork_cleanup_mm:
    if (p->mm)
        mmput(p->mm);
bad_fork_cleanup_signal:
    if (!(clone_flags & CLONE_THREAD))
        free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
    __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
    exit_fs(p); /* blocking */
bad_fork_cleanup_files:
    exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
    exit_sem(p);
bad_fork_cleanup_security:
    security_task_free(p);
bad_fork_cleanup_audit:
    audit_free(p);
bad_fork_cleanup_perf:
    perf_event_free_task(p);
bad_fork_cleanup_policy:
    lockdep_free_task(p);
#ifdef CONFIG_NUMA
    mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
    delayacct_tsk_free(p);
bad_fork_cleanup_count:
    atomic_dec(&p->cred->user->processes);
    exit_creds(p);
bad_fork_free:
    p->state = TASK_DEAD;
    put_task_stack(p);
    free_task(p);
fork_out:
    return ERR_PTR(retval);
}

　　可以看出创建一个新进程，还是一件非常重的事，copy了很多的信息，可以说肯定会非常耗性能和资源，所以不要随便创建不必要的进程，它是宝贵的。下面我们就几个简单的复制过程了解下，都干了什么。

// 复制线程结构外壳
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
    struct task_struct *tsk;
    unsigned long *stack;
    struct vm_struct *stack_vm_area;
    int err;

    if (node == NUMA_NO_NODE)
        node = tsk_fork_get_node(orig);
    tsk = alloc_task_struct_node(node);
    if (!tsk)
        return NULL;

    stack = alloc_thread_stack_node(tsk, node);
    if (!stack)
        goto free_tsk;

    stack_vm_area = task_stack_vm_area(tsk);

    err = arch_dup_task_struct(tsk, orig);

    /*
     * arch_dup_task_struct() clobbers the stack-related fields.  Make
     * sure they're properly initialized before using any stack-related
     * functions again.
     */
    tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
    tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
    atomic_set(&tsk->stack_refcount, 1);
#endif

    if (err)
        goto free_stack;

#ifdef CONFIG_SECCOMP
    /*
     * We must handle setting up seccomp filters once we're under
     * the sighand lock in case orig has changed between now and
     * then. Until then, filter must be NULL to avoid messing up
     * the usage counts on the error path calling free_task.
     */
    tsk->seccomp.filter = NULL;
#endif

    setup_thread_stack(tsk, orig);
    clear_user_return_notifier(tsk);
    clear_tsk_need_resched(tsk);
    set_task_stack_end_magic(tsk);

#ifdef CONFIG_STACKPROTECTOR
    tsk->stack_canary = get_random_canary();
#endif

    /*
     * One for us, one for whoever does the "release_task()" (usually
     * parent)
     */
    atomic_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
    tsk->btrace_seq = 0;
#endif
    tsk->splice_pipe = NULL;
    tsk->task_frag.page = NULL;
    tsk->wake_q.next = NULL;

    account_kernel_stack(tsk, 1);

    kcov_task_init(tsk);

#ifdef CONFIG_FAULT_INJECTION
    tsk->fail_nth = 0;
#endif

    return tsk;

free_stack:
    free_thread_stack(tsk);
free_tsk:
    free_task_struct(tsk);
    return NULL;
}

// 复制内存空间
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
    struct mm_struct *mm, *oldmm;
    int retval;

    tsk->min_flt = tsk->maj_flt = 0;
    tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
    tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif

    tsk->mm = NULL;
    tsk->active_mm = NULL;

    /*
     * Are we cloning a kernel thread?
     *
     * We need to steal a active VM for that..
     */
    oldmm = current->mm;
    if (!oldmm)
        return 0;

    /* initialize the new vmacache entries */
    vmacache_flush(tsk);
    // 一般情况下可以直接复用原有的内存信息
    if (clone_flags & CLONE_VM) {
        // 增加内存的使用计数，然后复用原有的指针即可
        mmget(oldmm);
        mm = oldmm;
        goto good_mm;
    }

    retval = -ENOMEM;
    // 重量级操作，内存复制
    mm = dup_mm(tsk);
    if (!mm)
        goto fail_nomem;

good_mm:
    tsk->mm = mm;
    tsk->active_mm = mm;
    return 0;

fail_nomem:
    return retval;
}

/*
 * Allocate a new mm structure and copy contents from the
 * mm structure of the passed in task structure.
 */
static struct mm_struct *dup_mm(struct task_struct *tsk)
{
    struct mm_struct *mm, *oldmm = current->mm;
    int err;

    mm = allocate_mm();
    if (!mm)
        goto fail_nomem;

    memcpy(mm, oldmm, sizeof(*mm));

    if (!mm_init(mm, tsk, mm->user_ns))
        goto fail_nomem;

    err = dup_mmap(mm, oldmm);
    if (err)
        goto free_pt;

    mm->hiwater_rss = get_mm_rss(mm);
    mm->hiwater_vm = mm->total_vm;

    if (mm->binfmt && !try_module_get(mm->binfmt->module))
        goto free_pt;

    return mm;

free_pt:
    /* don't put binfmt in mmput, we haven't got module yet */
    mm->binfmt = NULL;
    mmput(mm);

fail_nomem:
    return NULL;
}

　　针对复制过程，每个方法有其独特的用处，这也体现了设计模式中的单一职责原则。复制进程结构就是复制最外层的结构体，而复制内存，则是针对各区域的内存进行依次的复制。

4.2 将新建进程唤醒

　　进程创建好之后，自然是要交给调度系统处理的。理论上，只需要将将新建的进程加入到调度队列，后续的事则可以不用fork() 管了。具体是否是这样呢？

// kernel/sched/core.c
// 将新建进程加入到调度队列，并设置唤醒标识
/*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
void wake_up_new_task(struct task_struct *p)
{
    struct rq_flags rf;
    struct rq *rq;

    raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
    p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
    /*
     * Fork balancing, do it here and not earlier because:
     *  - cpus_allowed can change in the fork path
     *  - any previously selected CPU might disappear through hotplug
     *
     * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
     * as we're not fully set-up yet.
     */
    p->recent_used_cpu = task_cpu(p);
    __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
    rq = __task_rq_lock(p, &rf);
    update_rq_clock(rq);
    post_init_entity_util_avg(&p->se);

    activate_task(rq, p, ENQUEUE_NOCLOCK);
    p->on_rq = TASK_ON_RQ_QUEUED;
    trace_sched_wakeup_new(p);
    check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
    if (p->sched_class->task_woken) {
        /*
         * Nothing relies on rq->lock after this, so its fine to
         * drop it.
         */
        rq_unpin_lock(rq, &rf);
        p->sched_class->task_woken(rq, p);
        rq_repin_lock(rq, &rf);
    }
#endif
    task_rq_unlock(rq, p, &rf);
}
// 新进程加入队列
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
    if (task_contributes_to_load(p))
        rq->nr_uninterruptible--;

    enqueue_task(rq, p, flags);
}

static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
    if (!(flags & ENQUEUE_NOCLOCK))
        update_rq_clock(rq);

    if (!(flags & ENQUEUE_RESTORE))
        sched_info_queued(rq, p);

    p->sched_class->enqueue_task(rq, p, flags);
}

4.3 put_pid 是在管理pid吗？

　　最后，我们看下put_pid() 都做了些什么？感觉像是将pid写入到全局空间中，进行统一管理，但是实际上却不是的，它实际上是一个引用计数的处理过程。

// kernel/pid.c
// 减少pid_namespace的引用计数
void put_pid(struct pid *pid)
{
    struct pid_namespace *ns;

    if (!pid)
        return;

    ns = pid->numbers[pid->level].ns;
    if ((atomic_read(&pid->count) == 1) ||
         atomic_dec_and_test(&pid->count)) {
        kmem_cache_free(ns->pid_cachep, pid);
        put_pid_ns(ns);
    }
}
EXPORT_SYMBOL_GPL(put_pid);
// kernel/pid_namespace.c
void put_pid_ns(struct pid_namespace *ns)
{
    struct pid_namespace *parent;
    // 放到init进程pid空间
    while (ns != &init_pid_ns) {
        parent = ns->parent;
        if (!kref_put(&ns->kref, free_pid_ns))
            break;
        ns = parent;
    }
}
EXPORT_SYMBOL_GPL(put_pid_ns);

static void free_pid_ns(struct kref *kref)
{
    struct pid_namespace *ns;

    ns = container_of(kref, struct pid_namespace, kref);
    destroy_pid_namespace(ns);
}

　　创建进程线程，其实包含了大量的复杂操作，每个细节都值得我们深入研究，我们此处仅简单看几个方向，如内存的复制过程，也可以大致理解进线程的差别，相信也能在一定程度上为大家解惑。

posted @ 2022-09-25 11:04 阿牛20 阅读(1304) 评论(0) 收藏举报

刷新页面返回顶部

阿牛20

我约我期，要么求变，否则忍受，水到渠成！