从库函数fork()起步,探究新进程的诞生

本周在线学习了孟宁老师的《Linux内核分析》,本周的主要内容是进程的描述和创建,针对本次课程的实验现记录于本博文。

 

我们学习过操作系统这么课程,知道PCB是进程在内核中的唯一标识,PCB结构中包括本进程的全部信息。具体到Linux操作系统,这个PCB结构就是Linux内核中的task_struct结构体,该结构体非常庞大,包含了进程的很多基本信息。当我们使用fork()函数创建新进程的时候,理所当然的会涉及到task_struct函数,我们下面就从本结构体开始分析:

1235struct task_struct {
1236    volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
1237    void *stack;
1238    atomic_t usage;
1239    unsigned int flags;    /* per process flags, defined below */
1240    unsigned int ptrace;
1241
1242#ifdef CONFIG_SMP
1243    struct llist_node wake_entry;
1244    int on_cpu;
1245    struct task_struct *last_wakee;
1246    unsigned long wakee_flips;
1247    unsigned long wakee_flip_decay_ts;
1248
1249    int wake_cpu;
1250#endif
1251    int on_rq;
1252
1253    int prio, static_prio, normal_prio;
1254    unsigned int rt_priority;
1255    const struct sched_class *sched_class;
1256    struct sched_entity se;
1257    struct sched_rt_entity rt;
1258#ifdef CONFIG_CGROUP_SCHED
1259    struct task_group *sched_task_group;
1260#endif
1261    struct sched_dl_entity dl;
1262
1263#ifdef CONFIG_PREEMPT_NOTIFIERS
1264    /* list of struct preempt_notifier: */
1265    struct hlist_head preempt_notifiers;
1266#endif
1267
1268#ifdef CONFIG_BLK_DEV_IO_TRACE
1269    unsigned int btrace_seq;
1270#endif
1271
1272    unsigned int policy;
1273    int nr_cpus_allowed;
1274    cpumask_t cpus_allowed;
1275
1276#ifdef CONFIG_PREEMPT_RCU
1277    int rcu_read_lock_nesting;
1278    union rcu_special rcu_read_unlock_special;
1279    struct list_head rcu_node_entry;
1280#endif /* #ifdef CONFIG_PREEMPT_RCU */
1281#ifdef CONFIG_TREE_PREEMPT_RCU
1282    struct rcu_node *rcu_blocked_node;
1283#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1284#ifdef CONFIG_TASKS_RCU
1285    unsigned long rcu_tasks_nvcsw;
1286    bool rcu_tasks_holdout;
1287    struct list_head rcu_tasks_holdout_list;
1288    int rcu_tasks_idle_cpu;
1289#endif /* #ifdef CONFIG_TASKS_RCU */
1290
1291#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1292    struct sched_info sched_info;
1293#endif
1294
1295    struct list_head tasks;
1296#ifdef CONFIG_SMP
1297    struct plist_node pushable_tasks;
1298    struct rb_node pushable_dl_tasks;
1299#endif
1300
1301    struct mm_struct *mm, *active_mm;
1302#ifdef CONFIG_COMPAT_BRK
1303    unsigned brk_randomized:1;
1304#endif
1305    /* per-thread vma caching */
1306    u32 vmacache_seqnum;
1307    struct vm_area_struct *vmacache[VMACACHE_SIZE];
1308#if defined(SPLIT_RSS_COUNTING)
1309    struct task_rss_stat    rss_stat;
1310#endif
1311/* task state */
1312    int exit_state;
1313    int exit_code, exit_signal;
1314    int pdeath_signal;  /*  The signal sent when the parent dies  */
1315    unsigned int jobctl;    /* JOBCTL_*, siglock protected */
1316
1317    /* Used for emulating ABI behavior of previous Linux versions */
1318    unsigned int personality;
1319
1320    unsigned in_execve:1;    /* Tell the LSMs that the process is doing an
1321                 * execve */
1322    unsigned in_iowait:1;
1323
1324    /* Revert to default priority/policy when forking */
1325    unsigned sched_reset_on_fork:1;
1326    unsigned sched_contributes_to_load:1;
1327
1328    unsigned long atomic_flags; /* Flags needing atomic access. */
1329
1330    pid_t pid;
1331    pid_t tgid;
1332
1333#ifdef CONFIG_CC_STACKPROTECTOR
1334    /* Canary value for the -fstack-protector gcc feature */
1335    unsigned long stack_canary;
1336#endif
1337    /*
1338     * pointers to (original) parent process, youngest child, younger sibling,
1339     * older sibling, respectively.  (p->father can be replaced with
1340     * p->real_parent->pid)
1341     */
1342    struct task_struct __rcu *real_parent; /* real parent process */
1343    struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
1344    /*
1345     * children/sibling forms the list of my natural children
1346     */
1347    struct list_head children;    /* list of my children */
1348    struct list_head sibling;    /* linkage in my parent's children list */
1349    struct task_struct *group_leader;    /* threadgroup leader */
1350
1351    /*
1352     * ptraced is the list of tasks this task is using ptrace on.
1353     * This includes both natural children and PTRACE_ATTACH targets.
1354     * p->ptrace_entry is p's link on the p->parent->ptraced list.
1355     */
1356    struct list_head ptraced;
1357    struct list_head ptrace_entry;
1358
1359    /* PID/PID hash table linkage. */
1360    struct pid_link pids[PIDTYPE_MAX];
1361    struct list_head thread_group;
1362    struct list_head thread_node;
1363
1364    struct completion *vfork_done;        /* for vfork() */
1365    int __user *set_child_tid;        /* CLONE_CHILD_SETTID */
1366    int __user *clear_child_tid;        /* CLONE_CHILD_CLEARTID */
1367
1368    cputime_t utime, stime, utimescaled, stimescaled;
1369    cputime_t gtime;
1370#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1371    struct cputime prev_cputime;
1372#endif
1373#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1374    seqlock_t vtime_seqlock;
1375    unsigned long long vtime_snap;
1376    enum {
1377        VTIME_SLEEPING = 0,
1378        VTIME_USER,
1379        VTIME_SYS,
1380    } vtime_snap_whence;
1381#endif
1382    unsigned long nvcsw, nivcsw; /* context switch counts */
1383    u64 start_time;        /* monotonic time in nsec */
1384    u64 real_start_time;    /* boot based time in nsec */
1385/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
1386    unsigned long min_flt, maj_flt;
1387
1388    struct task_cputime cputime_expires;
1389    struct list_head cpu_timers[3];
1390
1391/* process credentials */
1392    const struct cred __rcu *real_cred; /* objective and real subjective task
1393                     * credentials (COW) */
1394    const struct cred __rcu *cred;    /* effective (overridable) subjective task
1395                     * credentials (COW) */
1396    char comm[TASK_COMM_LEN]; /* executable name excluding path
1397                     - access with [gs]et_task_comm (which lock
1398                       it with task_lock())
1399                     - initialized normally by setup_new_exec */
1400/* file system info */
1401    int link_count, total_link_count;
1402#ifdef CONFIG_SYSVIPC
1403/* ipc stuff */
1404    struct sysv_sem sysvsem;
1405    struct sysv_shm sysvshm;
1406#endif
1407#ifdef CONFIG_DETECT_HUNG_TASK
1408/* hung task detection */
1409    unsigned long last_switch_count;
1410#endif
1411/* CPU-specific state of this task */
1412    struct thread_struct thread;
1413/* filesystem information */
1414    struct fs_struct *fs;
1415/* open file information */
1416    struct files_struct *files;
1417/* namespaces */
1418    struct nsproxy *nsproxy;
1419/* signal handlers */
1420    struct signal_struct *signal;
1421    struct sighand_struct *sighand;
1422
1423    sigset_t blocked, real_blocked;
1424    sigset_t saved_sigmask;    /* restored if set_restore_sigmask() was used */
1425    struct sigpending pending;
1426
1427    unsigned long sas_ss_sp;
1428    size_t sas_ss_size;
1429    int (*notifier)(void *priv);
1430    void *notifier_data;
1431    sigset_t *notifier_mask;
1432    struct callback_head *task_works;
1433
1434    struct audit_context *audit_context;
1435#ifdef CONFIG_AUDITSYSCALL
1436    kuid_t loginuid;
1437    unsigned int sessionid;
1438#endif
1439    struct seccomp seccomp;
1440
1441/* Thread group tracking */
1442       u32 parent_exec_id;
1443       u32 self_exec_id;
1444/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
1445 * mempolicy */
1446    spinlock_t alloc_lock;
1447
1448    /* Protection of the PI data structures: */
1449    raw_spinlock_t pi_lock;
1450
1451#ifdef CONFIG_RT_MUTEXES
1452    /* PI waiters blocked on a rt_mutex held by this task */
1453    struct rb_root pi_waiters;
1454    struct rb_node *pi_waiters_leftmost;
1455    /* Deadlock detection and priority inheritance handling */
1456    struct rt_mutex_waiter *pi_blocked_on;
1457#endif
1458
1459#ifdef CONFIG_DEBUG_MUTEXES
1460    /* mutex deadlock detection */
1461    struct mutex_waiter *blocked_on;
1462#endif
1463#ifdef CONFIG_TRACE_IRQFLAGS
1464    unsigned int irq_events;
1465    unsigned long hardirq_enable_ip;
1466    unsigned long hardirq_disable_ip;
1467    unsigned int hardirq_enable_event;
1468    unsigned int hardirq_disable_event;
1469    int hardirqs_enabled;
1470    int hardirq_context;
1471    unsigned long softirq_disable_ip;
1472    unsigned long softirq_enable_ip;
1473    unsigned int softirq_disable_event;
1474    unsigned int softirq_enable_event;
1475    int softirqs_enabled;
1476    int softirq_context;
1477#endif
1478#ifdef CONFIG_LOCKDEP
1479# define MAX_LOCK_DEPTH 48UL
1480    u64 curr_chain_key;
1481    int lockdep_depth;
1482    unsigned int lockdep_recursion;
1483    struct held_lock held_locks[MAX_LOCK_DEPTH];
1484    gfp_t lockdep_reclaim_gfp;
1485#endif
1486
1487/* journalling filesystem info */
1488    void *journal_info;
1489
1490/* stacked block device info */
1491    struct bio_list *bio_list;
1492
1493#ifdef CONFIG_BLOCK
1494/* stack plugging */
1495    struct blk_plug *plug;
1496#endif
1497
1498/* VM state */
1499    struct reclaim_state *reclaim_state;
1500
1501    struct backing_dev_info *backing_dev_info;
1502
1503    struct io_context *io_context;
1504
1505    unsigned long ptrace_message;
1506    siginfo_t *last_siginfo; /* For ptrace use.  */
1507    struct task_io_accounting ioac;
1508#if defined(CONFIG_TASK_XACCT)
1509    u64 acct_rss_mem1;    /* accumulated rss usage */
1510    u64 acct_vm_mem1;    /* accumulated virtual memory usage */
1511    cputime_t acct_timexpd;    /* stime + utime since last update */
1512#endif
1513#ifdef CONFIG_CPUSETS
1514    nodemask_t mems_allowed;    /* Protected by alloc_lock */
1515    seqcount_t mems_allowed_seq;    /* Seqence no to catch updates */
1516    int cpuset_mem_spread_rotor;
1517    int cpuset_slab_spread_rotor;
1518#endif
1519#ifdef CONFIG_CGROUPS
1520    /* Control Group info protected by css_set_lock */
1521    struct css_set __rcu *cgroups;
1522    /* cg_list protected by css_set_lock and tsk->alloc_lock */
1523    struct list_head cg_list;
1524#endif
1525#ifdef CONFIG_FUTEX
1526    struct robust_list_head __user *robust_list;
1527#ifdef CONFIG_COMPAT
1528    struct compat_robust_list_head __user *compat_robust_list;
1529#endif
1530    struct list_head pi_state_list;
1531    struct futex_pi_state *pi_state_cache;
1532#endif
1533#ifdef CONFIG_PERF_EVENTS
1534    struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
1535    struct mutex perf_event_mutex;
1536    struct list_head perf_event_list;
1537#endif
1538#ifdef CONFIG_DEBUG_PREEMPT
1539    unsigned long preempt_disable_ip;
1540#endif
1541#ifdef CONFIG_NUMA
1542    struct mempolicy *mempolicy;    /* Protected by alloc_lock */
1543    short il_next;
1544    short pref_node_fork;
1545#endif
1546#ifdef CONFIG_NUMA_BALANCING
1547    int numa_scan_seq;
1548    unsigned int numa_scan_period;
1549    unsigned int numa_scan_period_max;
1550    int numa_preferred_nid;
1551    unsigned long numa_migrate_retry;
1552    u64 node_stamp;            /* migration stamp  */
1553    u64 last_task_numa_placement;
1554    u64 last_sum_exec_runtime;
1555    struct callback_head numa_work;
1556
1557    struct list_head numa_entry;
1558    struct numa_group *numa_group;
1559
1560    /*
1561     * Exponential decaying average of faults on a per-node basis.
1562     * Scheduling placement decisions are made based on the these counts.
1563     * The values remain static for the duration of a PTE scan
1564     */
1565    unsigned long *numa_faults_memory;
1566    unsigned long total_numa_faults;
1567
1568    /*
1569     * numa_faults_buffer records faults per node during the current
1570     * scan window. When the scan completes, the counts in
1571     * numa_faults_memory decay and these values are copied.
1572     */
1573    unsigned long *numa_faults_buffer_memory;
1574
1575    /*
1576     * Track the nodes the process was running on when a NUMA hinting
1577     * fault was incurred.
1578     */
1579    unsigned long *numa_faults_cpu;
1580    unsigned long *numa_faults_buffer_cpu;
1581
1582    /*
1583     * numa_faults_locality tracks if faults recorded during the last
1584     * scan window were remote/local. The task scan period is adapted
1585     * based on the locality of the faults with different weights
1586     * depending on whether they were shared or private faults
1587     */
1588    unsigned long numa_faults_locality[2];
1589
1590    unsigned long numa_pages_migrated;
1591#endif /* CONFIG_NUMA_BALANCING */
1592
1593    struct rcu_head rcu;
1594
1595    /*
1596     * cache last used pipe for splice
1597     */
1598    struct pipe_inode_info *splice_pipe;
1599
1600    struct page_frag task_frag;
1601
1602#ifdef    CONFIG_TASK_DELAY_ACCT
1603    struct task_delay_info *delays;
1604#endif
1605#ifdef CONFIG_FAULT_INJECTION
1606    int make_it_fail;
1607#endif
1608    /*
1609     * when (nr_dirtied >= nr_dirtied_pause), it's time to call
1610     * balance_dirty_pages() for some dirty throttling pause
1611     */
1612    int nr_dirtied;
1613    int nr_dirtied_pause;
1614    unsigned long dirty_paused_when; /* start of a write-and-pause period */
1615
1616#ifdef CONFIG_LATENCYTOP
1617    int latency_record_count;
1618    struct latency_record latency_record[LT_SAVECOUNT];
1619#endif
1620    /*
1621     * time slack values; these are used to round up poll() and
1622     * select() etc timeout values. These are in nanoseconds.
1623     */
1624    unsigned long timer_slack_ns;
1625    unsigned long default_timer_slack_ns;
1626
1627#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1628    /* Index of current stored address in ret_stack */
1629    int curr_ret_stack;
1630    /* Stack of return addresses for return function tracing */
1631    struct ftrace_ret_stack    *ret_stack;
1632    /* time stamp for last schedule */
1633    unsigned long long ftrace_timestamp;
1634    /*
1635     * Number of functions that haven't been traced
1636     * because of depth overrun.
1637     */
1638    atomic_t trace_overrun;
1639    /* Pause for the tracing */
1640    atomic_t tracing_graph_pause;
1641#endif
1642#ifdef CONFIG_TRACING
1643    /* state flags for use by tracers */
1644    unsigned long trace;
1645    /* bitmask and counter of trace recursion */
1646    unsigned long trace_recursion;
1647#endif /* CONFIG_TRACING */
1648#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
1649    unsigned int memcg_kmem_skip_account;
1650    struct memcg_oom_info {
1651        struct mem_cgroup *memcg;
1652        gfp_t gfp_mask;
1653        int order;
1654        unsigned int may_oom:1;
1655    } memcg_oom;
1656#endif
1657#ifdef CONFIG_UPROBES
1658    struct uprobe_task *utask;
1659#endif
1660#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
1661    unsigned int    sequential_io;
1662    unsigned int    sequential_io_avg;
1663#endif
1664};
task_struct

 (点击查看内核代码链接

 

1235struct task_struct {
//state标识当前进程的状态:Linux中进程的三种状态 -1表示当前进程不可运行, 0表示进程处于可运行状态(即TASK_RUNNING),>0表示当前进程处于停止运行状态
1236 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
//stack指向struct thread_info
1237 void *stack;
//atomic_t结构
1238 atomic_t usage; 1239 unsigned int flags; /* per process flags, defined below */
//flags对于内核十分重要,保存了内核的抢占等信息
1240 unsigned int ptrace;

其中 struct thread_info 是个非常重要的结构,保存了进程当前的CPU临时状态和各个寄存器的值。

一些相关的进程状态:

1 #define TASK_RUNNING        0
2 #define TASK_INTERRUPTIBLE    1
3 #define TASK_UNINTERRUPTIBLE    2
4 #define TASK_STOPPED        4
5 #define TASK_TRACED        8

注意到在Linux中进程的就绪和运行状态都用TASK_RUNNING表示。
接下来的代码:

1242#ifdef CONFIG_SMP
1243    struct llist_node wake_entry;
1244    int on_cpu;
1245    struct task_struct *last_wakee;
1246    unsigned long wakee_flips;
1247    unsigned long wakee_flip_decay_ts;
1248
1249    int wake_cpu;
1250#endif

其中CONFIG_SMP下面用于多处理器SMP系统结构的代码,我们这里可以先不用管,接着看下面的代码:

1251    int on_rq;    //用于记录进程是否位于一个运行队列上
1252
//设定当前进程的优先级,包括静态优先级和动态优先级
1253 int prio, static_prio, normal_prio; 1254 unsigned int rt_priority;
//设定该进程采用的调度策略及相关设定
1255 const struct sched_class *sched_class; 1256 struct sched_entity se; 1257 struct sched_rt_entity rt;

下面一些预处理编译选项用来对内核进行设置,我们先不考虑,

1301    struct mm_struct *mm, *active_mm;

struct mm_struct结构为内存描述符,一个进程占用的内存资源都是由它描述。

1311/* task state */
//进程退出时的状态 1312 int exit_state;
//进程退出时的返回码和退出信号
1313 int exit_code, exit_signal; 1314 int pdeath_signal; /* The signal sent when the parent dies */ 1315 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1316 1317 /* Used for emulating ABI behavior of previous Linux versions */ 1318 unsigned int personality; 1319 1320 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1321 * execve */ 1322 unsigned in_iowait:1; 1323 1324 /* Revert to default priority/policy when forking */ 1325 unsigned sched_reset_on_fork:1; 1326 unsigned sched_contributes_to_load:1; 1327 1328 unsigned long atomic_flags; /* Flags needing atomic access. */ 1329
//进程的唯一的编号
1330 pid_t pid; 1331 pid_t tgid; 1332

pid_t是进程编号,在Linux系统中每个进程都有一个唯一不同的进程编号,如init进程是所有用户进程的父进程,它的进程号是1。

下面这段代码用来设置进程的父进程,孩子进程等进程关系:

1337    /*
1338     * pointers to (original) parent process, youngest child, younger sibling,
1339     * older sibling, respectively.  (p->father can be replaced with
1340     * p->real_parent->pid)
1341     */
//保存该进程的父进程PCB结构的指针
1342 struct task_struct __rcu *real_parent; /* real parent process */ 1343 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ 1344 /* 1345 * children/sibling forms the list of my natural children 1346 */
//链接该进程的子进程 1347 struct list_head children; /* list of my children */
//链接兄弟进程 1348 struct list_head sibling; /* linkage in my parent's children list */
//本进程所在进程组的进程组长 1349 struct task_struct *group_leader; /* threadgroup leader */ 1350 1351 /* 1352 * ptraced is the list of tasks this task is using ptrace on. 1353 * This includes both natural children and PTRACE_ATTACH targets. 1354 * p->ptrace_entry is p's link on the p->parent->ptraced list. 1355 */ 1356 struct list_head ptraced; 1357 struct list_head ptrace_entry; 1358 1359 /* PID/PID hash table linkage. */ 1360 struct pid_link pids[PIDTYPE_MAX]; 1361 struct list_head thread_group; 1362 struct list_head thread_node; 1363
//用于同步父子进程运行顺序 1364 struct completion *vfork_done; /* for vfork() */ 1365 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1366 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1367 1368 cputime_t utime, stime, utimescaled, stimescaled; 1369 cputime_t gtime;

下面是一些关于CPU信息,内存信息等的数据指针:

1411/* CPU-specific state of this task */
1412    struct thread_struct thread;
1413/* filesystem information */
1414    struct fs_struct *fs;
1415/* open file information */
1416    struct files_struct *files;
1417/* namespaces */
1418    struct nsproxy *nsproxy;
1419/* signal handlers */
1420    struct signal_struct *signal;
1421    struct sighand_struct *sighand;
1422
1423    sigset_t blocked, real_blocked;
1424    sigset_t saved_sigmask;    /* restored if set_restore_sigmask() was used */
1425    struct sigpending pending;
1426
1427    unsigned long sas_ss_sp;
1428    size_t sas_ss_size;
1429    int (*notifier)(void *priv);
1430    void *notifier_data;
1431    sigset_t *notifier_mask;
1432    struct callback_head *task_works;
1433
1434    struct audit_context *audit_context;

这是一些主要的数据,还有很多,更详细的内容参见:内核的总体构成

 

了解了task_struct结构的一些基本内容后,我们转而看看fork这个函数到底是怎么创建新进程的,fork和vfork函数都是最终调用的sys_clone函数,最终都会调用do_fork这个内核函数,我们看do_fork内部是如何创建新进程的,代码如下:

1623long do_fork(unsigned long clone_flags,
1624          unsigned long stack_start,
1625          unsigned long stack_size,
1626          int __user *parent_tidptr,
1627          int __user *child_tidptr)
1628{
1629    struct task_struct *p;
1630    int trace = 0;
1631    long nr;
1632
1633    /*
1634     * Determine whether and which event to report to ptracer.  When
1635     * called from kernel_thread or CLONE_UNTRACED is explicitly
1636     * requested, no event is reported; otherwise, report if the event
1637     * for the type of forking is enabled.
1638     */
1639    if (!(clone_flags & CLONE_UNTRACED)) {
1640        if (clone_flags & CLONE_VFORK)
1641            trace = PTRACE_EVENT_VFORK;
1642        else if ((clone_flags & CSIGNAL) != SIGCHLD)
1643            trace = PTRACE_EVENT_CLONE;
1644        else
1645            trace = PTRACE_EVENT_FORK;
1646
1647        if (likely(!ptrace_event_enabled(current, trace)))
1648            trace = 0;
1649    }
1650
//这个函数用来拷贝进程,我们在下面讲解该函数 1651 p = copy_process(clone_flags, stack_start, stack_size, 1652 child_tidptr, NULL, trace); 1653 /* 1654 * Do this prior waking up the new thread - the thread pointer 1655 * might get invalid after that point, if the thread exits quickly. 1656 */ 1657 if (!IS_ERR(p)) { 1658 struct completion vfork; 1659 struct pid *pid; 1660 1661 trace_sched_process_fork(current, p); 1662 1663 pid = get_task_pid(p, PIDTYPE_PID); 1664 nr = pid_vnr(pid); 1665 1666 if (clone_flags & CLONE_PARENT_SETTID) 1667 put_user(nr, parent_tidptr); 1668 1669 if (clone_flags & CLONE_VFORK) { 1670 p->vfork_done = &vfork; 1671 init_completion(&vfork); 1672 get_task_struct(p); 1673 } 1674 1675 wake_up_new_task(p); 1676 1677 /* forking complete and child started to run, tell ptracer */ 1678 if (unlikely(trace)) 1679 ptrace_event_pid(trace, pid); 1680 1681 if (clone_flags & CLONE_VFORK) { 1682 if (!wait_for_vfork_done(p, &vfork)) 1683 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); 1684 } 1685 1686 put_pid(pid); 1687 } else { 1688 nr = PTR_ERR(p); 1689 } 1690 return nr; 1691}

我们看copy_process函数的内部实现,详细代码见链接

函数开始是一些错误检测处理,检测完出现一个重要的函数dup_task_struct这个函数就如函数名所表达的意思:复制当前进程的PCB控制块,即task_struct结构。

//为新进程分配task_struct结构的内存空间
312 tsk = alloc_task_struct_node(node); 313 if (!tsk) 314 return NULL; 315
//为新进程创建CPU当前状态结构空间,内核堆栈 316 ti = alloc_thread_info_node(tsk, node); 317 if (!ti) 318 goto free_tsk; 319
//复制当前进程task_struct 数据到新进程中
320 err = arch_dup_task_struct(tsk, orig); 321 if (err) 322 goto free_ti; 323 324 tsk->stack = ti;

我们可以追踪看进arch_dup_task_struct函数查看:

290int __weak arch_dup_task_struct(struct task_struct *dst,
291                           struct task_struct *src)
292{
293    *dst = *src;
294    return 0;
295}

下面的代码都是对新进程结构体的初始化和参数值的修改:

1252    retval = -EAGAIN;
1253    if (atomic_read(&p->real_cred->user->processes) >=
1254            task_rlimit(p, RLIMIT_NPROC)) {
1255        if (p->real_cred->user != INIT_USER &&
1256            !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
1257            goto bad_fork_free;
1258    }
1259    current->flags &= ~PF_NPROC_EXCEEDED;
1260
1261    retval = copy_creds(p, clone_flags);
1262    if (retval < 0)
1263        goto bad_fork_free;
1264
1265    /*
1266     * If multiple threads are within copy_process(), then this check
1267     * triggers too late. This doesn't hurt, the check is only there
1268     * to stop root fork bombs.
1269     */
1270    retval = -EAGAIN;
1271    if (nr_threads >= max_threads)
1272        goto bad_fork_cleanup_count;
1273
1274    if (!try_module_get(task_thread_info(p)->exec_domain->module))
1275        goto bad_fork_cleanup_count;
1276
1277    delayacct_tsk_init(p);    /* Must remain after dup_task_struct() */
1278    p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
1279    p->flags |= PF_FORKNOEXEC;
1280    INIT_LIST_HEAD(&p->children);
1281    INIT_LIST_HEAD(&p->sibling);
1282    rcu_copy_process(p);
1283    p->vfork_done = NULL;
1284    spin_lock_init(&p->alloc_lock);
1285
1286    init_sigpending(&p->pending);
1287
1288    p->utime = p->stime = p->gtime = 0;
1289    p->utimescaled = p->stimescaled = 0;

以及下面这段代码对一些内存分配数据结构的复制和设置等:

1359    /* Perform scheduler related setup. Assign this task to a CPU. */
1360    retval = sched_fork(clone_flags, p);
1361    if (retval)
1362        goto bad_fork_cleanup_policy;
1363
1364    retval = perf_event_init_task(p);
1365    if (retval)
1366        goto bad_fork_cleanup_policy;
1367    retval = audit_alloc(p);
1368    if (retval)
1369        goto bad_fork_cleanup_perf;
1370    /* copy all the process information */
1371    shm_init_task(p);
1372    retval = copy_semundo(clone_flags, p);
1373    if (retval)
1374        goto bad_fork_cleanup_audit;
1375    retval = copy_files(clone_flags, p);
1376    if (retval)
1377        goto bad_fork_cleanup_semundo;
1378    retval = copy_fs(clone_flags, p);
1379    if (retval)
1380        goto bad_fork_cleanup_files;
1381    retval = copy_sighand(clone_flags, p);
1382    if (retval)
1383        goto bad_fork_cleanup_fs;
1384    retval = copy_signal(clone_flags, p);
1385    if (retval)
1386        goto bad_fork_cleanup_sighand;
1387    retval = copy_mm(clone_flags, p);
1388    if (retval)
1389        goto bad_fork_cleanup_signal;
1390    retval = copy_namespaces(clone_flags, p);
1391    if (retval)
1392        goto bad_fork_cleanup_mm;
1393    retval = copy_io(clone_flags, p);
1394    if (retval)
1395        goto bad_fork_cleanup_namespaces;
1396    retval = copy_thread(clone_flags, stack_start, stack_size, p);
1397    if (retval)
1398        goto bad_fork_cleanup_io;
1399
1400    if (pid != &init_struct_pid) {
1401        retval = -ENOMEM;
1402        pid = alloc_pid(p->nsproxy->pid_ns_for_children);
1403        if (!pid)
1404            goto bad_fork_cleanup_io;
1405    }
1406
1407    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1408    /*
1409     * Clear TID on mm_release()?
1410     */
1411    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;

我们看copy_thread这个函数,其中拷贝了一些内核堆栈的信息。

132int copy_thread(unsigned long clone_flags, unsigned long sp,
133    unsigned long arg, struct task_struct *p)
134{
//找到压入数据的内核堆栈的基址
135 struct pt_regs *childregs = task_pt_regs(p); 136 struct task_struct *tsk; 137 int err; 138 139 p->thread.sp = (unsigned long) childregs; 140 p->thread.sp0 = (unsigned long) (childregs+1); 141 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 142 143 if (unlikely(p->flags & PF_KTHREAD)) { 144 /* kernel thread */ 145 memset(childregs, 0, sizeof(struct pt_regs)); 146 p->thread.ip = (unsigned long) ret_from_kernel_thread; 147 task_user_gs(p) = __KERNEL_STACK_CANARY; 148 childregs->ds = __USER_DS; 149 childregs->es = __USER_DS; 150 childregs->fs = __KERNEL_PERCPU; 151 childregs->bx = sp; /* function */ 152 childregs->bp = arg; 153 childregs->orig_ax = -1; 154 childregs->cs = __KERNEL_CS | get_kernel_rpl(); 155 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; 156 p->thread.io_bitmap_ptr = NULL; 157 return 0; 158 }
//拷贝栈中已有数据
159 *childregs = *current_pt_regs();
//返回值设为0
160 childregs->ax = 0; 161 if (sp) 162 childregs->sp = sp; 163
//设置子进程的ip为ret_from_fork这个函数的入口地址 164 p->thread.ip = (unsigned long) ret_from_fork; 165 task_user_gs(p) = get_user_gs(current_pt_regs()); 166 167 p->thread.io_bitmap_ptr = NULL; 168 tsk = current; 169 err = -ENOMEM; 170 171 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 172 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 173 IO_BITMAP_BYTES, GFP_KERNEL); 174 if (!p->thread.io_bitmap_ptr) { 175 p->thread.io_bitmap_max = 0; 176 return -ENOMEM; 177 } 178 set_tsk_thread_flag(p, TIF_IO_BITMAP); 179 } 180 181 err = 0; 182 183 /* 184 * Set a new TLS for the child thread? 185 */ 186 if (clone_flags & CLONE_SETTLS) 187 err = do_set_thread_area(p, -1, 188 (struct user_desc __user *)childregs->si, 0); 189 190 if (err && p->thread.io_bitmap_ptr) { 191 kfree(p->thread.io_bitmap_ptr); 192 p->thread.io_bitmap_max = 0; 193 } 194 return err; 195} 196

接下来我们使用gdb调试内核,跟踪fork的执行过程,来观察验证前面所述的内容:

我们从代码和跟踪中都可以看到,fork创建的新进程从ret_from_fork开始执行,在这之前,新进程把父进程的寄存器保存的值拷贝到自己的堆栈中,我们可以查看struct pt_regs这个结构,这里面保存的是SAVE_ALL保存的寄存器的值,子进程复制了这些值。

11struct pt_regs {
12    unsigned long bx;
13    unsigned long cx;
14    unsigned long dx;
15    unsigned long si;
16    unsigned long di;
17    unsigned long bp;
18    unsigned long ax;
19    unsigned long ds;
20    unsigned long es;
21    unsigned long fs;
22    unsigned long gs;
23    unsigned long orig_ax;
24    unsigned long ip;
25    unsigned long cs;
26    unsigned long flags;
27    unsigned long sp;
28    unsigned long ss;
29};

子进程创建完从ret_from_fock开始执行:

290ENTRY(ret_from_fork)
291    CFI_STARTPROC
292    pushl_cfi %eax
293    call schedule_tail
294    GET_THREAD_INFO(%ebp)
295    popl_cfi %eax
296    pushl_cfi $0x0202        # Reset kernel eflags
297    popfl_cfi
298    jmp syscall_exit
299    CFI_ENDPROC
300END(ret_from_fork)

最后执行这一句开始跳转:

jmp syscall_exit

挑战到system_call中:

505syscall_exit:
506    LOCKDEP_SYS_EXIT
507    DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
508                    # setting need_resched or sigpending
509                    # between sampling and the iret

以后执行和系统调用执行完返回的过程一样,这就是创建一个子进程的全部过程。

总结:

Linux调用fork创建进程,开始是复制进程描述符(PCB):task_struct,接着对子进程的PCB进行初始化,然后把当前CPU的状态复制给子进程,接着把保存在父进程中的用户态进程上下文环境拷贝到自己的堆栈里面,接下来就像普通系统调用一样,子进程可以返回到用户态继续执行中断前的进程上下文。

 

Allen 原创作品转载请注明出处《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

 

posted on 2015-04-12 19:52  lingzshen  阅读(604)  评论(0编辑  收藏  举报