从库函数fork()起步,探究新进程的诞生
本周在线学习了孟宁老师的《Linux内核分析》,本周的主要内容是进程的描述和创建,针对本次课程的实验现记录于本博文。
我们学习过操作系统这么课程,知道PCB是进程在内核中的唯一标识,PCB结构中包括本进程的全部信息。具体到Linux操作系统,这个PCB结构就是Linux内核中的task_struct结构体,该结构体非常庞大,包含了进程的很多基本信息。当我们使用fork()函数创建新进程的时候,理所当然的会涉及到task_struct函数,我们下面就从本结构体开始分析:
1235struct task_struct { 1236 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1237 void *stack; 1238 atomic_t usage; 1239 unsigned int flags; /* per process flags, defined below */ 1240 unsigned int ptrace; 1241 1242#ifdef CONFIG_SMP 1243 struct llist_node wake_entry; 1244 int on_cpu; 1245 struct task_struct *last_wakee; 1246 unsigned long wakee_flips; 1247 unsigned long wakee_flip_decay_ts; 1248 1249 int wake_cpu; 1250#endif 1251 int on_rq; 1252 1253 int prio, static_prio, normal_prio; 1254 unsigned int rt_priority; 1255 const struct sched_class *sched_class; 1256 struct sched_entity se; 1257 struct sched_rt_entity rt; 1258#ifdef CONFIG_CGROUP_SCHED 1259 struct task_group *sched_task_group; 1260#endif 1261 struct sched_dl_entity dl; 1262 1263#ifdef CONFIG_PREEMPT_NOTIFIERS 1264 /* list of struct preempt_notifier: */ 1265 struct hlist_head preempt_notifiers; 1266#endif 1267 1268#ifdef CONFIG_BLK_DEV_IO_TRACE 1269 unsigned int btrace_seq; 1270#endif 1271 1272 unsigned int policy; 1273 int nr_cpus_allowed; 1274 cpumask_t cpus_allowed; 1275 1276#ifdef CONFIG_PREEMPT_RCU 1277 int rcu_read_lock_nesting; 1278 union rcu_special rcu_read_unlock_special; 1279 struct list_head rcu_node_entry; 1280#endif /* #ifdef CONFIG_PREEMPT_RCU */ 1281#ifdef CONFIG_TREE_PREEMPT_RCU 1282 struct rcu_node *rcu_blocked_node; 1283#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1284#ifdef CONFIG_TASKS_RCU 1285 unsigned long rcu_tasks_nvcsw; 1286 bool rcu_tasks_holdout; 1287 struct list_head rcu_tasks_holdout_list; 1288 int rcu_tasks_idle_cpu; 1289#endif /* #ifdef CONFIG_TASKS_RCU */ 1290 1291#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1292 struct sched_info sched_info; 1293#endif 1294 1295 struct list_head tasks; 1296#ifdef CONFIG_SMP 1297 struct plist_node pushable_tasks; 1298 struct rb_node pushable_dl_tasks; 1299#endif 1300 1301 struct mm_struct *mm, *active_mm; 1302#ifdef CONFIG_COMPAT_BRK 1303 unsigned brk_randomized:1; 1304#endif 1305 /* per-thread vma caching */ 1306 u32 vmacache_seqnum; 1307 struct vm_area_struct *vmacache[VMACACHE_SIZE]; 1308#if defined(SPLIT_RSS_COUNTING) 1309 struct task_rss_stat rss_stat; 1310#endif 1311/* task state */ 1312 int exit_state; 1313 int exit_code, exit_signal; 1314 int pdeath_signal; /* The signal sent when the parent dies */ 1315 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1316 1317 /* Used for emulating ABI behavior of previous Linux versions */ 1318 unsigned int personality; 1319 1320 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1321 * execve */ 1322 unsigned in_iowait:1; 1323 1324 /* Revert to default priority/policy when forking */ 1325 unsigned sched_reset_on_fork:1; 1326 unsigned sched_contributes_to_load:1; 1327 1328 unsigned long atomic_flags; /* Flags needing atomic access. */ 1329 1330 pid_t pid; 1331 pid_t tgid; 1332 1333#ifdef CONFIG_CC_STACKPROTECTOR 1334 /* Canary value for the -fstack-protector gcc feature */ 1335 unsigned long stack_canary; 1336#endif 1337 /* 1338 * pointers to (original) parent process, youngest child, younger sibling, 1339 * older sibling, respectively. (p->father can be replaced with 1340 * p->real_parent->pid) 1341 */ 1342 struct task_struct __rcu *real_parent; /* real parent process */ 1343 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ 1344 /* 1345 * children/sibling forms the list of my natural children 1346 */ 1347 struct list_head children; /* list of my children */ 1348 struct list_head sibling; /* linkage in my parent's children list */ 1349 struct task_struct *group_leader; /* threadgroup leader */ 1350 1351 /* 1352 * ptraced is the list of tasks this task is using ptrace on. 1353 * This includes both natural children and PTRACE_ATTACH targets. 1354 * p->ptrace_entry is p's link on the p->parent->ptraced list. 1355 */ 1356 struct list_head ptraced; 1357 struct list_head ptrace_entry; 1358 1359 /* PID/PID hash table linkage. */ 1360 struct pid_link pids[PIDTYPE_MAX]; 1361 struct list_head thread_group; 1362 struct list_head thread_node; 1363 1364 struct completion *vfork_done; /* for vfork() */ 1365 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1366 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1367 1368 cputime_t utime, stime, utimescaled, stimescaled; 1369 cputime_t gtime; 1370#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 1371 struct cputime prev_cputime; 1372#endif 1373#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1374 seqlock_t vtime_seqlock; 1375 unsigned long long vtime_snap; 1376 enum { 1377 VTIME_SLEEPING = 0, 1378 VTIME_USER, 1379 VTIME_SYS, 1380 } vtime_snap_whence; 1381#endif 1382 unsigned long nvcsw, nivcsw; /* context switch counts */ 1383 u64 start_time; /* monotonic time in nsec */ 1384 u64 real_start_time; /* boot based time in nsec */ 1385/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ 1386 unsigned long min_flt, maj_flt; 1387 1388 struct task_cputime cputime_expires; 1389 struct list_head cpu_timers[3]; 1390 1391/* process credentials */ 1392 const struct cred __rcu *real_cred; /* objective and real subjective task 1393 * credentials (COW) */ 1394 const struct cred __rcu *cred; /* effective (overridable) subjective task 1395 * credentials (COW) */ 1396 char comm[TASK_COMM_LEN]; /* executable name excluding path 1397 - access with [gs]et_task_comm (which lock 1398 it with task_lock()) 1399 - initialized normally by setup_new_exec */ 1400/* file system info */ 1401 int link_count, total_link_count; 1402#ifdef CONFIG_SYSVIPC 1403/* ipc stuff */ 1404 struct sysv_sem sysvsem; 1405 struct sysv_shm sysvshm; 1406#endif 1407#ifdef CONFIG_DETECT_HUNG_TASK 1408/* hung task detection */ 1409 unsigned long last_switch_count; 1410#endif 1411/* CPU-specific state of this task */ 1412 struct thread_struct thread; 1413/* filesystem information */ 1414 struct fs_struct *fs; 1415/* open file information */ 1416 struct files_struct *files; 1417/* namespaces */ 1418 struct nsproxy *nsproxy; 1419/* signal handlers */ 1420 struct signal_struct *signal; 1421 struct sighand_struct *sighand; 1422 1423 sigset_t blocked, real_blocked; 1424 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ 1425 struct sigpending pending; 1426 1427 unsigned long sas_ss_sp; 1428 size_t sas_ss_size; 1429 int (*notifier)(void *priv); 1430 void *notifier_data; 1431 sigset_t *notifier_mask; 1432 struct callback_head *task_works; 1433 1434 struct audit_context *audit_context; 1435#ifdef CONFIG_AUDITSYSCALL 1436 kuid_t loginuid; 1437 unsigned int sessionid; 1438#endif 1439 struct seccomp seccomp; 1440 1441/* Thread group tracking */ 1442 u32 parent_exec_id; 1443 u32 self_exec_id; 1444/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, 1445 * mempolicy */ 1446 spinlock_t alloc_lock; 1447 1448 /* Protection of the PI data structures: */ 1449 raw_spinlock_t pi_lock; 1450 1451#ifdef CONFIG_RT_MUTEXES 1452 /* PI waiters blocked on a rt_mutex held by this task */ 1453 struct rb_root pi_waiters; 1454 struct rb_node *pi_waiters_leftmost; 1455 /* Deadlock detection and priority inheritance handling */ 1456 struct rt_mutex_waiter *pi_blocked_on; 1457#endif 1458 1459#ifdef CONFIG_DEBUG_MUTEXES 1460 /* mutex deadlock detection */ 1461 struct mutex_waiter *blocked_on; 1462#endif 1463#ifdef CONFIG_TRACE_IRQFLAGS 1464 unsigned int irq_events; 1465 unsigned long hardirq_enable_ip; 1466 unsigned long hardirq_disable_ip; 1467 unsigned int hardirq_enable_event; 1468 unsigned int hardirq_disable_event; 1469 int hardirqs_enabled; 1470 int hardirq_context; 1471 unsigned long softirq_disable_ip; 1472 unsigned long softirq_enable_ip; 1473 unsigned int softirq_disable_event; 1474 unsigned int softirq_enable_event; 1475 int softirqs_enabled; 1476 int softirq_context; 1477#endif 1478#ifdef CONFIG_LOCKDEP 1479# define MAX_LOCK_DEPTH 48UL 1480 u64 curr_chain_key; 1481 int lockdep_depth; 1482 unsigned int lockdep_recursion; 1483 struct held_lock held_locks[MAX_LOCK_DEPTH]; 1484 gfp_t lockdep_reclaim_gfp; 1485#endif 1486 1487/* journalling filesystem info */ 1488 void *journal_info; 1489 1490/* stacked block device info */ 1491 struct bio_list *bio_list; 1492 1493#ifdef CONFIG_BLOCK 1494/* stack plugging */ 1495 struct blk_plug *plug; 1496#endif 1497 1498/* VM state */ 1499 struct reclaim_state *reclaim_state; 1500 1501 struct backing_dev_info *backing_dev_info; 1502 1503 struct io_context *io_context; 1504 1505 unsigned long ptrace_message; 1506 siginfo_t *last_siginfo; /* For ptrace use. */ 1507 struct task_io_accounting ioac; 1508#if defined(CONFIG_TASK_XACCT) 1509 u64 acct_rss_mem1; /* accumulated rss usage */ 1510 u64 acct_vm_mem1; /* accumulated virtual memory usage */ 1511 cputime_t acct_timexpd; /* stime + utime since last update */ 1512#endif 1513#ifdef CONFIG_CPUSETS 1514 nodemask_t mems_allowed; /* Protected by alloc_lock */ 1515 seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ 1516 int cpuset_mem_spread_rotor; 1517 int cpuset_slab_spread_rotor; 1518#endif 1519#ifdef CONFIG_CGROUPS 1520 /* Control Group info protected by css_set_lock */ 1521 struct css_set __rcu *cgroups; 1522 /* cg_list protected by css_set_lock and tsk->alloc_lock */ 1523 struct list_head cg_list; 1524#endif 1525#ifdef CONFIG_FUTEX 1526 struct robust_list_head __user *robust_list; 1527#ifdef CONFIG_COMPAT 1528 struct compat_robust_list_head __user *compat_robust_list; 1529#endif 1530 struct list_head pi_state_list; 1531 struct futex_pi_state *pi_state_cache; 1532#endif 1533#ifdef CONFIG_PERF_EVENTS 1534 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; 1535 struct mutex perf_event_mutex; 1536 struct list_head perf_event_list; 1537#endif 1538#ifdef CONFIG_DEBUG_PREEMPT 1539 unsigned long preempt_disable_ip; 1540#endif 1541#ifdef CONFIG_NUMA 1542 struct mempolicy *mempolicy; /* Protected by alloc_lock */ 1543 short il_next; 1544 short pref_node_fork; 1545#endif 1546#ifdef CONFIG_NUMA_BALANCING 1547 int numa_scan_seq; 1548 unsigned int numa_scan_period; 1549 unsigned int numa_scan_period_max; 1550 int numa_preferred_nid; 1551 unsigned long numa_migrate_retry; 1552 u64 node_stamp; /* migration stamp */ 1553 u64 last_task_numa_placement; 1554 u64 last_sum_exec_runtime; 1555 struct callback_head numa_work; 1556 1557 struct list_head numa_entry; 1558 struct numa_group *numa_group; 1559 1560 /* 1561 * Exponential decaying average of faults on a per-node basis. 1562 * Scheduling placement decisions are made based on the these counts. 1563 * The values remain static for the duration of a PTE scan 1564 */ 1565 unsigned long *numa_faults_memory; 1566 unsigned long total_numa_faults; 1567 1568 /* 1569 * numa_faults_buffer records faults per node during the current 1570 * scan window. When the scan completes, the counts in 1571 * numa_faults_memory decay and these values are copied. 1572 */ 1573 unsigned long *numa_faults_buffer_memory; 1574 1575 /* 1576 * Track the nodes the process was running on when a NUMA hinting 1577 * fault was incurred. 1578 */ 1579 unsigned long *numa_faults_cpu; 1580 unsigned long *numa_faults_buffer_cpu; 1581 1582 /* 1583 * numa_faults_locality tracks if faults recorded during the last 1584 * scan window were remote/local. The task scan period is adapted 1585 * based on the locality of the faults with different weights 1586 * depending on whether they were shared or private faults 1587 */ 1588 unsigned long numa_faults_locality[2]; 1589 1590 unsigned long numa_pages_migrated; 1591#endif /* CONFIG_NUMA_BALANCING */ 1592 1593 struct rcu_head rcu; 1594 1595 /* 1596 * cache last used pipe for splice 1597 */ 1598 struct pipe_inode_info *splice_pipe; 1599 1600 struct page_frag task_frag; 1601 1602#ifdef CONFIG_TASK_DELAY_ACCT 1603 struct task_delay_info *delays; 1604#endif 1605#ifdef CONFIG_FAULT_INJECTION 1606 int make_it_fail; 1607#endif 1608 /* 1609 * when (nr_dirtied >= nr_dirtied_pause), it's time to call 1610 * balance_dirty_pages() for some dirty throttling pause 1611 */ 1612 int nr_dirtied; 1613 int nr_dirtied_pause; 1614 unsigned long dirty_paused_when; /* start of a write-and-pause period */ 1615 1616#ifdef CONFIG_LATENCYTOP 1617 int latency_record_count; 1618 struct latency_record latency_record[LT_SAVECOUNT]; 1619#endif 1620 /* 1621 * time slack values; these are used to round up poll() and 1622 * select() etc timeout values. These are in nanoseconds. 1623 */ 1624 unsigned long timer_slack_ns; 1625 unsigned long default_timer_slack_ns; 1626 1627#ifdef CONFIG_FUNCTION_GRAPH_TRACER 1628 /* Index of current stored address in ret_stack */ 1629 int curr_ret_stack; 1630 /* Stack of return addresses for return function tracing */ 1631 struct ftrace_ret_stack *ret_stack; 1632 /* time stamp for last schedule */ 1633 unsigned long long ftrace_timestamp; 1634 /* 1635 * Number of functions that haven't been traced 1636 * because of depth overrun. 1637 */ 1638 atomic_t trace_overrun; 1639 /* Pause for the tracing */ 1640 atomic_t tracing_graph_pause; 1641#endif 1642#ifdef CONFIG_TRACING 1643 /* state flags for use by tracers */ 1644 unsigned long trace; 1645 /* bitmask and counter of trace recursion */ 1646 unsigned long trace_recursion; 1647#endif /* CONFIG_TRACING */ 1648#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ 1649 unsigned int memcg_kmem_skip_account; 1650 struct memcg_oom_info { 1651 struct mem_cgroup *memcg; 1652 gfp_t gfp_mask; 1653 int order; 1654 unsigned int may_oom:1; 1655 } memcg_oom; 1656#endif 1657#ifdef CONFIG_UPROBES 1658 struct uprobe_task *utask; 1659#endif 1660#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) 1661 unsigned int sequential_io; 1662 unsigned int sequential_io_avg; 1663#endif 1664};
1235struct task_struct {
//state标识当前进程的状态:Linux中进程的三种状态 -1表示当前进程不可运行, 0表示进程处于可运行状态(即TASK_RUNNING),>0表示当前进程处于停止运行状态 1236 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
//stack指向struct thread_info 1237 void *stack;
//atomic_t结构 1238 atomic_t usage; 1239 unsigned int flags; /* per process flags, defined below */
//flags对于内核十分重要,保存了内核的抢占等信息 1240 unsigned int ptrace;
其中 struct thread_info 是个非常重要的结构,保存了进程当前的CPU临时状态和各个寄存器的值。
一些相关的进程状态:
1 #define TASK_RUNNING 0 2 #define TASK_INTERRUPTIBLE 1 3 #define TASK_UNINTERRUPTIBLE 2 4 #define TASK_STOPPED 4 5 #define TASK_TRACED 8
注意到在Linux中进程的就绪和运行状态都用TASK_RUNNING表示。
接下来的代码:
1242#ifdef CONFIG_SMP 1243 struct llist_node wake_entry; 1244 int on_cpu; 1245 struct task_struct *last_wakee; 1246 unsigned long wakee_flips; 1247 unsigned long wakee_flip_decay_ts; 1248 1249 int wake_cpu; 1250#endif
其中CONFIG_SMP下面用于多处理器SMP系统结构的代码,我们这里可以先不用管,接着看下面的代码:
1251 int on_rq; //用于记录进程是否位于一个运行队列上 1252
//设定当前进程的优先级,包括静态优先级和动态优先级 1253 int prio, static_prio, normal_prio; 1254 unsigned int rt_priority;
//设定该进程采用的调度策略及相关设定 1255 const struct sched_class *sched_class; 1256 struct sched_entity se; 1257 struct sched_rt_entity rt;
下面一些预处理编译选项用来对内核进行设置,我们先不考虑,
1301 struct mm_struct *mm, *active_mm;
struct mm_struct结构为内存描述符,一个进程占用的内存资源都是由它描述。
1311/* task state */
//进程退出时的状态 1312 int exit_state;
//进程退出时的返回码和退出信号 1313 int exit_code, exit_signal; 1314 int pdeath_signal; /* The signal sent when the parent dies */ 1315 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1316 1317 /* Used for emulating ABI behavior of previous Linux versions */ 1318 unsigned int personality; 1319 1320 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1321 * execve */ 1322 unsigned in_iowait:1; 1323 1324 /* Revert to default priority/policy when forking */ 1325 unsigned sched_reset_on_fork:1; 1326 unsigned sched_contributes_to_load:1; 1327 1328 unsigned long atomic_flags; /* Flags needing atomic access. */ 1329
//进程的唯一的编号 1330 pid_t pid; 1331 pid_t tgid; 1332
pid_t是进程编号,在Linux系统中每个进程都有一个唯一不同的进程编号,如init进程是所有用户进程的父进程,它的进程号是1。
下面这段代码用来设置进程的父进程,孩子进程等进程关系:
1337 /* 1338 * pointers to (original) parent process, youngest child, younger sibling, 1339 * older sibling, respectively. (p->father can be replaced with 1340 * p->real_parent->pid) 1341 */
//保存该进程的父进程PCB结构的指针 1342 struct task_struct __rcu *real_parent; /* real parent process */ 1343 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ 1344 /* 1345 * children/sibling forms the list of my natural children 1346 */
//链接该进程的子进程 1347 struct list_head children; /* list of my children */
//链接兄弟进程 1348 struct list_head sibling; /* linkage in my parent's children list */
//本进程所在进程组的进程组长 1349 struct task_struct *group_leader; /* threadgroup leader */ 1350 1351 /* 1352 * ptraced is the list of tasks this task is using ptrace on. 1353 * This includes both natural children and PTRACE_ATTACH targets. 1354 * p->ptrace_entry is p's link on the p->parent->ptraced list. 1355 */ 1356 struct list_head ptraced; 1357 struct list_head ptrace_entry; 1358 1359 /* PID/PID hash table linkage. */ 1360 struct pid_link pids[PIDTYPE_MAX]; 1361 struct list_head thread_group; 1362 struct list_head thread_node; 1363
//用于同步父子进程运行顺序 1364 struct completion *vfork_done; /* for vfork() */ 1365 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1366 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1367 1368 cputime_t utime, stime, utimescaled, stimescaled; 1369 cputime_t gtime;
下面是一些关于CPU信息,内存信息等的数据指针:
1411/* CPU-specific state of this task */ 1412 struct thread_struct thread; 1413/* filesystem information */ 1414 struct fs_struct *fs; 1415/* open file information */ 1416 struct files_struct *files; 1417/* namespaces */ 1418 struct nsproxy *nsproxy; 1419/* signal handlers */ 1420 struct signal_struct *signal; 1421 struct sighand_struct *sighand; 1422 1423 sigset_t blocked, real_blocked; 1424 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ 1425 struct sigpending pending; 1426 1427 unsigned long sas_ss_sp; 1428 size_t sas_ss_size; 1429 int (*notifier)(void *priv); 1430 void *notifier_data; 1431 sigset_t *notifier_mask; 1432 struct callback_head *task_works; 1433 1434 struct audit_context *audit_context;
这是一些主要的数据,还有很多,更详细的内容参见:内核的总体构成
了解了task_struct结构的一些基本内容后,我们转而看看fork这个函数到底是怎么创建新进程的,fork和vfork函数都是最终调用的sys_clone函数,最终都会调用do_fork这个内核函数,我们看do_fork内部是如何创建新进程的,代码如下:
1623long do_fork(unsigned long clone_flags, 1624 unsigned long stack_start, 1625 unsigned long stack_size, 1626 int __user *parent_tidptr, 1627 int __user *child_tidptr) 1628{ 1629 struct task_struct *p; 1630 int trace = 0; 1631 long nr; 1632 1633 /* 1634 * Determine whether and which event to report to ptracer. When 1635 * called from kernel_thread or CLONE_UNTRACED is explicitly 1636 * requested, no event is reported; otherwise, report if the event 1637 * for the type of forking is enabled. 1638 */ 1639 if (!(clone_flags & CLONE_UNTRACED)) { 1640 if (clone_flags & CLONE_VFORK) 1641 trace = PTRACE_EVENT_VFORK; 1642 else if ((clone_flags & CSIGNAL) != SIGCHLD) 1643 trace = PTRACE_EVENT_CLONE; 1644 else 1645 trace = PTRACE_EVENT_FORK; 1646 1647 if (likely(!ptrace_event_enabled(current, trace))) 1648 trace = 0; 1649 } 1650
//这个函数用来拷贝进程,我们在下面讲解该函数 1651 p = copy_process(clone_flags, stack_start, stack_size, 1652 child_tidptr, NULL, trace); 1653 /* 1654 * Do this prior waking up the new thread - the thread pointer 1655 * might get invalid after that point, if the thread exits quickly. 1656 */ 1657 if (!IS_ERR(p)) { 1658 struct completion vfork; 1659 struct pid *pid; 1660 1661 trace_sched_process_fork(current, p); 1662 1663 pid = get_task_pid(p, PIDTYPE_PID); 1664 nr = pid_vnr(pid); 1665 1666 if (clone_flags & CLONE_PARENT_SETTID) 1667 put_user(nr, parent_tidptr); 1668 1669 if (clone_flags & CLONE_VFORK) { 1670 p->vfork_done = &vfork; 1671 init_completion(&vfork); 1672 get_task_struct(p); 1673 } 1674 1675 wake_up_new_task(p); 1676 1677 /* forking complete and child started to run, tell ptracer */ 1678 if (unlikely(trace)) 1679 ptrace_event_pid(trace, pid); 1680 1681 if (clone_flags & CLONE_VFORK) { 1682 if (!wait_for_vfork_done(p, &vfork)) 1683 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); 1684 } 1685 1686 put_pid(pid); 1687 } else { 1688 nr = PTR_ERR(p); 1689 } 1690 return nr; 1691}
我们看copy_process函数的内部实现,详细代码见链接
函数开始是一些错误检测处理,检测完出现一个重要的函数dup_task_struct这个函数就如函数名所表达的意思:复制当前进程的PCB控制块,即task_struct结构。
//为新进程分配task_struct结构的内存空间
312 tsk = alloc_task_struct_node(node); 313 if (!tsk) 314 return NULL; 315
//为新进程创建CPU当前状态结构空间,内核堆栈 316 ti = alloc_thread_info_node(tsk, node); 317 if (!ti) 318 goto free_tsk; 319
//复制当前进程task_struct 数据到新进程中 320 err = arch_dup_task_struct(tsk, orig); 321 if (err) 322 goto free_ti; 323 324 tsk->stack = ti;
我们可以追踪看进arch_dup_task_struct函数查看:
290int __weak arch_dup_task_struct(struct task_struct *dst, 291 struct task_struct *src) 292{ 293 *dst = *src; 294 return 0; 295}
下面的代码都是对新进程结构体的初始化和参数值的修改:
1252 retval = -EAGAIN; 1253 if (atomic_read(&p->real_cred->user->processes) >= 1254 task_rlimit(p, RLIMIT_NPROC)) { 1255 if (p->real_cred->user != INIT_USER && 1256 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) 1257 goto bad_fork_free; 1258 } 1259 current->flags &= ~PF_NPROC_EXCEEDED; 1260 1261 retval = copy_creds(p, clone_flags); 1262 if (retval < 0) 1263 goto bad_fork_free; 1264 1265 /* 1266 * If multiple threads are within copy_process(), then this check 1267 * triggers too late. This doesn't hurt, the check is only there 1268 * to stop root fork bombs. 1269 */ 1270 retval = -EAGAIN; 1271 if (nr_threads >= max_threads) 1272 goto bad_fork_cleanup_count; 1273 1274 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1275 goto bad_fork_cleanup_count; 1276 1277 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1278 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); 1279 p->flags |= PF_FORKNOEXEC; 1280 INIT_LIST_HEAD(&p->children); 1281 INIT_LIST_HEAD(&p->sibling); 1282 rcu_copy_process(p); 1283 p->vfork_done = NULL; 1284 spin_lock_init(&p->alloc_lock); 1285 1286 init_sigpending(&p->pending); 1287 1288 p->utime = p->stime = p->gtime = 0; 1289 p->utimescaled = p->stimescaled = 0;
以及下面这段代码对一些内存分配数据结构的复制和设置等:
1359 /* Perform scheduler related setup. Assign this task to a CPU. */ 1360 retval = sched_fork(clone_flags, p); 1361 if (retval) 1362 goto bad_fork_cleanup_policy; 1363 1364 retval = perf_event_init_task(p); 1365 if (retval) 1366 goto bad_fork_cleanup_policy; 1367 retval = audit_alloc(p); 1368 if (retval) 1369 goto bad_fork_cleanup_perf; 1370 /* copy all the process information */ 1371 shm_init_task(p); 1372 retval = copy_semundo(clone_flags, p); 1373 if (retval) 1374 goto bad_fork_cleanup_audit; 1375 retval = copy_files(clone_flags, p); 1376 if (retval) 1377 goto bad_fork_cleanup_semundo; 1378 retval = copy_fs(clone_flags, p); 1379 if (retval) 1380 goto bad_fork_cleanup_files; 1381 retval = copy_sighand(clone_flags, p); 1382 if (retval) 1383 goto bad_fork_cleanup_fs; 1384 retval = copy_signal(clone_flags, p); 1385 if (retval) 1386 goto bad_fork_cleanup_sighand; 1387 retval = copy_mm(clone_flags, p); 1388 if (retval) 1389 goto bad_fork_cleanup_signal; 1390 retval = copy_namespaces(clone_flags, p); 1391 if (retval) 1392 goto bad_fork_cleanup_mm; 1393 retval = copy_io(clone_flags, p); 1394 if (retval) 1395 goto bad_fork_cleanup_namespaces; 1396 retval = copy_thread(clone_flags, stack_start, stack_size, p); 1397 if (retval) 1398 goto bad_fork_cleanup_io; 1399 1400 if (pid != &init_struct_pid) { 1401 retval = -ENOMEM; 1402 pid = alloc_pid(p->nsproxy->pid_ns_for_children); 1403 if (!pid) 1404 goto bad_fork_cleanup_io; 1405 } 1406 1407 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1408 /* 1409 * Clear TID on mm_release()? 1410 */ 1411 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
我们看copy_thread这个函数,其中拷贝了一些内核堆栈的信息。
132int copy_thread(unsigned long clone_flags, unsigned long sp, 133 unsigned long arg, struct task_struct *p) 134{
//找到压入数据的内核堆栈的基址 135 struct pt_regs *childregs = task_pt_regs(p); 136 struct task_struct *tsk; 137 int err; 138 139 p->thread.sp = (unsigned long) childregs; 140 p->thread.sp0 = (unsigned long) (childregs+1); 141 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 142 143 if (unlikely(p->flags & PF_KTHREAD)) { 144 /* kernel thread */ 145 memset(childregs, 0, sizeof(struct pt_regs)); 146 p->thread.ip = (unsigned long) ret_from_kernel_thread; 147 task_user_gs(p) = __KERNEL_STACK_CANARY; 148 childregs->ds = __USER_DS; 149 childregs->es = __USER_DS; 150 childregs->fs = __KERNEL_PERCPU; 151 childregs->bx = sp; /* function */ 152 childregs->bp = arg; 153 childregs->orig_ax = -1; 154 childregs->cs = __KERNEL_CS | get_kernel_rpl(); 155 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; 156 p->thread.io_bitmap_ptr = NULL; 157 return 0; 158 }
//拷贝栈中已有数据 159 *childregs = *current_pt_regs();
//返回值设为0 160 childregs->ax = 0; 161 if (sp) 162 childregs->sp = sp; 163
//设置子进程的ip为ret_from_fork这个函数的入口地址 164 p->thread.ip = (unsigned long) ret_from_fork; 165 task_user_gs(p) = get_user_gs(current_pt_regs()); 166 167 p->thread.io_bitmap_ptr = NULL; 168 tsk = current; 169 err = -ENOMEM; 170 171 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 172 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 173 IO_BITMAP_BYTES, GFP_KERNEL); 174 if (!p->thread.io_bitmap_ptr) { 175 p->thread.io_bitmap_max = 0; 176 return -ENOMEM; 177 } 178 set_tsk_thread_flag(p, TIF_IO_BITMAP); 179 } 180 181 err = 0; 182 183 /* 184 * Set a new TLS for the child thread? 185 */ 186 if (clone_flags & CLONE_SETTLS) 187 err = do_set_thread_area(p, -1, 188 (struct user_desc __user *)childregs->si, 0); 189 190 if (err && p->thread.io_bitmap_ptr) { 191 kfree(p->thread.io_bitmap_ptr); 192 p->thread.io_bitmap_max = 0; 193 } 194 return err; 195} 196
接下来我们使用gdb调试内核,跟踪fork的执行过程,来观察验证前面所述的内容:
我们从代码和跟踪中都可以看到,fork创建的新进程从ret_from_fork开始执行,在这之前,新进程把父进程的寄存器保存的值拷贝到自己的堆栈中,我们可以查看struct pt_regs这个结构,这里面保存的是SAVE_ALL保存的寄存器的值,子进程复制了这些值。
11struct pt_regs { 12 unsigned long bx; 13 unsigned long cx; 14 unsigned long dx; 15 unsigned long si; 16 unsigned long di; 17 unsigned long bp; 18 unsigned long ax; 19 unsigned long ds; 20 unsigned long es; 21 unsigned long fs; 22 unsigned long gs; 23 unsigned long orig_ax; 24 unsigned long ip; 25 unsigned long cs; 26 unsigned long flags; 27 unsigned long sp; 28 unsigned long ss; 29};
子进程创建完从ret_from_fock开始执行:
290ENTRY(ret_from_fork) 291 CFI_STARTPROC 292 pushl_cfi %eax 293 call schedule_tail 294 GET_THREAD_INFO(%ebp) 295 popl_cfi %eax 296 pushl_cfi $0x0202 # Reset kernel eflags 297 popfl_cfi 298 jmp syscall_exit 299 CFI_ENDPROC 300END(ret_from_fork)
最后执行这一句开始跳转:
jmp syscall_exit
挑战到system_call中:
505syscall_exit: 506 LOCKDEP_SYS_EXIT 507 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 508 # setting need_resched or sigpending 509 # between sampling and the iret
以后执行和系统调用执行完返回的过程一样,这就是创建一个子进程的全部过程。
总结:
Linux调用fork创建进程,开始是复制进程描述符(PCB):task_struct,接着对子进程的PCB进行初始化,然后把当前CPU的状态复制给子进程,接着把保存在父进程中的用户态进程上下文环境拷贝到自己的堆栈里面,接下来就像普通系统调用一样,子进程可以返回到用户态继续执行中断前的进程上下文。
Allen 原创作品转载请注明出处《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000