简析进程地址空间的形成过程
关键词:fork、wait、execve、elf、ld.so、stack、heap等等。
本文着重分析一个进程从shell中输入,到执行起来后如何一步步形成maps地址空间的。
00008000-00009000 r-xp 00000000 b3:03 15 /root/data/maps------------------------进程可执行文件首地址如何确定?ELF文件中规定。 00009000-0000a000 r--p 00000000 b3:03 15 /root/data/maps 0000a000-0000b000 rw-p 00001000 b3:03 15 /root/data/maps 0000b000-0002c000 rwxp 00000000 00:00 0 [heap]---------------------------------确定heap低地址是heap的起始地址,紧跟可执行文件的bss段。 10000000-1001d000 r-xp 00000000 00:01 5918 /lib/ld-2.28.9000.so-------------------动态加载库地址如何确定?内核mmap区域的起始地址加载ld.so。 1001d000-1001e000 r--p 0001c000 00:01 5918 /lib/ld-2.28.9000.so 1001e000-1001f000 rw-p 0001d000 00:01 5918 /lib/ld-2.28.9000.so 1001f000-10020000 r-xp 00000000 00:00 0 [vdso] 10020000-1014a000 r-xp 00000000 00:01 5910 /lib/libc-2.28.9000.so-----------------由ld.so加载的库文件,地址紧挨ld.so和[vdso]。 1014a000-1014b000 ---p 0012a000 00:01 5910 /lib/libc-2.28.9000.so 1014b000-1014d000 r--p 0012a000 00:01 5910 /lib/libc-2.28.9000.so 1014d000-1014e000 rw-p 0012c000 00:01 5910 /lib/libc-2.28.9000.so 1014e000-10153000 rw-p 00000000 00:00 0 7f9f2000-7fa13000 rwxp 00000000 00:00 0 [stack]--------------------------------确定栈高地址是栈的起始地址。
1. shell执行进程
Busybox的shell入口为ash_main()函数,最终进入cmdloop()循环读取解析命令,然后执行。
static int cmdloop(int top) { ...
for (;;) {
... n = parsecmd(inter);------------------------------------解析命令,解析结果放在全局变量中,返回命令类型。 #if DEBUG if (DEBUG > 2 && debug && (n != NODE_EOF)) showtree(n); #endif if (n == NODE_EOF) { ... } else if (nflag == 0) { int i; /* job_warning can only be 2,1,0. Here 2->1, 1/0->0 */ job_warning >>= 1; numeof = 0; i = evaltree(n, 0);---------------------------------评估解析的命令。 if (n) status = i; } ... } return status; } static int evaltree(union node *n, int flags) { int checkexit = 0; int (*evalfn)(union node *, int); int status = 0; if (n == NULL) { TRACE(("evaltree(NULL) called\n")); goto out; } TRACE(("evaltree(%p: %d, %d) called\n", n, n->type, flags)); dotrap(); switch (n->type) { default: ... case NCMD: evalfn = evalcommand;-------------------------如果是命令,则使用evalcommand()进行处理。 checkexit: if (eflag && !(flags & EV_TESTED)) checkexit = ~0; goto calleval; case NFOR: ... evalfn = evaltree; calleval: status = evalfn(n, flags); goto setstatus; } case NIF: ... setstatus: exitstatus = status; break; } out: ... return exitstatus; }
evalcommand()创建一个子进程用于执行命令,父进程进行wait等处理。
子进程调用shellexec()执行命令,最终调用execve()系统调用。
static int evalcommand(union node *cmd, int flags) { ... /* Execute the command. */ switch (cmdentry.cmdtype) { default: { ... if (!(flags & EV_EXIT) || may_have_traps) { /* No, forking off a child is necessary */ INT_OFF; get_tty_state(); jp = makejob(/*cmd,*/ 1); if (forkshell(jp, cmd, FORK_FG) != 0) {--------------fork()子进程。 /* parent */ status = waitforjob(jp);-------------------------如果是父进程则在此wait()。 INT_ON; TRACE(("forked child exited with %d\n", status)); break; } /* child */ FORCE_INT_ON; /* fall through to exec'ing external program */ } listsetvar(varlist.list, VEXPORT|VSTACK); shellexec(argv[0], argv, path, cmdentry.u.index);-----------执行argv[0]命令。 /* NOTREACHED */ } /* default */ case CMDBUILTIN: ... } /* switch */ ... return status; } static void shellexec(char *prog, char **argv, const char *path, int idx) NORETURN; static void shellexec(char *prog, char **argv, const char *path, int idx) { ... envp = listvars(VEXPORT, VUNSET, /*end:*/ NULL); if (strchr(prog, '/') != NULL #if ENABLE_FEATURE_SH_STANDALONE || (applet_no = find_applet_by_name(prog)) >= 0 #endif ) { tryexec(IF_FEATURE_SH_STANDALONE(applet_no,) prog, argv, envp); if (applet_no >= 0) { goto try_PATH; } e = errno; } else { try_PATH: e = ENOENT; while ((cmdname = path_advance(&path, prog)) != NULL) { if (--idx < 0 && pathopt == NULL) { tryexec(IF_FEATURE_SH_STANDALONE(-1,) cmdname, argv, envp); if (errno != ENOENT && errno != ENOTDIR) e = errno; } stunalloc(cmdname); } } .. } static void tryexec(IF_FEATURE_SH_STANDALONE(int applet_no,) char *cmd, char **argv, char **envp) { ... repeat: #ifdef SYSV do { execve(cmd, argv, envp); } while (errno == EINTR); #else execve(cmd, argv, envp); #endif ... }
所以执行命令最终核心是系统调用execve()。
2. execve()内核中运行可执行程序
execve()系统调用在内核的入口是do_execve()。
SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); }
do_evecveat_common()在运行可执行二进制文件前,为即将创建的进程创建内存空间、寻找合适CPU、程序名、环境变量等等信息。
核心是exec_binprm(),它用于加载可执行文件、并跳转到ld.so进行动态库加载工作。
不同格式的可执行程序通过search_binary_handler()找到合适的load_binary()进行处理。
static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { char *pathbuf = NULL; struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); if ((current->flags & PF_NPROC_EXCEEDED) && atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; retval = unshare_files(&displaced);-------------------------------为进程复制一份文件表。 if (retval) goto out_ret; retval = -ENOMEM; bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);------------------------分配一个struct linux_binprm结构体。 if (!bprm) goto out_files; retval = prepare_bprm_creds(bprm); if (retval) goto out_free; check_unsafe_exec(bprm); current->in_execve = 1; file = do_open_execat(fd, filename, flags);-----------------------打开可执行文件。 retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; sched_exec();-----------------------------------------------------找到合适的CPU来执行任务。 bprm->file = file;------------------------------------------------填充struct linux_binprm结构体file、filename、interp成员。 if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); else pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", fd, filename->name); if (!pathbuf) { retval = -ENOMEM; goto out_unmark; } if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; bprm->filename = pathbuf; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm);--------------------------------------创建进程的内存地址空间。 if (retval) goto out_unmark; bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) goto out; bprm->envc = count(envp, MAX_ARG_STRINGS); if ((retval = bprm->envc) < 0) goto out; retval = prepare_binprm(bprm);-------------------------------------读取可执行文件elf头128字节。 if (retval < 0) goto out; retval = copy_strings_kernel(1, &bprm->filename, bprm);------------获取可执行文件名称。 if (retval < 0) goto out; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out; would_dump(bprm, bprm->file); retval = exec_binprm(bprm);----------------------------------------运行可执行文件。 if (retval < 0) goto out; /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); kfree(pathbuf); putname(filename); if (displaced) put_files_struct(displaced); return retval; ... } static int exec_binprm(struct linux_binprm *bprm)
{ ... ret = search_binary_handler(bprm);-------------------遍历所有可执行二进制文件处理列表formats,找到合适的load_binary()b并进行处理。 ... } int search_binary_handler(struct linux_binprm *bprm) { bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; int retval; /* This allows 4 levels of binfmt rewrites before failing hard. */ if (bprm->recursion_depth > 5) return -ELOOP; retval = security_bprm_check(bprm); if (retval) return retval; retval = -ENOENT; retry: read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) {------------------------遍历formats列表,使用成员的load_binary()加载可执行二进制程序。 if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); bprm->recursion_depth++; retval = fmt->load_binary(bprm);----------------------------调用load_binary()加载可执行二进制文件并运行。 read_lock(&binfmt_lock); put_binfmt(fmt); bprm->recursion_depth--; if (retval < 0 && !bprm->mm) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); force_sigsegv(SIGSEGV, current); return retval; } if (retval != -ENOEXEC || !bprm->file) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); ... return retval; }
linux通过register_binfmt()/unregister_binfmt注册可执行二进制处理函数到formats中。
下面以elf_fotmat进行分析:
- load_elf_binary()将elf静态文件中指令和数据加载到内存中。
- elf_core_dump在进程coredump的时候将各个vma段导出以便离线分析。
- load_shlib用于动态把一个共享库捆绑到一个已经在运行的进程。
static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, .load_shlib = load_elf_library, .core_dump = elf_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, }; static int load_elf_binary(struct linux_binprm *bprm) { ... /* Get the exec-header */ loc->elf_ex = *((struct elfhdr *)bprm->buf); retval = -ENOEXEC; /* First of all, some simple consistency checks */ if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)--------------------文件头魔数检查。 goto out; if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN) goto out; if (!elf_check_arch(&loc->elf_ex)) goto out; if (!bprm->file->f_op->mmap) goto out; elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file); if (!elf_phdata) goto out; elf_ppnt = elf_phdata; elf_bss = 0; elf_brk = 0; start_code = ~0UL; end_code = 0; start_data = 0; end_data = 0; for (i = 0; i < loc->elf_ex.e_phnum; i++) { if (elf_ppnt->p_type == PT_INTERP) { retval = -ENOEXEC; if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2) goto out_free_ph; retval = -ENOMEM; elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL); if (!elf_interpreter) goto out_free_ph; retval = kernel_read(bprm->file, elf_ppnt->p_offset, elf_interpreter, elf_ppnt->p_filesz); if (retval != elf_ppnt->p_filesz) { if (retval >= 0) retval = -EIO; goto out_free_interp; } /* make sure path is NULL terminated */ retval = -ENOEXEC; if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') goto out_free_interp; interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) goto out_free_interp; /* * If the binary is not readable then enforce * mm->dumpable = 0 regardless of the interpreter's * permissions. */ would_dump(bprm, interpreter); /* Get the exec headers */ retval = kernel_read(interpreter, 0, (void *)&loc->interp_elf_ex, sizeof(loc->interp_elf_ex)); if (retval != sizeof(loc->interp_elf_ex)) { if (retval >= 0) retval = -EIO; goto out_free_dentry; } break; } elf_ppnt++; } elf_ppnt = elf_phdata; for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) executable_stack = EXSTACK_ENABLE_X; else executable_stack = EXSTACK_DISABLE_X; break; case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt, bprm->file, false, &arch_state); if (retval) goto out_free_dentry; break; } /* Some simple consistency checks for the interpreter */ if (elf_interpreter) { retval = -ELIBBAD; /* Not an ELF interpreter */ if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0) goto out_free_dentry; /* Verify the interpreter has a valid arch */ if (!elf_check_arch(&loc->interp_elf_ex)) goto out_free_dentry; /* Load the interpreter program headers */ interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex, interpreter); if (!interp_elf_phdata) goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ elf_ppnt = interp_elf_phdata; for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(&loc->interp_elf_ex, elf_ppnt, interpreter, true, &arch_state); if (retval) goto out_free_dentry; break; } }... setup_new_exec(bprm);----------------------------设置进程的comm、task_size、mmap_base等信息。 install_exec_creds(bprm); retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) goto out_free_dentry; current->mm->start_stack = bprm->p; /* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { int elf_prot = 0, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; if (elf_ppnt->p_type != PT_LOAD) continue; if (unlikely (elf_brk > elf_bss)) { unsigned long nbyte; retval = set_brk(elf_bss + load_bias, elf_brk + load_bias); ... } ... vaddr = elf_ppnt->p_vaddr;------------从elf文件中解析出可执行文件加载的虚拟地址,也是可执行文件在maps中首地址。 ... error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR((void *)error) ? PTR_ERR((void*)error) : -EINVAL; goto out_free_dentry; } ... if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz || elf_ppnt->p_memsz > TASK_SIZE || TASK_SIZE - elf_ppnt->p_memsz < k) { /* set_brk can never work. Avoid overflows. */ retval = -EINVAL; goto out_free_dentry; } k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; ... } loc->elf_ex.e_entry += load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; end_code += load_bias; start_data += load_bias; end_data += load_bias; retval = set_brk(elf_bss, elf_brk);--------------------------确定进程地址空间的brk和start_brk,也即heap的低地址。 if (retval) goto out_free_dentry; if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { retval = -EFAULT; /* Nobody gets to see this, but.. */ goto out_free_dentry; } if (elf_interpreter) { unsigned long interp_map_addr = 0; elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, &interp_map_addr, load_bias, interp_elf_phdata);-----------加载elf解释器,这里一般指ld.so文件。返回值elf_entry是下面启动线程的首地址,也即执行权限交给ld.so处理其他动态库的加载。 ... kfree(interp_elf_phdata); kfree(elf_phdata); set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES retval = arch_setup_additional_pages(bprm, !!elf_interpreter); if (retval < 0) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); if (retval < 0) goto out; /* N.B. passed_fileno might not be initialized? */ current->mm->end_code = end_code;-------------------------------更新进程内存空间代码段、数据段、栈等信息。 current->mm->start_code = start_code; current->mm->start_data = start_data; current->mm->end_data = end_data; current->mm->start_stack = bprm->p;
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { current->mm->brk = current->mm->start_brk = arch_randomize_brk(current->mm); #ifdef compat_brk_randomized current->brk_randomized = 1; #endif }... start_thread(regs, elf_entry, bprm->p);------------------------将CPU指向ld.so运行,到用户空间处理动态库查找加载等工作。 retval = 0; out: kfree(loc); out_ret: return retval; /* error cleanup */ out_free_dentry: kfree(interp_elf_phdata); allow_write_access(interpreter); if (interpreter) fput(interpreter); out_free_interp: kfree(elf_interpreter); out_free_ph: kfree(elf_phdata); goto out; } void setup_new_exec(struct linux_binprm * bprm) { arch_pick_mmap_layout(current->mm);-----------------获取进程的mmap起始地址,也是ld.so加载的首地址。 ... __set_task_comm(current, kbasename(bprm->filename), true); current->mm->task_size = TASK_SIZE;-----------------获取进程地址空间的上限。... } static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, unsigned long *interp_map_addr, unsigned long no_base, struct elf_phdr *interp_elf_phdata) { ... total_size = total_mapping_size(interp_elf_phdata, interp_elf_ex->e_phnum); if (!total_size) { error = -EINVAL; goto out; } eppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; int elf_prot = 0; unsigned long vaddr = 0; unsigned long k, map_addr; if (eppnt->p_flags & PF_R) elf_prot = PROT_READ; if (eppnt->p_flags & PF_W) elf_prot |= PROT_WRITE; if (eppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type, total_size);---------------------将ld.so的指令和数据段加载到内存中,由于不是MAP_FIXED,内核从mmap_base分配地址空间。 ... } } if (padzero(elf_bss)) { error = -EFAULT; goto out; } elf_bss = ELF_PAGEALIGN(elf_bss); last_bss = ELF_PAGEALIGN(last_bss); /* Finally, if there is still more bss to allocate, do it. */ if (last_bss > elf_bss) { error = vm_brk(elf_bss, last_bss - elf_bss); if (error) goto out; } error = load_addr; out: return error; } static unsigned long elf_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) { unsigned long map_addr; unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr); unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr); addr = ELF_PAGESTART(addr); size = ELF_PAGEALIGN(size); /* mmap() will return -EINVAL if given a zero size, but a * segment with zero filesize is perfectly valid */ if (!size) return addr; if (total_size) { total_size = ELF_PAGEALIGN(total_size); map_addr = vm_mmap(filep, addr, total_size, prot, type, off);-----------将文件映射到当前进程的地址空间中。 if (!BAD_ADDR(map_addr)) vm_munmap(map_addr+size, total_size-size); } else map_addr = vm_mmap(filep, addr, size, prot, type, off); return(map_addr); }
static int set_brk(unsigned long start, unsigned long end)
{
start = ELF_PAGEALIGN(start);
end = ELF_PAGEALIGN(end);
if (end > start) {
int error = vm_brk(start, end - start);
if (error)
return error;
}
current->mm->start_brk = current->mm->brk = end;------------------------------确定进程栈的起始地址。
return 0;
}
参考文档: