module加载过程初步分析[更新中]【转】
转自:http://blog.chinaunix.net/uid-1817735-id-2837068.html
分析这个过程可以有助于我们认识在加载模块时出现的问题大抵在哪里了。
直接从sys_init_module()系统调用的地方开始了。
该函数的实现在 kernel/module.c 中
/* This is where the real work happens */
asmlinkage long
sys_init_module(void __user *umod,
unsigned long len,
const char __user *uargs)
{
struct module *mod;
int ret = 0;
/* Must have permission */
if (!capable(CAP_SYS_MODULE))
return -EPERM; //权限错误
/* Only one module load at a time, please */
if (mutex_lock_interruptible(&module_mutex) != 0)
return -EINTR; //中断系统调用,一次只能加载一个module
/* Do all the hard work */
mod = load_module(umod, len, uargs); //load_module返回一个struct module的数据结构
if (IS_ERR(mod)) {
mutex_unlock(&module_mutex);
return PTR_ERR(mod);
}
/* Drop lock so they can recurse */
mutex_unlock(&module_mutex);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod); //通知模块list有新的模块加入
/* Start the module */
if (mod->init != NULL)
ret = do_one_initcall(mod->init); //execute init()
if (ret < 0) {
/* Init routine failed: abort. Try to protect us from
buggy refcounters. */
mod->state = MODULE_STATE_GOING; //change module status
synchronize_sched();
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod); //通知模块list有模块移出
mutex_lock(&module_mutex);
free_module(mod);
mutex_unlock(&module_mutex);
wake_up(&module_wq);
return ret;
}
if (ret > 0) {
printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
"it should follow 0/-E convention\n"
KERN_WARNING "%s: loading module anyway...\n",
__func__, mod->name, ret,
__func__);
dump_stack();
}
/* Now it's a first class citizen! Wake up anyone waiting for it. */
mod->state = MODULE_STATE_LIVE; //可以使用这个模块了
wake_up(&module_wq);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod); //init execute, don't use again。how to execute .probe???
unwind_remove_table(mod->unwind_info, 1);
module_free(mod, mod->module_init); //free module_init()
mod->module_init = NULL;
mod->init_size = 0;
mod->init_text_size = 0;
mutex_unlock(&module_mutex);
return 0;
}
上面的代码注释中可以看出先会检测权限,权限检测通过后判断是否是一次加载一个模块,如果不是的话,返回系统调用错误。
接下来是一个非常关键的调用 load_module()这个函数将模块从用户空间拷贝到内核空间,并对模块文件进行解析,重定向后以便系统使用。该函数定义也在kernel/module.c 文件中
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
static noinline struct module *load_module(void __user *umod,
unsigned long len,
const char __user *uargs)
{
Elf_Ehdr *hdr;
Elf_Shdr *sechdrs;
char *secstrings, *args, *modmagic, *strtab = NULL;
unsigned int i;
unsigned int symindex = 0;
unsigned int strindex = 0;
unsigned int setupindex;
unsigned int exindex;
unsigned int exportindex;
unsigned int modindex;
unsigned int obsparmindex;
unsigned int infoindex;
unsigned int gplindex;
unsigned int crcindex;
unsigned int gplcrcindex;
unsigned int versindex;
unsigned int pcpuindex;
unsigned int gplfutureindex;
unsigned int gplfuturecrcindex;
unsigned int unwindex = 0;
#ifdef CONFIG_UNUSED_SYMBOLS
unsigned int unusedindex;
unsigned int unusedcrcindex;
unsigned int unusedgplindex;
unsigned int unusedgplcrcindex;
#endif
unsigned int markersindex;
unsigned int markersstringsindex;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
struct exception_table_entry *extable;
mm_segment_t old_fs;
DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
umod, len, uargs);
if (len < sizeof(*hdr))
return ERR_PTR(-ENOEXEC);
/* Suck in entire file: we'll want most of it. */
/* vmalloc barfs on "unusual" numbers. Check here */
if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
return ERR_PTR(-ENOMEM);
if (copy_from_user(hdr, umod, len) != 0) {
err = -EFAULT;
goto free_hdr;
}
/* Sanity checks against insmoding binaries or wrong arch,
weird elf version */
if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 //验证是否是elf文件
/*
Elf_Ehdr = elf header
unsigned char e_indent[EI_NIDENT]; //EI_NIDENT = 16
EI_MAG0 0 文件标识
EI_MAG1 1 文件标识
EI_MAG2 2 文件标识
EI_MAG3 3 文件标识
EI_CLASS 4 文件类
EI_DATA 5 数据编码
EI_VERSION 6 文件版本
EI_PAD 7 补齐字节开始处
EI_NIDENT 16 e_ident[]大小
*/
|| hdr->e_type != ET_REL // e_type 目标文件类型
/*
ET_NONE = 0 未知的目标文件格式
ET_REL = 1 可重定位文件
ET_EXEC = 2 可执行文件
ET_DYN = 3 共享目标文件
ET_CORE = 4 Core文件 (转储格式)
ET_LOPROC = 0xff00 特定处理器文件
ET_HIPROC = 0xffff 特定处理器文件
ET_LOPROC 与 ET_HIPROC 之间的取值用来标识与处理器相关的文件格式
*/
|| !elf_check_arch(hdr)
|| hdr->e_shentsize != sizeof(*sechdrs)) {// e_shentsize节区头部表格的表项大小
err = -ENOEXEC;
goto free_hdr;
}
if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) //sh = section header
//e_shoff 节区头部表格的偏移量(按字节计算)。如果文件没有节区头部表格,可以为0
//e_shnum 节区头部表格的表项数目。可以为0的
goto truncated;
/* Convenience variables */
sechdrs = (void *)hdr + hdr->e_shoff; //可以确定节区开始位置
secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
/*
e_shstrndx 节区头部表格中与节区名称字符串表相关的表项的索引。如果文件没有节区名称字符串表,
该参数可以为SHN_UNDEF
sh_offset 该成员的取值给出节区的第一个字节与文件头之间的偏移。
确定节区名称字符串的位置
*/
sechdrs[0].sh_addr = 0;
for (i = 1; i < hdr->e_shnum; i++) { // e_shnum
if (sechdrs[i].sh_type != SHT_NOBITS //sh_type 为节区的内容和语义进行分类
/*
SHT_NULL = 0
SHT_PROGBITS = 1 此节区包含程序定义的信息,其格式和含义都由程序来解释
SHT_SYMTAB = 2 此节区包含一个符号表。
SHT_STRTAB = 3 此节区包含字符串表 文件可能包含多个字符串表节区
SHT_RELA = 4 此节区包含重定位表项,其中可能会有补齐内容
SHT_HASH = 5 此节区包含符号哈希表
SHT_DYNAMIC = 6
.....
SHT_NOBITS = 8 这种类型的节区不占用文件中的空间。
SHT_REL = 9 此节区包含重定位表项,其中没有补全
.....
*/
&& len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
goto truncated;
/* Mark all sections sh_addr with their address in the
temporary image. */
sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
// sh_addr 如果节区将出现在进程的内存镜像中,这个成员给出节区的第一个字节映出的位置
// 求得该节区在装入image 的内存地址
/* Internal symbols and strings. */
if (sechdrs[i].sh_type == SHT_SYMTAB) {
symindex = i;
strindex = sechdrs[i].sh_link; //sh_link 此成员给出节区头部表索引链接,
//具体的解释依赖于节区的类型
strtab = (char *)hdr + sechdrs[strindex].sh_offset;
}
#ifndef CONFIG_MODULE_UNLOAD
/* Don't load .exit sections */
if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; // 不会为这个节区分配内存
/*
sh_flags字段定义了一个节区中包含的内容是否可以修改,是否可以执行等信息
如果一个标志位被设置,则该位取值为1。未定义的各位都设置为0
SHF_WRITE 0x1 包含进程执行过程中将可写的数据
SHF_ALLOC 0x2 此节区在进程执行过程中占用内存,某些控制节区并不出现于目标
文件的内存映像中,对于那些节区,此位应设置为0
SHF_EXECINSTR 0x4 节区包含可执行的机器指令
SHF_MASKPROC 0xf0000000 所有包含于此掩码中的四位都用于处理器专用的语义
*/
#endif
}
modindex = find_sec(hdr, sechdrs, secstrings,
".gnu.linkonce.this_module"); //objdump -x modules_name.ko
if (!modindex) {
printk(KERN_WARNING "No module found in object\n");
err = -ENOEXEC; //可执行格式错误
goto free_hdr;
}
mod = (void *)sechdrs[modindex].sh_addr;
if (symindex == 0) {
printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
mod->name);
err = -ENOEXEC;
goto free_hdr;
}
/* Optional sections */
exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
#ifdef CONFIG_UNUSED_SYMBOLS
unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
#endif
setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); // return 0,!CONFIG_SMP
#ifdef ARCH_UNWIND_SECTION_NAME
unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
#endif
/* Don't keep modinfo and version sections. */
sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
#ifdef CONFIG_KALLSYMS
/* Keep symbol and string tables for decoding later. */
sechdrs[symindex].sh_flags |= SHF_ALLOC;
sechdrs[strindex].sh_flags |= SHF_ALLOC;
#endif
if (unwindex)
sechdrs[unwindex].sh_flags |= SHF_ALLOC;
/* Check module struct version now, before we try to use module. */
if (!check_modstruct_version(sechdrs, versindex, mod)) {//checkout modstruct version
err = -ENOEXEC;
goto free_hdr;
}
modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
err = try_to_force_load(mod, "magic");
if (err)
goto free_hdr;
} else if (!same_magic(modmagic, vermagic, versindex)) {
printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
mod->name, modmagic, vermagic);
err = -ENOEXEC;
goto free_hdr;
}
/* Now copy in args */
args = strndup_user(uargs, ~0UL >> 1);
if (IS_ERR(args)) {
err = PTR_ERR(args);
goto free_hdr;
}
if (find_module(mod->name)) {
err = -EEXIST; //模块存在的错误
goto free_mod;
}
mod->state = MODULE_STATE_COMING; //修改模块的状态.开始加入进来了
/* Allow arches to frob section contents and sizes. */
err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); // nothing to do for mips
if (err < 0)
goto free_mod;
if (pcpuindex) { //nothing to do for !smp
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
sechdrs[pcpuindex].sh_addralign,
mod->name);
if (!percpu) {
err = -ENOMEM;
goto free_mod;
}
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
mod->percpu = percpu;
}
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
special cases for the architectures. */
layout_sections(mod, hdr, sechdrs, secstrings); //get core size + init() size
/* Do the allocs. */
ptr = module_alloc_update_bounds(mod->core_size);//ready for module
if (!ptr) {
err = -ENOMEM;
goto free_percpu;
}
memset(ptr, 0, mod->core_size);
mod->module_core = ptr; //module_core address
ptr = module_alloc_update_bounds(mod->init_size);
if (!ptr && mod->init_size) {
err = -ENOMEM;
goto free_core;
}
memset(ptr, 0, mod->init_size);
mod->module_init = ptr; //module_init address
/* Transfer each section which specifies SHF_ALLOC */
DEBUGP("final section addresses:\n");
for (i = 0; i < hdr->e_shnum; i++) {
void *dest;
if (!(sechdrs[i].sh_flags & SHF_ALLOC))
continue;
if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
dest = mod->module_init
+ (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
else
dest = mod->module_core + sechdrs[i].sh_entsize;
if (sechdrs[i].sh_type != SHT_NOBITS)
memcpy(dest, (void *)sechdrs[i].sh_addr,
sechdrs[i].sh_size); //memcpy 拷贝各个段到相应位置
/* Update sh_addr to point to copy in image. */
sechdrs[i].sh_addr = (unsigned long)dest; //更新地址
DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
}
/* Module has been moved. */
mod = (void *)sechdrs[modindex].sh_addr; //更新module 段地址
/* Now we've moved module, initialize linked lists, etc. */
module_unload_init(mod);
/* add kobject, so we can reference it. */
err = mod_sysfs_init(mod);
if (err)
goto free_unload;
/* Set up license info based on the info section */
set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
/*
* ndiswrapper is under GPL by itself, but loads proprietary modules.
* Don't use add_taint_module(), as it would prevent ndiswrapper from
* using GPL-only symbols it needs.
*/
if (strcmp(mod->name, "ndiswrapper") == 0)
add_taint(TAINT_PROPRIETARY_MODULE);
/* driverloader was caught wrongly pretending to be under GPL */
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
/* Set up MODINFO_ATTR fields */
setup_modinfo(mod, sechdrs, infoindex);
/* Fix up syms, so that st_value is a pointer to location. */
err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
mod);
if (err < 0)
goto cleanup;
/* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */
mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms);
mod->syms = (void *)sechdrs[exportindex].sh_addr;
if (crcindex)
mod->crcs = (void *)sechdrs[crcindex].sh_addr;
mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms);
mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
if (gplcrcindex)
mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
sizeof(*mod->gpl_future_syms);
mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
if (gplfuturecrcindex)
mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
#ifdef CONFIG_UNUSED_SYMBOLS
mod->num_unused_syms = sechdrs[unusedindex].sh_size /
sizeof(*mod->unused_syms);
mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
sizeof(*mod->unused_gpl_syms);
mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
if (unusedcrcindex)
mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
if (unusedgplcrcindex)
mod->unused_gpl_crcs
= (void *)sechdrs[unusedgplcrcindex].sh_addr;
#endif
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !crcindex)
|| (mod->num_gpl_syms && !gplcrcindex)
|| (mod->num_gpl_future_syms && !gplfuturecrcindex)
#ifdef CONFIG_UNUSED_SYMBOLS
|| (mod->num_unused_syms && !unusedcrcindex)
|| (mod->num_unused_gpl_syms && !unusedgplcrcindex)
#endif
) {
printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
err = try_to_force_load(mod, "nocrc");
if (err)
goto cleanup;
}
#endif
markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
markersstringsindex = find_sec(hdr, sechdrs, secstrings,
"__markers_strings");
/* Now do relocations. */
for (i = 1; i < hdr->e_shnum; i++) {
const char *strtab = (char *)sechdrs[strindex].sh_addr;
unsigned int info = sechdrs[i].sh_info;
/* Not a valid relocation section? */
if (info >= hdr->e_shnum)
continue;
/* Don't bother with non-allocated sections */
if (!(sechdrs[info].sh_flags & SHF_ALLOC))
continue;
if (sechdrs[i].sh_type == SHT_REL) //relocations section
err = apply_relocate(sechdrs, strtab, symindex, i,mod);
else if (sechdrs[i].sh_type == SHT_RELA) //包含重定位表项,有补全的内容
err = apply_relocate_add(sechdrs, strtab, symindex, i,
mod);
if (err < 0)
goto cleanup;
}
#ifdef CONFIG_MARKERS
mod->markers = (void *)sechdrs[markersindex].sh_addr;
mod->num_markers =
sechdrs[markersindex].sh_size / sizeof(*mod->markers);
#endif
/* Find duplicate symbols */
err = verify_export_symbols(mod);
if (err < 0)
goto cleanup;
/* Set up and sort exception table */ //什么是exception table ??? 是指系统异常处理的那个异常表
mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable);
mod->extable = extable = (void *)sechdrs[exindex].sh_addr;
sort_extable(extable, extable + mod->num_exentries);
/* Finally, copy percpu area over. */
percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
sechdrs[pcpuindex].sh_size);
add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
#ifdef CONFIG_MARKERS
if (!mod->taints)
marker_update_probe_range(mod->markers,
mod->markers + mod->num_markers);
#endif
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
goto cleanup;
/* flush the icache in correct context */
old_fs = get_fs(); //get current_thread_info addr_limits
set_fs(KERNEL_DS);
/*
* Flush the instruction cache, since we've played with text.
* Do it before processing of module parameters, so the module
* can provide parameter accessor functions of its own.
*/
if (mod->module_init)
flush_icache_range((unsigned long)mod->module_init,
(unsigned long)mod->module_init
+ mod->init_size); //刷cache,将这个范围内的刷掉。
flush_icache_range((unsigned long)mod->module_core,
(unsigned long)mod->module_core + mod->core_size);
set_fs(old_fs);
mod->args = args;
if (obsparmindex)
printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
mod->name);
/* Now sew it into the lists so we can get lockdep and oops
* info during argument parsing. Noone should access us, since
* strong_try_module_get() will fail. */
stop_machine(__link_module, mod, NULL); //干什么呢?
/* Size of section 0 is 0, so this works well if no params */
err = parse_args(mod->name, mod->args,
(struct kernel_param *)
sechdrs[setupindex].sh_addr,
sechdrs[setupindex].sh_size
/ sizeof(struct kernel_param),
NULL);
if (err < 0)
goto unlink;
err = mod_sysfs_setup(mod,
(struct kernel_param *)
sechdrs[setupindex].sh_addr,
sechdrs[setupindex].sh_size
/ sizeof(struct kernel_param));
if (err < 0)
goto unlink;
add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
/* Size of section 0 is 0, so this works well if no unwind info. */
mod->unwind_info = unwind_add_table(mod,
(void *)sechdrs[unwindex].sh_addr,
sechdrs[unwindex].sh_size);
/* Get rid of temporary copy */
vfree(hdr); //释放临时copy
/* Done! */
return mod;
unlink:
stop_machine(__unlink_module, mod, NULL);
module_arch_cleanup(mod);
cleanup:
kobject_del(&mod->mkobj.kobj);
kobject_put(&mod->mkobj.kobj);
free_unload:
module_unload_free(mod);
module_free(mod, mod->module_init);
free_core:
module_free(mod, mod->module_core);
free_percpu:
if (percpu)
percpu_modfree(percpu);
free_mod:
kfree(args);
free_hdr:
vfree(hdr);
return ERR_PTR(err);
truncated:
printk(KERN_ERR "Module len %lu truncated\n", len);
err = -ENOEXEC;
goto free_hdr;
}