ko module加载flow

ko module加载flow

insmod ko都是在user space发起的,通过系统调用finit_module或者init_module来加载ko

其中finit_module系统调用是user space只传ko file的fd,kernel里根据这个fd直接读ko,将ko读到内存上来,此内存是使用vmalloc来分配的,分配的大小对应ko file完整大小;

而init_module系统调用是ko file已经读到了user space,在kernel里使用copy_from_user将ko file copy到kernel,同样,kernel里用来存放此ko file的内存是在vmalloc上,大小也是ko file完整大小。

在android系统上,看起来都是用的finit_module

读完后,info.hdr就是指向kernel里ko file的头,ko file开头是elf file header,里面包含了关于elf file的诸多信息:

SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
    struct load_info info = { };
    loff_t size;
    void *hdr;
    int err;

    err = may_init_module();
    if (err)
        return err;

    pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);

    if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
              |MODULE_INIT_IGNORE_VERMAGIC))
        return -EINVAL;

    err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX,
                       READING_MODULE);
    if (err)
        return err;
    info.hdr = hdr;
    info.len = size;

    return load_module(&info, uargs, flags);
}

 

 

确定每个section相对基地址的offset,为将ko里的section copy到module区做准备

如下sh_entsize即表示某个section相对基地址的offset

这里有两个layout,一个是core_layout,另外一个是init layout,core_layout是ko里的所有需要copy到module区的section,这个的section从名字上就可以看出来,即section name不是以.init开头的section,如果section name是以.init开头,这样的section对应init_layout,init_layout是执行一次就不用再执行了,比如module init函数它前面带了__init attribute,即表示它将被放在.init.text section,module init函数执行一次就不用再执行了,它在执行完后对应的内存将会被free。

kernel/module.c

static void layout_sections(struct module *mod, struct load_info *info)
{
    static unsigned long const masks[][2] = {
        /*
         * NOTE: all executable code must be the first section
         * in this array; otherwise modify the text_size
         * finder in the two loops below
         */
        { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
        { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
        { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
        { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
        { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
    };
    unsigned int m, i;

    for (i = 0; i < info->hdr->e_shnum; i++)
        info->sechdrs[i].sh_entsize = ~0UL;

    pr_debug("Core section allocation order:\n");
    for (m = 0; m < ARRAY_SIZE(masks); ++m) {
        for (i = 0; i < info->hdr->e_shnum; ++i) {
            Elf_Shdr *s = &info->sechdrs[i];
            const char *sname = info->secstrings + s->sh_name;

            if ((s->sh_flags & masks[m][0]) != masks[m][0]
                || (s->sh_flags & masks[m][1])
                || s->sh_entsize != ~0UL
                || module_init_layout_section(sname))
                continue;
            s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
            pr_debug("\t%s\n", sname);
        }
        switch (m) {
        case 0: /* executable */
            mod->core_layout.size = debug_align(mod->core_layout.size);
            mod->core_layout.text_size = mod->core_layout.size;
            break;
        case 1: /* RO: text and ro-data */
            mod->core_layout.size = debug_align(mod->core_layout.size);
            mod->core_layout.ro_size = mod->core_layout.size;
            break;
        case 2: /* RO after init */
            mod->core_layout.size = debug_align(mod->core_layout.size);
            mod->core_layout.ro_after_init_size = mod->core_layout.size;
            break;
        case 4: /* whole core */
            mod->core_layout.size = debug_align(mod->core_layout.size);
            break;
        }
    }

    pr_debug("Init section allocation order:\n");
    for (m = 0; m < ARRAY_SIZE(masks); ++m) {
        for (i = 0; i < info->hdr->e_shnum; ++i) {
            Elf_Shdr *s = &info->sechdrs[i];
            const char *sname = info->secstrings + s->sh_name;

            if ((s->sh_flags & masks[m][0]) != masks[m][0]
                || (s->sh_flags & masks[m][1])
                || s->sh_entsize != ~0UL
                || !module_init_layout_section(sname))
                continue;
            s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
                     | INIT_OFFSET_MASK);
            pr_debug("\t%s\n", sname);
        }
        switch (m) {
        case 0: /* executable */
            mod->init_layout.size = debug_align(mod->init_layout.size);
            mod->init_layout.text_size = mod->init_layout.size;
            break;
        case 1: /* RO: text and ro-data */
            mod->init_layout.size = debug_align(mod->init_layout.size);
            mod->init_layout.ro_size = mod->init_layout.size;
            break;
        case 2:
            /*
             * RO after init doesn't apply to init_layout (only
             * core_layout), so it just takes the value of ro_size.
             */
            mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
            break;
        case 4: /* whole init */
            mod->init_layout.size = debug_align(mod->init_layout.size);
            break;
        }
    }
}

 

 

copy sections

在move_module()里执行copy section的动作,mod->core_layout.size、mod->init_layout.size是在layout_sections等函数里累加确定的

在move_module()里执行module_alloc()将在module区分配虚拟地址空间,确保copy section到module区。

在上述module_alloc分配的base地址的基础上加上section对应的offset(shdr->sh_entsize)就得到了这个section copy dest address,之后执行memcpy将这个section copy到module区。

所以对于kernel module,其.text,.data,.rodata等section都是位于module区间,它们所占用的内存是一起申请的。对于所有的module都是一样的,所以在module区间内,是按照一个个module layout的,一个module内部包含.text/.data/.rodata等section,如下示意图。

同时init_layout和core_layout都是在module区里申请的内存,只是init_layout里包含的section会在执行完一次后被free以释放空间;而core_layout则是一直在module区,直到rmmod才会从module区释放。

 

	-------------------------------------------------
			|     .text      |       |
	    moduleA     |     .data      |       | 
			|     .rodata    |       |
			|     ...        |       
	----------------|----------------|  module region
			|     .text      |	     
	    moduleB     |     .data      |       |
			|     .rodata    |       |
			|     ...        |       |
	-------------------------------------------------

 

kernel/module.c

static int move_module(struct module *mod, struct load_info *info)
{
    int i;
    void *ptr;

    /* Do the allocs. */
    ptr = module_alloc(mod->core_layout.size);
    /*
     * The pointer to this block is stored in the module structure
     * which is inside the block. Just mark it as not being a
     * leak.
     */
    kmemleak_not_leak(ptr);
    if (!ptr)
        return -ENOMEM;

    memset(ptr, 0, mod->core_layout.size);
    mod->core_layout.base = ptr;

    if (mod->init_layout.size) {
        ptr = module_alloc(mod->init_layout.size);
        /*
         * The pointer to this block is stored in the module structure
         * which is inside the block. This block doesn't need to be
         * scanned as it contains data and code that will be freed
         * after the module is initialized.
         */
        kmemleak_ignore(ptr);
        if (!ptr) {
            module_memfree(mod->core_layout.base);
            return -ENOMEM;
        }
        memset(ptr, 0, mod->init_layout.size);
        mod->init_layout.base = ptr;
    } else
        mod->init_layout.base = NULL;

    /* Transfer each section which specifies SHF_ALLOC */
    pr_debug("final section addresses:\n");
    for (i = 0; i < info->hdr->e_shnum; i++) {
        void *dest;
        Elf_Shdr *shdr = &info->sechdrs[i];

        if (!(shdr->sh_flags & SHF_ALLOC))
            continue;

        if (shdr->sh_entsize & INIT_OFFSET_MASK)
            dest = mod->init_layout.base
                + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
        else
            dest = mod->core_layout.base + shdr->sh_entsize;

        if (shdr->sh_type != SHT_NOBITS)
            memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
        /* Update sh_addr to point to copy in image. */
        shdr->sh_addr = (unsigned long)dest;
        pr_debug("\t0x%lx %s\n",
             (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
    }

    return 0;
}

 

来看下上述module_alloc是如何确保是在module虚拟地址空间里alloc的。

arch/arm64/kernel/module.c

void *module_alloc(unsigned long size)
{
    u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
    gfp_t gfp_mask = GFP_KERNEL;
    void *p;

    /* Silence the initial allocation */
    if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
        gfp_mask |= __GFP_NOWARN;

    if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
        IS_ENABLED(CONFIG_KASAN_SW_TAGS))
        /* don't exceed the static module region - see below */
        module_alloc_end = MODULES_END;

    p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
                module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
                NUMA_NO_NODE, __builtin_return_address(0));

    if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
        (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
         (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
          !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
        /*
         * KASAN without KASAN_VMALLOC can only deal with module
         * allocations being served from the reserved module region,
         * since the remainder of the vmalloc region is already
         * backed by zero shadow pages, and punching holes into it
         * is non-trivial. Since the module region is not randomized
         * when KASAN is enabled without KASAN_VMALLOC, it is even
         * less likely that the module region gets exhausted, so we
         * can simply omit this fallback in that case.
         */
        p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
                module_alloc_base + SZ_2G, GFP_KERNEL,
                PAGE_KERNEL, 0, NUMA_NO_NODE,
                __builtin_return_address(0));

    if (p && (kasan_module_alloc(p, size) < 0)) {
        vfree(p);
        return NULL;
    }

    return p;
}

 

在这个函数里调用__vmalloc_node_range传的range的起始地址是module_alloc_base,看下这个变量是怎么赋值的,在没有开启kaslr的情况下它的值为_etext - MODULES_VSIZE,MODULES_VSIZE在ARM64平台上定义为128M

#ifdef CONFIG_RANDOMIZE_BASE
extern u64 module_alloc_base;
#else
#define module_alloc_base    ((u64)_etext - MODULES_VSIZE)
#endif

 

 

init_layout什么时候被free

在load_module()的最后,会调用do_init_module(),这个函数将会调用module的init函数。在do_init_module()的后面将init_layout给free掉:

static noinline int do_init_module(struct module *mod)
{
    int ret = 0;
    struct mod_initfree *freeinit;

    freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
    if (!freeinit) {
        ret = -ENOMEM;
        goto fail;
    }
    freeinit->module_init = mod->init_layout.base;

    /*
     * We want to find out whether @mod uses async during init.  Clear
     * PF_USED_ASYNC.  async_schedule*() will set it.
     */
    current->flags &= ~PF_USED_ASYNC;

    do_mod_ctors(mod);
    /* Start the module */
    if (mod->init != NULL)
        ret = do_one_initcall(mod->init);
    if (ret < 0) {
        goto fail_free_freeinit;
    }
        ...
    mod->init_layout.base = NULL;
    mod->init_layout.size = 0;
    mod->init_layout.ro_size = 0;
    mod->init_layout.ro_after_init_size = 0;
    mod->init_layout.text_size = 0;
        ...
    call_rcu_sched(&freeinit->rcu, do_free_init);

 

static void do_free_init(struct rcu_head *head)
{
    struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
    module_memfree(m->module_init);
    kfree(m);
}

 

 module_init macro

这个macro的定义有两种情况,当MODULE define了时,对应built-in kernel case,它没有define;否则是build成ko时,它为defined

1.当为built-in kernel时

module_init(fn)对应:

include/linux/init.h

#define ___define_initcall(fn, id, __sec) \
    static initcall_t __initcall_##fn##id __used \
        __attribute__((__section__(#__sec ".init"))) = fn;

#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id)
#define device_initcall(fn)        __define_initcall(fn, 6)

 

 

对于module_init(hello_init)的展开如下:

    static initcall_t __initcall_hello_init6 __used \
        __attribute__((__section__("initcall6.init"))) = hello_init;

 

initcall_t为一个函数指针类型:

typedef int (*initcall_t)(void);

 

所以是定义了一个类型为initcall_t的函数指针,并赋值为module init fuction hello_init()

并且这个函数指针变量是放在initcall6.init section

2.当build成ko时

#define module_init(initfn)                    \
    static inline initcall_t __maybe_unused __inittest(void)        \
    { return initfn; }                    \
    int init_module(void) __copy(initfn) __attribute__((alias(#initfn)));

 

此时在这个macro里define了一个__inittest()函数,这个函数只是用来在编译时提示你写的module init函数类型是否为initcall_t,如果不同,则会报build warning,比如你写的module init函数是void (*)(void)或者int (*)(int)都会报build warning

下面的init_module,这里是给你的module init函数起了一个别名,方便在insmod ko时kernel调用。每一个module,在编译的时候会生成一个*.mod.c的文件,在这个文件里会define一个struct module结构体,在这个结构体里给init成员赋值的就是init_module,而init指针就是kernel会call的,其类型与initcall_t一样。

__copy是copy initfn的attribute,后面的__attribute__((alias(#initfn)))即是给init_module起了一个别名,这个别名即是initfn,即为module init函数:

__visible struct module __this_module
__attribute__((section(".gnu.linkonce.this_module"))) = {
        .name = KBUILD_MODNAME,
        .init = init_module,
#ifdef CONFIG_MODULE_UNLOAD
        .exit = cleanup_module,
#endif
        .arch = MODULE_ARCH_INIT,
};

 

上述__this_module结构体是位于.gnu.linkonce.this_module section,这个用readelf -SW可以查看ko里是有这个section的。

 

总结

加载ko的flow有两个copy,一个copy是将ko file完整地copy到kernel vmalloc区;然后解析此elf file,确定各个section相对于core_layout/init_layout base address的offset;然后根据这个offset将elf file里的section copy module区;最后调用do_init_module()执行module init函数,之后将init_layout free。

 

posted @ 2021-11-07 22:35  aspirs  阅读(267)  评论(0编辑  收藏  举报