ko module加载flow

insmod ko都是在user space发起的，通过系统调用finit_module或者init_module来加载ko

其中finit_module系统调用是user space只传ko file的fd，kernel里根据这个fd直接读ko，将ko读到内存上来，此内存是使用vmalloc来分配的，分配的大小对应ko file完整大小；

而init_module系统调用是ko file已经读到了user space，在kernel里使用copy_from_user将ko file copy到kernel，同样，kernel里用来存放此ko file的内存是在vmalloc上，大小也是ko file完整大小。

在android系统上，看起来都是用的finit_module

读完后，info.hdr就是指向kernel里ko file的头，ko file开头是elf file header，里面包含了关于elf file的诸多信息：

SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
    struct load_info info = { };
    loff_t size;
    void *hdr;
    int err;

    err = may_init_module();
    if (err)
        return err;

    pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);

    if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
              |MODULE_INIT_IGNORE_VERMAGIC))
        return -EINVAL;

    err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX,
                       READING_MODULE);
    if (err)
        return err;
    info.hdr = hdr;
    info.len = size;

    return load_module(&info, uargs, flags);
}

确定每个section相对基地址的offset，为将ko里的section copy到module区做准备

如下sh_entsize即表示某个section相对基地址的offset

这里有两个layout，一个是core_layout，另外一个是init layout，core_layout是ko里的所有需要copy到module区的section，这个的section从名字上就可以看出来，即section name不是以.init开头的section，如果section name是以.init开头，这样的section对应init_layout，init_layout是执行一次就不用再执行了，比如module init函数它前面带了__init attribute，即表示它将被放在.init.text section，module init函数执行一次就不用再执行了，它在执行完后对应的内存将会被free。

kernel/module.c

static void layout_sections(struct module *mod, struct load_info *info)
{
    static unsigned long const masks[][2] = {
        /*
         * NOTE: all executable code must be the first section
         * in this array; otherwise modify the text_size
         * finder in the two loops below
         */
        { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
        { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
        { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
        { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
        { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
    };
    unsigned int m, i;

    for (i = 0; i < info->hdr->e_shnum; i++)
        info->sechdrs[i].sh_entsize = ~0UL;

    pr_debug("Core section allocation order:\n");
    for (m = 0; m < ARRAY_SIZE(masks); ++m) {
        for (i = 0; i < info->hdr->e_shnum; ++i) {
            Elf_Shdr *s = &info->sechdrs[i];
            const char *sname = info->secstrings + s->sh_name;

            if ((s->sh_flags & masks[m][0]) != masks[m][0]
                || (s->sh_flags & masks[m][1])
                || s->sh_entsize != ~0UL
                || module_init_layout_section(sname))
                continue;
            s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
            pr_debug("\t%s\n", sname);
        }
        switch (m) {
        case 0: /* executable */
            mod->core_layout.size = debug_align(mod->core_layout.size);
            mod->core_layout.text_size = mod->core_layout.size;
            break;
        case 1: /* RO: text and ro-data */
            mod->core_layout.size = debug_align(mod->core_layout.size);
            mod->core_layout.ro_size = mod->core_layout.size;
            break;
        case 2: /* RO after init */
            mod->core_layout.size = debug_align(mod->core_layout.size);
            mod->core_layout.ro_after_init_size = mod->core_layout.size;
            break;
        case 4: /* whole core */
            mod->core_layout.size = debug_align(mod->core_layout.size);
            break;
        }
    }

    pr_debug("Init section allocation order:\n");
    for (m = 0; m < ARRAY_SIZE(masks); ++m) {
        for (i = 0; i < info->hdr->e_shnum; ++i) {
            Elf_Shdr *s = &info->sechdrs[i];
            const char *sname = info->secstrings + s->sh_name;

            if ((s->sh_flags & masks[m][0]) != masks[m][0]
                || (s->sh_flags & masks[m][1])
                || s->sh_entsize != ~0UL
                || !module_init_layout_section(sname))
                continue;
            s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
                     | INIT_OFFSET_MASK);
            pr_debug("\t%s\n", sname);
        }
        switch (m) {
        case 0: /* executable */
            mod->init_layout.size = debug_align(mod->init_layout.size);
            mod->init_layout.text_size = mod->init_layout.size;
            break;
        case 1: /* RO: text and ro-data */
            mod->init_layout.size = debug_align(mod->init_layout.size);
            mod->init_layout.ro_size = mod->init_layout.size;
            break;
        case 2:
            /*
             * RO after init doesn't apply to init_layout (only
             * core_layout), so it just takes the value of ro_size.
             */
            mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
            break;
        case 4: /* whole init */
            mod->init_layout.size = debug_align(mod->init_layout.size);
            break;
        }
    }
}

copy sections

在move_module()里执行copy section的动作，mod->core_layout.size、mod->init_layout.size是在layout_sections等函数里累加确定的

在move_module()里执行module_alloc()将在module区分配虚拟地址空间，确保copy section到module区。

在上述module_alloc分配的base地址的基础上加上section对应的offset（shdr->sh_entsize）就得到了这个section copy dest address，之后执行memcpy将这个section copy到module区。

所以对于kernel module，其.text，.data，.rodata等section都是位于module区间，它们所占用的内存是一起申请的。对于所有的module都是一样的，所以在module区间内，是按照一个个module layout的，一个module内部包含.text/.data/.rodata等section，如下示意图。

同时init_layout和core_layout都是在module区里申请的内存，只是init_layout里包含的section会在执行完一次后被free以释放空间；而core_layout则是一直在module区，直到rmmod才会从module区释放。

	-------------------------------------------------
			|     .text      |       |
	    moduleA     |     .data      |       | 
			|     .rodata    |       |
			|     ...        |       
	----------------|----------------|  module region
			|     .text      |	     
	    moduleB     |     .data      |       |
			|     .rodata    |       |
			|     ...        |       |
	-------------------------------------------------

kernel/module.c

static int move_module(struct module *mod, struct load_info *info)
{
    int i;
    void *ptr;

    /* Do the allocs. */
    ptr = module_alloc(mod->core_layout.size);
    /*
     * The pointer to this block is stored in the module structure
     * which is inside the block. Just mark it as not being a
     * leak.
     */
    kmemleak_not_leak(ptr);
    if (!ptr)
        return -ENOMEM;

    memset(ptr, 0, mod->core_layout.size);
    mod->core_layout.base = ptr;

    if (mod->init_layout.size) {
        ptr = module_alloc(mod->init_layout.size);
        /*
         * The pointer to this block is stored in the module structure
         * which is inside the block. This block doesn't need to be
         * scanned as it contains data and code that will be freed
         * after the module is initialized.
         */
        kmemleak_ignore(ptr);
        if (!ptr) {
            module_memfree(mod->core_layout.base);
            return -ENOMEM;
        }
        memset(ptr, 0, mod->init_layout.size);
        mod->init_layout.base = ptr;
    } else
        mod->init_layout.base = NULL;

    /* Transfer each section which specifies SHF_ALLOC */
    pr_debug("final section addresses:\n");
    for (i = 0; i < info->hdr->e_shnum; i++) {
        void *dest;
        Elf_Shdr *shdr = &info->sechdrs[i];

        if (!(shdr->sh_flags & SHF_ALLOC))
            continue;

        if (shdr->sh_entsize & INIT_OFFSET_MASK)
            dest = mod->init_layout.base
                + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
        else
            dest = mod->core_layout.base + shdr->sh_entsize;

        if (shdr->sh_type != SHT_NOBITS)
            memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
        /* Update sh_addr to point to copy in image. */
        shdr->sh_addr = (unsigned long)dest;
        pr_debug("\t0x%lx %s\n",
             (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
    }

    return 0;
}

来看下上述module_alloc是如何确保是在module虚拟地址空间里alloc的。

arch/arm64/kernel/module.c

void *module_alloc(unsigned long size)
{
    u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
    gfp_t gfp_mask = GFP_KERNEL;
    void *p;

    /* Silence the initial allocation */
    if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
        gfp_mask |= __GFP_NOWARN;

    if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
        IS_ENABLED(CONFIG_KASAN_SW_TAGS))
        /* don't exceed the static module region - see below */
        module_alloc_end = MODULES_END;

    p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
                module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
                NUMA_NO_NODE, __builtin_return_address(0));

    if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
        (IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
         (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
          !IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
        /*
         * KASAN without KASAN_VMALLOC can only deal with module
         * allocations being served from the reserved module region,
         * since the remainder of the vmalloc region is already
         * backed by zero shadow pages, and punching holes into it
         * is non-trivial. Since the module region is not randomized
         * when KASAN is enabled without KASAN_VMALLOC, it is even
         * less likely that the module region gets exhausted, so we
         * can simply omit this fallback in that case.
         */
        p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
                module_alloc_base + SZ_2G, GFP_KERNEL,
                PAGE_KERNEL, 0, NUMA_NO_NODE,
                __builtin_return_address(0));

    if (p && (kasan_module_alloc(p, size) < 0)) {
        vfree(p);
        return NULL;
    }

    return p;
}

在这个函数里调用__vmalloc_node_range传的range的起始地址是module_alloc_base，看下这个变量是怎么赋值的，在没有开启kaslr的情况下它的值为_etext - MODULES_VSIZE，MODULES_VSIZE在ARM64平台上定义为128M

#ifdef CONFIG_RANDOMIZE_BASE
extern u64 module_alloc_base;
#else
#define module_alloc_base    ((u64)_etext - MODULES_VSIZE)
#endif

init_layout什么时候被free

在load_module()的最后，会调用do_init_module()，这个函数将会调用module的init函数。在do_init_module()的后面将init_layout给free掉：

static noinline int do_init_module(struct module *mod)
{
    int ret = 0;
    struct mod_initfree *freeinit;

    freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
    if (!freeinit) {
        ret = -ENOMEM;
        goto fail;
    }
    freeinit->module_init = mod->init_layout.base;

    /*
     * We want to find out whether @mod uses async during init.  Clear
     * PF_USED_ASYNC.  async_schedule*() will set it.
     */
    current->flags &= ~PF_USED_ASYNC;

    do_mod_ctors(mod);
    /* Start the module */
    if (mod->init != NULL)
        ret = do_one_initcall(mod->init);
    if (ret < 0) {
        goto fail_free_freeinit;
    }
        ...
    mod->init_layout.base = NULL;
    mod->init_layout.size = 0;
    mod->init_layout.ro_size = 0;
    mod->init_layout.ro_after_init_size = 0;
    mod->init_layout.text_size = 0;
        ...
    call_rcu_sched(&freeinit->rcu, do_free_init);

static void do_free_init(struct rcu_head *head)
{
    struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
    module_memfree(m->module_init);
    kfree(m);
}

module_init macro

这个macro的定义有两种情况，当MODULE define了时，对应built-in kernel case，它没有define；否则是build成ko时，它为defined

1.当为built-in kernel时

module_init(fn)对应：

include/linux/init.h

#define ___define_initcall(fn, id, __sec) \
    static initcall_t __initcall_##fn##id __used \
        __attribute__((__section__(#__sec ".init"))) = fn;

#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id)
#define device_initcall(fn)        __define_initcall(fn, 6)

对于module_init(hello_init)的展开如下：

    static initcall_t __initcall_hello_init6 __used \
        __attribute__((__section__("initcall6.init"))) = hello_init;

initcall_t为一个函数指针类型：

typedef int (*initcall_t)(void);

所以是定义了一个类型为initcall_t的函数指针，并赋值为module init fuction hello_init()

并且这个函数指针变量是放在initcall6.init section

2.当build成ko时

#define module_init(initfn)                    \
    static inline initcall_t __maybe_unused __inittest(void)        \
    { return initfn; }                    \
    int init_module(void) __copy(initfn) __attribute__((alias(#initfn)));

此时在这个macro里define了一个__inittest()函数，这个函数只是用来在编译时提示你写的module init函数类型是否为initcall_t，如果不同，则会报build warning，比如你写的module init函数是void (*)(void)或者int (*)(int)都会报build warning

下面的init_module，这里是给你的module init函数起了一个别名，方便在insmod ko时kernel调用。每一个module，在编译的时候会生成一个*.mod.c的文件，在这个文件里会define一个struct module结构体，在这个结构体里给init成员赋值的就是init_module，而init指针就是kernel会call的，其类型与initcall_t一样。

__copy是copy initfn的attribute，后面的__attribute__((alias(#initfn)))即是给init_module起了一个别名，这个别名即是initfn，即为module init函数：

__visible struct module __this_module
__attribute__((section(".gnu.linkonce.this_module"))) = {
        .name = KBUILD_MODNAME,
        .init = init_module,
#ifdef CONFIG_MODULE_UNLOAD
        .exit = cleanup_module,
#endif
        .arch = MODULE_ARCH_INIT,
};

上述__this_module结构体是位于.gnu.linkonce.this_module section，这个用readelf -SW可以查看ko里是有这个section的。

总结

加载ko的flow有两个copy，一个copy是将ko file完整地copy到kernel vmalloc区；然后解析此elf file，确定各个section相对于core_layout/init_layout base address的offset；然后根据这个offset将elf file里的section copy module区；最后调用do_init_module()执行module init函数，之后将init_layout free。

posted @ 2021-11-07 22:35 aspirs 阅读(267) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

ko module加载flow