linux内存管理(二)- vmalloc

个人笔记,谨慎观看.

先看看vmalloc是怎么实现的。它能在非连续物理内存之上建立连续的虚拟内存映射。这里有一篇博客Linux内存管理 (6)vmalloc - ArnoldLu - 博客园 (cnblogs.com)

调用链vmalloc->_vmalloc_node->_vmalloc_node_range

void *__vmalloc_node(unsigned long size, unsigned long align,
                gfp_t gfp_mask, int node, const void *caller)
{
    return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
                gfp_mask, PAGE_KERNEL, 0, node, caller);
}

 * Map them into contiguous kernel virtual space, using a pagetable
 * protection of @prot.
 *
 * Return: the address of the area or %NULL on failure
 */
void *__vmalloc_node_range(unsigned long size, unsigned long align,
            unsigned long start, unsigned long end, gfp_t gfp_mask,
            pgprot_t prot, unsigned long vm_flags, int node,
            const void *caller)
{
    if ((size >> PAGE_SHIFT) > totalram_pages()) {
        warn_alloc(gfp_mask, NULL,
            "vmalloc error: size %lu, exceeds total pages",
            real_size);
        return NULL;
    }

        size_per_node = size;
        if (node == NUMA_NO_NODE)
            size_per_node /= num_online_nodes();

//分配并初始化一个vm_struct area
= __get_vm_area_node(real_size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; return area->addr; ... }

totalram_pages是一个保存系统总可用内存页的全局变量。__get_vm_area_node分配一个vm_struct并初始化,这个结构描述了要分配的vmalloc。

static struct vm_struct *__get_vm_area_node(unsigned long size,
        unsigned long align, unsigned long shift, unsigned long flags,
        unsigned long start, unsigned long end, int node,
        gfp_t gfp_mask, const void *caller)
{
    BUG_ON(in_interrupt());
    size = ALIGN(size, 1ul << shift);//按page size对齐
        //分配一个vm_struct
    area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
    if (!(flags & VM_NO_GUARD))
        size += PAGE_SIZE;
        //分配一个vmap_area
    va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
        //设置vmap_area到vm_struct
    setup_vmalloc_vm(area, va, flags, caller);
    return area;
}

这里涉及到俩结构体。vm_struct, vmap_area.

struct vm_struct {
    struct vm_struct    *next;
    void            *addr;
    unsigned long        size;
    unsigned long        flags;
    struct page        **pages;
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
    unsigned int        page_order;
#endif
    unsigned int        nr_pages;
    phys_addr_t        phys_addr;
    const void        *caller;
};

描述vmalloc区域。

struct vmap_area {
    unsigned long va_start;
    unsigned long va_end;

    struct rb_node rb_node;         /* address sorted rbtree */
    struct list_head list;          /* address sorted list */

    /*
     * The following two variables can be packed, because
     * a vmap_area object can be either:
     *    1) in "free" tree (root is free_vmap_area_root)
     *    2) or "busy" tree (root is vmap_area_root)
     */
    union {
        unsigned long subtree_max_size; /* in "free" tree */
        struct vm_struct *vm;           /* in "busy" tree */
    };
    unsigned long flags; /* mark type of vm_map_ram area */
};

也用来描述vmalloc的那个区域,主要描述区域的范围,并且链接到一个全局rbtree上。alloc_vmap_area会找到当前地址最低的一个空闲区域。

__vmalloc_area_node是核心函数,分配物理内存,建立映射。

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                 pgprot_t prot, unsigned int page_shift,
                 int node)
{
    const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
    bool nofail = gfp_mask & __GFP_NOFAIL;
    unsigned long addr = (unsigned long)area->addr;
    unsigned long size = get_vm_area_size(area);
    unsigned long array_size;
    unsigned int nr_small_pages = size >> PAGE_SHIFT;
    unsigned int page_order;
    unsigned int flags;
    int ret;

    // 计算需要存储page指针的内存大小
    array_size = (unsigned long)nr_small_pages * sizeof(struct page *);

    // 可能会使用递归来分配给area->pages的内存,pages保存的是page指针数组
    /* Please note that the recursion is strictly bounded. */
    if (array_size > PAGE_SIZE) {
        area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
                    area->caller);
    } else {
        area->pages = kmalloc_node(array_size, nested_gfp, node);
    }

    // 如果没有enable CONFIG_HAVE_ARCH_HUGE_VMALLOC阶就是0
    set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
    page_order = vm_area_page_order(area);

    // 分配内存页面,因为我们要的是不连续物理页面,对于大多数情形每次获取1页,这样就可以得到nr_pages个不连续的页面
    area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
        node, page_order, nr_small_pages, area->pages);

    // nr_vmalloc_pages应该是保存vmalloc分配总页数的全局变量
    atomic_long_add(area->nr_pages, &nr_vmalloc_pages);

    do {
        // 页面分配好了,建立映射吧,看起如果不允许失败那就要一直循环知道成功
        ret = vmap_pages_range(addr, addr + size, prot, area->pages,
            page_shift);
        if (nofail && (ret < 0))
            schedule_timeout_uninterruptible(1);
    } while (nofail && (ret < 0));

    return area->addr;
}
static int vmap_pages_range(unsigned long addr, unsigned long end,
        pgprot_t prot, struct page **pages, unsigned int page_shift)
{
    int err;
    // 用连续的虚拟地址区映射离散的物理页面
    err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
// 页面映射好了,刷一下cache flush_cache_vmap(addr, end);
return err; }
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
        pgprot_t prot, struct page **pages, unsigned int page_shift)
{
    int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
                         page_shift);

    if (ret)
        return ret;
    return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}

忽略kmsan相关的操作,直接看看__vmap_pages_range_noflush.

int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
        pgprot_t prot, struct page **pages, unsigned int page_shift)
{
    unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
...
    for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
        int err;

        // 每次只映射一个page
        err = vmap_range_noflush(addr, addr + (1UL << page_shift),
                    page_to_phys(pages[i]), prot,
                    page_shift);
        if (err)
            return err;

        addr += 1UL << page_shift;
    }

    return 0;
}
static int vmap_range_noflush(unsigned long addr, unsigned long end,
            phys_addr_t phys_addr, pgprot_t prot,
            unsigned int max_page_shift)
{
。。。
    start = addr;
    pgd = pgd_offset_k(addr);
    do {
        next = pgd_addr_end(addr, end);
        //终于看到熟悉的建页表的逻辑了
        err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
                    max_page_shift, &mask);
        if (err)
            break;
    } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
。。。

    return err;
}

 从代码可以看到vmalloc的分配的页面是虚拟地址连续而物理页面不连续的,分配逻辑复杂,只能是按page分配,因此相对域kmalloc可以分配连续物理页面和小内存,vmalloc比较耗时,只针对较大内存。

posted @ 2024-06-11 10:45  半山随笔  阅读(14)  评论(0编辑  收藏  举报