linux内存管理(二)- vmalloc
个人笔记,谨慎观看.
先看看vmalloc是怎么实现的。它能在非连续物理内存之上建立连续的虚拟内存映射。这里有一篇博客Linux内存管理 (6)vmalloc - ArnoldLu - 博客园 (cnblogs.com)
调用链vmalloc->_vmalloc_node->_vmalloc_node_range
void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. * * Return: the address of the area or %NULL on failure */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, exceeds total pages", real_size); return NULL; } size_per_node = size; if (node == NUMA_NO_NODE) size_per_node /= num_online_nodes();
//分配并初始化一个vm_struct area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); /* Allocate physical pages and map them into vmalloc space. */ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!ret) goto fail; return area->addr; ... }
totalram_pages是一个保存系统总可用内存页的全局变量。__get_vm_area_node分配一个vm_struct并初始化,这个结构描述了要分配的vmalloc。
static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { BUG_ON(in_interrupt()); size = ALIGN(size, 1ul << shift);//按page size对齐 //分配一个vm_struct area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; //分配一个vmap_area va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); //设置vmap_area到vm_struct setup_vmalloc_vm(area, va, flags, caller); return area; }
这里涉及到俩结构体。vm_struct, vmap_area.
struct vm_struct { struct vm_struct *next; void *addr; unsigned long size; unsigned long flags; struct page **pages; #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC unsigned int page_order; #endif unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; };
描述vmalloc区域。
struct vmap_area { unsigned long va_start; unsigned long va_end; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ /* * The following two variables can be packed, because * a vmap_area object can be either: * 1) in "free" tree (root is free_vmap_area_root) * 2) or "busy" tree (root is vmap_area_root) */ union { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ }; unsigned long flags; /* mark type of vm_map_ram area */ };
也用来描述vmalloc的那个区域,主要描述区域的范围,并且链接到一个全局rbtree上。alloc_vmap_area会找到当前地址最低的一个空闲区域。
__vmalloc_area_node是核心函数,分配物理内存,建立映射。
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; unsigned int flags; int ret; // 计算需要存储page指针的内存大小 array_size = (unsigned long)nr_small_pages * sizeof(struct page *); // 可能会使用递归来分配给area->pages的内存,pages保存的是page指针数组 /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); } else { area->pages = kmalloc_node(array_size, nested_gfp, node); } // 如果没有enable CONFIG_HAVE_ARCH_HUGE_VMALLOC阶就是0 set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); // 分配内存页面,因为我们要的是不连续物理页面,对于大多数情形每次获取1页,这样就可以得到nr_pages个不连续的页面 area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); // nr_vmalloc_pages应该是保存vmalloc分配总页数的全局变量 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); do { // 页面分配好了,建立映射吧,看起如果不允许失败那就要一直循环知道成功 ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); if (nofail && (ret < 0)) schedule_timeout_uninterruptible(1); } while (nofail && (ret < 0)); return area->addr; }
static int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; // 用连续的虚拟地址区映射离散的物理页面 err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
// 页面映射好了,刷一下cache flush_cache_vmap(addr, end); return err; }
int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); if (ret) return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); }
忽略kmsan相关的操作,直接看看__vmap_pages_range_noflush.
int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; ... for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; // 每次只映射一个page err = vmap_range_noflush(addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) return err; addr += 1UL << page_shift; } return 0; }
static int vmap_range_noflush(unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, unsigned int max_page_shift) { 。。。 start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); //终于看到熟悉的建页表的逻辑了 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); 。。。 return err; }
从代码可以看到vmalloc的分配的页面是虚拟地址连续而物理页面不连续的,分配逻辑复杂,只能是按page分配,因此相对域kmalloc可以分配连续物理页面和小内存,vmalloc比较耗时,只针对较大内存。