linux内存管理(四)- 用户空间的内存分配在kernel中的实现
malloc是常用的用户态分配内存的接口,它会调用brk系统调用来请内存分配内存。下面看看该系统调用的实现。
插一句,每次调用malloc的时候未必都会调用brk去从kernel分配实际的内存,因为每次系统调用都是有开销的,为了避免频繁的陷入内核,malloc会多申请一部分内存当作内存池,之后要申请内存会首先在这个自己维护的内存池中获取,这样会大大减少系统调用的开销。
下面看看brk的实现。
SYSCALL_DEFINE1(brk, unsigned long, brk) { origbrk = mm->brk; #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ if (current->brk_randomized) min_brk = mm->start_brk; else min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) { mm->brk = brk; goto success; } /* Always allow shrinking brk. */ //可以减小堆 if (brk <= mm->brk) { /* Search one past newbrk */ vma_iter_init(&vmi, mm, newbrk); brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. * do_vma_munmap() will drop the lock on success, so update it * before calling do_vma_munmap(). */ mm->brk = brk; if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true)) goto out; goto success_unlocked; } /* * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ vma_iter_init(&vmi, mm, oldbrk); next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; if (mm->def_flags & VM_LOCKED) populate = true; success: mmap_write_unlock(mm); success_unlocked: userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: mm->brk = origbrk; mmap_write_unlock(mm); return origbrk; }
在mm_struct结构中与brk相关的成员有start_brk表示brk的起始地址,brk代码堆当前的边界。brk系统调用的入参是请求更改后的brk边界。如果请求的brk跟当前的brk在同一个page那就直接返回。如果小于当前的brk说明需要收缩堆空间。查找已有的对应的vma,使用do_vma_munmap释放部分堆空间。如果当前的堆边界大于请求的brk边界且找到的vma包含请求的brk那就可以直接返回无需更改。最后一种情况是没有找到能包含请求brk的vma,则需要调用do_brk_flags增大vma或者新建一个vma。
看一下do_brk_flags的实现。
static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; struct vma_prepare vp; /* * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr && !vma_policy(vma) && can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { vma_iter_config(vmi, vma->vm_start, addr + len); if (vma_iter_prealloc(vmi, vma)) goto unacct_fail; vma_start_write(vma); init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); vma->vm_end = addr + len; vm_flags_set(vma, VM_SOFTDIRTY); vma_iter_store(vmi, vma); vma_complete(&vp, vmi, mm); khugepaged_enter_vma(vma, flags); goto out; } if (vma) vma_iter_next_range(vmi); /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) goto unacct_fail; vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = addr >> PAGE_SHIFT; vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; validate_mm(mm); ksm_add_vma(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vm_flags_set(vma, VM_SOFTDIRTY); return 0; mas_store_fail: vm_area_free(vma); unacct_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; }
总觉得这个实现有点奇怪。我的理解是它的入参vma是数据段的vma,那肯定跟要增长的区间不挨着,那一定要新建一个vma,而且也没将新的vma和旧的堆的vma合并。这不就是说没此堆增长都是新建一个vma吗?而且设置flag也没设置到pte这一层,那访问岗分配的内存难道不会data abort吗?
分析一下mmap系统调用。
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
mmap是常用的系统调用,常常用来分配内存,读写文件,进程间通信。本文只分析分配内存有关的部分。
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); }
系统调用会直接调用ksys_mmap_pgoff。
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { struct file *file = NULL; unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { ... } else if (flags & MAP_HUGETLB) { ... } retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); ... return retval; }
忽略文件映射和巨页,匿名页就是调用vm_mmap_pgoff.
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, &uf); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } return ret; }
核心函数是do_mmap。
unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; *populate = 0; addr = get_unmapped_area(file, addr, len, pgoff, flags); if (file) { ... } else { switch (flags & MAP_TYPE) { case MAP_SHARED: if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; /* * Ignore pgoff. */ pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; } } ... addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; }
分配匿名页最主要的就是找到一块满足要求的区域,由get_unmapped_area完成。
unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); get_area = current->mm->get_unmapped_area; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; } else if (flags & MAP_SHARED) { pgoff = 0; get_area = shmem_get_unmapped_area; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { /* Ensures that larger anonymous mappings are THP aligned. */ get_area = thp_get_unmapped_area; } addr = get_area(file, addr, len, pgoff, flags); return error ? error : addr; }
get_unmapped_area把任务交给对应的回调函数来完成。只关注默认的get_unmapped_area,它是mm_struct结构中的一个成员,进程初始化的时候后设置。
这是一个回调函数,通过ftrace查看调用链得到实际调用函数。
<...>-4069 [002] ...1. 4361.648639: do_mmap <-vm_mmap_pgoff <...>-4069 [002] ...1. 4361.648639: get_unmapped_area <-do_mmap <...>-4069 [002] ...1. 4361.648639: arch_get_unmapped_area_topdown <-get_unmapped_area <...>-4069 [002] ...1. 4361.648639: get_mmap_base <-arch_get_unmapped_area_topdown <...>-4069 [002] ...1. 4361.648639: get_align_mask <-arch_get_unmapped_area_topdown <...>-4069 [002] ...1. 4361.648639: get_align_mask <-arch_get_unmapped_area_topdown <...>-4069 [002] ...1. 4361.648639: vm_unmapped_area <-arch_get_unmapped_area_topdown <...>-4069 [002] ...1. 4361.648640: security_mmap_addr <-get_unmapped_area <...>-4069 [002] ...1. 4361.648640: cap_mmap_addr <-security_mmap_addr <...>-4069 [002] ...1. 4361.648640: path_noexec <-do_mmap <...>-4069 [002] ...1. 4361.648640: mmap_region <-do_mmap <...>-4069 [002] ...1. 4361.648640: may_expand_vm <-mmap_region <...>-4069 [002] ...1. 4361.648641: do_vmi_munmap <-mmap_region <...>-4069 [002] ...1. 4361.648641: is_file_shm_hugepages <-mmap_region <...>-4069 [002] ...1. 4361.648641: can_vma_merge_before <-mmap_region <...>-4069 [002] ...1. 4361.648641: can_vma_merge_after <-mmap_region <...>-4069 [002] ...1. 4361.648641: vm_area_alloc <-mmap_region <...>-4069 [002] ...1. 4361.648641: kmem_cache_alloc <-vm_area_alloc
get_unmapped_area默认回调函数是arch_get_unmapped_area_topdown。
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); } /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): */ unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; struct vm_unmapped_area_info info; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); /* requested length too big for entire address space */ if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; /* requesting a specific address */ if (addr) { ... } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.align_mask = 0; info.align_offset = 0; addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = mmap_end; addr = vm_unmapped_area(&info); } return addr; }
设置info,带着这些信息让vm_unmapped_area去寻找合适的addr。这里的信息比较重要的有设置VM_UNMAPPED_AREA_TOPDOWN,这个参数表明要以自上而下的方向寻找空闲的空间,low_limit和high_limit设置一个查找区间。这里low_limit设置成page size,high_limit一般是mm->mmap_base。
/** * unmapped_area_topdown() - Find an area between the low_limit and the * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{ ... retry: if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length)) return -ENOMEM; gap = mas.last + 1 - info->length; gap -= (gap - info->align_offset) & info->align_mask; gap_end = mas.last; ... return gap; }
最终mas_empty_area_rev会找出这个范围的last address,通过+1 - len得到这个范围的起始地址。
回到do_mmap,得到这个尚未map的区域后还要给它加入到vma的管理中,这由mmap_region完成。
unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct vm_area_struct *next, *prev, *merge; pgoff_t pglen = len >> PAGE_SHIFT; unsigned long charged = 0; unsigned long end = addr + len; unsigned long merge_start = addr, merge_end = end; bool writable_file_mapping = false; pgoff_t vm_pgoff; int error; VMA_ITERATOR(vmi, mm, addr); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; /* * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. */ nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; } /* Unmap any existing mapping in the area */ if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) return -ENOMEM; /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT; } next = vma_next(&vmi); prev = vma_prev(&vmi); if (vm_flags & VM_SPECIAL) { if (prev) vma_iter_next_range(&vmi); goto cannot_expand; } /* Attempt to expand an old mapping */ /* Check next */ if (next && next->vm_start == end && !vma_policy(next) && can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, NULL_VM_UFFD_CTX, NULL)) { merge_end = next->vm_end; vma = next; vm_pgoff = next->vm_pgoff - pglen; } /* Check prev */ if (prev && prev->vm_end == addr && !vma_policy(prev) && (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, pgoff, vma->vm_userfaultfd_ctx, NULL) : can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, NULL_VM_UFFD_CTX, NULL))) { merge_start = prev->vm_start; vma = prev; vm_pgoff = prev->vm_pgoff; } else if (prev) { vma_iter_next_range(&vmi); } /* Actually expand, if possible */ if (vma && !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } if (vma == prev) vma_iter_set(&vmi, addr); cannot_expand: /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(mm); if (!vma) { error = -ENOMEM; goto unacct_error; } vma_iter_config(&vmi, addr, end); vma->vm_start = addr; vma->vm_end = end; vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; if (file) { vma->vm_file = get_file(file); error = call_mmap(file, vma); if (error) goto unmap_and_free_vma; if (vma_is_shared_maywrite(vma)) { error = mapping_map_writable(file->f_mapping); if (error) goto close_and_free_vma; writable_file_mapping = true; } /* * Expansion is handled above, merging is handled below. * Drivers should not alter the address of the VMA. */ error = -EINVAL; if (WARN_ON((addr != vma->vm_start))) goto close_and_free_vma; vma_iter_config(&vmi, addr, end); /* * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge_new_vma(&vmi, prev, vma, vma->vm_start, vma->vm_end, vma->vm_pgoff); if (merge) { /* * ->mmap() can change vma->vm_file and fput * the original file. So fput the vma->vm_file * here or we would add an extra fput for file * and cause general protection fault * ultimately. */ fput(vma->vm_file); vm_area_free(vma); vma = merge; /* Update vm_flags to pick up the change. */ vm_flags = vma->vm_flags; goto unmap_writable; } } vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) goto free_vma; } else { vma_set_anonymous(vma); } if (map_deny_write_exec(vma, vma->vm_flags)) { error = -EACCES; goto close_and_free_vma; } /* Allow architectures to sanity-check the vm_flags */ error = -EINVAL; if (!arch_validate_flags(vma->vm_flags)) goto close_and_free_vma; error = -ENOMEM; if (vma_iter_prealloc(&vmi, vma)) goto close_and_free_vma; /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); vma_iter_store(&vmi, vma); mm->map_count++; if (vma->vm_file) { i_mmap_lock_write(vma->vm_file->f_mapping); if (vma_is_shared_maywrite(vma)) mapping_allow_writable(vma->vm_file->f_mapping); flush_dcache_mmap_lock(vma->vm_file->f_mapping); vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); flush_dcache_mmap_unlock(vma->vm_file->f_mapping); i_mmap_unlock_write(vma->vm_file->f_mapping); } /* * vma_merge() calls khugepaged_enter_vma() either, the below * call covers the non-merge case. */ khugepaged_enter_vma(vma, vma->vm_flags); /* Once vma denies write, undo our temporary denial count */ unmap_writable: if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; ksm_add_vma(vma); expanded:
//将mmap事件记录到perf perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += (len >> PAGE_SHIFT); } if (file)
//如果这个文件需要被uprobe hook,替换文件hook点的指令为breakpoint指令 uprobe_mmap(vma); /* * New (or expanded) vma always get soft dirty status. * Otherwise user-space soft-dirty page tracker won't * be able to distinguish situation when vma area unmapped, * then new mapped in-place (which must be aimed as * a completely new data area). */ vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); validate_mm(mm); return addr; close_and_free_vma: if (file && vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (file || vma->vm_file) { unmap_and_free_vma: fput(vma->vm_file); vma->vm_file = NULL; vma_iter_set(&vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, vma->vm_end, vma->vm_end, true); } if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); unacct_error: if (charged) vm_unacct_memory(charged); validate_mm(mm); return error; }
mmap_region函数很长,做的事情不是太复杂。先尝试找到相邻的vma,看看能不能通过expand把这块新的区域合并进去,如果不能何并就新建一个vma。
至此mmap分析完了。等等,还有mm_populate需要分析一下。
static inline void mm_populate(unsigned long addr, unsigned long len) { /* Ignore errors */ (void) __mm_populate(addr, len, 1); } /* * __mm_populate - populate and/or mlock pages within a range of address space. * * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap * flags. VMAs must be already marked with the desired vm_flags, and * mmap_lock must not be held. */ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; end = start + len; for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ if (!locked) { locked = 1; mmap_read_lock(mm); vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) vma = find_vma_intersection(mm, vma->vm_end, end); if (!vma) break; /* * Set [nstart; nend) to intersection of desired address * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; /* * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ } break; } nend = nstart + ret * PAGE_SIZE; ret = 0; } if (locked) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ }
一般内核分配内存的时候只是把vma建好,其实物理内存还没有分配,但是如果使用了mlock()或者在分配内存时使用MAP_POPULATE 或MAP_LOCKED标志就会调用mm_populate主动分配物理页。populate意思是填充。mm_populate的入参是要填充页表的的虚拟地址范围。它直接调用__mm_populate。后者会以入参为范围查找vma,找到后调用populate_vma_page_range。
/** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address * @locked: whether the mmap_lock is still held * * This takes care of mlocking the pages too if VM_LOCKED is set. * * Return either number of pages pinned in the vma, or a negative error * code on error. * * vma->vm_mm->mmap_lock must be held. * * If @locked is NULL, it may be held for read or write and will * be unperturbed. * * If @locked is non-NULL, it must held for read only and may be * released. If it's released, *@locked will be set to 0. */ long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { ... /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, locked ? locked : &local_locked); ... return ret; }
populate_vma_page_range最终调用__get_user_pages。
static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; ... do { struct page *page; unsigned int foll_flags = gup_flags; unsigned int page_increm; ... retry: ... cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, &foll_flags, PTR_ERR(page) == -EMLINK, locked); ... } else if (PTR_ERR(page) == -EEXIST) { .. } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } next_page: page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; if (pages) { ... } i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return i ? i : ret; }
调用follow_page_mask查找页表,如果页表没有完全建好证明没有分配物理页,此时会返回NULL,之后调用faultin_page手动触发page fault来分配内存
static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; ... pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
// 如果没有页表不存在返回NULL return no_page_table(vma, flags); return follow_p4d_mask(vma, address, pgd, flags, ctx); }
第一次分配内存肯定没建好页表,必然在遍历页表的某个环节返回NULL,下面看看faultin_page。
static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int *flags, bool unshare, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; ... ret = handle_mm_fault(vma, address, fault_flags, NULL); .. return 0; }
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* If the fault handler drops the mmap_lock, vma may be freed */ struct mm_struct *mm = vma->vm_mm; vm_fault_t ret; ... lru_gen_enter_fault(vma); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); lru_gen_exit_fault(); ... return ret; }
__handle_mm_fault是处理page fault的核心函数。
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; unsigned long vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; pgd = pgd_offset(mm, address); //分配p4d页表 p4d = p4d_alloc(mm, pgd, address); if (!p4d) return VM_FAULT_OOM; //分配pud页表 vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; retry_pud: ... // 分配pmd vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; ... return handle_pte_fault(&vmf); }
分配p4d,pud,pmd的页表并填充,最后调用handle_pte_fault解决pte fault。
static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; if (unlikely(pmd_none(*vmf->pmd))) { ... } else { /* * A regular pmd is established and it can't morph into a huge * pmd by anon khugepaged, since that takes mmap_lock in write * mode; but shmem or file collapse to THP could still morph * it into a huge pmd: just retry later if so. */
//从pmd中找到对应的pte的entry vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) return 0; vmf->orig_pte = ptep_get_lockless(vmf->pte); vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
//即便pte条目存在pte也只是个空的 if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; } } if (!vmf->pte)
//pte还没有建,去分配内存然后建好pte return do_pte_missing(vmf); if (!pte_present(vmf->orig_pte))
//pte已经建好了,只是现在物理页不在内存中,肯定是被swap出去了,找回来 return do_swap_page(vmf); if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!pte_write(entry)) return do_wp_page(vmf); else if (likely(vmf->flags & FAULT_FLAG_WRITE)) entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); ... unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; }
handle_pte_fault会有多种情况要去处理。这些情况都要从pte中去获取,先获取pte,如果pte是0那说明从来没有给这段地址空间分配过内存,现在要做的就是去分配内存。我们先在就属于这种情况,使用do_pte_missing去获取内存。如果pte非0,但是不在内存中,也就是!pte_present,那说明页面已经分配过,但是现在被swap出去了,使用do_swap_page解决。本次我们只关心第一次分配内存的情况。
static vm_fault_t do_pte_missing(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); else return do_fault(vmf); }
如果是匿名页do_pte_missing会调用do_anonymous_page。我们只看这种情况。
static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; int nr_pages = 1; pte_t entry; int i; ...
/* Use the zero-page for reads */
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
//只读?分配个零页吧,这是个特殊映射,mkspecial
entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (!vmf->pte)
goto unlock;
if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
}
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte;
}
/* Allocate our own private page. */
//准备一下反向映射 if (unlikely(anon_vma_prepare(vma))) goto oom; /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ //分配内存 folio = alloc_anon_folio(vmf); nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); folio_throttle_swaprate(folio, GFP_KERNEL); /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ __folio_mark_uptodate(folio); //制作一个pte entry entry = mk_pte(&folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; if (nr_pages == 1 && vmf_pte_changed(vmf)) { update_mmu_tlb(vma, addr, vmf->pte); goto release; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { for (i = 0; i < nr_pages; i++) update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i); goto release; } ...
//曾加page _refcount folio_ref_add(folio, nr_pages - 1);
//刚刚分配了内存,加入mm->rss_stat统计 add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
//对新分配的页面加入反向映射 folio_add_new_anon_rmap(folio, vma, addr);
//将新分配的folio加入lru链表 folio_add_lru_vma(folio, vma); setpte: if (uffd_wp) entry = pte_mkuffd_wp(entry);
//设置pte set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; ... }
alloc_annoc_folio会去分配内存,返回一个folio结构,这个是page的封装,之后制作一个pte并设置到页表中。
至此mmap系统调用分配匿名页这一块简单的过了一遍,非常简略。