linux内存管理(五)- 缺页处理
分析一下缺页的处理。缺页的意思是在访问内存的时候该地址还没有建好页表,页面尚未分配,或者页面被swap出去或者没有权限。缺页是同步异常,用户态发生缺页异常会等待内核解决,当然这一切对于用户态都是透明的。缺页处理的核心函数是do_page_fault,这个函数是架构相关的所以这个函数分布在各个架构相关的代码中。我们以arm64为例。
static int __kprobes do_page_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf; struct mm_struct *mm = current->mm; vm_fault_t fault; unsigned long vm_flags; unsigned int mm_flags = FAULT_FLAG_DEFAULT; unsigned long addr = untagged_addr(far); struct vm_area_struct *vma;
//kprobe还会处理page fault? if (kprobe_page_fault(regs, esr)) return 0; /* * If we're in an interrupt or have no user context, we must not take * the fault. */
//task_struct里面有一个pagefault_disabled成员用来作禁止pagefault的标志 if (faulthandler_disabled() || !mm) goto no_context; //查看pstate错误是否用户态 if (user_mode(regs)) mm_flags |= FAULT_FLAG_USER; /* * vm_flags tells us what bits we must have in vma->vm_flags * for the fault to be benign, __do_page_fault() would check * vma->vm_flags & vm_flags and returns an error if the * intersection is empty */
//判断错误类型 if (is_el0_instruction_abort(esr)) { /* It was exec fault */ vm_flags = VM_EXEC; mm_flags |= FAULT_FLAG_INSTRUCTION; } else if (is_write_abort(esr)) { /* It was write fault */ vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } else { /* It was read fault */ vm_flags = VM_READ; /* Write implies read */ vm_flags |= VM_WRITE; /* If EPAN is absent then exec implies read */ if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN)) vm_flags |= VM_EXEC; } if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
//在kernel态执行用户态指令出错 if (is_el1_instruction_abort(esr)) die_kernel_fault("execution of user memory", addr, esr, regs); if (!search_exception_tables(regs->pc)) die_kernel_fault("access to user memory outside uaccess routines", addr, esr, regs); }
//perf软件事件?有空看看 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (!(mm_flags & FAULT_FLAG_USER)) goto lock_mmap; //获取vma vma = lock_vma_under_rcu(mm, addr); if (!vma) goto lock_mmap; //出错类型跟vma的权限一定得对的上,不然就有问题 if (!(vma->vm_flags & vm_flags)) { vma_end_read(vma); goto lock_mmap; }
//上次刚分析了一下,分配页面,如果一切顺利,返回0 fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); //不需要重试就goto done if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); if (fault & VM_FAULT_MAJOR) mm_flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) goto no_context; return 0; } lock_mmap: retry: vma = lock_mm_and_find_vma(mm, addr, regs); if (unlikely(!vma)) { fault = VM_FAULT_BADMAP; goto done; } //还是调用handle_mm_fault fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) goto no_context; return 0; } /* The fault is fully completed (including releasing mmap lock) */ if (fault & VM_FAULT_COMPLETED) return 0; if (fault & VM_FAULT_RETRY) { mm_flags |= FAULT_FLAG_TRIED; goto retry; } mmap_read_unlock(mm); done: /* * Handle the "normal" (no error) case first. */
//正常情况这里就返回了 if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) return 0; /* * If we are in kernel mode at this point, we have no context to * handle this fault with. */ if (!user_mode(regs)) goto no_context; if (fault & VM_FAULT_OOM) { /* * We ran out of memory, call the OOM killer, and return to * userspace (which will retry the fault, or kill us if we got * oom-killed). */
//没内存了,咋整 pagefault_out_of_memory(); return 0; } inf = esr_to_fault_info(esr);
//task_struct里面有一个thread成员保存进程上下文,里面有标志出错的成员,fault_address和fault_code,设置他们 set_thread_esr(addr, esr); if (fault & VM_FAULT_SIGBUS) { /* * We had some memory, but were unable to successfully fix up * this page fault. */ arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) { unsigned int lsb; lsb = PAGE_SHIFT; if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { /* * Something tried to access memory that isn't in our memory * map. */ arm64_force_sig_fault(SIGSEGV, fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR, far, inf->name); } return 0; no_context:
//kernel一般是不会有page fault的,大概率是bug,发一个Oops然后杀掉进程算了 __do_kernel_fault(addr, esr, regs); return 0; }
kernel一般只处理用户进程发生的page fault,如果发生在kernel态可能是个bug。do_page_fault会调用handle_mm_fault去处理。这个函数在之前已经分析过了,这里就省略,不过为了完整性,之后会补上。