linux内存管理（五）- 缺页处理

分析一下缺页的处理。缺页的意思是在访问内存的时候该地址还没有建好页表，页面尚未分配，或者页面被swap出去或者没有权限。缺页是同步异常，用户态发生缺页异常会等待内核解决，当然这一切对于用户态都是透明的。缺页处理的核心函数是do_page_fault，这个函数是架构相关的所以这个函数分布在各个架构相关的代码中。我们以arm64为例。

static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
                   struct pt_regs *regs)
{
    const struct fault_info *inf;
    struct mm_struct *mm = current->mm;
    vm_fault_t fault;
    unsigned long vm_flags;
    unsigned int mm_flags = FAULT_FLAG_DEFAULT;
    unsigned long addr = untagged_addr(far);
    struct vm_area_struct *vma;

    //kprobe还会处理page fault？
    if (kprobe_page_fault(regs, esr))
        return 0;

    /*
     * If we're in an interrupt or have no user context, we must not take
     * the fault.
     */
    //task_struct里面有一个pagefault_disabled成员用来作禁止pagefault的标志
    if (faulthandler_disabled() || !mm)
        goto no_context;
    //查看pstate错误是否用户态
    if (user_mode(regs))
        mm_flags |= FAULT_FLAG_USER;

    /*
     * vm_flags tells us what bits we must have in vma->vm_flags
     * for the fault to be benign, __do_page_fault() would check
     * vma->vm_flags & vm_flags and returns an error if the
     * intersection is empty
     */
    //判断错误类型
    if (is_el0_instruction_abort(esr)) {
        /* It was exec fault */
        vm_flags = VM_EXEC;
        mm_flags |= FAULT_FLAG_INSTRUCTION;
    } else if (is_write_abort(esr)) {
        /* It was write fault */
        vm_flags = VM_WRITE;
        mm_flags |= FAULT_FLAG_WRITE;
    } else {
        /* It was read fault */
        vm_flags = VM_READ;
        /* Write implies read */
        vm_flags |= VM_WRITE;
        /* If EPAN is absent then exec implies read */
        if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN))
            vm_flags |= VM_EXEC;
    }

    if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
        //在kernel态执行用户态指令出错
        if (is_el1_instruction_abort(esr))
            die_kernel_fault("execution of user memory",
                     addr, esr, regs);

        if (!search_exception_tables(regs->pc))
            die_kernel_fault("access to user memory outside uaccess routines",
                     addr, esr, regs);
    }

    //perf软件事件？有空看看
    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);

    if (!(mm_flags & FAULT_FLAG_USER))
        goto lock_mmap;
    //获取vma
    vma = lock_vma_under_rcu(mm, addr);
    if (!vma)
        goto lock_mmap;
    //出错类型跟vma的权限一定得对的上，不然就有问题
    if (!(vma->vm_flags & vm_flags)) {
        vma_end_read(vma);
        goto lock_mmap;
    }
    //上次刚分析了一下，分配页面，如果一切顺利，返回0
    fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
    if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
        vma_end_read(vma);
    //不需要重试就goto done
    if (!(fault & VM_FAULT_RETRY)) {
        count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
        goto done;
    }
    count_vm_vma_lock_event(VMA_LOCK_RETRY);
    if (fault & VM_FAULT_MAJOR)
        mm_flags |= FAULT_FLAG_TRIED;

    /* Quick path to respond to signals */
    if (fault_signal_pending(fault, regs)) {
        if (!user_mode(regs))
            goto no_context;
        return 0;
    }
lock_mmap:

retry:
    vma = lock_mm_and_find_vma(mm, addr, regs);
    if (unlikely(!vma)) {
        fault = VM_FAULT_BADMAP;
        goto done;
    }
    //还是调用handle_mm_fault
    fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);

    /* Quick path to respond to signals */
    if (fault_signal_pending(fault, regs)) {
        if (!user_mode(regs))
            goto no_context;
        return 0;
    }

    /* The fault is fully completed (including releasing mmap lock) */
    if (fault & VM_FAULT_COMPLETED)
        return 0;

    if (fault & VM_FAULT_RETRY) {
        mm_flags |= FAULT_FLAG_TRIED;
        goto retry;
    }
    mmap_read_unlock(mm);

done:
    /*
     * Handle the "normal" (no error) case first.
     */
    //正常情况这里就返回了
    if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
                  VM_FAULT_BADACCESS))))
        return 0;

    /*
     * If we are in kernel mode at this point, we have no context to
     * handle this fault with.
     */
    if (!user_mode(regs))
        goto no_context;

    if (fault & VM_FAULT_OOM) {
        /*
         * We ran out of memory, call the OOM killer, and return to
         * userspace (which will retry the fault, or kill us if we got
         * oom-killed).
         */
        //没内存了，咋整
        pagefault_out_of_memory();
        return 0;
    }

    inf = esr_to_fault_info(esr);
    //task_struct里面有一个thread成员保存进程上下文，里面有标志出错的成员，fault_address和fault_code,设置他们
    set_thread_esr(addr, esr);
    if (fault & VM_FAULT_SIGBUS) {
        /*
         * We had some memory, but were unable to successfully fix up
         * this page fault.
         */
        arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
    } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
        unsigned int lsb;

        lsb = PAGE_SHIFT;
        if (fault & VM_FAULT_HWPOISON_LARGE)
            lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));

        arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
    } else {
        /*
         * Something tried to access memory that isn't in our memory
         * map.
         */
        arm64_force_sig_fault(SIGSEGV,
                      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
                      far, inf->name);
    }

    return 0;

no_context:
   //kernel一般是不会有page fault的，大概率是bug，发一个Oops然后杀掉进程算了
    __do_kernel_fault(addr, esr, regs);
    return 0;
}

kernel一般只处理用户进程发生的page fault，如果发生在kernel态可能是个bug。do_page_fault会调用handle_mm_fault去处理。这个函数在之前已经分析过了，这里就省略，不过为了完整性，之后会补上。

posted on 2024-06-11 13:47 半山随笔阅读(173) 评论(0) 编辑收藏举报

刷新页面返回顶部

linux内存管理（五）- 缺页处理

导航

公告