linux内存管理（七）- 写时复制

在fork进程的时候子进程会共享父进程的页表，但并没有分配新页。此时页表时只读的，如果父进程或者子进程写内存就会触发page fault，内核会重新分配内存更改页表，从此分道扬镳。因此写时复制包含两部分内容，第一是fork进程时复制页表并设置pte为只读，第二是写内存发生page fault。

先来看看第一部分。

复制页表的操作发生在dup_mmap中，调用链是kernel_clone->copy_process->copy_mm->dup_mm->dup_mmap

static __latent_entropy int dup_mmap(struct mm_struct *mm,
                    struct mm_struct *oldmm)
{
...
    for_each_vma(vmi, mpnt) {
...
        if (!(tmp->vm_flags & VM_WIPEONFORK))
            //复制页表
            retval = copy_page_range(tmp, mpnt);
...
    }
...
}

int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
...
    do {
        next = pgd_addr_end(addr, end);
        if (pgd_none_or_clear_bad(src_pgd))
            continue;
        if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                        addr, next))) {
            untrack_pfn_clear(dst_vma);
            ret = -ENOMEM;
            break;
        }
    } while (dst_pgd++, src_pgd++, addr = next, addr != end);
...
    return ret;
}

copy_page_range循环复制各级页表，copy_p4d_range->copy_pud_range->copy_pmd_range->copy_pte_range->copy_present_pte

static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
         pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
         struct folio **prealloc)
{
    struct mm_struct *src_mm = src_vma->vm_mm;
    unsigned long vm_flags = src_vma->vm_flags;
    pte_t pte = ptep_get(src_pte);
    struct page *page;
    struct folio *folio;

    page = vm_normal_page(src_vma, addr, pte);
    if (page)
        folio = page_folio(page);
    if (page && folio_test_anon(folio)) {
        /*
         * If this page may have been pinned by the parent process,
         * copy the page immediately for the child so that we'll always
         * guarantee the pinned page won't be randomly replaced in the
         * future.
         */
        folio_get(folio);
        if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
            /* Page may be pinned, we have to copy. */
            folio_put(folio);
            return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                         addr, rss, prealloc, page);
        }
        rss[MM_ANONPAGES]++;
    } else if (page) {
        folio_get(folio);
        folio_dup_file_rmap_pte(folio, page);
        rss[mm_counter_file(page)]++;
    }

    /*
     * If it's a COW mapping, write protect it both
     * in the parent and the child
     */
    //如果是cow mapping且pte有可写属性，将pte改为写保护
    if (is_cow_mapping(vm_flags) && pte_write(pte)) {
        ptep_set_wrprotect(src_mm, addr, src_pte);
        pte = pte_wrprotect(pte);
    }
    VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));

    /*
     * If it's a shared mapping, mark it clean in
     * the child
     */
    if (vm_flags & VM_SHARED)
        pte = pte_mkclean(pte);
    pte = pte_mkold(pte);

    if (!userfaultfd_wp(dst_vma))
        pte = pte_clear_uffd_wp(pte);

    set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
    return 0;
}

如果vma是cow mapping，pte将会被设置为写保护。如果写对应的内存就会触发page fault，最终由handle_pte_fault处理。

在分析handle_pte_fault时我们只关注了分配匿名页的情况，下面我们看看写时复制。

static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
    pte_t entry;
...
    if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
        if (!pte_write(entry))
            return do_wp_page(vmf);
        else if (likely(vmf->flags & FAULT_FLAG_WRITE))
            entry = pte_mkdirty(entry);

什么情况会走到处理写时复制的地方呢？如果pte不为0，页面在内存中，错误是写错误，页面非共享且当前pte没有可写属性，我们就认为此时发生了写时复制，do_wp_page会处理此错误。

static vm_fault_t do_wp_page(struct vm_fault *vmf)
    __releases(vmf->ptl)
{
    const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
    struct vm_area_struct *vma = vmf->vma;
    struct folio *folio = NULL;
    pte_t pte;
    ...
    vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);

...
    return wp_page_copy(vmf);
}

忽略special映射，vm_normal_page会返回普通映射页面。wp_page_copy会去处理写时复制的情况。

static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
    const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
    struct vm_area_struct *vma = vmf->vma;
    struct mm_struct *mm = vma->vm_mm;
    struct folio *old_folio = NULL;
    struct folio *new_folio = NULL;
    pte_t entry;
    int page_copied = 0;
    struct mmu_notifier_range range;
    vm_fault_t ret;
    bool pfn_is_zero;

    delayacct_wpcopy_start();

    if (vmf->page)
        old_folio = page_folio(vmf->page);
    //准备反向映射
    ret = vmf_anon_prepare(vmf);
    if (unlikely(ret))
        goto out;
       
    pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
    //分配内存
    new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
    if (!new_folio)
        goto oom;

    if (!pfn_is_zero) {
        int err;
        //copy 旧页到新分配的页面
        err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
        if (err) {
            /*
             * COW failed, if the fault was solved by other,
             * it's fine. If not, userspace would re-fault on
             * the same address and we will handle the fault
             * from the second attempt.
             * The -EHWPOISON case will not be retried.
             */
            folio_put(new_folio);
            if (old_folio)
                folio_put(old_folio);

            delayacct_wpcopy_end();
            return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
        }
        kmsan_copy_page_meta(&new_folio->page, vmf->page);
    }

    __folio_mark_uptodate(new_folio);

    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                vmf->address & PAGE_MASK,
                (vmf->address & PAGE_MASK) + PAGE_SIZE);
    mmu_notifier_invalidate_range_start(&range);

    /*
     * Re-check the pte - we dropped the lock
     */
    vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
    if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
        if (old_folio) {
            if (!folio_test_anon(old_folio)) {
                dec_mm_counter(mm, mm_counter_file(&old_folio->page));
                inc_mm_counter(mm, MM_ANONPAGES);
            }
        } else {
            ksm_might_unmap_zero_page(mm, vmf->orig_pte);
            inc_mm_counter(mm, MM_ANONPAGES);
        }
        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = mk_pte(&new_folio->page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (unlikely(unshare)) {
            if (pte_soft_dirty(vmf->orig_pte))
                entry = pte_mksoft_dirty(entry);
            if (pte_uffd_wp(vmf->orig_pte))
                entry = pte_mkuffd_wp(entry);
        } else {
            entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        /*
         * Clear the pte entry and flush it first, before updating the
         * pte with the new entry, to keep TLBs on different CPUs in
         * sync. This code used to set the new PTE then flush TLBs, but
         * that left a window where the new PTE could be loaded into
         * some TLBs while the old PTE remains in others.
         */
        ptep_clear_flush(vma, vmf->address, vmf->pte);
        //将vma的anon_vma设置到folio的i_mapping字段
        folio_add_new_anon_rmap(new_folio, vma, vmf->address);
        //将folio加入lru
        folio_add_lru_vma(new_folio, vma);
        /*
         * We call the notify macro here because, when using secondary
         * mmu page tables (such as kvm shadow page tables), we want the
         * new page to be mapped directly into the secondary page table.
         */
        BUG_ON(unshare && pte_write(entry));
        //将新的page地址设置到pte上
        set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
        update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
        if (old_folio) {
            /*
             * Only after switching the pte to the new page may
             * we remove the mapcount here. Otherwise another
             * process may come and find the rmap count decremented
             * before the pte is switched to the new page, and
             * "reuse" the old page writing into it while our pte
             * here still points into it and can be read by other
             * threads.
             *
             * The critical issue is to order this
             * folio_remove_rmap_pte() with the ptp_clear_flush
             * above. Those stores are ordered by (if nothing else,)
             * the barrier present in the atomic_add_negative
             * in folio_remove_rmap_pte();
             *
             * Then the TLB flush in ptep_clear_flush ensures that
             * no process can access the old page before the
             * decremented mapcount is visible. And the old page
             * cannot be reused until after the decremented
             * mapcount is visible. So transitively, TLBs to
             * old page will be flushed before it can be reused.
             */
            //pte已经切换到新的page，反向映射该remove了
            folio_remove_rmap_pte(old_folio, vmf->page, vma);
        }

        /* Free the old page.. */
        new_folio = old_folio;
        page_copied = 1;
        pte_unmap_unlock(vmf->pte, vmf->ptl);
    } else if (vmf->pte) {
        update_mmu_tlb(vma, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
    }

    mmu_notifier_invalidate_range_end(&range);

    if (new_folio)
        folio_put(new_folio);
    if (old_folio) {
        if (page_copied)
            free_swap_cache(&old_folio->page);
        folio_put(old_folio);
    }

    delayacct_wpcopy_end();
    return 0;
oom:
    ret = VM_FAULT_OOM;
out:
    if (old_folio)
        folio_put(old_folio);

    delayacct_wpcopy_end();
    return ret;
}

wp_page_copy的流程：

1. vmf_anon_prepare准备反向映射；

2. folio_prealloc分配新页；

3. __wp_page_copy_user将旧页的内容复制到新页；

4. folio_add_new_anon_rmap将vma->anon_vma + 1设置到folio->i_mapping;

5. set_pte_at_notify设置新的page pfn到pte中；

6. folio_remove_map_pte删除旧的folio反向映射；

posted on 2024-06-11 16:29 半山随笔阅读(73) 评论(0) 编辑收藏举报

刷新页面返回顶部

linux内存管理（七）- 写时复制

导航

公告