linux内存管理(七)- 写时复制

在fork进程的时候子进程会共享父进程的页表,但并没有分配新页。此时页表时只读的,如果父进程或者子进程写内存就会触发page fault,内核会重新分配内存更改页表,从此分道扬镳。因此写时复制包含两部分内容,第一是fork进程时复制页表并设置pte为只读,第二是写内存发生page fault。

先来看看第一部分。

复制页表的操作发生在dup_mmap中,调用链是kernel_clone->copy_process->copy_mm->dup_mm->dup_mmap

static __latent_entropy int dup_mmap(struct mm_struct *mm,
                    struct mm_struct *oldmm)
{
...
    for_each_vma(vmi, mpnt) {
...
        if (!(tmp->vm_flags & VM_WIPEONFORK))
            //复制页表
            retval = copy_page_range(tmp, mpnt);
...
    }
...
}
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
...
    do {
        next = pgd_addr_end(addr, end);
        if (pgd_none_or_clear_bad(src_pgd))
            continue;
        if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                        addr, next))) {
            untrack_pfn_clear(dst_vma);
            ret = -ENOMEM;
            break;
        }
    } while (dst_pgd++, src_pgd++, addr = next, addr != end);
...
    return ret;
}

copy_page_range循环复制各级页表,copy_p4d_range->copy_pud_range->copy_pmd_range->copy_pte_range->copy_present_pte

static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
         pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
         struct folio **prealloc)
{
    struct mm_struct *src_mm = src_vma->vm_mm;
    unsigned long vm_flags = src_vma->vm_flags;
    pte_t pte = ptep_get(src_pte);
    struct page *page;
    struct folio *folio;

    page = vm_normal_page(src_vma, addr, pte);
    if (page)
        folio = page_folio(page);
    if (page && folio_test_anon(folio)) {
        /*
         * If this page may have been pinned by the parent process,
         * copy the page immediately for the child so that we'll always
         * guarantee the pinned page won't be randomly replaced in the
         * future.
         */
        folio_get(folio);
        if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
            /* Page may be pinned, we have to copy. */
            folio_put(folio);
            return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                         addr, rss, prealloc, page);
        }
        rss[MM_ANONPAGES]++;
    } else if (page) {
        folio_get(folio);
        folio_dup_file_rmap_pte(folio, page);
        rss[mm_counter_file(page)]++;
    }

    /*
     * If it's a COW mapping, write protect it both
     * in the parent and the child
     */
//如果是cow mapping且pte有可写属性,将pte改为写保护
if (is_cow_mapping(vm_flags) && pte_write(pte)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); /* * If it's a shared mapping, mark it clean in * the child */ if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; }

如果vma是cow mapping,pte将会被设置为写保护。如果写对应的内存就会触发page fault,最终由handle_pte_fault处理。

在分析handle_pte_fault时我们只关注了分配匿名页的情况,下面我们看看写时复制。

static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
    pte_t entry;
...
    if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
        if (!pte_write(entry))
            return do_wp_page(vmf);
        else if (likely(vmf->flags & FAULT_FLAG_WRITE))
            entry = pte_mkdirty(entry);

什么情况会走到处理写时复制的地方呢?如果pte不为0,页面在内存中,错误是写错误,页面非共享且当前pte没有可写属性,我们就认为此时发生了写时复制,do_wp_page会处理此错误。

static vm_fault_t do_wp_page(struct vm_fault *vmf)
    __releases(vmf->ptl)
{
    const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
    struct vm_area_struct *vma = vmf->vma;
    struct folio *folio = NULL;
    pte_t pte;
    ...
    vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);

...
    return wp_page_copy(vmf);
}

忽略special映射,vm_normal_page会返回普通映射页面。wp_page_copy会去处理写时复制的情况。

static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
    const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
    struct vm_area_struct *vma = vmf->vma;
    struct mm_struct *mm = vma->vm_mm;
    struct folio *old_folio = NULL;
    struct folio *new_folio = NULL;
    pte_t entry;
    int page_copied = 0;
    struct mmu_notifier_range range;
    vm_fault_t ret;
    bool pfn_is_zero;

    delayacct_wpcopy_start();

    if (vmf->page)
        old_folio = page_folio(vmf->page);
    //准备反向映射
    ret = vmf_anon_prepare(vmf);
    if (unlikely(ret))
        goto out;
       
    pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
    //分配内存
    new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
    if (!new_folio)
        goto oom;

    if (!pfn_is_zero) {
        int err;
        //copy 旧页到新分配的页面
        err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
        if (err) {
            /*
             * COW failed, if the fault was solved by other,
             * it's fine. If not, userspace would re-fault on
             * the same address and we will handle the fault
             * from the second attempt.
             * The -EHWPOISON case will not be retried.
             */
            folio_put(new_folio);
            if (old_folio)
                folio_put(old_folio);

            delayacct_wpcopy_end();
            return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
        }
        kmsan_copy_page_meta(&new_folio->page, vmf->page);
    }

    __folio_mark_uptodate(new_folio);

    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                vmf->address & PAGE_MASK,
                (vmf->address & PAGE_MASK) + PAGE_SIZE);
    mmu_notifier_invalidate_range_start(&range);

    /*
     * Re-check the pte - we dropped the lock
     */
    vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
    if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
        if (old_folio) {
            if (!folio_test_anon(old_folio)) {
                dec_mm_counter(mm, mm_counter_file(&old_folio->page));
                inc_mm_counter(mm, MM_ANONPAGES);
            }
        } else {
            ksm_might_unmap_zero_page(mm, vmf->orig_pte);
            inc_mm_counter(mm, MM_ANONPAGES);
        }
        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = mk_pte(&new_folio->page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (unlikely(unshare)) {
            if (pte_soft_dirty(vmf->orig_pte))
                entry = pte_mksoft_dirty(entry);
            if (pte_uffd_wp(vmf->orig_pte))
                entry = pte_mkuffd_wp(entry);
        } else {
            entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        /*
         * Clear the pte entry and flush it first, before updating the
         * pte with the new entry, to keep TLBs on different CPUs in
         * sync. This code used to set the new PTE then flush TLBs, but
         * that left a window where the new PTE could be loaded into
         * some TLBs while the old PTE remains in others.
         */
        ptep_clear_flush(vma, vmf->address, vmf->pte);
        //将vma的anon_vma设置到folio的i_mapping字段
        folio_add_new_anon_rmap(new_folio, vma, vmf->address);
//将folio加入lru folio_add_lru_vma(new_folio, vma);
/* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ BUG_ON(unshare && pte_write(entry)); //将新的page地址设置到pte上 set_pte_at_notify(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * folio_remove_rmap_pte() with the ptp_clear_flush * above. Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in folio_remove_rmap_pte(); * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ //pte已经切换到新的page,反向映射该remove了 folio_remove_rmap_pte(old_folio, vmf->page, vma); } /* Free the old page.. */ new_folio = old_folio; page_copied = 1; pte_unmap_unlock(vmf->pte, vmf->ptl); } else if (vmf->pte) { update_mmu_tlb(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); } mmu_notifier_invalidate_range_end(&range); if (new_folio) folio_put(new_folio); if (old_folio) { if (page_copied) free_swap_cache(&old_folio->page); folio_put(old_folio); } delayacct_wpcopy_end(); return 0; oom: ret = VM_FAULT_OOM; out: if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return ret; }

wp_page_copy的流程:

1. vmf_anon_prepare准备反向映射;

2. folio_prealloc分配新页;

3. __wp_page_copy_user将旧页的内容复制到新页;

4. folio_add_new_anon_rmap将vma->anon_vma + 1设置到folio->i_mapping;

 5. set_pte_at_notify设置新的page pfn到pte中;

6. folio_remove_map_pte删除旧的folio反向映射;

posted @ 2024-06-11 16:29  半山随笔  阅读(22)  评论(0编辑  收藏  举报