linux内存管理(十)- 页面回收(二)

本篇了解一下内核是怎样触发页面回收的。

触发内存回收的方式有两种,同步和异步回收。alloc_pages在分配内存的时候,如果内存短缺会主动回收内存,这是同步回收;内核有一个或多个kswapd内核线程负责在后台回收内存,这是异步。

 看一下shrink_active_list

static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
    unsigned long nr[NR_LRU_LISTS];
    unsigned long targets[NR_LRU_LISTS];
    unsigned long nr_to_scan;
    enum lru_list lru;
    unsigned long nr_reclaimed = 0;
    unsigned long nr_to_reclaim = sc->nr_to_reclaim;
    bool proportional_reclaim;
    struct blk_plug plug;

    if (lru_gen_enabled() && !root_reclaim(sc)) {
        lru_gen_shrink_lruvec(lruvec, sc);
        return;
    }
    //计算各个lru链表需要扫描的page个数
    get_scan_count(lruvec, sc, nr);

    /* Record the original scan target for proportional adjustments later */
    memcpy(targets, nr, sizeof(nr));

    /*
     * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
     * event that can occur when there is little memory pressure e.g.
     * multiple streaming readers/writers. Hence, we do not abort scanning
     * when the requested number of pages are reclaimed when scanning at
     * DEF_PRIORITY on the assumption that the fact we are direct
     * reclaiming implies that kswapd is not keeping up and it is best to
     * do a batch of work at once. For memcg reclaim one check is made to
     * abort proportional reclaim if either the file or anon lru has already
     * dropped to zero at the first pass.
     */
    proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
                sc->priority == DEF_PRIORITY);

    blk_start_plug(&plug);
    while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||          //active file也要被回收
                    nr[LRU_INACTIVE_FILE]) {
        unsigned long nr_anon, nr_file, percentage;
        unsigned long nr_scanned;

        for_each_evictable_lru(lru) {
            if (nr[lru]) {
                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
                nr[lru] -= nr_to_scan;
                //shrink_list会调用shrink active list或者shrink inactive list
                nr_reclaimed += shrink_list(lru, nr_to_scan,
                                lruvec, sc);
            }
        }

        cond_resched();

        if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
            continue;

        /*
         * For kswapd and memcg, reclaim at least the number of pages
         * requested. Ensure that the anon and file LRUs are scanned
         * proportionally what was requested by get_scan_count(). We
         * stop reclaiming one LRU and reduce the amount scanning
         * proportional to the original scan target.
         */
        nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
        nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

        /*
         * It's just vindictive to attack the larger once the smaller
         * has gone to zero.  And given the way we stop scanning the
         * smaller below, this makes sure that we only make one nudge
         * towards proportionality once we've got nr_to_reclaim.
         */
        if (!nr_file || !nr_anon)
            break;

        if (nr_file > nr_anon) {
            unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
                        targets[LRU_ACTIVE_ANON] + 1;
            lru = LRU_BASE;
            percentage = nr_anon * 100 / scan_target;
        } else {
            unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
                        targets[LRU_ACTIVE_FILE] + 1;
            lru = LRU_FILE;
            percentage = nr_file * 100 / scan_target;
        }

        /* Stop scanning the smaller of the LRU */
        nr[lru] = 0;
        nr[lru + LRU_ACTIVE] = 0;

        /*
         * Recalculate the other LRU scan count based on its original
         * scan target and the percentage scanning already complete
         */
        lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
        nr_scanned = targets[lru] - nr[lru];
        nr[lru] = targets[lru] * (100 - percentage) / 100;
        nr[lru] -= min(nr[lru], nr_scanned);

        lru += LRU_ACTIVE;
        nr_scanned = targets[lru] - nr[lru];
        nr[lru] = targets[lru] * (100 - percentage) / 100;
        nr[lru] -= min(nr[lru], nr_scanned);
    }
    blk_finish_plug(&plug);
    sc->nr_reclaimed += nr_reclaimed;

    /*
     * Even if we did not try to evict anon pages at all, we want to
     * rebalance the anon lru active/inactive ratio.
     */
    if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
        inactive_is_low(lruvec, LRU_INACTIVE_ANON))
        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                   sc, LRU_ACTIVE_ANON);
}

看看shrink_list

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                 struct lruvec *lruvec, struct scan_control *sc)
{
    if (is_active_lru(lru)) {
        if (sc->may_deactivate & (1 << is_file_lru(lru)))
            shrink_active_list(nr_to_scan, lruvec, sc, lru);
        else
            sc->skipped_deactivate = 1;
        return 0;
    }

    return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}

如果当前是active lru且允许deactive当前的lru那么调用shrink_active_list。

/*
 * shrink_active_list() moves folios from the active LRU to the inactive LRU.
 *
 * We move them the other way if the folio is referenced by one or more
 * processes.
 *
 * If the folios are mostly unmapped, the processing is fast and it is
 * appropriate to hold lru_lock across the whole operation.  But if
 * the folios are mapped, the processing is slow (folio_referenced()), so
 * we should drop lru_lock around each folio.  It's impossible to balance
 * this, so instead we remove the folios from the LRU while processing them.
 * It is safe to rely on the active flag against the non-LRU folios in here
 * because nobody will play with that bit on a non-LRU folio.
 *
 * The downside is that we have to touch folio->_refcount against each folio.
 * But we had to alter folio->flags anyway.
 */
static void shrink_active_list(unsigned long nr_to_scan,
                   struct lruvec *lruvec,
                   struct scan_control *sc,
                   enum lru_list lru)
{
    unsigned long nr_taken;
    unsigned long nr_scanned;
    unsigned long vm_flags;
    LIST_HEAD(l_hold);    /* The folios which were snipped off */
    LIST_HEAD(l_active);
    LIST_HEAD(l_inactive);
    unsigned nr_deactivate, nr_activate;
    unsigned nr_rotated = 0;
    int file = is_file_lru(lru);
    struct pglist_data *pgdat = lruvec_pgdat(lruvec);
    //排空lru per cpu cache
    lru_add_drain();

    spin_lock_irq(&lruvec->lru_lock);
    //把要扫描的页面先从lru上分离到l_hold中备用,我觉得这是为了减少对lru锁的使用时长
    nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
                     &nr_scanned, sc, lru);

    __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);

    if (!cgroup_reclaim(sc))
        __count_vm_events(PGREFILL, nr_scanned);
    __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);

    spin_unlock_irq(&lruvec->lru_lock);
    //遍历l_hold
    while (!list_empty(&l_hold)) {
        struct folio *folio;

        cond_resched();
        folio = lru_to_folio(&l_hold);
        list_del(&folio->lru);

        if (unlikely(!folio_evictable(folio))) {
            folio_putback_lru(folio);
            continue;
        }
        //容我日后分析
        if (unlikely(buffer_heads_over_limit)) {
            if (folio_needs_release(folio) &&
                folio_trylock(folio)) {
                filemap_release_folio(folio, 0);
                folio_unlock(folio);
            }
        }

        /* Referenced or rmap lock contention: rotate */
        if (folio_referenced(folio, 0, sc->target_mem_cgroup,
                     &vm_flags) != 0) {
            /*
             * Identify referenced, file-backed active folios and
             * give them one more trip around the active list. So
             * that executable code get better chances to stay in
             * memory under moderate memory pressure.  Anon folios
             * are not likely to be evicted by use-once streaming
             * IO, plus JVM can create lots of anon VM_EXEC folios,
             * so we ignore them here.
             */
//可执行文件cache如果被引用过那就先放回active list
if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { nr_rotated += folio_nr_pages(folio); list_add(&folio->lru, &l_active); continue; } } //将folio的active标志清掉 folio_clear_active(folio); /* we are de-activating */ folio_set_workingset(folio);
//将folio加到inactive tmp list中 list_add(
&folio->lru, &l_inactive); } /* * Move folios back to the lru list. */ spin_lock_irq(&lruvec->lru_lock); nr_activate = move_folios_to_lru(lruvec, &l_active); nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); /* Keep all free folios in l_active list */ list_splice(&l_inactive, &l_active); __count_vm_events(PGDEACTIVATE, nr_deactivate); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); if (nr_rotated) lru_note_cost(lruvec, file, 0, nr_rotated); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); }

看看shrink_inactive_list

static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
        struct lruvec *lruvec, struct scan_control *sc,
        enum lru_list lru)
{
    LIST_HEAD(folio_list);
    unsigned long nr_scanned;
    unsigned int nr_reclaimed = 0;
    unsigned long nr_taken;
    struct reclaim_stat stat;
    bool file = is_file_lru(lru);
    enum vm_event_item item;
    struct pglist_data *pgdat = lruvec_pgdat(lruvec);
    bool stalled = false;

    while (unlikely(too_many_isolated(pgdat, file, sc))) {
        if (stalled)
            return 0;

        /* wait a bit for the reclaimer. */
        stalled = true;
        reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);

        /* We are about to die and free our memory. Return now. */
        if (fatal_signal_pending(current))
            return SWAP_CLUSTER_MAX;
    }

    lru_add_drain();

    spin_lock_irq(&lruvec->lru_lock);
    //分离要扫描的lru folio到folio_list
    nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
                     &nr_scanned, sc, lru);

    __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
    item = PGSCAN_KSWAPD + reclaimer_offset();
    if (!cgroup_reclaim(sc))
        __count_vm_events(item, nr_scanned);
    __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
    __count_vm_events(PGSCAN_ANON + file, nr_scanned);

    spin_unlock_irq(&lruvec->lru_lock);

    if (nr_taken == 0)
        return 0;
    //回收folio_list里面的folio,返回回收的page数量
    nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);

    spin_lock_irq(&lruvec->lru_lock);
//没有被回收的folio放回lru move_folios_to_lru(lruvec,
&folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); mem_cgroup_uncharge_list(&folio_list);
//folio_list还有东西,free到伙伴系统 free_unref_page_list(
&folio_list); /* * If dirty folios are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty folios to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of * dirty folios grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. */ if (stat.nr_unqueued_dirty == nr_taken) { wakeup_flusher_threads(WB_REASON_VMSCAN); /* * For cgroupv1 dirty throttling is achieved by waking up * the kernel flusher here and later waiting on folios * which are in writeback to finish (see shrink_folio_list()). * * Flusher may not be able to issue writeback quickly * enough for cgroupv1 writeback throttling to work * on a large system. */ if (!writeback_throttling_sane(sc)) reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } sc->nr.dirty += stat.nr_dirty; sc->nr.congested += stat.nr_congested; sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr.writeback += stat.nr_writeback; sc->nr.immediate += stat.nr_immediate; sc->nr.taken += nr_taken; if (file) sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; }

shrink_inactive_list调用shrink_folio_list去回收页。

看一下shrink_folio_list

static unsigned int shrink_folio_list(struct list_head *folio_list,
        struct pglist_data *pgdat, struct scan_control *sc,
        struct reclaim_stat *stat, bool ignore_references)
{
    LIST_HEAD(ret_folios);
    LIST_HEAD(free_folios);
    LIST_HEAD(demote_folios);
    unsigned int nr_reclaimed = 0;
    unsigned int pgactivate = 0;
    bool do_demote_pass;
    struct swap_iocb *plug = NULL;

    memset(stat, 0, sizeof(*stat));
    cond_resched();
    do_demote_pass = can_demote(pgdat->node_id, sc);

retry:
//扫描folio_list
while (!list_empty(folio_list)) { struct address_space *mapping; struct folio *folio; enum folio_references references = FOLIOREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; cond_resched(); folio = lru_to_folio(folio_list); list_del(&folio->lru); if (!folio_trylock(folio)) goto keep; VM_BUG_ON_FOLIO(folio_test_active(folio), folio); nr_pages = folio_nr_pages(folio); /* Account the number of base pages */ sc->nr_scanned += nr_pages; if (unlikely(!folio_evictable(folio))) goto activate_locked; if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; /* folio_update_gen() tried to promote this page? */ if (lru_gen_enabled() && !ignore_references && folio_mapped(folio) && folio_test_referenced(folio)) goto keep_locked; /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing * folios if the tail of the LRU is all dirty unqueued folios. */ folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback) stat->nr_dirty += nr_pages; if (dirty && !writeback) stat->nr_unqueued_dirty += nr_pages; /* * Treat this folio as congested if folios are cycling * through the LRU so quickly that the folios marked * for immediate reclaim are making it to the end of * the LRU a second time. */ if (writeback && folio_test_reclaim(folio)) stat->nr_congested += nr_pages; /* * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * * 1) If reclaim is encountering an excessive number * of folios under writeback and this folio has both * the writeback and reclaim flags set, then it * indicates that folios are being queued for I/O but * are being recycled through the LRU before the I/O * can complete. Waiting on the folio itself risks an * indefinite stall if it is impossible to writeback * the folio due to I/O error or disconnected storage * so instead note that the LRU is being scanned too * quickly and the caller can stall after the folio * list has been processed. * * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, * not to fs). In this case mark the folio for immediate * reclaim and continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might * enter reclaim, and deadlock if it waits on a folio for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * * 3) Legacy memcg encounters a folio that already has the * reclaim flag set. memcg does not have any dirty folio * throttling so we could easily OOM just because too many * folios are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * * In cases 1) and 2) we activate the folios to get them out of * the way while we continue scanning for clean folios on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. * Since they're marked for immediate reclaim, they won't put * memory pressure on the cache working set any longer than it * takes to write them to disk. */ if (folio_test_writeback(folio)) { /* Case 1 above */ if (current_is_kswapd() && folio_test_reclaim(folio) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { stat->nr_immediate += nr_pages; goto activate_locked; /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || !may_enter_fs(folio, sc->gfp_mask)) { /* * This is slightly racy - * folio_end_writeback() might have * just cleared the reclaim flag, then * setting the reclaim flag here ends up * interpreted as the readahead flag - but * that does not matter enough to care. * What we do want is for this folio to * have the reclaim flag set next time * memcg reclaim reaches the tests above, * so it will then wait for writeback to * avoid OOM; and it's also appropriate * in global reclaim. */ folio_set_reclaim(folio); stat->nr_writeback += nr_pages; goto activate_locked; /* Case 3 above */ } else { folio_unlock(folio); folio_wait_writeback(folio); /* then go back and try same folio again */ list_add_tail(&folio->lru, folio_list); continue; } } if (!ignore_references)
//判断需要如何处理当前folio references
= folio_check_references(folio, sc); switch (references) { case FOLIOREF_ACTIVATE: goto activate_locked; case FOLIOREF_KEEP: stat->nr_ref_keep += nr_pages; goto keep_locked; case FOLIOREF_RECLAIM: case FOLIOREF_RECLAIM_CLEAN: ; /* try to reclaim the folio below */ } /* * Before reclaiming the folio, try to relocate * its contents to another node. */ if (do_demote_pass && (thp_migration_supported() || !folio_test_large(folio))) {
//可以迁移到其他node list_add(
&folio->lru, &demote_folios); folio_unlock(folio); continue; } /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree folio could be freed directly */ if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { if (!folio_test_swapcache(folio)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (folio_maybe_dma_pinned(folio)) goto keep_locked; if (folio_test_large(folio)) { /* cannot split folio, skip it */ if (!can_split_folio(folio, NULL)) goto activate_locked; /* * Split folios without a PMD map right * away. Chances are some or all of the * tail pages can be freed without IO. */ if (!folio_entire_mapcount(folio) && split_folio_to_list(folio, folio_list)) goto activate_locked; } if (!add_to_swap(folio)) { if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) goto activate_locked_split; } } } else if (folio_test_swapbacked(folio) && folio_test_large(folio)) { /* Split shmem folio */ if (split_folio_to_list(folio, folio_list)) goto keep_locked; } /* * If the folio was split above, the tail pages will make * their own pass through this function and be accounted * then. */ if ((nr_pages > 1) && !folio_test_large(folio)) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } /* * The folio is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (folio_mapped(folio)) { enum ttu_flags flags = TTU_BATCH_FLUSH; bool was_swapbacked = folio_test_swapbacked(folio); if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; try_to_unmap(folio, flags); if (folio_mapped(folio)) { stat->nr_unmap_fail += nr_pages; if (!was_swapbacked && folio_test_swapbacked(folio)) stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } /* * Folio is unmapped now so it cannot be newly pinned anymore. * No point in trying to reclaim folio if it is pinned. * Furthermore we don't want to reclaim underlying fs metadata * if the folio is pinned and thus potentially modified by the * pinning process as that may upset the filesystem. */ if (folio_maybe_dma_pinned(folio)) goto activate_locked; mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { /* * Only kswapd can writeback filesystem folios * to avoid risk of stack overflow. But avoid * injecting inefficient single-folio I/O into * flusher writeback as much as possible: only * write folios when we've encountered many * dirty folios, and when we've already scanned * the rest of the LRU for clean folios and see * the same dirty folios again (with the reclaim * flag set). */ if (folio_is_file_lru(folio) && (!current_is_kswapd() || !folio_test_reclaim(folio) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() * except we already have the folio isolated * and know it's dirty */ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, nr_pages); folio_set_reclaim(folio); goto activate_locked; } if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked; /* * Folio is dirty. Flush the TLB if a writable entry * potentially exists to avoid CPU writes after I/O * starts and then write it out here. */ try_to_unmap_flush_dirty(); switch (pageout(folio, mapping, &plug)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: stat->nr_pageout += nr_pages; if (folio_test_writeback(folio)) goto keep; if (folio_test_dirty(folio)) goto keep; /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the folio. */ if (!folio_trylock(folio)) goto keep; if (folio_test_dirty(folio) || folio_test_writeback(folio)) goto keep_locked; mapping = folio_mapping(folio); fallthrough; case PAGE_CLEAN: ; /* try to free the folio below */ } } /* * If the folio has buffers, try to free the buffer * mappings associated with this folio. If we succeed * we try to free the folio as well. * * We do this even if the folio is dirty. * filemap_release_folio() does not perform I/O, but it * is possible for a folio to have the dirty flag set, * but it is actually clean (all its buffers are clean). * This happens if the buffers were written out directly, * with submit_bh(). ext3 will do this, as well as * the blockdev mapping. filemap_release_folio() will * discover that cleanness and will drop the buffers * and mark the folio clean - it can be freed. * * Rarely, folios can have buffers and no ->mapping. * These are the folios which were not successfully * invalidated in truncate_cleanup_folio(). We try to * drop those buffers here and if that worked, and the * folio is no longer mapped into process address space * (refcount == 1) it can be freed. Otherwise, leave * the folio on the LRU so it is swappable. */ if (folio_needs_release(folio)) { if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; if (!mapping && folio_ref_count(folio) == 1) { folio_unlock(folio); if (folio_put_testzero(folio)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free * this folio shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU). */ nr_reclaimed += nr_pages; continue; } } } if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { /* follow __remove_mapping for reference */ if (!folio_ref_freeze(folio, 1)) goto keep_locked; /* * The folio has only one reference left, which is * from the isolation. After the caller puts the * folio back on the lru and drops the reference, the * folio will be freed anyway. It doesn't matter * which lru it goes on. So we don't bother checking * the dirty flag here. */ count_vm_events(PGLAZYFREED, nr_pages); count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); } else if (!mapping || !__remove_mapping(mapping, folio, true, sc->target_mem_cgroup)) goto keep_locked; folio_unlock(folio); free_it: /* * Folio may get swapped out as a whole, need to account * all pages in it. */ nr_reclaimed += nr_pages; /* * Is there need to periodically free_folio_list? It would * appear not as the counts should be low */ if (unlikely(folio_test_large(folio))) destroy_large_folio(folio); else
//这是要被释放的页,放到free_folios list
list_add(&folio->lru, &free_folios); continue; activate_locked_split: /* * The tail pages that are failed to add into swap cache * reach here. Fixup nr_scanned and nr_pages. */ if (nr_pages > 1) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (folio_test_swapcache(folio) && (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { int type = folio_is_file_lru(folio); folio_set_active(folio); stat->nr_activate[type] += nr_pages; count_memcg_folio_events(folio, PGACTIVATE, nr_pages); } keep_locked: folio_unlock(folio); keep: list_add(&folio->lru, &ret_folios); VM_BUG_ON_FOLIO(folio_test_lru(folio) || folio_test_unevictable(folio), folio); } /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ nr_reclaimed += demote_folio_list(&demote_folios, pgdat); /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ list_splice_init(&demote_folios, folio_list); /* * goto retry to reclaim the undemoted folios in folio_list if * desired. * * Reclaiming directly from top tier nodes is not often desired * due to it breaking the LRU ordering: in general memory * should be reclaimed from lower tier nodes and demoted from * top tier nodes. * * However, disabling reclaim from top tier nodes entirely * would cause ooms in edge scenarios where lower tier memory * is unreclaimable for whatever reason, eg memory being * mlocked or too hot to reclaim. We can disable reclaim * from top tier nodes in proactive reclaim though as that is * not real memory pressure. */ if (!sc->proactive) { do_demote_pass = false; goto retry; } } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; mem_cgroup_uncharge_list(&free_folios); try_to_unmap_flush();
//释放页到伙伴系统,这才是真正的回收 free_unref_page_list(
&free_folios); list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); if (plug) swap_write_unplug(plug); return nr_reclaimed; }

shrink_folio_list是一个很复杂的函数,现在还没完全看懂。回头看。

目前来看决定哪些页面被扫描的函数是

static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
        struct lruvec *lruvec, struct list_head *dst,
        unsigned long *nr_scanned, struct scan_control *sc,
        enum lru_list lru)
{
    struct list_head *src = &lruvec->lists[lru];
    unsigned long nr_taken = 0;
    unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
    unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
    unsigned long skipped = 0;
    unsigned long scan, total_scan, nr_pages;
    LIST_HEAD(folios_skipped);

    total_scan = 0;
    scan = 0;
    while (scan < nr_to_scan && !list_empty(src)) {
        struct list_head *move_to = src;
        struct folio *folio;

        folio = lru_to_folio(src);
        prefetchw_prev_lru_folio(folio, src, flags);

        nr_pages = folio_nr_pages(folio);
        total_scan += nr_pages;

        if (folio_zonenum(folio) > sc->reclaim_idx ||
                skip_cma(folio, sc)) {
            nr_skipped[folio_zonenum(folio)] += nr_pages;
            move_to = &folios_skipped;
            goto move;
        }

        /*
         * Do not count skipped folios because that makes the function
         * return with no isolated folios if the LRU mostly contains
         * ineligible folios.  This causes the VM to not reclaim any
         * folios, triggering a premature OOM.
         * Account all pages in a folio.
         */
        scan += nr_pages;

        if (!folio_test_lru(folio))
            goto move;
        if (!sc->may_unmap && folio_mapped(folio))
            goto move;

        /*
         * Be careful not to clear the lru flag until after we're
         * sure the folio is not being freed elsewhere -- the
         * folio release code relies on it.
         */
        if (unlikely(!folio_try_get(folio)))
            goto move;

        if (!folio_test_clear_lru(folio)) {
            /* Another thread is already isolating this folio */
            folio_put(folio);
            goto move;
        }

        nr_taken += nr_pages;
        nr_zone_taken[folio_zonenum(folio)] += nr_pages;
        move_to = dst;
move:
        list_move(&folio->lru, move_to);
    }

    /*
     * Splice any skipped folios to the start of the LRU list. Note that
     * this disrupts the LRU order when reclaiming for lower zones but
     * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
     * scanning would soon rescan the same folios to skip and waste lots
     * of cpu cycles.
     */
    if (!list_empty(&folios_skipped)) {
        int zid;

        list_splice(&folios_skipped, src);
        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
            if (!nr_skipped[zid])
                continue;

            __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
            skipped += nr_skipped[zid];
        }
    }
    *nr_scanned = total_scan;
    trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
                    total_scan, skipped, nr_taken, lru);
    update_lru_sizes(lruvec, lru, nr_zone_taken);
    return nr_taken;
}

扫描lru链表,满足要求就加到folio_list中,后面会扫描folio_list从中找出要回收的页面。get_scan_count函数会计算各lru链表需要扫描的数量。

/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.
 *
 * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
 * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
 */
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
               unsigned long *nr)
{
    struct pglist_data *pgdat = lruvec_pgdat(lruvec);
    struct mem_cgroup *memcg = lruvec_memcg(lruvec);
    unsigned long anon_cost, file_cost, total_cost;
    int swappiness = mem_cgroup_swappiness(memcg);
    u64 fraction[ANON_AND_FILE];
    u64 denominator = 0;    /* gcc */
    enum scan_balance scan_balance;
    unsigned long ap, fp;
    enum lru_list lru;

    /* If we have no swap space, do not bother scanning anon folios. */
    if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
        scan_balance = SCAN_FILE;
        goto out;
    }

    /*
     * Global reclaim will swap to prevent OOM even with no
     * swappiness, but memcg users want to use this knob to
     * disable swapping for individual groups completely when
     * using the memory controller's swap limit feature would be
     * too expensive.
     */
    if (cgroup_reclaim(sc) && !swappiness) {
        scan_balance = SCAN_FILE;
        goto out;
    }

    /*
     * Do not apply any pressure balancing cleverness when the
     * system is close to OOM, scan both anon and file equally
     * (unless the swappiness setting disagrees with swapping).
     */
    if (!sc->priority && swappiness) {
        scan_balance = SCAN_EQUAL;
        goto out;
    }

    /*
     * If the system is almost out of file pages, force-scan anon.
     */
    if (sc->file_is_tiny) {
        scan_balance = SCAN_ANON;
        goto out;
    }

    /*
     * If there is enough inactive page cache, we do not reclaim
     * anything from the anonymous working right now.
     */
    if (sc->cache_trim_mode) {
        scan_balance = SCAN_FILE;
        goto out;
    }

    scan_balance = SCAN_FRACT;
    /*
     * Calculate the pressure balance between anon and file pages.
     *
     * The amount of pressure we put on each LRU is inversely
     * proportional to the cost of reclaiming each list, as
     * determined by the share of pages that are refaulting, times
     * the relative IO cost of bringing back a swapped out
     * anonymous page vs reloading a filesystem page (swappiness).
     *
     * Although we limit that influence to ensure no list gets
     * left behind completely: at least a third of the pressure is
     * applied, before swappiness.
     *
     * With swappiness at 100, anon and file have equal IO cost.
     */
    total_cost = sc->anon_cost + sc->file_cost;
    anon_cost = total_cost + sc->anon_cost;
    file_cost = total_cost + sc->file_cost;
    total_cost = anon_cost + file_cost;

    ap = swappiness * (total_cost + 1);
    ap /= anon_cost + 1;

    fp = (200 - swappiness) * (total_cost + 1);
    fp /= file_cost + 1;

    fraction[0] = ap;
    fraction[1] = fp;
    denominator = ap + fp;
out:
    for_each_evictable_lru(lru) {
        int file = is_file_lru(lru);
        unsigned long lruvec_size;
        unsigned long low, min;
        unsigned long scan;

        lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
        mem_cgroup_protection(sc->target_mem_cgroup, memcg,
                      &min, &low);

        if (min || low) {
            /*
             * Scale a cgroup's reclaim pressure by proportioning
             * its current usage to its memory.low or memory.min
             * setting.
             *
             * This is important, as otherwise scanning aggression
             * becomes extremely binary -- from nothing as we
             * approach the memory protection threshold, to totally
             * nominal as we exceed it.  This results in requiring
             * setting extremely liberal protection thresholds. It
             * also means we simply get no protection at all if we
             * set it too low, which is not ideal.
             *
             * If there is any protection in place, we reduce scan
             * pressure by how much of the total memory used is
             * within protection thresholds.
             *
             * There is one special case: in the first reclaim pass,
             * we skip over all groups that are within their low
             * protection. If that fails to reclaim enough pages to
             * satisfy the reclaim goal, we come back and override
             * the best-effort low protection. However, we still
             * ideally want to honor how well-behaved groups are in
             * that case instead of simply punishing them all
             * equally. As such, we reclaim them based on how much
             * memory they are using, reducing the scan pressure
             * again by how much of the total memory used is under
             * hard protection.
             */
            unsigned long cgroup_size = mem_cgroup_size(memcg);
            unsigned long protection;

            /* memory.low scaling, make sure we retry before OOM */
            if (!sc->memcg_low_reclaim && low > min) {
                protection = low;
                sc->memcg_low_skipped = 1;
            } else {
                protection = min;
            }

            /* Avoid TOCTOU with earlier protection check */
            cgroup_size = max(cgroup_size, protection);

            scan = lruvec_size - lruvec_size * protection /
                (cgroup_size + 1);

            /*
             * Minimally target SWAP_CLUSTER_MAX pages to keep
             * reclaim moving forwards, avoiding decrementing
             * sc->priority further than desirable.
             */
            scan = max(scan, SWAP_CLUSTER_MAX);
        } else {
            scan = lruvec_size;
        }

        scan >>= sc->priority;

        /*
         * If the cgroup's already been deleted, make sure to
         * scrape out the remaining cache.
         */
        if (!scan && !mem_cgroup_online(memcg))
            scan = min(lruvec_size, SWAP_CLUSTER_MAX);

        switch (scan_balance) {
        case SCAN_EQUAL:
            /* Scan lists relative to size */
            break;
        case SCAN_FRACT:
            /*
             * Scan types proportional to swappiness and
             * their relative recent reclaim efficiency.
             * Make sure we don't miss the last page on
             * the offlined memory cgroups because of a
             * round-off error.
             */
            scan = mem_cgroup_online(memcg) ?
                   div64_u64(scan * fraction[file], denominator) :
                   DIV64_U64_ROUND_UP(scan * fraction[file],
                          denominator);
            break;
        case SCAN_FILE:
        case SCAN_ANON:
            /* Scan one type exclusively */
            if ((scan_balance == SCAN_FILE) != file)
                scan = 0;
            break;
        default:
            /* Look ma, no brain */
            BUG();
        }

        nr[lru] = scan;
    }
}

计算的关键参数是swappiness,这个值是0-100,默认60,越大需要被扫描的匿名页越多,100表示跟文件cache页一样多。这个值可以在/proc/sys/vm/swappiness中修改。

上面我们是按直接回收的路径分析的,下面看看异步回收的路径。

异步内存回收是通过一个内核线程kswapd,它的初始化路径是

static int __init kswapd_init(void)
{
        int nid;

        swap_setup();
        for_each_node_state(nid, N_MEMORY)
                //给每个node建一个kswapd线程
                kswapd_run(nid);
        return 0;
}

module_init(kswapd_init)

/*
 * This kswapd start function will be called by init and node-hot-add.
 */
void __meminit kswapd_run(int nid)
{
        pg_data_t *pgdat = NODE_DATA(nid);

        pgdat_kswapd_lock(pgdat);
        if (!pgdat->kswapd) {
                //创建并唤醒kswapd线程
                pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
                if (IS_ERR(pgdat->kswapd)) {
                        /* failure at boot is fatal */
                        pr_err("Failed to start kswapd on node %d,ret=%ld\n",
                                   nid, PTR_ERR(pgdat->kswapd));
                        BUG_ON(system_state < SYSTEM_RUNNING);
                        pgdat->kswapd = NULL;
                }
        }
        pgdat_kswapd_unlock(pgdat);
}

在初始化node的时候会初始化kswapd_wait。

void __init free_area_init(unsigned long *max_zone_pfn)
{
...
        for_each_node(nid) {
                pg_data_t *pgdat;

                if (!node_online(nid)) {
                     ...
                        free_area_init_node(nid);
...
}

static void __init free_area_init_node(int nid)
{
...
        free_area_init_core(pgdat);
        lru_gen_init_pgdat(pgdat);
}

static void __init free_area_init_core(struct pglist_data *pgdat)
{
        enum zone_type j;
        int nid = pgdat->node_id;

        pgdat_init_internals(pgdat);
...
}

 

static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
        int i;

        pgdat_resize_init(pgdat);
        pgdat_kswapd_lock_init(pgdat);

        pgdat_init_split_queue(pgdat);
        pgdat_init_kcompactd(pgdat);

        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);

        for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
                init_waitqueue_head(&pgdat->reclaim_wait[i]);

        pgdat_page_ext_init(pgdat);
        lruvec_init(&pgdat->__lruvec);
}

可知kswapd_wait是每个node都有一个。

但是光有一个等待队列头没用,还得把kswapd线程加进队列。

/*
 * The background pageout daemon, started as a kernel thread
 * from the init process.
 *
 * This basically trickles out pages so that we have _some_
 * free memory available even if there is no other activity
 * that frees anything up. This is needed for things like routing
 * etc, where we otherwise might have all activity going on in
 * asynchronous contexts that cannot page things out.
 *
 * If there are applications that are active memory-allocators
 * (most normal use), this basically shouldn't matter.
 */
static int kswapd(void *p)
{
    unsigned int alloc_order, reclaim_order;
    unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
    pg_data_t *pgdat = (pg_data_t *)p;
    struct task_struct *tsk = current;
    const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

    if (!cpumask_empty(cpumask))
        set_cpus_allowed_ptr(tsk, cpumask);

    /*
     * Tell the memory management that we're a "memory allocator",
     * and that if we need more memory we should get access to it
     * regardless (see "__alloc_pages()"). "kswapd" should
     * never get caught in the normal page freeing logic.
     *
     * (Kswapd normally doesn't need memory anyway, but sometimes
     * you need a small amount of memory in order to be able to
     * page out something else, and this flag essentially protects
     * us from recursively trying to free more memory as we're
     * trying to free the first piece of memory in the first place).
     */
    tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
    set_freezable();

    WRITE_ONCE(pgdat->kswapd_order, 0);
    WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
    atomic_set(&pgdat->nr_writeback_throttled, 0);
//上面的代码只是在第一次执行的时候运行
for ( ; ; ) {
//以后就呆在这个循环里了
bool ret; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); kswapd_try_sleep:
//把自己加到等待队列中后schedule,等着被唤醒 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, highest_zoneidx);
/* Read the new order and highest_zoneidx */ alloc_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); ret = try_to_freeze(); if (kthread_should_stop()) break; /* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ if (ret) continue; /* * Reclaim begins at the requested order but if a high-order * reclaim fails then kswapd falls back to reclaiming for * order-0. If that happens, kswapd will consider sleeping * for the order it finished reclaiming at (reclaim_order) * but kcompactd is woken to compact for the original * request (alloc_order). */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order);
//开始做正事 reclaim_order
= balance_pgdat(pgdat, alloc_order, highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); return 0; }
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
                unsigned int highest_zoneidx)
{
    long remaining = 0;
    DEFINE_WAIT(wait);

    if (freezing(current) || kthread_should_stop())
        return;
    //把自己加到kswapd_wait等待队列
    prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

    /*
     * Try to sleep for a short interval. Note that kcompactd will only be
     * woken if it is possible to sleep for a short interval. This is
     * deliberate on the assumption that if reclaim cannot keep an
     * eligible zone balanced that it's also unlikely that compaction will
     * succeed.
     */
//睡会儿
if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. * When kswapd is going to sleep, it is reasonable to assume * that pages and compaction may succeed so reset the cache. */ reset_isolation_suitable(pgdat); /* * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { WRITE_ONCE(pgdat->kswapd_highest_zoneidx, kswapd_highest_zoneidx(pgdat, highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); } finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); } /* * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ if (!remaining && prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* * vmstat counters are not perfectly accurate and the estimated * value for counters such as NR_FREE_PAGES can deviate from the * true value by nr_online_cpus * threshold. To avoid the zone * watermarks being breached while under pressure, we reduce the * per-cpu vmstat threshold while kswapd is awake and restore * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); if (!kthread_should_stop()) schedule(); set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); else count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); } finish_wait(&pgdat->kswapd_wait, &wait); }

唤醒kswapd的通常是在分配内存时,alloc_page()-->__alloc_pages_nodemask()-->__alloc_pages_slowpath()-->wake_all_kswapds()-->wakeup_kswapd()

/*
 * A zone is low on free memory or too fragmented for high-order memory.  If
 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
 * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
 * has failed or is not needed, still wake up kcompactd if only compaction is
 * needed.
 */
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
                   enum zone_type highest_zoneidx)
{
        pg_data_t *pgdat;
        enum zone_type curr_idx;

        if (!managed_zone(zone))
                return;

        if (!cpuset_zone_allowed(zone, gfp_flags))
                return;
        pgdat = zone->zone_pgdat;
        curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);

        if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
                WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);

        if (READ_ONCE(pgdat->kswapd_order) < order)
                WRITE_ONCE(pgdat->kswapd_order, order);

        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;

        /* Hopeless node, leave it to direct reclaim if possible */
        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
            (pgdat_balanced(pgdat, order, highest_zoneidx) &&
             !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
                /*
                 * There may be plenty of free memory available, but it's too
                 * fragmented for high-order allocations.  Wake up kcompactd
                 * and rely on compaction_suitable() to determine if it's
                 * needed.  If it fails, it will defer subsequent attempts to
                 * ratelimit its work.
                 */
                if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
                        wakeup_kcompactd(pgdat, order, highest_zoneidx);
                return;
        }
        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
                                      gfp_flags);
        wake_up_interruptible(&pgdat->kswapd_wait);
}

下面看看balance_pgdat

/*
 * For kswapd, balance_pgdat() will reclaim pages across a node from zones
 * that are eligible for use by the caller until at least one zone is
 * balanced.
 *
 * Returns the order kswapd finished reclaiming at.
 *
 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
 * found to have free_pages <= high_wmark_pages(zone), any page in that zone
 * or lower is eligible for reclaim until at least one usable zone is
 * balanced.
 */
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
    int i;
    unsigned long nr_soft_reclaimed;
    unsigned long nr_soft_scanned;
    unsigned long pflags;
    unsigned long nr_boost_reclaim;
    unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
    bool boosted;
    struct zone *zone;
//用于回收页面的控制结构
struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, .may_unmap = 1, }; //设置当前task->reclaim_state成员 set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(_THIS_IP_); count_vm_event(PAGEOUTRUN); /* * Account for the reclaim boost. Note that the zone boost is left in * place so that parallel allocations that are near the watermark will * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; nr_boost_reclaim += zone->watermark_boost; zone_boosts[i] = zone->watermark_boost; } boosted = nr_boost_reclaim; restart:
//将当前node hzoneidx及以下的所有zone设置ZONE_RECLAIM_ACTIVE到flag set_reclaim_active(pgdat, highest_zoneidx);
//控制需要扫描的页面数量,nrpage = lru_pages >> priority sc.priority
= DEF_PRIORITY; // 12 do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; bool balanced; bool ret; sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed * then consider reclaiming from all zones. This has a dual * purpose -- on 64-bit systems it is expected that * buffer_heads are stripped during active rotation. On 32-bit * systems, highmem pages can pin lowmem memory and shrinking * buffers can relieve lowmem pressure. Reclaim may still not * go ahead if all eligible zones for the original allocation * request are balanced to avoid excessive reclaim from kswapd. */ if (buffer_heads_over_limit) { for (i = MAX_NR_ZONES - 1; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; sc.reclaim_idx = i; break; } } /* * If the pgdat is imbalanced then ignore boosting and preserve * the watermarks for a later time and restart. Note that the * zone watermarks will be still reset at the end of balancing * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */
//有没有一个zone有足够的free page,free page > zone->wmark[high]且可以分配出2^order的页面
balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; } /* * If boosting is not active then only reclaim if there are no * eligible zones. Note that sc.reclaim_idx is not used as * buffer_heads_over_limit may have adjusted it. */
//没有boost且已经balanced,那就不用回收了 if (!nr_boost_reclaim && balanced) goto out; /* Limit the priority of boosting to avoid reclaim writeback */ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) raise_priority = false; /* * Do not writeback or swap pages for boosted reclaim. The * intent is to relieve pressure not issue sub-optimal IO * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted. */ sc.may_writepage = !laptop_mode && !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; /* * Do some background aging, to give pages a chance to be * referenced before reclaiming. All pages are rotated * regardless of classzone as this is about consistent aging. */
//不是很明白???这又啥用? kswapd_age_node(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, sc.gfp_mask, &nr_soft_scanned); sc.nr_reclaimed += nr_soft_reclaimed; /* * There should be no need to raise the scanning priority if * enough pages are already being scanned that that high * watermark would be met at 100% efficiency. */
//进入回收路径
if (kswapd_shrink_node(pgdat, &sc)) raise_priority = false; /* * If the low watermark is met there is no need for processes * to be throttled on pfmemalloc_wait as they should not be * able to safely make forward progress. Wake them */

//唤醒pfmemalloc,不过这是个啥? if (waitqueue_active(&pgdat->pfmemalloc_wait) && allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); ret = try_to_freeze(); __fs_reclaim_acquire(_THIS_IP_); if (ret || kthread_should_stop()) break; /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); /* * If reclaim made no progress for a boost, stop reclaim as * IO cannot be queued and it could be an infinite loop in * extreme circumstances. */
//感觉是啥也没榨出来,放弃吧
if (nr_boost_reclaim && !nr_reclaimed) break; if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); //每次我都多扫描一些页面 if (!sc.nr_reclaimed) pgdat->kswapd_failures++; //卡在这里应该是没回收成功,在node中记录一下 out: clear_reclaim_active(pgdat, highest_zoneidx); /* If reclaim was boosted, account for the reclaim done in this pass */ if (boosted) { unsigned long flags; for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; /* Increments are under the zone lock */ zone = pgdat->node_zones + i; spin_lock_irqsave(&zone->lock, flags); zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); spin_unlock_irqrestore(&zone->lock, flags); } /* * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller * entered the allocator slow path while kswapd was awake, order will * remain at the higher level. */ return sc.order; }

kswapd_shrink_node是shrink_node的包装。

static bool kswapd_shrink_node(pg_data_t *pgdat,
                   struct scan_control *sc)
{
    struct zone *zone;
    int z;

    /* Reclaim a number of pages proportional to the number of zones */
    sc->nr_to_reclaim = 0;
    for (z = 0; z <= sc->reclaim_idx; z++) {
        zone = pgdat->node_zones + z;
        if (!managed_zone(zone))
            continue;
        //这里有好多跟要回收页面数量有关的参数,傻傻分不清
        sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
    }

    /*
     * Historically care was taken to put equal pressure on all zones but
     * now pressure is applied based on node LRU order.
     */
//回收路径 shrink_node(pgdat, sc); /* * Fragmentation may mean that the system cannot be rebalanced for * high-order allocations. If twice the allocation size has been * reclaimed then recheck watermarks only at order-0 to prevent * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact. */ if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) sc->order = 0; //扫描的数量比计划要扫描的数量多? return sc->nr_scanned >= sc->nr_to_reclaim; }

shrink_node_memcgs里面除了shrink_lruvec还有shrink_slab没有看。

static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{
    struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
    struct mem_cgroup *memcg;

    memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
    do {
...        
        shrink_lruvec(lruvec, sc);

        shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
                sc->priority);
...

    } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}

看看shrink_slab

/**
 * shrink_slab - shrink slab caches
 * @gfp_mask: allocation context
 * @nid: node whose slab caches to target
 * @memcg: memory cgroup whose slab caches to target
 * @priority: the reclaim priority
 *
 * Call the shrink functions to age shrinkable caches.
 *
 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 * unaware shrinkers will receive a node id of 0 instead.
 *
 * @memcg specifies the memory cgroup to target. Unaware shrinkers
 * are called only if it is the root cgroup.
 *
 * @priority is sc->priority, we take the number of objects and >> by priority
 * in order to get the scan target.
 *
 * Returns the number of reclaimed slab objects.
 */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
              int priority)
{
    unsigned long ret, freed = 0;
    struct shrinker *shrinker;

    /*
     * The root memcg might be allocated even though memcg is disabled
     * via "cgroup_disable=memory" boot parameter.  This could make
     * mem_cgroup_is_root() return false, then just run memcg slab
     * shrink, but skip global shrink.  This may result in premature
     * oom.
     */
    if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
        return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

    /*
     * lockless algorithm of global shrink.
     *
     * In the unregistration setp, the shrinker will be freed asynchronously
     * via RCU after its refcount reaches 0. So both rcu_read_lock() and
     * shrinker_try_get() can be used to ensure the existence of the shrinker.
     *
     * So in the global shrink:
     *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
     *          and the validity of the shrinker_list walk.
     *  step 2: use shrinker_try_get() to try get the refcount, if successful,
     *          then the existence of the shrinker can also be guaranteed,
     *          so we can release the RCU lock to do do_shrink_slab() that
     *          may sleep.
     *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
     *          which ensures that neither this shrinker nor the next shrinker
     *          will be freed in the next traversal operation.
     *  step 4: do shrinker_put() paired with step 2 to put the refcount,
     *          if the refcount reaches 0, then wake up the waiter in
     *          shrinker_free() by calling complete().
     */
    rcu_read_lock();
//有一个全局shrink_list list_for_each_entry_rcu(shrinker,
&shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; if (!shrinker_try_get(shrinker)) continue; rcu_read_unlock(); //shrink slab ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; rcu_read_lock(); shrinker_put(shrinker); } rcu_read_unlock(); cond_resched(); return freed; }

通过shrink_register很多模块会注册shrinker到shrinker_list,这里遍历shrinker_list,使用do_shrink_slab是回收slab。

/*
 * A callback you can register to apply pressure to ageable caches.
 *
 * @count_objects should return the number of freeable items in the cache. If
 * there are no objects to free, it should return SHRINK_EMPTY, while 0 is
 * returned in cases of the number of freeable items cannot be determined
 * or shrinker should skip this cache for this time (e.g., their number
 * is below shrinkable limit). No deadlock checks should be done during the
 * count callback - the shrinker relies on aggregating scan counts that couldn't
 * be executed due to potential deadlocks to be run at a later call when the
 * deadlock condition is no longer pending.
 *
 * @scan_objects will only be called if @count_objects returned a non-zero
 * value for the number of freeable objects. The callout should scan the cache
 * and attempt to free items from the cache. It should then return the number
 * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
 * due to potential deadlocks. If SHRINK_STOP is returned, then no further
 * attempts to call the @scan_objects will be made from the current reclaim
 * context.
 *
 * @flags determine the shrinker abilities, like numa awareness
 */
struct shrinker {
    unsigned long (*count_objects)(struct shrinker *,
                       struct shrink_control *sc);
    unsigned long (*scan_objects)(struct shrinker *,
                      struct shrink_control *sc);

    long batch;    /* reclaim batch size, 0 = default */
    int seeks;    /* seeks to recreate an obj */
    unsigned flags;

    /*
     * The reference count of this shrinker. Registered shrinker have an
     * initial refcount of 1, then the lookup operations are now allowed
     * to use it via shrinker_try_get(). Later in the unregistration step,
     * the initial refcount will be discarded, and will free the shrinker
     * asynchronously via RCU after its refcount reaches 0.
     */
    refcount_t refcount;
    struct completion done;    /* use to wait for refcount to reach 0 */
    struct rcu_head rcu;

    void *private_data;

    /* These are for internal use */
    struct list_head list;
#ifdef CONFIG_MEMCG
    /* ID in shrinker_idr */
    int id;
#endif
#ifdef CONFIG_SHRINKER_DEBUG
    int debugfs_id;
    const char *name;
    struct dentry *debugfs_entry;
#endif
    /* objs pending delete, per node */
    atomic_long_t *nr_deferred;
};

可以看到shrinker结构是一个cache的回调函数即一些参数,count_objects负责返回空闲项数量,scan_objects会去回收cache。

 

页面回收先分析到这里,非常简略,很多地方尚不清楚。

 

posted on 2024-06-18 12:43  半山随笔  阅读(249)  评论(0编辑  收藏  举报

导航