linux内存管理（十）- 页面回收（二）

news/2024/9/22 1:32:55

本篇了解一下内核是怎样触发页面回收的。

触发内存回收的方式有两种，同步和异步回收。alloc_pages在分配内存的时候，如果内存短缺会主动回收内存，这是同步回收；内核有一个或多个kswapd内核线程负责在后台回收内存，这是异步。

看一下shrink_active_list

static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{unsigned long nr[NR_LRU_LISTS];unsigned long targets[NR_LRU_LISTS];unsigned long nr_to_scan;enum lru_list lru;unsigned long nr_reclaimed = 0;unsigned long nr_to_reclaim = sc->nr_to_reclaim;bool proportional_reclaim;struct blk_plug plug;if (lru_gen_enabled() && !root_reclaim(sc)) {lru_gen_shrink_lruvec(lruvec, sc);return;}//计算各个lru链表需要扫描的page个数get_scan_count(lruvec, sc, nr);/* Record the original scan target for proportional adjustments later */memcpy(targets, nr, sizeof(nr));/** Global reclaiming within direct reclaim at DEF_PRIORITY is a normal* event that can occur when there is little memory pressure e.g.* multiple streaming readers/writers. Hence, we do not abort scanning* when the requested number of pages are reclaimed when scanning at* DEF_PRIORITY on the assumption that the fact we are direct* reclaiming implies that kswapd is not keeping up and it is best to* do a batch of work at once. For memcg reclaim one check is made to* abort proportional reclaim if either the file or anon lru has already* dropped to zero at the first pass.*/proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&sc->priority == DEF_PRIORITY);blk_start_plug(&plug);while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||          //active file也要被回收nr[LRU_INACTIVE_FILE]) {unsigned long nr_anon, nr_file, percentage;unsigned long nr_scanned;for_each_evictable_lru(lru) {if (nr[lru]) {nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);nr[lru] -= nr_to_scan;//shrink_list会调用shrink active list或者shrink inactive listnr_reclaimed += shrink_list(lru, nr_to_scan,lruvec, sc);}}cond_resched();if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)continue;/** For kswapd and memcg, reclaim at least the number of pages* requested. Ensure that the anon and file LRUs are scanned* proportionally what was requested by get_scan_count(). We* stop reclaiming one LRU and reduce the amount scanning* proportional to the original scan target.*/nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];/** It's just vindictive to attack the larger once the smaller* has gone to zero.  And given the way we stop scanning the* smaller below, this makes sure that we only make one nudge* towards proportionality once we've got nr_to_reclaim.*/if (!nr_file || !nr_anon)break;if (nr_file > nr_anon) {unsigned long scan_target = targets[LRU_INACTIVE_ANON] +targets[LRU_ACTIVE_ANON] + 1;lru = LRU_BASE;percentage = nr_anon * 100 / scan_target;} else {unsigned long scan_target = targets[LRU_INACTIVE_FILE] +targets[LRU_ACTIVE_FILE] + 1;lru = LRU_FILE;percentage = nr_file * 100 / scan_target;}/* Stop scanning the smaller of the LRU */nr[lru] = 0;nr[lru + LRU_ACTIVE] = 0;/** Recalculate the other LRU scan count based on its original* scan target and the percentage scanning already complete*/lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;nr_scanned = targets[lru] - nr[lru];nr[lru] = targets[lru] * (100 - percentage) / 100;nr[lru] -= min(nr[lru], nr_scanned);lru += LRU_ACTIVE;nr_scanned = targets[lru] - nr[lru];nr[lru] = targets[lru] * (100 - percentage) / 100;nr[lru] -= min(nr[lru], nr_scanned);}blk_finish_plug(&plug);sc->nr_reclaimed += nr_reclaimed;/** Even if we did not try to evict anon pages at all, we want to* rebalance the anon lru active/inactive ratio.*/if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&inactive_is_low(lruvec, LRU_INACTIVE_ANON))shrink_active_list(SWAP_CLUSTER_MAX, lruvec,sc, LRU_ACTIVE_ANON);
}

看看shrink_list

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,struct lruvec *lruvec, struct scan_control *sc)
{if (is_active_lru(lru)) {if (sc->may_deactivate & (1 << is_file_lru(lru)))shrink_active_list(nr_to_scan, lruvec, sc, lru);elsesc->skipped_deactivate = 1;return 0;}return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}

如果当前是active lru且允许deactive当前的lru那么调用shrink_active_list。

/** shrink_active_list() moves folios from the active LRU to the inactive LRU.** We move them the other way if the folio is referenced by one or more* processes.** If the folios are mostly unmapped, the processing is fast and it is* appropriate to hold lru_lock across the whole operation.  But if* the folios are mapped, the processing is slow (folio_referenced()), so* we should drop lru_lock around each folio.  It's impossible to balance* this, so instead we remove the folios from the LRU while processing them.* It is safe to rely on the active flag against the non-LRU folios in here* because nobody will play with that bit on a non-LRU folio.** The downside is that we have to touch folio->_refcount against each folio.* But we had to alter folio->flags anyway.*/
static void shrink_active_list(unsigned long nr_to_scan,struct lruvec *lruvec,struct scan_control *sc,enum lru_list lru)
{unsigned long nr_taken;unsigned long nr_scanned;unsigned long vm_flags;LIST_HEAD(l_hold);    /* The folios which were snipped off */LIST_HEAD(l_active);LIST_HEAD(l_inactive);unsigned nr_deactivate, nr_activate;unsigned nr_rotated = 0;int file = is_file_lru(lru);struct pglist_data *pgdat = lruvec_pgdat(lruvec);//排空lru per cpu cachelru_add_drain();spin_lock_irq(&lruvec->lru_lock);//把要扫描的页面先从lru上分离到l_hold中备用，我觉得这是为了减少对lru锁的使用时长nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,&nr_scanned, sc, lru);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);if (!cgroup_reclaim(sc))__count_vm_events(PGREFILL, nr_scanned);__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);spin_unlock_irq(&lruvec->lru_lock);//遍历l_holdwhile (!list_empty(&l_hold)) {struct folio *folio;cond_resched();folio = lru_to_folio(&l_hold);list_del(&folio->lru);if (unlikely(!folio_evictable(folio))) {folio_putback_lru(folio);continue;}//容我日后分析if (unlikely(buffer_heads_over_limit)) {if (folio_needs_release(folio) &&folio_trylock(folio)) {filemap_release_folio(folio, 0);folio_unlock(folio);}}/* Referenced or rmap lock contention: rotate */if (folio_referenced(folio, 0, sc->target_mem_cgroup,&vm_flags) != 0) {/** Identify referenced, file-backed active folios and* give them one more trip around the active list. So* that executable code get better chances to stay in* memory under moderate memory pressure.  Anon folios* are not likely to be evicted by use-once streaming* IO, plus JVM can create lots of anon VM_EXEC folios,* so we ignore them here.*/
            //可执行文件cache如果被引用过那就先放回active listif ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {nr_rotated += folio_nr_pages(folio);list_add(&folio->lru, &l_active);continue;}}//将folio的active标志清掉folio_clear_active(folio);    /* we are de-activating */folio_set_workingset(folio);
        //将folio加到inactive tmp list中list_add(&folio->lru, &l_inactive);}/** Move folios back to the lru list.*/spin_lock_irq(&lruvec->lru_lock);nr_activate = move_folios_to_lru(lruvec, &l_active);nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);/* Keep all free folios in l_active list */list_splice(&l_inactive, &l_active);__count_vm_events(PGDEACTIVATE, nr_deactivate);__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);spin_unlock_irq(&lruvec->lru_lock);if (nr_rotated)lru_note_cost(lruvec, file, 0, nr_rotated);mem_cgroup_uncharge_list(&l_active);free_unref_page_list(&l_active);trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,nr_deactivate, nr_rotated, sc->priority, file);
}

看看shrink_inactive_list

static unsigned long shrink_inactive_list(unsigned long nr_to_scan,struct lruvec *lruvec, struct scan_control *sc,enum lru_list lru)
{LIST_HEAD(folio_list);unsigned long nr_scanned;unsigned int nr_reclaimed = 0;unsigned long nr_taken;struct reclaim_stat stat;bool file = is_file_lru(lru);enum vm_event_item item;struct pglist_data *pgdat = lruvec_pgdat(lruvec);bool stalled = false;while (unlikely(too_many_isolated(pgdat, file, sc))) {if (stalled)return 0;/* wait a bit for the reclaimer. */stalled = true;reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);/* We are about to die and free our memory. Return now. */if (fatal_signal_pending(current))return SWAP_CLUSTER_MAX;}lru_add_drain();spin_lock_irq(&lruvec->lru_lock);//分离要扫描的lru folio到folio_listnr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,&nr_scanned, sc, lru);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);item = PGSCAN_KSWAPD + reclaimer_offset();if (!cgroup_reclaim(sc))__count_vm_events(item, nr_scanned);__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);__count_vm_events(PGSCAN_ANON + file, nr_scanned);spin_unlock_irq(&lruvec->lru_lock);if (nr_taken == 0)return 0;//回收folio_list里面的folio，返回回收的page数量nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);spin_lock_irq(&lruvec->lru_lock);
    //没有被回收的folio放回lrumove_folios_to_lru(lruvec, &folio_list);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);item = PGSTEAL_KSWAPD + reclaimer_offset();if (!cgroup_reclaim(sc))__count_vm_events(item, nr_reclaimed);__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);spin_unlock_irq(&lruvec->lru_lock);lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);mem_cgroup_uncharge_list(&folio_list);
    //folio_list还有东西，free到伙伴系统free_unref_page_list(&folio_list);/** If dirty folios are scanned that are not queued for IO, it* implies that flushers are not doing their job. This can* happen when memory pressure pushes dirty folios to the end of* the LRU before the dirty limits are breached and the dirty* data has expired. It can also happen when the proportion of* dirty folios grows not through writes but through memory* pressure reclaiming all the clean cache. And in some cases,* the flushers simply cannot keep up with the allocation* rate. Nudge the flusher threads in case they are asleep.*/if (stat.nr_unqueued_dirty == nr_taken) {wakeup_flusher_threads(WB_REASON_VMSCAN);/** For cgroupv1 dirty throttling is achieved by waking up* the kernel flusher here and later waiting on folios* which are in writeback to finish (see shrink_folio_list()).** Flusher may not be able to issue writeback quickly* enough for cgroupv1 writeback throttling to work* on a large system.*/if (!writeback_throttling_sane(sc))reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);}sc->nr.dirty += stat.nr_dirty;sc->nr.congested += stat.nr_congested;sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;sc->nr.writeback += stat.nr_writeback;sc->nr.immediate += stat.nr_immediate;sc->nr.taken += nr_taken;if (file)sc->nr.file_taken += nr_taken;trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,nr_scanned, nr_reclaimed, &stat, sc->priority, file);return nr_reclaimed;
}

shrink_inactive_list调用shrink_folio_list去回收页。

看一下shrink_folio_list

static unsigned int shrink_folio_list(struct list_head *folio_list,struct pglist_data *pgdat, struct scan_control *sc,struct reclaim_stat *stat, bool ignore_references)
{LIST_HEAD(ret_folios);LIST_HEAD(free_folios);LIST_HEAD(demote_folios);unsigned int nr_reclaimed = 0;unsigned int pgactivate = 0;bool do_demote_pass;struct swap_iocb *plug = NULL;memset(stat, 0, sizeof(*stat));cond_resched();do_demote_pass = can_demote(pgdat->node_id, sc);retry:
    //扫描folio_listwhile (!list_empty(folio_list)) {struct address_space *mapping;struct folio *folio;enum folio_references references = FOLIOREF_RECLAIM;bool dirty, writeback;unsigned int nr_pages;cond_resched();folio = lru_to_folio(folio_list);list_del(&folio->lru);if (!folio_trylock(folio))goto keep;VM_BUG_ON_FOLIO(folio_test_active(folio), folio);nr_pages = folio_nr_pages(folio);/* Account the number of base pages */sc->nr_scanned += nr_pages;if (unlikely(!folio_evictable(folio)))goto activate_locked;if (!sc->may_unmap && folio_mapped(folio))goto keep_locked;/* folio_update_gen() tried to promote this page? */if (lru_gen_enabled() && !ignore_references &&folio_mapped(folio) && folio_test_referenced(folio))goto keep_locked;/** The number of dirty pages determines if a node is marked* reclaim_congested. kswapd will stall and start writing* folios if the tail of the LRU is all dirty unqueued folios.*/folio_check_dirty_writeback(folio, &dirty, &writeback);if (dirty || writeback)stat->nr_dirty += nr_pages;if (dirty && !writeback)stat->nr_unqueued_dirty += nr_pages;/** Treat this folio as congested if folios are cycling* through the LRU so quickly that the folios marked* for immediate reclaim are making it to the end of* the LRU a second time.*/if (writeback && folio_test_reclaim(folio))stat->nr_congested += nr_pages;/** If a folio at the tail of the LRU is under writeback, there* are three cases to consider.** 1) If reclaim is encountering an excessive number*    of folios under writeback and this folio has both*    the writeback and reclaim flags set, then it*    indicates that folios are being queued for I/O but*    are being recycled through the LRU before the I/O*    can complete. Waiting on the folio itself risks an*    indefinite stall if it is impossible to writeback*    the folio due to I/O error or disconnected storage*    so instead note that the LRU is being scanned too*    quickly and the caller can stall after the folio*    list has been processed.** 2) Global or new memcg reclaim encounters a folio that is*    not marked for immediate reclaim, or the caller does not*    have __GFP_FS (or __GFP_IO if it's simply going to swap,*    not to fs). In this case mark the folio for immediate*    reclaim and continue scanning.**    Require may_enter_fs() because we would wait on fs, which*    may not have submitted I/O yet. And the loop driver might*    enter reclaim, and deadlock if it waits on a folio for*    which it is needed to do the write (loop masks off*    __GFP_IO|__GFP_FS for this reason); but more thought*    would probably show more reasons.** 3) Legacy memcg encounters a folio that already has the*    reclaim flag set. memcg does not have any dirty folio*    throttling so we could easily OOM just because too many*    folios are in writeback and there is nothing else to*    reclaim. Wait for the writeback to complete.** In cases 1) and 2) we activate the folios to get them out of* the way while we continue scanning for clean folios on the* inactive list and refilling from the active list. The* observation here is that waiting for disk writes is more* expensive than potentially causing reloads down the line.* Since they're marked for immediate reclaim, they won't put* memory pressure on the cache working set any longer than it* takes to write them to disk.*/if (folio_test_writeback(folio)) {/* Case 1 above */if (current_is_kswapd() &&folio_test_reclaim(folio) &&test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {stat->nr_immediate += nr_pages;goto activate_locked;/* Case 2 above */} else if (writeback_throttling_sane(sc) ||!folio_test_reclaim(folio) ||!may_enter_fs(folio, sc->gfp_mask)) {/** This is slightly racy -* folio_end_writeback() might have* just cleared the reclaim flag, then* setting the reclaim flag here ends up* interpreted as the readahead flag - but* that does not matter enough to care.* What we do want is for this folio to* have the reclaim flag set next time* memcg reclaim reaches the tests above,* so it will then wait for writeback to* avoid OOM; and it's also appropriate* in global reclaim.*/folio_set_reclaim(folio);stat->nr_writeback += nr_pages;goto activate_locked;/* Case 3 above */} else {folio_unlock(folio);folio_wait_writeback(folio);/* then go back and try same folio again */list_add_tail(&folio->lru, folio_list);continue;}}if (!ignore_references)
            //判断需要如何处理当前folioreferences = folio_check_references(folio, sc);switch (references) {case FOLIOREF_ACTIVATE:goto activate_locked;case FOLIOREF_KEEP:stat->nr_ref_keep += nr_pages;goto keep_locked;case FOLIOREF_RECLAIM:case FOLIOREF_RECLAIM_CLEAN:; /* try to reclaim the folio below */}/** Before reclaiming the folio, try to relocate* its contents to another node.*/if (do_demote_pass &&(thp_migration_supported() || !folio_test_large(folio))) {
            //可以迁移到其他nodelist_add(&folio->lru, &demote_folios);folio_unlock(folio);continue;}/** Anonymous process memory has backing store?* Try to allocate it some swap space here.* Lazyfree folio could be freed directly*/if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {if (!folio_test_swapcache(folio)) {if (!(sc->gfp_mask & __GFP_IO))goto keep_locked;if (folio_maybe_dma_pinned(folio))goto keep_locked;if (folio_test_large(folio)) {/* cannot split folio, skip it */if (!can_split_folio(folio, NULL))goto activate_locked;/** Split folios without a PMD map right* away. Chances are some or all of the* tail pages can be freed without IO.*/if (!folio_entire_mapcount(folio) &&split_folio_to_list(folio,folio_list))goto activate_locked;}if (!add_to_swap(folio)) {if (!folio_test_large(folio))goto activate_locked_split;/* Fallback to swap normal pages */if (split_folio_to_list(folio,folio_list))goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGEcount_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);count_vm_event(THP_SWPOUT_FALLBACK);
#endifif (!add_to_swap(folio))goto activate_locked_split;}}} else if (folio_test_swapbacked(folio) &&folio_test_large(folio)) {/* Split shmem folio */if (split_folio_to_list(folio, folio_list))goto keep_locked;}/** If the folio was split above, the tail pages will make* their own pass through this function and be accounted* then.*/if ((nr_pages > 1) && !folio_test_large(folio)) {sc->nr_scanned -= (nr_pages - 1);nr_pages = 1;}/** The folio is mapped into the page tables of one or more* processes. Try to unmap it here.*/if (folio_mapped(folio)) {enum ttu_flags flags = TTU_BATCH_FLUSH;bool was_swapbacked = folio_test_swapbacked(folio);if (folio_test_pmd_mappable(folio))flags |= TTU_SPLIT_HUGE_PMD;try_to_unmap(folio, flags);if (folio_mapped(folio)) {stat->nr_unmap_fail += nr_pages;if (!was_swapbacked &&folio_test_swapbacked(folio))stat->nr_lazyfree_fail += nr_pages;goto activate_locked;}}/** Folio is unmapped now so it cannot be newly pinned anymore.* No point in trying to reclaim folio if it is pinned.* Furthermore we don't want to reclaim underlying fs metadata* if the folio is pinned and thus potentially modified by the* pinning process as that may upset the filesystem.*/if (folio_maybe_dma_pinned(folio))goto activate_locked;mapping = folio_mapping(folio);if (folio_test_dirty(folio)) {/** Only kswapd can writeback filesystem folios* to avoid risk of stack overflow. But avoid* injecting inefficient single-folio I/O into* flusher writeback as much as possible: only* write folios when we've encountered many* dirty folios, and when we've already scanned* the rest of the LRU for clean folios and see* the same dirty folios again (with the reclaim* flag set).*/if (folio_is_file_lru(folio) &&(!current_is_kswapd() ||!folio_test_reclaim(folio) ||!test_bit(PGDAT_DIRTY, &pgdat->flags))) {/** Immediately reclaim when written back.* Similar in principle to folio_deactivate()* except we already have the folio isolated* and know it's dirty*/node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,nr_pages);folio_set_reclaim(folio);goto activate_locked;}if (references == FOLIOREF_RECLAIM_CLEAN)goto keep_locked;if (!may_enter_fs(folio, sc->gfp_mask))goto keep_locked;if (!sc->may_writepage)goto keep_locked;/** Folio is dirty. Flush the TLB if a writable entry* potentially exists to avoid CPU writes after I/O* starts and then write it out here.*/try_to_unmap_flush_dirty();switch (pageout(folio, mapping, &plug)) {case PAGE_KEEP:goto keep_locked;case PAGE_ACTIVATE:goto activate_locked;case PAGE_SUCCESS:stat->nr_pageout += nr_pages;if (folio_test_writeback(folio))goto keep;if (folio_test_dirty(folio))goto keep;/** A synchronous write - probably a ramdisk.  Go* ahead and try to reclaim the folio.*/if (!folio_trylock(folio))goto keep;if (folio_test_dirty(folio) ||folio_test_writeback(folio))goto keep_locked;mapping = folio_mapping(folio);fallthrough;case PAGE_CLEAN:; /* try to free the folio below */}}/** If the folio has buffers, try to free the buffer* mappings associated with this folio. If we succeed* we try to free the folio as well.** We do this even if the folio is dirty.* filemap_release_folio() does not perform I/O, but it* is possible for a folio to have the dirty flag set,* but it is actually clean (all its buffers are clean).* This happens if the buffers were written out directly,* with submit_bh(). ext3 will do this, as well as* the blockdev mapping.  filemap_release_folio() will* discover that cleanness and will drop the buffers* and mark the folio clean - it can be freed.** Rarely, folios can have buffers and no ->mapping.* These are the folios which were not successfully* invalidated in truncate_cleanup_folio().  We try to* drop those buffers here and if that worked, and the* folio is no longer mapped into process address space* (refcount == 1) it can be freed.  Otherwise, leave* the folio on the LRU so it is swappable.*/if (folio_needs_release(folio)) {if (!filemap_release_folio(folio, sc->gfp_mask))goto activate_locked;if (!mapping && folio_ref_count(folio) == 1) {folio_unlock(folio);if (folio_put_testzero(folio))goto free_it;else {/** rare race with speculative reference.* the speculative reference will free* this folio shortly, so we may* increment nr_reclaimed here (and* leave it off the LRU).*/nr_reclaimed += nr_pages;continue;}}}if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {/* follow __remove_mapping for reference */if (!folio_ref_freeze(folio, 1))goto keep_locked;/** The folio has only one reference left, which is* from the isolation. After the caller puts the* folio back on the lru and drops the reference, the* folio will be freed anyway. It doesn't matter* which lru it goes on. So we don't bother checking* the dirty flag here.*/count_vm_events(PGLAZYFREED, nr_pages);count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);} else if (!mapping || !__remove_mapping(mapping, folio, true,sc->target_mem_cgroup))goto keep_locked;folio_unlock(folio);
free_it:/** Folio may get swapped out as a whole, need to account* all pages in it.*/nr_reclaimed += nr_pages;/** Is there need to periodically free_folio_list? It would* appear not as the counts should be low*/if (unlikely(folio_test_large(folio)))destroy_large_folio(folio);else
            //这是要被释放的页，放到free_folios listlist_add(&folio->lru, &free_folios);continue;activate_locked_split:/** The tail pages that are failed to add into swap cache* reach here.  Fixup nr_scanned and nr_pages.*/if (nr_pages > 1) {sc->nr_scanned -= (nr_pages - 1);nr_pages = 1;}
activate_locked:/* Not a candidate for swapping, so reclaim swap space. */if (folio_test_swapcache(folio) &&(mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))folio_free_swap(folio);VM_BUG_ON_FOLIO(folio_test_active(folio), folio);if (!folio_test_mlocked(folio)) {int type = folio_is_file_lru(folio);folio_set_active(folio);stat->nr_activate[type] += nr_pages;count_memcg_folio_events(folio, PGACTIVATE, nr_pages);}
keep_locked:folio_unlock(folio);
keep:list_add(&folio->lru, &ret_folios);VM_BUG_ON_FOLIO(folio_test_lru(folio) ||folio_test_unevictable(folio), folio);}/* 'folio_list' is always empty here *//* Migrate folios selected for demotion */nr_reclaimed += demote_folio_list(&demote_folios, pgdat);/* Folios that could not be demoted are still in @demote_folios */if (!list_empty(&demote_folios)) {/* Folios which weren't demoted go back on @folio_list */list_splice_init(&demote_folios, folio_list);/** goto retry to reclaim the undemoted folios in folio_list if* desired.** Reclaiming directly from top tier nodes is not often desired* due to it breaking the LRU ordering: in general memory* should be reclaimed from lower tier nodes and demoted from* top tier nodes.** However, disabling reclaim from top tier nodes entirely* would cause ooms in edge scenarios where lower tier memory* is unreclaimable for whatever reason, eg memory being* mlocked or too hot to reclaim. We can disable reclaim* from top tier nodes in proactive reclaim though as that is* not real memory pressure.*/if (!sc->proactive) {do_demote_pass = false;goto retry;}}pgactivate = stat->nr_activate[0] + stat->nr_activate[1];mem_cgroup_uncharge_list(&free_folios);try_to_unmap_flush();
    //释放页到伙伴系统，这才是真正的回收free_unref_page_list(&free_folios);list_splice(&ret_folios, folio_list);count_vm_events(PGACTIVATE, pgactivate);if (plug)swap_write_unplug(plug);return nr_reclaimed;
}

shrink_folio_list是一个很复杂的函数，现在还没完全看懂。回头看。

目前来看决定哪些页面被扫描的函数是

static unsigned long isolate_lru_folios(unsigned long nr_to_scan,struct lruvec *lruvec, struct list_head *dst,unsigned long *nr_scanned, struct scan_control *sc,enum lru_list lru)
{struct list_head *src = &lruvec->lists[lru];unsigned long nr_taken = 0;unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };unsigned long skipped = 0;unsigned long scan, total_scan, nr_pages;LIST_HEAD(folios_skipped);total_scan = 0;scan = 0;while (scan < nr_to_scan && !list_empty(src)) {struct list_head *move_to = src;struct folio *folio;folio = lru_to_folio(src);prefetchw_prev_lru_folio(folio, src, flags);nr_pages = folio_nr_pages(folio);total_scan += nr_pages;if (folio_zonenum(folio) > sc->reclaim_idx ||skip_cma(folio, sc)) {nr_skipped[folio_zonenum(folio)] += nr_pages;move_to = &folios_skipped;goto move;}/** Do not count skipped folios because that makes the function* return with no isolated folios if the LRU mostly contains* ineligible folios.  This causes the VM to not reclaim any* folios, triggering a premature OOM.* Account all pages in a folio.*/scan += nr_pages;if (!folio_test_lru(folio))goto move;if (!sc->may_unmap && folio_mapped(folio))goto move;/** Be careful not to clear the lru flag until after we're* sure the folio is not being freed elsewhere -- the* folio release code relies on it.*/if (unlikely(!folio_try_get(folio)))goto move;if (!folio_test_clear_lru(folio)) {/* Another thread is already isolating this folio */folio_put(folio);goto move;}nr_taken += nr_pages;nr_zone_taken[folio_zonenum(folio)] += nr_pages;move_to = dst;
move:list_move(&folio->lru, move_to);}/** Splice any skipped folios to the start of the LRU list. Note that* this disrupts the LRU order when reclaiming for lower zones but* we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX* scanning would soon rescan the same folios to skip and waste lots* of cpu cycles.*/if (!list_empty(&folios_skipped)) {int zid;list_splice(&folios_skipped, src);for (zid = 0; zid < MAX_NR_ZONES; zid++) {if (!nr_skipped[zid])continue;__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);skipped += nr_skipped[zid];}}*nr_scanned = total_scan;trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,total_scan, skipped, nr_taken, lru);update_lru_sizes(lruvec, lru, nr_zone_taken);return nr_taken;
}

扫描lru链表，满足要求就加到folio_list中，后面会扫描folio_list从中找出要回收的页面。get_scan_count函数会计算各lru链表需要扫描的数量。

/** Determine how aggressively the anon and file LRU lists should be* scanned.** nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan* nr[2] = file inactive folios to scan; nr[3] = file active folios to scan*/
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,unsigned long *nr)
{struct pglist_data *pgdat = lruvec_pgdat(lruvec);struct mem_cgroup *memcg = lruvec_memcg(lruvec);unsigned long anon_cost, file_cost, total_cost;int swappiness = mem_cgroup_swappiness(memcg);u64 fraction[ANON_AND_FILE];u64 denominator = 0;    /* gcc */enum scan_balance scan_balance;unsigned long ap, fp;enum lru_list lru;/* If we have no swap space, do not bother scanning anon folios. */if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {scan_balance = SCAN_FILE;goto out;}/** Global reclaim will swap to prevent OOM even with no* swappiness, but memcg users want to use this knob to* disable swapping for individual groups completely when* using the memory controller's swap limit feature would be* too expensive.*/if (cgroup_reclaim(sc) && !swappiness) {scan_balance = SCAN_FILE;goto out;}/** Do not apply any pressure balancing cleverness when the* system is close to OOM, scan both anon and file equally* (unless the swappiness setting disagrees with swapping).*/if (!sc->priority && swappiness) {scan_balance = SCAN_EQUAL;goto out;}/** If the system is almost out of file pages, force-scan anon.*/if (sc->file_is_tiny) {scan_balance = SCAN_ANON;goto out;}/** If there is enough inactive page cache, we do not reclaim* anything from the anonymous working right now.*/if (sc->cache_trim_mode) {scan_balance = SCAN_FILE;goto out;}scan_balance = SCAN_FRACT;/** Calculate the pressure balance between anon and file pages.** The amount of pressure we put on each LRU is inversely* proportional to the cost of reclaiming each list, as* determined by the share of pages that are refaulting, times* the relative IO cost of bringing back a swapped out* anonymous page vs reloading a filesystem page (swappiness).** Although we limit that influence to ensure no list gets* left behind completely: at least a third of the pressure is* applied, before swappiness.** With swappiness at 100, anon and file have equal IO cost.*/total_cost = sc->anon_cost + sc->file_cost;anon_cost = total_cost + sc->anon_cost;file_cost = total_cost + sc->file_cost;total_cost = anon_cost + file_cost;ap = swappiness * (total_cost + 1);ap /= anon_cost + 1;fp = (200 - swappiness) * (total_cost + 1);fp /= file_cost + 1;fraction[0] = ap;fraction[1] = fp;denominator = ap + fp;
out:for_each_evictable_lru(lru) {int file = is_file_lru(lru);unsigned long lruvec_size;unsigned long low, min;unsigned long scan;lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);mem_cgroup_protection(sc->target_mem_cgroup, memcg,&min, &low);if (min || low) {/** Scale a cgroup's reclaim pressure by proportioning* its current usage to its memory.low or memory.min* setting.** This is important, as otherwise scanning aggression* becomes extremely binary -- from nothing as we* approach the memory protection threshold, to totally* nominal as we exceed it.  This results in requiring* setting extremely liberal protection thresholds. It* also means we simply get no protection at all if we* set it too low, which is not ideal.** If there is any protection in place, we reduce scan* pressure by how much of the total memory used is* within protection thresholds.** There is one special case: in the first reclaim pass,* we skip over all groups that are within their low* protection. If that fails to reclaim enough pages to* satisfy the reclaim goal, we come back and override* the best-effort low protection. However, we still* ideally want to honor how well-behaved groups are in* that case instead of simply punishing them all* equally. As such, we reclaim them based on how much* memory they are using, reducing the scan pressure* again by how much of the total memory used is under* hard protection.*/unsigned long cgroup_size = mem_cgroup_size(memcg);unsigned long protection;/* memory.low scaling, make sure we retry before OOM */if (!sc->memcg_low_reclaim && low > min) {protection = low;sc->memcg_low_skipped = 1;} else {protection = min;}/* Avoid TOCTOU with earlier protection check */cgroup_size = max(cgroup_size, protection);scan = lruvec_size - lruvec_size * protection /(cgroup_size + 1);/** Minimally target SWAP_CLUSTER_MAX pages to keep* reclaim moving forwards, avoiding decrementing* sc->priority further than desirable.*/scan = max(scan, SWAP_CLUSTER_MAX);} else {scan = lruvec_size;}scan >>= sc->priority;/** If the cgroup's already been deleted, make sure to* scrape out the remaining cache.*/if (!scan && !mem_cgroup_online(memcg))scan = min(lruvec_size, SWAP_CLUSTER_MAX);switch (scan_balance) {case SCAN_EQUAL:/* Scan lists relative to size */break;case SCAN_FRACT:/** Scan types proportional to swappiness and* their relative recent reclaim efficiency.* Make sure we don't miss the last page on* the offlined memory cgroups because of a* round-off error.*/scan = mem_cgroup_online(memcg) ?div64_u64(scan * fraction[file], denominator) :DIV64_U64_ROUND_UP(scan * fraction[file],denominator);break;case SCAN_FILE:case SCAN_ANON:/* Scan one type exclusively */if ((scan_balance == SCAN_FILE) != file)scan = 0;break;default:/* Look ma, no brain */BUG();}nr[lru] = scan;}
}

计算的关键参数是swappiness，这个值是0-100，默认60，越大需要被扫描的匿名页越多，100表示跟文件cache页一样多。这个值可以在/proc/sys/vm/swappiness中修改。

上面我们是按直接回收的路径分析的，下面看看异步回收的路径。

异步内存回收是通过一个内核线程kswapd，它的初始化路径是

static int __init kswapd_init(void)
{int nid;swap_setup();for_each_node_state(nid, N_MEMORY)//给每个node建一个kswapd线程
                kswapd_run(nid);return 0;
}module_init(kswapd_init)/** This kswapd start function will be called by init and node-hot-add.*/
void __meminit kswapd_run(int nid)
{pg_data_t *pgdat = NODE_DATA(nid);pgdat_kswapd_lock(pgdat);if (!pgdat->kswapd) {//创建并唤醒kswapd线程pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);if (IS_ERR(pgdat->kswapd)) {/* failure at boot is fatal */pr_err("Failed to start kswapd on node %d，ret=%ld\n",nid, PTR_ERR(pgdat->kswapd));BUG_ON(system_state < SYSTEM_RUNNING);pgdat->kswapd = NULL;}}pgdat_kswapd_unlock(pgdat);
}

在初始化node的时候会初始化kswapd_wait。

void __init free_area_init(unsigned long *max_zone_pfn)
{
...for_each_node(nid) {pg_data_t *pgdat;if (!node_online(nid)) {...free_area_init_node(nid);
...
}static void __init free_area_init_node(int nid)
{
...free_area_init_core(pgdat);lru_gen_init_pgdat(pgdat);
}static void __init free_area_init_core(struct pglist_data *pgdat)
{enum zone_type j;int nid = pgdat->node_id;pgdat_init_internals(pgdat);
...
}

static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{int i;pgdat_resize_init(pgdat);pgdat_kswapd_lock_init(pgdat);pgdat_init_split_queue(pgdat);pgdat_init_kcompactd(pgdat);init_waitqueue_head(&pgdat->kswapd_wait);init_waitqueue_head(&pgdat->pfmemalloc_wait);for (i = 0; i < NR_VMSCAN_THROTTLE; i++)init_waitqueue_head(&pgdat->reclaim_wait[i]);pgdat_page_ext_init(pgdat);lruvec_init(&pgdat->__lruvec);
}

可知kswapd_wait是每个node都有一个。

但是光有一个等待队列头没用，还得把kswapd线程加进队列。

/** The background pageout daemon, started as a kernel thread* from the init process.** This basically trickles out pages so that we have _some_* free memory available even if there is no other activity* that frees anything up. This is needed for things like routing* etc, where we otherwise might have all activity going on in* asynchronous contexts that cannot page things out.** If there are applications that are active memory-allocators* (most normal use), this basically shouldn't matter.*/
static int kswapd(void *p)
{unsigned int alloc_order, reclaim_order;unsigned int highest_zoneidx = MAX_NR_ZONES - 1;pg_data_t *pgdat = (pg_data_t *)p;struct task_struct *tsk = current;const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);if (!cpumask_empty(cpumask))set_cpus_allowed_ptr(tsk, cpumask);/** Tell the memory management that we're a "memory allocator",* and that if we need more memory we should get access to it* regardless (see "__alloc_pages()"). "kswapd" should* never get caught in the normal page freeing logic.** (Kswapd normally doesn't need memory anyway, but sometimes* you need a small amount of memory in order to be able to* page out something else, and this flag essentially protects* us from recursively trying to free more memory as we're* trying to free the first piece of memory in the first place).*/tsk->flags |= PF_MEMALLOC | PF_KSWAPD;set_freezable();WRITE_ONCE(pgdat->kswapd_order, 0);WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);atomic_set(&pgdat->nr_writeback_throttled, 0);
    //上面的代码只是在第一次执行的时候运行for ( ; ; ) {
        //以后就呆在这个循环里了bool ret;alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);highest_zoneidx = kswapd_highest_zoneidx(pgdat,highest_zoneidx);kswapd_try_sleep:
        //把自己加到等待队列中后schedule，等着被唤醒kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,highest_zoneidx);/* Read the new order and highest_zoneidx */alloc_order = READ_ONCE(pgdat->kswapd_order);highest_zoneidx = kswapd_highest_zoneidx(pgdat,highest_zoneidx);WRITE_ONCE(pgdat->kswapd_order, 0);WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);ret = try_to_freeze();if (kthread_should_stop())break;/** We can speed up thawing tasks if we don't call balance_pgdat* after returning from the refrigerator*/if (ret)continue;/** Reclaim begins at the requested order but if a high-order* reclaim fails then kswapd falls back to reclaiming for* order-0. If that happens, kswapd will consider sleeping* for the order it finished reclaiming at (reclaim_order)* but kcompactd is woken to compact for the original* request (alloc_order).*/trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,alloc_order);
        //开始做正事reclaim_order = balance_pgdat(pgdat, alloc_order,highest_zoneidx);if (reclaim_order < alloc_order)goto kswapd_try_sleep;}tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);return 0;
}

static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,unsigned int highest_zoneidx)
{long remaining = 0;DEFINE_WAIT(wait);if (freezing(current) || kthread_should_stop())return;//把自己加到kswapd_wait等待队列prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);/** Try to sleep for a short interval. Note that kcompactd will only be* woken if it is possible to sleep for a short interval. This is* deliberate on the assumption that if reclaim cannot keep an* eligible zone balanced that it's also unlikely that compaction will* succeed.*/
    //睡会儿if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {/** Compaction records what page blocks it recently failed to* isolate pages from and skips them in the future scanning.* When kswapd is going to sleep, it is reasonable to assume* that pages and compaction may succeed so reset the cache.*/reset_isolation_suitable(pgdat);/** We have freed the memory, now we should compact it to make* allocation of the requested order possible.*/wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);remaining = schedule_timeout(HZ/10);/** If woken prematurely then reset kswapd_highest_zoneidx and* order. The values will either be from a wakeup request or* the previous request that slept prematurely.*/if (remaining) {WRITE_ONCE(pgdat->kswapd_highest_zoneidx,kswapd_highest_zoneidx(pgdat,highest_zoneidx));if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)WRITE_ONCE(pgdat->kswapd_order, reclaim_order);}finish_wait(&pgdat->kswapd_wait, &wait);prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);}/** After a short sleep, check if it was a premature sleep. If not, then* go fully to sleep until explicitly woken up.*/if (!remaining &&prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {trace_mm_vmscan_kswapd_sleep(pgdat->node_id);/** vmstat counters are not perfectly accurate and the estimated* value for counters such as NR_FREE_PAGES can deviate from the* true value by nr_online_cpus * threshold. To avoid the zone* watermarks being breached while under pressure, we reduce the* per-cpu vmstat threshold while kswapd is awake and restore* them before going back to sleep.*/set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);if (!kthread_should_stop())schedule();set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);} else {if (remaining)count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);elsecount_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);}finish_wait(&pgdat->kswapd_wait, &wait);
}

唤醒kswapd的通常是在分配内存时，alloc_page()-->__alloc_pages_nodemask()-->__alloc_pages_slowpath()-->wake_all_kswapds()-->wakeup_kswapd()

/** A zone is low on free memory or too fragmented for high-order memory.  If* kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's* pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim* has failed or is not needed, still wake up kcompactd if only compaction is* needed.*/
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,enum zone_type highest_zoneidx)
{pg_data_t *pgdat;enum zone_type curr_idx;if (!managed_zone(zone))return;if (!cpuset_zone_allowed(zone, gfp_flags))return;pgdat = zone->zone_pgdat;curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);if (READ_ONCE(pgdat->kswapd_order) < order)WRITE_ONCE(pgdat->kswapd_order, order);if (!waitqueue_active(&pgdat->kswapd_wait))return;/* Hopeless node, leave it to direct reclaim if possible */if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||(pgdat_balanced(pgdat, order, highest_zoneidx) &&!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {/** There may be plenty of free memory available, but it's too* fragmented for high-order allocations.  Wake up kcompactd* and rely on compaction_suitable() to determine if it's* needed.  If it fails, it will defer subsequent attempts to* ratelimit its work.*/if (!(gfp_flags & __GFP_DIRECT_RECLAIM))wakeup_kcompactd(pgdat, order, highest_zoneidx);return;}trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,gfp_flags);wake_up_interruptible(&pgdat->kswapd_wait);
}

下面看看balance_pgdat

/** For kswapd, balance_pgdat() will reclaim pages across a node from zones* that are eligible for use by the caller until at least one zone is* balanced.** Returns the order kswapd finished reclaiming at.** kswapd scans the zones in the highmem->normal->dma direction.  It skips* zones which have free_pages > high_wmark_pages(zone), but once a zone is* found to have free_pages <= high_wmark_pages(zone), any page in that zone* or lower is eligible for reclaim until at least one usable zone is* balanced.*/
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{int i;unsigned long nr_soft_reclaimed;unsigned long nr_soft_scanned;unsigned long pflags;unsigned long nr_boost_reclaim;unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };bool boosted;struct zone *zone;
    //用于回收页面的控制结构struct scan_control sc = {.gfp_mask = GFP_KERNEL,.order = order,.may_unmap = 1,};//设置当前task->reclaim_state成员set_task_reclaim_state(current, &sc.reclaim_state);psi_memstall_enter(&pflags);__fs_reclaim_acquire(_THIS_IP_);count_vm_event(PAGEOUTRUN);/** Account for the reclaim boost. Note that the zone boost is left in* place so that parallel allocations that are near the watermark will* stall or direct reclaim until kswapd is finished.*/nr_boost_reclaim = 0;for (i = 0; i <= highest_zoneidx; i++) {zone = pgdat->node_zones + i;if (!managed_zone(zone))continue;nr_boost_reclaim += zone->watermark_boost;zone_boosts[i] = zone->watermark_boost;}boosted = nr_boost_reclaim;restart:
    //将当前node hzoneidx及以下的所有zone设置ZONE_RECLAIM_ACTIVE到flagset_reclaim_active(pgdat, highest_zoneidx);
    //控制需要扫描的页面数量，nrpage = lru_pages >> prioritysc.priority = DEF_PRIORITY;   // 12do {unsigned long nr_reclaimed = sc.nr_reclaimed;bool raise_priority = true;bool balanced;bool ret;sc.reclaim_idx = highest_zoneidx;/** If the number of buffer_heads exceeds the maximum allowed* then consider reclaiming from all zones. This has a dual* purpose -- on 64-bit systems it is expected that* buffer_heads are stripped during active rotation. On 32-bit* systems, highmem pages can pin lowmem memory and shrinking* buffers can relieve lowmem pressure. Reclaim may still not* go ahead if all eligible zones for the original allocation* request are balanced to avoid excessive reclaim from kswapd.*/if (buffer_heads_over_limit) {for (i = MAX_NR_ZONES - 1; i >= 0; i--) {zone = pgdat->node_zones + i;if (!managed_zone(zone))continue;sc.reclaim_idx = i;break;}}/** If the pgdat is imbalanced then ignore boosting and preserve* the watermarks for a later time and restart. Note that the* zone watermarks will be still reset at the end of balancing* on the grounds that the normal reclaim should be enough to* re-evaluate if boosting is required when kswapd next wakes.*/
        //有没有一个zone有足够的free page，free page > zone->wmark[high]且可以分配出2^order的页面balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);if (!balanced && nr_boost_reclaim) {nr_boost_reclaim = 0;goto restart;}/** If boosting is not active then only reclaim if there are no* eligible zones. Note that sc.reclaim_idx is not used as* buffer_heads_over_limit may have adjusted it.*/
        //没有boost且已经balanced，那就不用回收了if (!nr_boost_reclaim && balanced)goto out;/* Limit the priority of boosting to avoid reclaim writeback */if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)raise_priority = false;/** Do not writeback or swap pages for boosted reclaim. The* intent is to relieve pressure not issue sub-optimal IO* from reclaim context. If no pages are reclaimed, the* reclaim will be aborted.*/sc.may_writepage = !laptop_mode && !nr_boost_reclaim;sc.may_swap = !nr_boost_reclaim;/** Do some background aging, to give pages a chance to be* referenced before reclaiming. All pages are rotated* regardless of classzone as this is about consistent aging.*/
        //不是很明白？？？这又啥用？kswapd_age_node(pgdat, &sc);/** If we're getting trouble reclaiming, start doing writepage* even in laptop mode.*/if (sc.priority < DEF_PRIORITY - 2)sc.may_writepage = 1;/* Call soft limit reclaim before calling shrink_node. */sc.nr_scanned = 0;nr_soft_scanned = 0;nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,sc.gfp_mask, &nr_soft_scanned);sc.nr_reclaimed += nr_soft_reclaimed;/** There should be no need to raise the scanning priority if* enough pages are already being scanned that that high* watermark would be met at 100% efficiency.*/
        //进入回收路径if (kswapd_shrink_node(pgdat, &sc))raise_priority = false;/** If the low watermark is met there is no need for processes* to be throttled on pfmemalloc_wait as they should not be* able to safely make forward progress. Wake them*/

        //唤醒pfmemalloc，不过这是个啥？if (waitqueue_active(&pgdat->pfmemalloc_wait) &&allow_direct_reclaim(pgdat))wake_up_all(&pgdat->pfmemalloc_wait);/* Check if kswapd should be suspending */__fs_reclaim_release(_THIS_IP_);ret = try_to_freeze();__fs_reclaim_acquire(_THIS_IP_);if (ret || kthread_should_stop())break;/** Raise priority if scanning rate is too low or there was no* progress in reclaiming pages*/nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);/** If reclaim made no progress for a boost, stop reclaim as* IO cannot be queued and it could be an infinite loop in* extreme circumstances.*/
        //感觉是啥也没榨出来，放弃吧if (nr_boost_reclaim && !nr_reclaimed)break;if (raise_priority || !nr_reclaimed)sc.priority--;} while (sc.priority >= 1);    //每次我都多扫描一些页面if (!sc.nr_reclaimed)pgdat->kswapd_failures++;     //卡在这里应该是没回收成功，在node中记录一下out:clear_reclaim_active(pgdat, highest_zoneidx);/* If reclaim was boosted, account for the reclaim done in this pass */if (boosted) {unsigned long flags;for (i = 0; i <= highest_zoneidx; i++) {if (!zone_boosts[i])continue;/* Increments are under the zone lock */zone = pgdat->node_zones + i;spin_lock_irqsave(&zone->lock, flags);zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);spin_unlock_irqrestore(&zone->lock, flags);}/** As there is now likely space, wakeup kcompact to defragment* pageblocks.*/wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);}snapshot_refaults(NULL, pgdat);__fs_reclaim_release(_THIS_IP_);psi_memstall_leave(&pflags);set_task_reclaim_state(current, NULL);/** Return the order kswapd stopped reclaiming at as* prepare_kswapd_sleep() takes it into account. If another caller* entered the allocator slow path while kswapd was awake, order will* remain at the higher level.*/return sc.order;
}

kswapd_shrink_node是shrink_node的包装。

static bool kswapd_shrink_node(pg_data_t *pgdat,struct scan_control *sc)
{struct zone *zone;int z;/* Reclaim a number of pages proportional to the number of zones */sc->nr_to_reclaim = 0;for (z = 0; z <= sc->reclaim_idx; z++) {zone = pgdat->node_zones + z;if (!managed_zone(zone))continue;//这里有好多跟要回收页面数量有关的参数，傻傻分不清sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);}/** Historically care was taken to put equal pressure on all zones but* now pressure is applied based on node LRU order.*/
    //回收路径shrink_node(pgdat, sc);/** Fragmentation may mean that the system cannot be rebalanced for* high-order allocations. If twice the allocation size has been* reclaimed then recheck watermarks only at order-0 to prevent* excessive reclaim. Assume that a process requested a high-order* can direct reclaim/compact.*/if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))sc->order = 0;//扫描的数量比计划要扫描的数量多？return sc->nr_scanned >= sc->nr_to_reclaim;
}

shrink_node_memcgs里面除了shrink_lruvec还有shrink_slab没有看。

static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{struct mem_cgroup *target_memcg = sc->target_mem_cgroup;struct mem_cgroup *memcg;memcg = mem_cgroup_iter(target_memcg, NULL, NULL);do {
...        shrink_lruvec(lruvec, sc);shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,sc->priority);
...} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}

看看shrink_slab

/*** shrink_slab - shrink slab caches* @gfp_mask: allocation context* @nid: node whose slab caches to target* @memcg: memory cgroup whose slab caches to target* @priority: the reclaim priority** Call the shrink functions to age shrinkable caches.** @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,* unaware shrinkers will receive a node id of 0 instead.** @memcg specifies the memory cgroup to target. Unaware shrinkers* are called only if it is the root cgroup.** @priority is sc->priority, we take the number of objects and >> by priority* in order to get the scan target.** Returns the number of reclaimed slab objects.*/
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,int priority)
{unsigned long ret, freed = 0;struct shrinker *shrinker;/** The root memcg might be allocated even though memcg is disabled* via "cgroup_disable=memory" boot parameter.  This could make* mem_cgroup_is_root() return false, then just run memcg slab* shrink, but skip global shrink.  This may result in premature* oom.*/if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))return shrink_slab_memcg(gfp_mask, nid, memcg, priority);/** lockless algorithm of global shrink.** In the unregistration setp, the shrinker will be freed asynchronously* via RCU after its refcount reaches 0. So both rcu_read_lock() and* shrinker_try_get() can be used to ensure the existence of the shrinker.** So in the global shrink:*  step 1: use rcu_read_lock() to guarantee existence of the shrinker*          and the validity of the shrinker_list walk.*  step 2: use shrinker_try_get() to try get the refcount, if successful,*          then the existence of the shrinker can also be guaranteed,*          so we can release the RCU lock to do do_shrink_slab() that*          may sleep.*  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),*          which ensures that neither this shrinker nor the next shrinker*          will be freed in the next traversal operation.*  step 4: do shrinker_put() paired with step 2 to put the refcount,*          if the refcount reaches 0, then wake up the waiter in*          shrinker_free() by calling complete().*/rcu_read_lock();
    //有一个全局shrink_listlist_for_each_entry_rcu(shrinker, &shrinker_list, list) {struct shrink_control sc = {.gfp_mask = gfp_mask,.nid = nid,.memcg = memcg,};if (!shrinker_try_get(shrinker))continue;rcu_read_unlock();//shrink slabret = do_shrink_slab(&sc, shrinker, priority);if (ret == SHRINK_EMPTY)ret = 0;freed += ret;rcu_read_lock();shrinker_put(shrinker);}rcu_read_unlock();cond_resched();return freed;
}

通过shrink_register很多模块会注册shrinker到shrinker_list，这里遍历shrinker_list，使用do_shrink_slab是回收slab。

/** A callback you can register to apply pressure to ageable caches.** @count_objects should return the number of freeable items in the cache. If* there are no objects to free, it should return SHRINK_EMPTY, while 0 is* returned in cases of the number of freeable items cannot be determined* or shrinker should skip this cache for this time (e.g., their number* is below shrinkable limit). No deadlock checks should be done during the* count callback - the shrinker relies on aggregating scan counts that couldn't* be executed due to potential deadlocks to be run at a later call when the* deadlock condition is no longer pending.** @scan_objects will only be called if @count_objects returned a non-zero* value for the number of freeable objects. The callout should scan the cache* and attempt to free items from the cache. It should then return the number* of objects freed during the scan, or SHRINK_STOP if progress cannot be made* due to potential deadlocks. If SHRINK_STOP is returned, then no further* attempts to call the @scan_objects will be made from the current reclaim* context.** @flags determine the shrinker abilities, like numa awareness*/
struct shrinker {unsigned long (*count_objects)(struct shrinker *,struct shrink_control *sc);unsigned long (*scan_objects)(struct shrinker *,struct shrink_control *sc);long batch;    /* reclaim batch size, 0 = default */int seeks;    /* seeks to recreate an obj */unsigned flags;/** The reference count of this shrinker. Registered shrinker have an* initial refcount of 1, then the lookup operations are now allowed* to use it via shrinker_try_get(). Later in the unregistration step,* the initial refcount will be discarded, and will free the shrinker* asynchronously via RCU after its refcount reaches 0.*/refcount_t refcount;struct completion done;    /* use to wait for refcount to reach 0 */struct rcu_head rcu;void *private_data;/* These are for internal use */struct list_head list;
#ifdef CONFIG_MEMCG/* ID in shrinker_idr */int id;
#endif
#ifdef CONFIG_SHRINKER_DEBUGint debugfs_id;const char *name;struct dentry *debugfs_entry;
#endif/* objs pending delete, per node */atomic_long_t *nr_deferred;
};

可以看到shrinker结构是一个cache的回调函数即一些参数，count_objects负责返回空闲项数量，scan_objects会去回收cache。

页面回收先分析到这里，非常简略，很多地方尚不清楚。

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.hjln.cn/news/46822.html

如若内容造成侵权/违法违规/事实不符，请联系我们进行投诉反馈，一经查实，立即删除！

Windows CSC提权漏洞复现（CVE-2024-26229）

漏洞信息 Windows CSC服务特权提升漏洞。当程序向缓冲区写入的数据超出其处理能力时，就会发生基于堆的缓冲区溢出，从而导致多余的数据溢出到相邻的内存区域。这种溢出会损坏内存，并可能使攻击者能够执行任意代码或未经授权访问系统。本质上，攻击者可以编写触发溢出的恶意代…

【日记】忽然想买根卷发棒了……（622 字）

正文突然想买一根卷发棒了，虽然还没有多长，但刘海全搭在前面不太好看。忽然有些理解女孩子们的心情了。说起头发，晚上练头定下落接三角撑，终于找到了感觉。不过因为头要往前滑，刚开始在垫子上找感觉，那个垫子又糙，滑的时候感觉头发被拉得生疼。不过头发长度好歹能让小猫…

noise的分类

linux 配置java环境变量

1.下载与安装查看操作系统位数 uname -m 如果输出是x86_64，那么你的系统是64位的；如果输出是i686或i386，那么你的系统是32位的。openJDK下载地址：清华大学开源软件镜像站以jdk11为例，下载地址为：https://mirror.tuna.tsinghua.edu.cn/Adoptium/11/jdk/x64/linux/OpenJ…

使用.NET Core创建Windows服务

一、使用VS创建打开Visual Studio，创建新项目，找到Worker Service模板。二、项目结构说明创建出来的项目，包含两个主要文件： 1）其中Program.cs文件是应用的启动“引导程序”； 2）另外一个文件是worker.cs文件，在这个文件中，可以编写自己的服务逻辑。三、将应用转换…

【触想智能】工业显示器定制时需要注意的重点问题

随着工业自动化的不断发展，工业显示器越来越重要。不同于普通的娱乐和商业应用，工业显示器需要更加耐用、可靠、安全，并且满足特定的工业环境和应用需求。因此，仔细考虑和选择适合自己的工业显示器至关重要。本文将介绍一些您在进行工业显示器定制时需要注意的重点问题。一…

BizDevOps全局建设思路：横向串联，纵向深化

本文来自腾讯蓝鲸智云社区用户：CanWayBizDevOps概述 IT技术交付实践方法在不断迭代中持续优化。在工业化时代，Biz（业务）、Dev（开发）、Ops（运维）三者往往相对分离，甚至有时只有其中的两者或仅有一者独立存在。然而，随着时代的演进，互联网化时代带来了敏捷的先进思想，…

一个中转代码，底层调用openai，上层模拟openai

openai的调用api几乎成为了实质性的大模型社区的调用标准，你看不论是阿里的灵积，智谱，together，vllm，ollama，fastchat等都支持openai的调用方式，所以这也是大势所趋，有时候我们想做个中间层，底层调用大模型，上层提供业务服务，特别是许多公司的多节点的agent，如果我…

linux内存管理（十）- 页面回收（二）

相关文章