[v5,2/2] mm: vmscan: refactor reclaim_state helpers
Commit Message
During reclaim, we keep track of pages reclaimed from other means than
LRU-based reclaim through scan_control->reclaim_state->reclaimed_slab,
which we stash a pointer to in current task_struct.
However, we keep track of more than just reclaimed slab pages through
this. We also use it for clean file pages dropped through pruned inodes,
and xfs buffer pages freed. Rename reclaimed_slab to reclaimed, and add
a helper function that wraps updating it through current, so that future
changes to this logic are contained within mm/vmscan.c.
Additionally, add a flush_reclaim_state() helper to wrap using
reclaim_state->reclaimed to updated sc->nr_reclaimed, and use that
helper to add an elaborate comment about why we only do the update for
global reclaim.
Finally, move set_task_reclaim_state() next to flush_reclaim_state() so
that all reclaim_state helpers are in close proximity for easier
readability.
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
fs/inode.c | 3 +-
fs/xfs/xfs_buf.c | 3 +-
include/linux/swap.h | 17 +++++++++-
mm/slab.c | 3 +-
mm/slob.c | 6 ++--
mm/slub.c | 5 ++-
mm/vmscan.c | 75 ++++++++++++++++++++++++++++++++------------
7 files changed, 78 insertions(+), 34 deletions(-)
Comments
On Wed, 2023-04-05 at 18:54 +0000, Yosry Ahmed wrote:
> During reclaim, we keep track of pages reclaimed from other means than
> LRU-based reclaim through scan_control->reclaim_state->reclaimed_slab,
> which we stash a pointer to in current task_struct.
>
> However, we keep track of more than just reclaimed slab pages through
> this. We also use it for clean file pages dropped through pruned inodes,
> and xfs buffer pages freed. Rename reclaimed_slab to reclaimed, and add
> a helper function that wraps updating it through current, so that future
> changes to this logic are contained within mm/vmscan.c.
>
> Additionally, add a flush_reclaim_state() helper to wrap using
> reclaim_state->reclaimed to updated sc->nr_reclaimed, and use that
> helper to add an elaborate comment about why we only do the update for
> global reclaim.
>
> Finally, move set_task_reclaim_state() next to flush_reclaim_state() so
> that all reclaim_state helpers are in close proximity for easier
> readability.
>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
> fs/inode.c | 3 +-
> fs/xfs/xfs_buf.c | 3 +-
> include/linux/swap.h | 17 +++++++++-
> mm/slab.c | 3 +-
> mm/slob.c | 6 ++--
> mm/slub.c | 5 ++-
> mm/vmscan.c | 75 ++++++++++++++++++++++++++++++++------------
> 7 files changed, 78 insertions(+), 34 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 4558dc2f13557..e60fcc41faf17 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
> __count_vm_events(KSWAPD_INODESTEAL, reap);
> else
> __count_vm_events(PGINODESTEAL, reap);
> - if (current->reclaim_state)
> - current->reclaim_state->reclaimed_slab += reap;
> + mm_account_reclaimed_pages(reap);
> }
> iput(inode);
> spin_lock(lru_lock);
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 54c774af6e1c6..15d1e5a7c2d34 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -286,8 +286,7 @@ xfs_buf_free_pages(
> if (bp->b_pages[i])
> __free_page(bp->b_pages[i]);
> }
> - if (current->reclaim_state)
> - current->reclaim_state->reclaimed_slab += bp->b_page_count;
> + mm_account_reclaimed_pages(bp->b_page_count);
>
> if (bp->b_pages != bp->b_page_array)
> kmem_free(bp->b_pages);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 209a425739a9f..e131ac155fb95 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -153,13 +153,28 @@ union swap_header {
> * memory reclaim
> */
> struct reclaim_state {
> - unsigned long reclaimed_slab;
> + /* pages reclaimed outside of LRU-based reclaim */
> + unsigned long reclaimed;
> #ifdef CONFIG_LRU_GEN
> /* per-thread mm walk data */
> struct lru_gen_mm_walk *mm_walk;
> #endif
> };
>
> +/*
> + * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
> + * reclaim
> + * @pages: number of pages reclaimed
> + *
> + * If the current process is undergoing a reclaim operation, increment the
> + * number of reclaimed pages by @pages.
> + */
> +static inline void mm_account_reclaimed_pages(unsigned long pages)
> +{
> + if (current->reclaim_state)
> + current->reclaim_state->reclaimed += pages;
> +}
> +
> #ifdef __KERNEL__
>
> struct address_space;
> diff --git a/mm/slab.c b/mm/slab.c
> index dabc2a671fc6f..64bf1de817b24 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -1392,8 +1392,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
> smp_wmb();
> __folio_clear_slab(folio);
>
> - if (current->reclaim_state)
> - current->reclaim_state->reclaimed_slab += 1 << order;
> + mm_account_reclaimed_pages(1 << order);
> unaccount_slab(slab, order, cachep);
> __free_pages(&folio->page, order);
> }
> diff --git a/mm/slob.c b/mm/slob.c
> index fe567fcfa3a39..79cc8680c973c 100644
> --- a/mm/slob.c
> +++ b/mm/slob.c
> @@ -61,7 +61,7 @@
> #include <linux/slab.h>
>
> #include <linux/mm.h>
> -#include <linux/swap.h> /* struct reclaim_state */
> +#include <linux/swap.h> /* mm_account_reclaimed_pages() */
> #include <linux/cache.h>
> #include <linux/init.h>
> #include <linux/export.h>
> @@ -211,9 +211,7 @@ static void slob_free_pages(void *b, int order)
> {
> struct page *sp = virt_to_page(b);
>
> - if (current->reclaim_state)
> - current->reclaim_state->reclaimed_slab += 1 << order;
> -
> + mm_account_reclaimed_pages(1 << order);
> mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
> -(PAGE_SIZE << order));
> __free_pages(sp, order);
> diff --git a/mm/slub.c b/mm/slub.c
> index 39327e98fce34..7aa30eef82350 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -11,7 +11,7 @@
> */
>
> #include <linux/mm.h>
> -#include <linux/swap.h> /* struct reclaim_state */
> +#include <linux/swap.h> /* mm_account_reclaimed_pages() */
> #include <linux/module.h>
> #include <linux/bit_spinlock.h>
> #include <linux/interrupt.h>
> @@ -2063,8 +2063,7 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
> /* Make the mapping reset visible before clearing the flag */
> smp_wmb();
> __folio_clear_slab(folio);
> - if (current->reclaim_state)
> - current->reclaim_state->reclaimed_slab += pages;
> + mm_account_reclaimed_pages(pages);
> unaccount_slab(slab, order, s);
> __free_pages(&folio->page, order);
> }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c82bd89f90364..049e39202e6ce 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -188,18 +188,6 @@ struct scan_control {
> */
> int vm_swappiness = 60;
>
> -static void set_task_reclaim_state(struct task_struct *task,
> - struct reclaim_state *rs)
> -{
> - /* Check for an overwrite */
> - WARN_ON_ONCE(rs && task->reclaim_state);
> -
> - /* Check for the nulling of an already-nulled member */
> - WARN_ON_ONCE(!rs && !task->reclaim_state);
> -
> - task->reclaim_state = rs;
> -}
> -
> LIST_HEAD(shrinker_list);
> DECLARE_RWSEM(shrinker_rwsem);
>
> @@ -511,6 +499,59 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> }
> #endif
>
> +static void set_task_reclaim_state(struct task_struct *task,
> + struct reclaim_state *rs)
> +{
> + /* Check for an overwrite */
> + WARN_ON_ONCE(rs && task->reclaim_state);
> +
> + /* Check for the nulling of an already-nulled member */
> + WARN_ON_ONCE(!rs && !task->reclaim_state);
> +
> + task->reclaim_state = rs;
> +}
> +
> +/*
> + * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> + * scan_control->nr_reclaimed.
> + */
> +static void flush_reclaim_state(struct scan_control *sc,
> + struct reclaim_state *rs)
> +{
> + /*
> + * Currently, reclaim_state->reclaimed includes three types of pages
> + * freed outside of vmscan:
> + * (1) Slab pages.
> + * (2) Clean file pages from pruned inodes.
> + * (3) XFS freed buffer pages.
> + *
> + * For all of these cases, we have no way of finding out whether these
> + * pages were related to the memcg under reclaim. For example, a freed
> + * slab page could have had only a single object charged to the memcg
Minor nits:
s/could have had/could have
> + * under reclaim. Also, populated inodes are not on shrinker LRUs
> + * anymore except on highmem systems.
> + *
> + * Instead of over-reporting the reclaimed pages in a memcg reclaim,
> + * only count such pages in global reclaim. This prevents unnecessary
May be clearer to say:
This prevents under-reclaimaing the target memcg, and unnecessary
> + * retries during memcg charging and false positive from proactive
> + * reclaim (memory.reclaim).
> + *
Tim
On Thu, Apr 6, 2023 at 10:32 AM Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> On Wed, 2023-04-05 at 18:54 +0000, Yosry Ahmed wrote:
> > During reclaim, we keep track of pages reclaimed from other means than
> > LRU-based reclaim through scan_control->reclaim_state->reclaimed_slab,
> > which we stash a pointer to in current task_struct.
> >
> > However, we keep track of more than just reclaimed slab pages through
> > this. We also use it for clean file pages dropped through pruned inodes,
> > and xfs buffer pages freed. Rename reclaimed_slab to reclaimed, and add
> > a helper function that wraps updating it through current, so that future
> > changes to this logic are contained within mm/vmscan.c.
> >
> > Additionally, add a flush_reclaim_state() helper to wrap using
> > reclaim_state->reclaimed to updated sc->nr_reclaimed, and use that
> > helper to add an elaborate comment about why we only do the update for
> > global reclaim.
> >
> > Finally, move set_task_reclaim_state() next to flush_reclaim_state() so
> > that all reclaim_state helpers are in close proximity for easier
> > readability.
> >
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
> > fs/inode.c | 3 +-
> > fs/xfs/xfs_buf.c | 3 +-
> > include/linux/swap.h | 17 +++++++++-
> > mm/slab.c | 3 +-
> > mm/slob.c | 6 ++--
> > mm/slub.c | 5 ++-
> > mm/vmscan.c | 75 ++++++++++++++++++++++++++++++++------------
> > 7 files changed, 78 insertions(+), 34 deletions(-)
> >
> > diff --git a/fs/inode.c b/fs/inode.c
> > index 4558dc2f13557..e60fcc41faf17 100644
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > @@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
> > __count_vm_events(KSWAPD_INODESTEAL, reap);
> > else
> > __count_vm_events(PGINODESTEAL, reap);
> > - if (current->reclaim_state)
> > - current->reclaim_state->reclaimed_slab += reap;
> > + mm_account_reclaimed_pages(reap);
> > }
> > iput(inode);
> > spin_lock(lru_lock);
> > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> > index 54c774af6e1c6..15d1e5a7c2d34 100644
> > --- a/fs/xfs/xfs_buf.c
> > +++ b/fs/xfs/xfs_buf.c
> > @@ -286,8 +286,7 @@ xfs_buf_free_pages(
> > if (bp->b_pages[i])
> > __free_page(bp->b_pages[i]);
> > }
> > - if (current->reclaim_state)
> > - current->reclaim_state->reclaimed_slab += bp->b_page_count;
> > + mm_account_reclaimed_pages(bp->b_page_count);
> >
> > if (bp->b_pages != bp->b_page_array)
> > kmem_free(bp->b_pages);
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 209a425739a9f..e131ac155fb95 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -153,13 +153,28 @@ union swap_header {
> > * memory reclaim
> > */
> > struct reclaim_state {
> > - unsigned long reclaimed_slab;
> > + /* pages reclaimed outside of LRU-based reclaim */
> > + unsigned long reclaimed;
> > #ifdef CONFIG_LRU_GEN
> > /* per-thread mm walk data */
> > struct lru_gen_mm_walk *mm_walk;
> > #endif
> > };
> >
> > +/*
> > + * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
> > + * reclaim
> > + * @pages: number of pages reclaimed
> > + *
> > + * If the current process is undergoing a reclaim operation, increment the
> > + * number of reclaimed pages by @pages.
> > + */
> > +static inline void mm_account_reclaimed_pages(unsigned long pages)
> > +{
> > + if (current->reclaim_state)
> > + current->reclaim_state->reclaimed += pages;
> > +}
> > +
> > #ifdef __KERNEL__
> >
> > struct address_space;
> > diff --git a/mm/slab.c b/mm/slab.c
> > index dabc2a671fc6f..64bf1de817b24 100644
> > --- a/mm/slab.c
> > +++ b/mm/slab.c
> > @@ -1392,8 +1392,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
> > smp_wmb();
> > __folio_clear_slab(folio);
> >
> > - if (current->reclaim_state)
> > - current->reclaim_state->reclaimed_slab += 1 << order;
> > + mm_account_reclaimed_pages(1 << order);
> > unaccount_slab(slab, order, cachep);
> > __free_pages(&folio->page, order);
> > }
> > diff --git a/mm/slob.c b/mm/slob.c
> > index fe567fcfa3a39..79cc8680c973c 100644
> > --- a/mm/slob.c
> > +++ b/mm/slob.c
> > @@ -61,7 +61,7 @@
> > #include <linux/slab.h>
> >
> > #include <linux/mm.h>
> > -#include <linux/swap.h> /* struct reclaim_state */
> > +#include <linux/swap.h> /* mm_account_reclaimed_pages() */
> > #include <linux/cache.h>
> > #include <linux/init.h>
> > #include <linux/export.h>
> > @@ -211,9 +211,7 @@ static void slob_free_pages(void *b, int order)
> > {
> > struct page *sp = virt_to_page(b);
> >
> > - if (current->reclaim_state)
> > - current->reclaim_state->reclaimed_slab += 1 << order;
> > -
> > + mm_account_reclaimed_pages(1 << order);
> > mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
> > -(PAGE_SIZE << order));
> > __free_pages(sp, order);
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 39327e98fce34..7aa30eef82350 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -11,7 +11,7 @@
> > */
> >
> > #include <linux/mm.h>
> > -#include <linux/swap.h> /* struct reclaim_state */
> > +#include <linux/swap.h> /* mm_account_reclaimed_pages() */
> > #include <linux/module.h>
> > #include <linux/bit_spinlock.h>
> > #include <linux/interrupt.h>
> > @@ -2063,8 +2063,7 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
> > /* Make the mapping reset visible before clearing the flag */
> > smp_wmb();
> > __folio_clear_slab(folio);
> > - if (current->reclaim_state)
> > - current->reclaim_state->reclaimed_slab += pages;
> > + mm_account_reclaimed_pages(pages);
> > unaccount_slab(slab, order, s);
> > __free_pages(&folio->page, order);
> > }
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index c82bd89f90364..049e39202e6ce 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -188,18 +188,6 @@ struct scan_control {
> > */
> > int vm_swappiness = 60;
> >
> > -static void set_task_reclaim_state(struct task_struct *task,
> > - struct reclaim_state *rs)
> > -{
> > - /* Check for an overwrite */
> > - WARN_ON_ONCE(rs && task->reclaim_state);
> > -
> > - /* Check for the nulling of an already-nulled member */
> > - WARN_ON_ONCE(!rs && !task->reclaim_state);
> > -
> > - task->reclaim_state = rs;
> > -}
> > -
> > LIST_HEAD(shrinker_list);
> > DECLARE_RWSEM(shrinker_rwsem);
> >
> > @@ -511,6 +499,59 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> > }
> > #endif
> >
> > +static void set_task_reclaim_state(struct task_struct *task,
> > + struct reclaim_state *rs)
> > +{
> > + /* Check for an overwrite */
> > + WARN_ON_ONCE(rs && task->reclaim_state);
> > +
> > + /* Check for the nulling of an already-nulled member */
> > + WARN_ON_ONCE(!rs && !task->reclaim_state);
> > +
> > + task->reclaim_state = rs;
> > +}
> > +
> > +/*
> > + * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> > + * scan_control->nr_reclaimed.
> > + */
> > +static void flush_reclaim_state(struct scan_control *sc,
> > + struct reclaim_state *rs)
> > +{
> > + /*
> > + * Currently, reclaim_state->reclaimed includes three types of pages
> > + * freed outside of vmscan:
> > + * (1) Slab pages.
> > + * (2) Clean file pages from pruned inodes.
> > + * (3) XFS freed buffer pages.
> > + *
> > + * For all of these cases, we have no way of finding out whether these
> > + * pages were related to the memcg under reclaim. For example, a freed
> > + * slab page could have had only a single object charged to the memcg
>
> Minor nits:
> s/could have had/could have
>
> > + * under reclaim. Also, populated inodes are not on shrinker LRUs
> > + * anymore except on highmem systems.
> > + *
> > + * Instead of over-reporting the reclaimed pages in a memcg reclaim,
> > + * only count such pages in global reclaim. This prevents unnecessary
>
> May be clearer to say:
> This prevents under-reclaimaing the target memcg, and unnecessary
Thanks, will rephrase for the next version!
>
> > + * retries during memcg charging and false positive from proactive
> > + * reclaim (memory.reclaim).
> > + *
>
> Tim
>
On Thu, Apr 06, 2023 at 10:31:53AM -0700, Tim Chen wrote:
> On Wed, 2023-04-05 at 18:54 +0000, Yosry Ahmed wrote:
> > + * For all of these cases, we have no way of finding out whether these
> > + * pages were related to the memcg under reclaim. For example, a freed
> > + * slab page could have had only a single object charged to the memcg
>
> Minor nits:
> s/could have had/could have
No ... "could have had" is correct. I'm a native English speaker, so I
have no idea what the rule here is, but I can ask my linguist wife later
if you want to know ;-)
Maybe it's something like this:
https://www.englishgrammar.org/have-had-and-had-had/
Hi, Yosry,
On Wed, Apr 05, 2023 at 06:54:27PM +0000, Yosry Ahmed wrote:
[...]
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c82bd89f90364..049e39202e6ce 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -188,18 +188,6 @@ struct scan_control {
> */
> int vm_swappiness = 60;
>
> -static void set_task_reclaim_state(struct task_struct *task,
> - struct reclaim_state *rs)
> -{
> - /* Check for an overwrite */
> - WARN_ON_ONCE(rs && task->reclaim_state);
> -
> - /* Check for the nulling of an already-nulled member */
> - WARN_ON_ONCE(!rs && !task->reclaim_state);
> -
> - task->reclaim_state = rs;
> -}
> -
> LIST_HEAD(shrinker_list);
> DECLARE_RWSEM(shrinker_rwsem);
>
> @@ -511,6 +499,59 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> }
> #endif
>
> +static void set_task_reclaim_state(struct task_struct *task,
> + struct reclaim_state *rs)
> +{
> + /* Check for an overwrite */
> + WARN_ON_ONCE(rs && task->reclaim_state);
> +
> + /* Check for the nulling of an already-nulled member */
> + WARN_ON_ONCE(!rs && !task->reclaim_state);
> +
> + task->reclaim_state = rs;
> +}
Nit: I just think such movement not necessary while it loses the "git
blame" information easily.
Instead of moving this here without major benefit, why not just define
flush_reclaim_state() right after previous set_task_reclaim_state()?
> +
> +/*
> + * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> + * scan_control->nr_reclaimed.
> + */
> +static void flush_reclaim_state(struct scan_control *sc,
> + struct reclaim_state *rs)
> +{
> + /*
> + * Currently, reclaim_state->reclaimed includes three types of pages
> + * freed outside of vmscan:
> + * (1) Slab pages.
> + * (2) Clean file pages from pruned inodes.
> + * (3) XFS freed buffer pages.
> + *
> + * For all of these cases, we have no way of finding out whether these
> + * pages were related to the memcg under reclaim. For example, a freed
> + * slab page could have had only a single object charged to the memcg
> + * under reclaim. Also, populated inodes are not on shrinker LRUs
> + * anymore except on highmem systems.
> + *
> + * Instead of over-reporting the reclaimed pages in a memcg reclaim,
> + * only count such pages in global reclaim. This prevents unnecessary
> + * retries during memcg charging and false positive from proactive
> + * reclaim (memory.reclaim).
> + *
> + * For uncommon cases were the freed pages were actually significantly
> + * charged to the memcg under reclaim, and we end up under-reporting, it
> + * should be fine. The freed pages will be uncharged anyway, even if
> + * they are not reported properly, and we will be able to make forward
> + * progress in charging (which is usually in a retry loop).
> + *
> + * We can go one step further, and report the uncharged objcg pages in
> + * memcg reclaim, to make reporting more accurate and reduce
> + * under-reporting, but it's probably not worth the complexity for now.
> + */
> + if (rs && global_reclaim(sc)) {
> + sc->nr_reclaimed += rs->reclaimed;
> + rs->reclaimed = 0;
> + }
> +}
> +
> static long xchg_nr_deferred(struct shrinker *shrinker,
> struct shrink_control *sc)
> {
> @@ -5346,10 +5387,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
> vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
> sc->nr_reclaimed - reclaimed);
>
> - if (global_reclaim(sc)) {
> - sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
> - current->reclaim_state->reclaimed_slab = 0;
> - }
> + flush_reclaim_state(sc, current->reclaim_state);
>
> return success ? MEMCG_LRU_YOUNG : 0;
> }
> @@ -6474,10 +6512,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>
> shrink_node_memcgs(pgdat, sc);
>
> - if (reclaim_state && global_reclaim(sc)) {
> - sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> - reclaim_state->reclaimed_slab = 0;
> - }
> + flush_reclaim_state(sc, reclaim_state);
IIUC reclaim_state here still points to current->reclaim_state. Could it
change at all?
Is it cleaner to make flush_reclaim_state() taking "sc" only if it always
references current->reclaim_state?
>
> /* Record the subtree's reclaim efficiency */
> if (!sc->proactive)
> --
> 2.40.0.348.gf938b09366-goog
>
On Thu, Apr 6, 2023 at 1:45 PM Peter Xu <peterx@redhat.com> wrote:
>
> Hi, Yosry,
>
> On Wed, Apr 05, 2023 at 06:54:27PM +0000, Yosry Ahmed wrote:
>
> [...]
>
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index c82bd89f90364..049e39202e6ce 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -188,18 +188,6 @@ struct scan_control {
> > */
> > int vm_swappiness = 60;
> >
> > -static void set_task_reclaim_state(struct task_struct *task,
> > - struct reclaim_state *rs)
> > -{
> > - /* Check for an overwrite */
> > - WARN_ON_ONCE(rs && task->reclaim_state);
> > -
> > - /* Check for the nulling of an already-nulled member */
> > - WARN_ON_ONCE(!rs && !task->reclaim_state);
> > -
> > - task->reclaim_state = rs;
> > -}
> > -
> > LIST_HEAD(shrinker_list);
> > DECLARE_RWSEM(shrinker_rwsem);
> >
> > @@ -511,6 +499,59 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> > }
> > #endif
> >
> > +static void set_task_reclaim_state(struct task_struct *task,
> > + struct reclaim_state *rs)
> > +{
> > + /* Check for an overwrite */
> > + WARN_ON_ONCE(rs && task->reclaim_state);
> > +
> > + /* Check for the nulling of an already-nulled member */
> > + WARN_ON_ONCE(!rs && !task->reclaim_state);
> > +
> > + task->reclaim_state = rs;
> > +}
>
> Nit: I just think such movement not necessary while it loses the "git
> blame" information easily.
>
> Instead of moving this here without major benefit, why not just define
> flush_reclaim_state() right after previous set_task_reclaim_state()?
An earlier version did that, but we would have to add a forward
declaration of global_reclaim() (or cgroup_reclaim()), as they are
defined after the previous position of set_task_reclaim_state().
>
> > +
> > +/*
> > + * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> > + * scan_control->nr_reclaimed.
> > + */
> > +static void flush_reclaim_state(struct scan_control *sc,
> > + struct reclaim_state *rs)
> > +{
> > + /*
> > + * Currently, reclaim_state->reclaimed includes three types of pages
> > + * freed outside of vmscan:
> > + * (1) Slab pages.
> > + * (2) Clean file pages from pruned inodes.
> > + * (3) XFS freed buffer pages.
> > + *
> > + * For all of these cases, we have no way of finding out whether these
> > + * pages were related to the memcg under reclaim. For example, a freed
> > + * slab page could have had only a single object charged to the memcg
> > + * under reclaim. Also, populated inodes are not on shrinker LRUs
> > + * anymore except on highmem systems.
> > + *
> > + * Instead of over-reporting the reclaimed pages in a memcg reclaim,
> > + * only count such pages in global reclaim. This prevents unnecessary
> > + * retries during memcg charging and false positive from proactive
> > + * reclaim (memory.reclaim).
> > + *
> > + * For uncommon cases were the freed pages were actually significantly
> > + * charged to the memcg under reclaim, and we end up under-reporting, it
> > + * should be fine. The freed pages will be uncharged anyway, even if
> > + * they are not reported properly, and we will be able to make forward
> > + * progress in charging (which is usually in a retry loop).
> > + *
> > + * We can go one step further, and report the uncharged objcg pages in
> > + * memcg reclaim, to make reporting more accurate and reduce
> > + * under-reporting, but it's probably not worth the complexity for now.
> > + */
> > + if (rs && global_reclaim(sc)) {
> > + sc->nr_reclaimed += rs->reclaimed;
> > + rs->reclaimed = 0;
> > + }
> > +}
> > +
> > static long xchg_nr_deferred(struct shrinker *shrinker,
> > struct shrink_control *sc)
> > {
> > @@ -5346,10 +5387,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
> > vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
> > sc->nr_reclaimed - reclaimed);
> >
> > - if (global_reclaim(sc)) {
> > - sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
> > - current->reclaim_state->reclaimed_slab = 0;
> > - }
> > + flush_reclaim_state(sc, current->reclaim_state);
> >
> > return success ? MEMCG_LRU_YOUNG : 0;
> > }
> > @@ -6474,10 +6512,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> >
> > shrink_node_memcgs(pgdat, sc);
> >
> > - if (reclaim_state && global_reclaim(sc)) {
> > - sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> > - reclaim_state->reclaimed_slab = 0;
> > - }
> > + flush_reclaim_state(sc, reclaim_state);
>
> IIUC reclaim_state here still points to current->reclaim_state. Could it
> change at all?
>
> Is it cleaner to make flush_reclaim_state() taking "sc" only if it always
> references current->reclaim_state?
Good point. I think it's always current->reclaim_state.
I think we can make flush_reclaim_state() only take "sc" as an
argument, and remove the "reclaim_state" local variable in
shrink_node() completely.
>
> >
> > /* Record the subtree's reclaim efficiency */
> > if (!sc->proactive)
> > --
> > 2.40.0.348.gf938b09366-goog
> >
>
> --
> Peter Xu
>
@@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
__count_vm_events(PGINODESTEAL, reap);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += reap;
+ mm_account_reclaimed_pages(reap);
}
iput(inode);
spin_lock(lru_lock);
@@ -286,8 +286,7 @@ xfs_buf_free_pages(
if (bp->b_pages[i])
__free_page(bp->b_pages[i]);
}
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += bp->b_page_count;
+ mm_account_reclaimed_pages(bp->b_page_count);
if (bp->b_pages != bp->b_page_array)
kmem_free(bp->b_pages);
@@ -153,13 +153,28 @@ union swap_header {
* memory reclaim
*/
struct reclaim_state {
- unsigned long reclaimed_slab;
+ /* pages reclaimed outside of LRU-based reclaim */
+ unsigned long reclaimed;
#ifdef CONFIG_LRU_GEN
/* per-thread mm walk data */
struct lru_gen_mm_walk *mm_walk;
#endif
};
+/*
+ * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
+ * reclaim
+ * @pages: number of pages reclaimed
+ *
+ * If the current process is undergoing a reclaim operation, increment the
+ * number of reclaimed pages by @pages.
+ */
+static inline void mm_account_reclaimed_pages(unsigned long pages)
+{
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed += pages;
+}
+
#ifdef __KERNEL__
struct address_space;
@@ -1392,8 +1392,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
smp_wmb();
__folio_clear_slab(folio);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += 1 << order;
+ mm_account_reclaimed_pages(1 << order);
unaccount_slab(slab, order, cachep);
__free_pages(&folio->page, order);
}
@@ -61,7 +61,7 @@
#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/swap.h> /* mm_account_reclaimed_pages() */
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/export.h>
@@ -211,9 +211,7 @@ static void slob_free_pages(void *b, int order)
{
struct page *sp = virt_to_page(b);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += 1 << order;
-
+ mm_account_reclaimed_pages(1 << order);
mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
__free_pages(sp, order);
@@ -11,7 +11,7 @@
*/
#include <linux/mm.h>
-#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/swap.h> /* mm_account_reclaimed_pages() */
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
@@ -2063,8 +2063,7 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
/* Make the mapping reset visible before clearing the flag */
smp_wmb();
__folio_clear_slab(folio);
- if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += pages;
+ mm_account_reclaimed_pages(pages);
unaccount_slab(slab, order, s);
__free_pages(&folio->page, order);
}
@@ -188,18 +188,6 @@ struct scan_control {
*/
int vm_swappiness = 60;
-static void set_task_reclaim_state(struct task_struct *task,
- struct reclaim_state *rs)
-{
- /* Check for an overwrite */
- WARN_ON_ONCE(rs && task->reclaim_state);
-
- /* Check for the nulling of an already-nulled member */
- WARN_ON_ONCE(!rs && !task->reclaim_state);
-
- task->reclaim_state = rs;
-}
-
LIST_HEAD(shrinker_list);
DECLARE_RWSEM(shrinker_rwsem);
@@ -511,6 +499,59 @@ static bool writeback_throttling_sane(struct scan_control *sc)
}
#endif
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
+/*
+ * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
+ * scan_control->nr_reclaimed.
+ */
+static void flush_reclaim_state(struct scan_control *sc,
+ struct reclaim_state *rs)
+{
+ /*
+ * Currently, reclaim_state->reclaimed includes three types of pages
+ * freed outside of vmscan:
+ * (1) Slab pages.
+ * (2) Clean file pages from pruned inodes.
+ * (3) XFS freed buffer pages.
+ *
+ * For all of these cases, we have no way of finding out whether these
+ * pages were related to the memcg under reclaim. For example, a freed
+ * slab page could have had only a single object charged to the memcg
+ * under reclaim. Also, populated inodes are not on shrinker LRUs
+ * anymore except on highmem systems.
+ *
+ * Instead of over-reporting the reclaimed pages in a memcg reclaim,
+ * only count such pages in global reclaim. This prevents unnecessary
+ * retries during memcg charging and false positive from proactive
+ * reclaim (memory.reclaim).
+ *
+ * For uncommon cases were the freed pages were actually significantly
+ * charged to the memcg under reclaim, and we end up under-reporting, it
+ * should be fine. The freed pages will be uncharged anyway, even if
+ * they are not reported properly, and we will be able to make forward
+ * progress in charging (which is usually in a retry loop).
+ *
+ * We can go one step further, and report the uncharged objcg pages in
+ * memcg reclaim, to make reporting more accurate and reduce
+ * under-reporting, but it's probably not worth the complexity for now.
+ */
+ if (rs && global_reclaim(sc)) {
+ sc->nr_reclaimed += rs->reclaimed;
+ rs->reclaimed = 0;
+ }
+}
+
static long xchg_nr_deferred(struct shrinker *shrinker,
struct shrink_control *sc)
{
@@ -5346,10 +5387,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
- if (global_reclaim(sc)) {
- sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
- current->reclaim_state->reclaimed_slab = 0;
- }
+ flush_reclaim_state(sc, current->reclaim_state);
return success ? MEMCG_LRU_YOUNG : 0;
}
@@ -6474,10 +6512,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
shrink_node_memcgs(pgdat, sc);
- if (reclaim_state && global_reclaim(sc)) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
+ flush_reclaim_state(sc, reclaim_state);
/* Record the subtree's reclaim efficiency */
if (!sc->proactive)