[v3,2/2] mm: memcg: introduce new event to trace shrink_memcg

Message ID 20231123193937.11628-3-ddrokosov@salutedevices.com
State New
Headers
Series mm: memcg: improve vmscan tracepoints |

Commit Message

Dmitry Rokosov Nov. 23, 2023, 7:39 p.m. UTC
  The shrink_memcg flow plays a crucial role in memcg reclamation.
Currently, it is not possible to trace this point from non-direct
reclaim paths. However, direct reclaim has its own tracepoint, so there
is no issue there. In certain cases, when debugging memcg pressure,
developers may need to identify all potential requests for memcg
reclamation including kswapd(). The patchset introduces the tracepoints
mm_vmscan_memcg_shrink_{begin|end}() to address this problem.

Example of output in the kswapd context (non-direct reclaim):
    kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
    kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
    kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
    kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
    kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
    kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
    kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16

Signed-off-by: Dmitry Rokosov <ddrokosov@salutedevices.com>
---
 include/trace/events/vmscan.h | 22 ++++++++++++++++++++++
 mm/vmscan.c                   |  7 +++++++
 2 files changed, 29 insertions(+)
  

Comments

Shakeel Butt Nov. 25, 2023, 6:36 a.m. UTC | #1
On Thu, Nov 23, 2023 at 10:39:37PM +0300, Dmitry Rokosov wrote:
> The shrink_memcg flow plays a crucial role in memcg reclamation.
> Currently, it is not possible to trace this point from non-direct
> reclaim paths. However, direct reclaim has its own tracepoint, so there
> is no issue there. In certain cases, when debugging memcg pressure,
> developers may need to identify all potential requests for memcg
> reclamation including kswapd(). The patchset introduces the tracepoints
> mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> 
> Example of output in the kswapd context (non-direct reclaim):
>     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
>     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
>     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
>     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
>     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
>     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> 
> Signed-off-by: Dmitry Rokosov <ddrokosov@salutedevices.com>
> ---
>  include/trace/events/vmscan.h | 22 ++++++++++++++++++++++
>  mm/vmscan.c                   |  7 +++++++
>  2 files changed, 29 insertions(+)
> 
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index e9093fa1c924..a4686afe571d 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -180,6 +180,17 @@ DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_softlimit_r
>  	TP_ARGS(order, gfp_flags, memcg)
>  );
>  
> +DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_shrink_begin,
> +
> +	TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
> +
> +	TP_ARGS(order, gfp_flags, memcg)
> +);
> +
> +#else
> +
> +#define trace_mm_vmscan_memcg_shrink_begin(...)
> +
>  #endif /* CONFIG_MEMCG */
>  
>  DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
> @@ -243,6 +254,17 @@ DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_softlimit_rec
>  	TP_ARGS(nr_reclaimed, memcg)
>  );
>  
> +DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_shrink_end,
> +
> +	TP_PROTO(unsigned long nr_reclaimed, const struct mem_cgroup *memcg),
> +
> +	TP_ARGS(nr_reclaimed, memcg)
> +);
> +
> +#else
> +
> +#define trace_mm_vmscan_memcg_shrink_end(...)
> +
>  #endif /* CONFIG_MEMCG */
>  
>  TRACE_EVENT(mm_shrink_slab_start,
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 45780952f4b5..f7e3ddc5a7ad 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6461,6 +6461,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
>  		 */
>  		cond_resched();
>  
> +		trace_mm_vmscan_memcg_shrink_begin(sc->order,
> +						   sc->gfp_mask,
> +						   memcg);
> +

If you place the start of the trace here, you may have only the begin
trace for memcgs whose usage are below their min or low limits. Is that
fine? Otherwise you can put it just before shrink_lruvec() call.

>  		mem_cgroup_calculate_protection(target_memcg, memcg);
>  
>  		if (mem_cgroup_below_min(target_memcg, memcg)) {
> @@ -6491,6 +6495,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
>  		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
>  			    sc->priority);
>  
> +		trace_mm_vmscan_memcg_shrink_end(sc->nr_reclaimed - reclaimed,
> +						 memcg);
> +
>  		/* Record the group's reclaim efficiency */
>  		if (!sc->proactive)
>  			vmpressure(sc->gfp_mask, memcg, false,
> -- 
> 2.36.0
>
  
Dmitry Rokosov Nov. 25, 2023, 8:01 a.m. UTC | #2
On Sat, Nov 25, 2023 at 06:36:16AM +0000, Shakeel Butt wrote:
> On Thu, Nov 23, 2023 at 10:39:37PM +0300, Dmitry Rokosov wrote:
> > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > Currently, it is not possible to trace this point from non-direct
> > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > is no issue there. In certain cases, when debugging memcg pressure,
> > developers may need to identify all potential requests for memcg
> > reclamation including kswapd(). The patchset introduces the tracepoints
> > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > 
> > Example of output in the kswapd context (non-direct reclaim):
> >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> > 
> > Signed-off-by: Dmitry Rokosov <ddrokosov@salutedevices.com>
> > ---
> >  include/trace/events/vmscan.h | 22 ++++++++++++++++++++++
> >  mm/vmscan.c                   |  7 +++++++
> >  2 files changed, 29 insertions(+)
> > 
> > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> > index e9093fa1c924..a4686afe571d 100644
> > --- a/include/trace/events/vmscan.h
> > +++ b/include/trace/events/vmscan.h
> > @@ -180,6 +180,17 @@ DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_softlimit_r
> >  	TP_ARGS(order, gfp_flags, memcg)
> >  );
> >  
> > +DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_shrink_begin,
> > +
> > +	TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
> > +
> > +	TP_ARGS(order, gfp_flags, memcg)
> > +);
> > +
> > +#else
> > +
> > +#define trace_mm_vmscan_memcg_shrink_begin(...)
> > +
> >  #endif /* CONFIG_MEMCG */
> >  
> >  DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
> > @@ -243,6 +254,17 @@ DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_softlimit_rec
> >  	TP_ARGS(nr_reclaimed, memcg)
> >  );
> >  
> > +DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_shrink_end,
> > +
> > +	TP_PROTO(unsigned long nr_reclaimed, const struct mem_cgroup *memcg),
> > +
> > +	TP_ARGS(nr_reclaimed, memcg)
> > +);
> > +
> > +#else
> > +
> > +#define trace_mm_vmscan_memcg_shrink_end(...)
> > +
> >  #endif /* CONFIG_MEMCG */
> >  
> >  TRACE_EVENT(mm_shrink_slab_start,
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 45780952f4b5..f7e3ddc5a7ad 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -6461,6 +6461,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> >  		 */
> >  		cond_resched();
> >  
> > +		trace_mm_vmscan_memcg_shrink_begin(sc->order,
> > +						   sc->gfp_mask,
> > +						   memcg);
> > +
> 
> If you place the start of the trace here, you may have only the begin
> trace for memcgs whose usage are below their min or low limits. Is that
> fine? Otherwise you can put it just before shrink_lruvec() call.
> 

From my point of view, it's fine. For situations like the one you
described, when we only see the begin() tracepoint raised without the
end(), we understand that reclaim requests are being made but cannot be
satisfied due to certain conditions within memcg (such as limits).

There may be some spam tracepoints in the trace pipe, which is a disadvantage
of this approach.

How important do you think it is to understand such situations? Or do
you suggest moving the begin() tracepoint after the memcg limits checks
and don't care about it?

> >  		mem_cgroup_calculate_protection(target_memcg, memcg);
> >  
> >  		if (mem_cgroup_below_min(target_memcg, memcg)) {
> > @@ -6491,6 +6495,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> >  		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> >  			    sc->priority);
> >  
> > +		trace_mm_vmscan_memcg_shrink_end(sc->nr_reclaimed - reclaimed,
> > +						 memcg);
> > +
> >  		/* Record the group's reclaim efficiency */
> >  		if (!sc->proactive)
> >  			vmpressure(sc->gfp_mask, memcg, false,
> > -- 
> > 2.36.0
> >
  
Shakeel Butt Nov. 25, 2023, 5:38 p.m. UTC | #3
On Sat, Nov 25, 2023 at 11:01:37AM +0300, Dmitry Rokosov wrote:
[...]
> > > +		trace_mm_vmscan_memcg_shrink_begin(sc->order,
> > > +						   sc->gfp_mask,
> > > +						   memcg);
> > > +
> > 
> > If you place the start of the trace here, you may have only the begin
> > trace for memcgs whose usage are below their min or low limits. Is that
> > fine? Otherwise you can put it just before shrink_lruvec() call.
> > 
> 
> From my point of view, it's fine. For situations like the one you
> described, when we only see the begin() tracepoint raised without the
> end(), we understand that reclaim requests are being made but cannot be
> satisfied due to certain conditions within memcg (such as limits).
> 
> There may be some spam tracepoints in the trace pipe, which is a disadvantage
> of this approach.
> 
> How important do you think it is to understand such situations? Or do
> you suggest moving the begin() tracepoint after the memcg limits checks
> and don't care about it?
> 

I was mainly wondering if that is intentional. It seems like you as
first user of this trace has a need to know that a reclaim for a given
memcg was triggered but due to min/low limits no reclaim was done. This
is a totally reasonable use-case.
  
Shakeel Butt Nov. 25, 2023, 5:47 p.m. UTC | #4
On Thu, Nov 23, 2023 at 10:39:37PM +0300, Dmitry Rokosov wrote:
> The shrink_memcg flow plays a crucial role in memcg reclamation.
> Currently, it is not possible to trace this point from non-direct
> reclaim paths. However, direct reclaim has its own tracepoint, so there
> is no issue there. In certain cases, when debugging memcg pressure,
> developers may need to identify all potential requests for memcg
> reclamation including kswapd(). The patchset introduces the tracepoints
> mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> 
> Example of output in the kswapd context (non-direct reclaim):
>     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
>     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
>     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
>     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
>     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
>     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> 
> Signed-off-by: Dmitry Rokosov <ddrokosov@salutedevices.com>

Acked-by: Shakeel Butt <shakeelb@google.com>
  
Michal Hocko Nov. 27, 2023, 9:33 a.m. UTC | #5
On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> The shrink_memcg flow plays a crucial role in memcg reclamation.
> Currently, it is not possible to trace this point from non-direct
> reclaim paths. However, direct reclaim has its own tracepoint, so there
> is no issue there. In certain cases, when debugging memcg pressure,
> developers may need to identify all potential requests for memcg
> reclamation including kswapd(). The patchset introduces the tracepoints
> mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> 
> Example of output in the kswapd context (non-direct reclaim):
>     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
>     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
>     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
>     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
>     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
>     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
>     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16

In the previous version I have asked why do we need this specific
tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
which already give you a very good insight. That includes the number of
reclaimed pages but also more. I do see that we do not include memcg id
of the reclaimed LRU, but that shouldn't be a big problem to add, no?
  
Dmitry Rokosov Nov. 27, 2023, 11:36 a.m. UTC | #6
On Mon, Nov 27, 2023 at 10:33:49AM +0100, Michal Hocko wrote:
> On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > Currently, it is not possible to trace this point from non-direct
> > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > is no issue there. In certain cases, when debugging memcg pressure,
> > developers may need to identify all potential requests for memcg
> > reclamation including kswapd(). The patchset introduces the tracepoints
> > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > 
> > Example of output in the kswapd context (non-direct reclaim):
> >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> 
> In the previous version I have asked why do we need this specific
> tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
> which already give you a very good insight. That includes the number of
> reclaimed pages but also more. I do see that we do not include memcg id
> of the reclaimed LRU, but that shouldn't be a big problem to add, no?

From my point of view, memcg reclaim includes two points: LRU shrink and
slab shrink, as mentioned in the vmscan.c file.


static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
...
		reclaimed = sc->nr_reclaimed;
		scanned = sc->nr_scanned;

		shrink_lruvec(lruvec, sc);

		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
			    sc->priority);
...

So, both of these operations are important for understanding whether
memcg reclaiming was successful or not, as well as its effectiveness. I
believe it would be beneficial to summarize them, which is why I have
created new tracepoints.
  
Dmitry Rokosov Nov. 29, 2023, 3:26 p.m. UTC | #7
On Wed, Nov 29, 2023 at 06:20:57PM +0300, Dmitry Rokosov wrote:
> On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> > On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> > > On Mon, Nov 27, 2023 at 01:50:22PM +0100, Michal Hocko wrote:
> > > > On Mon 27-11-23 14:36:44, Dmitry Rokosov wrote:
> > > > > On Mon, Nov 27, 2023 at 10:33:49AM +0100, Michal Hocko wrote:
> > > > > > On Thu 23-11-23 22:39:37, Dmitry Rokosov wrote:
> > > > > > > The shrink_memcg flow plays a crucial role in memcg reclamation.
> > > > > > > Currently, it is not possible to trace this point from non-direct
> > > > > > > reclaim paths. However, direct reclaim has its own tracepoint, so there
> > > > > > > is no issue there. In certain cases, when debugging memcg pressure,
> > > > > > > developers may need to identify all potential requests for memcg
> > > > > > > reclamation including kswapd(). The patchset introduces the tracepoints
> > > > > > > mm_vmscan_memcg_shrink_{begin|end}() to address this problem.
> > > > > > > 
> > > > > > > Example of output in the kswapd context (non-direct reclaim):
> > > > > > >     kswapd0-39      [001] .....   240.356378: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356396: mm_vmscan_memcg_shrink_end: nr_reclaimed=0 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356420: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356454: mm_vmscan_memcg_shrink_end: nr_reclaimed=1 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356479: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356506: mm_vmscan_memcg_shrink_end: nr_reclaimed=4 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356525: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356593: mm_vmscan_memcg_shrink_end: nr_reclaimed=11 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356614: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356738: mm_vmscan_memcg_shrink_end: nr_reclaimed=25 memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.356790: mm_vmscan_memcg_shrink_begin: order=0 gfp_flags=GFP_KERNEL memcg=16
> > > > > > >     kswapd0-39      [001] .....   240.357125: mm_vmscan_memcg_shrink_end: nr_reclaimed=53 memcg=16
> > > > > > 
> > > > > > In the previous version I have asked why do we need this specific
> > > > > > tracepoint when we already do have trace_mm_vmscan_lru_shrink_{in}active
> > > > > > which already give you a very good insight. That includes the number of
> > > > > > reclaimed pages but also more. I do see that we do not include memcg id
> > > > > > of the reclaimed LRU, but that shouldn't be a big problem to add, no?
> > > > > 
> > > > > >From my point of view, memcg reclaim includes two points: LRU shrink and
> > > > > slab shrink, as mentioned in the vmscan.c file.
> > > > > 
> > > > > 
> > > > > static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> > > > > ...
> > > > > 		reclaimed = sc->nr_reclaimed;
> > > > > 		scanned = sc->nr_scanned;
> > > > > 
> > > > > 		shrink_lruvec(lruvec, sc);
> > > > > 
> > > > > 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> > > > > 			    sc->priority);
> > > > > ...
> > > > > 
> > > > > So, both of these operations are important for understanding whether
> > > > > memcg reclaiming was successful or not, as well as its effectiveness. I
> > > > > believe it would be beneficial to summarize them, which is why I have
> > > > > created new tracepoints.
> > > > 
> > > > This sounds like nice to have rather than must. Put it differently. If
> > > > you make existing reclaim trace points memcg aware (print memcg id) then
> > > > what prevents you from making analysis you need?
> > > 
> > > You are right, nothing prevents me from making this analysis... but...
> > > 
> > > This approach does have some disadvantages:
> > > 1) It requires more changes to vmscan. At the very least, the memcg
> > > object should be forwarded to all subfunctions for LRU and SLAB
> > > shrinkers.
> > 
> > We should have lruvec or memcg available. lruvec_memcg() could be used
> > to get memcg from the lruvec. It might be more places to add the id but
> > arguably this would improve them to identify where the memory has been
> > scanned/reclaimed from.
> >  
> 
> Oh, thank you, didn't see this conversion function before...
> 
> > > 2) With this approach, we will not have the ability to trace a situation
> > > where the kernel is requesting reclaim for a specific memcg, but due to
> > > limits issues, we are unable to run it.
> > 
> > I do not follow. Could you be more specific please?
> > 
> 
> I'm referring to a situation where kswapd() or another kernel mm code
> requests some reclaim pages from memcg, but memcg rejects it due to
> limits checkers. This occurs in the shrink_node_memcgs() function.
> 
> ===
> 		mem_cgroup_calculate_protection(target_memcg, memcg);
> 
> 		if (mem_cgroup_below_min(target_memcg, memcg)) {
> 			/*
> 			 * Hard protection.
> 			 * If there is no reclaimable memory, OOM.
> 			 */
> 			continue;
> 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
> 			/*
> 			 * Soft protection.
> 			 * Respect the protection only as long as
> 			 * there is an unprotected supply
> 			 * of reclaimable memory from other cgroups.
> 			 */
> 			if (!sc->memcg_low_reclaim) {
> 				sc->memcg_low_skipped = 1;
> 				continue;
> 			}
> 			memcg_memory_event(memcg, MEMCG_LOW);
> 		}
> ===
> 
> With separate shrink begin()/end() tracepoints we can detect such
> problem.
> 
> 
> > > 3) LRU and SLAB shrinkers are too common places to handle memcg-related
> > > tasks. Additionally, memcg can be disabled in the kernel configuration.
> > 
> > Right. This could be all hidden in the tracing code. You simply do not
> > print memcg id when the controller is disabled. Or just simply print 0.
> > I do not really see any major problems with that.
> > 
> > I would really prefer to focus on that direction rather than adding
> > another begin/end tracepoint which overalaps with existing begin/end
> > traces and provides much more limited information because I would bet we
> > will have somebody complaining that mere nr_reclaimed is not sufficient.
> 
> Okay, I will try to prepare a new patch version with memcg printing from
> lruvec and slab tracepoints.
> 
> Then Andrew should drop the previous patchsets, I suppose. Please advise
> on the correct workflow steps here.

Actually, it has already been merged into linux-next... I just checked.
Maybe it would be better to prepare lruvec and slab memcg printing as a
separate patch series?

https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=0e7f0c52a76cb22c8633f21bff6e48fabff6016e
  
Dmitry Rokosov Nov. 29, 2023, 4:57 p.m. UTC | #8
On Wed, Nov 29, 2023 at 05:06:37PM +0100, Michal Hocko wrote:
> On Wed 29-11-23 18:20:57, Dmitry Rokosov wrote:
> > On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> > > On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> [...]
> > > > 2) With this approach, we will not have the ability to trace a situation
> > > > where the kernel is requesting reclaim for a specific memcg, but due to
> > > > limits issues, we are unable to run it.
> > > 
> > > I do not follow. Could you be more specific please?
> > > 
> > 
> > I'm referring to a situation where kswapd() or another kernel mm code
> > requests some reclaim pages from memcg, but memcg rejects it due to
> > limits checkers. This occurs in the shrink_node_memcgs() function.
> 
> Ohh, you mean reclaim protection
> 
> > ===
> > 		mem_cgroup_calculate_protection(target_memcg, memcg);
> > 
> > 		if (mem_cgroup_below_min(target_memcg, memcg)) {
> > 			/*
> > 			 * Hard protection.
> > 			 * If there is no reclaimable memory, OOM.
> > 			 */
> > 			continue;
> > 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
> > 			/*
> > 			 * Soft protection.
> > 			 * Respect the protection only as long as
> > 			 * there is an unprotected supply
> > 			 * of reclaimable memory from other cgroups.
> > 			 */
> > 			if (!sc->memcg_low_reclaim) {
> > 				sc->memcg_low_skipped = 1;
> > 				continue;
> > 			}
> > 			memcg_memory_event(memcg, MEMCG_LOW);
> > 		}
> > ===
> > 
> > With separate shrink begin()/end() tracepoints we can detect such
> > problem.
> 
> How? You are only reporting the number of reclaimed pages and no
> reclaimed pages could be not just because of low/min limits but
> generally because of other reasons. You would need to report also the
> number of scanned/isolated pages.
>  

From my perspective, if memory control group (memcg) protection
restrictions occur, we can identify them by the absence of the end()
pair of begin(). Other reasons will have both tracepoints raised.

> > > > 3) LRU and SLAB shrinkers are too common places to handle memcg-related
> > > > tasks. Additionally, memcg can be disabled in the kernel configuration.
> > > 
> > > Right. This could be all hidden in the tracing code. You simply do not
> > > print memcg id when the controller is disabled. Or just simply print 0.
> > > I do not really see any major problems with that.
> > > 
> > > I would really prefer to focus on that direction rather than adding
> > > another begin/end tracepoint which overalaps with existing begin/end
> > > traces and provides much more limited information because I would bet we
> > > will have somebody complaining that mere nr_reclaimed is not sufficient.
> > 
> > Okay, I will try to prepare a new patch version with memcg printing from
> > lruvec and slab tracepoints.
> > 
> > Then Andrew should drop the previous patchsets, I suppose. Please advise
> > on the correct workflow steps here.
> 
> Andrew usually just drops the patch from his tree and it will disappaer
> from the linux-next as well.

Okay, I understand, thank you!

Andrew, could you please take a look? I am planning to prepare a new
patch version based on Michal's suggestion, so previous one should be
dropped.
  
Michal Hocko Nov. 29, 2023, 5:10 p.m. UTC | #9
On Wed 29-11-23 19:57:52, Dmitry Rokosov wrote:
> On Wed, Nov 29, 2023 at 05:06:37PM +0100, Michal Hocko wrote:
> > On Wed 29-11-23 18:20:57, Dmitry Rokosov wrote:
> > > On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> > > > On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> > [...]
> > > > > 2) With this approach, we will not have the ability to trace a situation
> > > > > where the kernel is requesting reclaim for a specific memcg, but due to
> > > > > limits issues, we are unable to run it.
> > > > 
> > > > I do not follow. Could you be more specific please?
> > > > 
> > > 
> > > I'm referring to a situation where kswapd() or another kernel mm code
> > > requests some reclaim pages from memcg, but memcg rejects it due to
> > > limits checkers. This occurs in the shrink_node_memcgs() function.
> > 
> > Ohh, you mean reclaim protection
> > 
> > > ===
> > > 		mem_cgroup_calculate_protection(target_memcg, memcg);
> > > 
> > > 		if (mem_cgroup_below_min(target_memcg, memcg)) {
> > > 			/*
> > > 			 * Hard protection.
> > > 			 * If there is no reclaimable memory, OOM.
> > > 			 */
> > > 			continue;
> > > 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
> > > 			/*
> > > 			 * Soft protection.
> > > 			 * Respect the protection only as long as
> > > 			 * there is an unprotected supply
> > > 			 * of reclaimable memory from other cgroups.
> > > 			 */
> > > 			if (!sc->memcg_low_reclaim) {
> > > 				sc->memcg_low_skipped = 1;
> > > 				continue;
> > > 			}
> > > 			memcg_memory_event(memcg, MEMCG_LOW);
> > > 		}
> > > ===
> > > 
> > > With separate shrink begin()/end() tracepoints we can detect such
> > > problem.
> > 
> > How? You are only reporting the number of reclaimed pages and no
> > reclaimed pages could be not just because of low/min limits but
> > generally because of other reasons. You would need to report also the
> > number of scanned/isolated pages.
> >  
> 
> From my perspective, if memory control group (memcg) protection
> restrictions occur, we can identify them by the absence of the end()
> pair of begin(). Other reasons will have both tracepoints raised.

That is not really great way to detect that TBH. Trace events could be
lost and then you simply do not know what has happened.
  
Dmitry Rokosov Nov. 29, 2023, 5:35 p.m. UTC | #10
On Wed, Nov 29, 2023 at 06:10:33PM +0100, Michal Hocko wrote:
> On Wed 29-11-23 19:57:52, Dmitry Rokosov wrote:
> > On Wed, Nov 29, 2023 at 05:06:37PM +0100, Michal Hocko wrote:
> > > On Wed 29-11-23 18:20:57, Dmitry Rokosov wrote:
> > > > On Tue, Nov 28, 2023 at 10:32:50AM +0100, Michal Hocko wrote:
> > > > > On Mon 27-11-23 19:16:37, Dmitry Rokosov wrote:
> > > [...]
> > > > > > 2) With this approach, we will not have the ability to trace a situation
> > > > > > where the kernel is requesting reclaim for a specific memcg, but due to
> > > > > > limits issues, we are unable to run it.
> > > > > 
> > > > > I do not follow. Could you be more specific please?
> > > > > 
> > > > 
> > > > I'm referring to a situation where kswapd() or another kernel mm code
> > > > requests some reclaim pages from memcg, but memcg rejects it due to
> > > > limits checkers. This occurs in the shrink_node_memcgs() function.
> > > 
> > > Ohh, you mean reclaim protection
> > > 
> > > > ===
> > > > 		mem_cgroup_calculate_protection(target_memcg, memcg);
> > > > 
> > > > 		if (mem_cgroup_below_min(target_memcg, memcg)) {
> > > > 			/*
> > > > 			 * Hard protection.
> > > > 			 * If there is no reclaimable memory, OOM.
> > > > 			 */
> > > > 			continue;
> > > > 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
> > > > 			/*
> > > > 			 * Soft protection.
> > > > 			 * Respect the protection only as long as
> > > > 			 * there is an unprotected supply
> > > > 			 * of reclaimable memory from other cgroups.
> > > > 			 */
> > > > 			if (!sc->memcg_low_reclaim) {
> > > > 				sc->memcg_low_skipped = 1;
> > > > 				continue;
> > > > 			}
> > > > 			memcg_memory_event(memcg, MEMCG_LOW);
> > > > 		}
> > > > ===
> > > > 
> > > > With separate shrink begin()/end() tracepoints we can detect such
> > > > problem.
> > > 
> > > How? You are only reporting the number of reclaimed pages and no
> > > reclaimed pages could be not just because of low/min limits but
> > > generally because of other reasons. You would need to report also the
> > > number of scanned/isolated pages.
> > >  
> > 
> > From my perspective, if memory control group (memcg) protection
> > restrictions occur, we can identify them by the absence of the end()
> > pair of begin(). Other reasons will have both tracepoints raised.
> 
> That is not really great way to detect that TBH. Trace events could be
> lost and then you simply do not know what has happened.

I see, thank you very much for the detailed review! I will prepare a new
patchset with memcg names in the lruvec and slab paths, will back soon.
  

Patch

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index e9093fa1c924..a4686afe571d 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -180,6 +180,17 @@  DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_softlimit_r
 	TP_ARGS(order, gfp_flags, memcg)
 );
 
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_begin_template, mm_vmscan_memcg_shrink_begin,
+
+	TP_PROTO(int order, gfp_t gfp_flags, const struct mem_cgroup *memcg),
+
+	TP_ARGS(order, gfp_flags, memcg)
+);
+
+#else
+
+#define trace_mm_vmscan_memcg_shrink_begin(...)
+
 #endif /* CONFIG_MEMCG */
 
 DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
@@ -243,6 +254,17 @@  DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_softlimit_rec
 	TP_ARGS(nr_reclaimed, memcg)
 );
 
+DEFINE_EVENT(mm_vmscan_memcg_reclaim_end_template, mm_vmscan_memcg_shrink_end,
+
+	TP_PROTO(unsigned long nr_reclaimed, const struct mem_cgroup *memcg),
+
+	TP_ARGS(nr_reclaimed, memcg)
+);
+
+#else
+
+#define trace_mm_vmscan_memcg_shrink_end(...)
+
 #endif /* CONFIG_MEMCG */
 
 TRACE_EVENT(mm_shrink_slab_start,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 45780952f4b5..f7e3ddc5a7ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6461,6 +6461,10 @@  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 		 */
 		cond_resched();
 
+		trace_mm_vmscan_memcg_shrink_begin(sc->order,
+						   sc->gfp_mask,
+						   memcg);
+
 		mem_cgroup_calculate_protection(target_memcg, memcg);
 
 		if (mem_cgroup_below_min(target_memcg, memcg)) {
@@ -6491,6 +6495,9 @@  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
 			    sc->priority);
 
+		trace_mm_vmscan_memcg_shrink_end(sc->nr_reclaimed - reclaimed,
+						 memcg);
+
 		/* Record the group's reclaim efficiency */
 		if (!sc->proactive)
 			vmpressure(sc->gfp_mask, memcg, false,