[v1,6/9] memcg: sleep during flushing stats in safe contexts

Message ID 20230328061638.203420-7-yosryahmed@google.com
State New
Headers
Series memcg: make rstat flushing irq and sleep friendly |

Commit Message

Yosry Ahmed March 28, 2023, 6:16 a.m. UTC
  Currently, all contexts that flush memcg stats do so with sleeping not
allowed. Some of these contexts are perfectly safe to sleep in, such as
reading cgroup files from userspace or the background periodic flusher.

Refactor the code to make mem_cgroup_flush_stats() non-atomic (aka
sleepable), and provide a separate atomic version. The atomic version is
used in reclaim, refault, writeback, and in mem_cgroup_usage(). All
other code paths are left to use the non-atomic version. This includes
callbacks for userspace reads and the periodic flusher.

Since refault is the only caller of mem_cgroup_flush_stats_ratelimited(),
this function is changed to call the atomic version of
mem_cgroup_flush_stats(). Reclaim and refault code paths are modified
to do non-atomic flushing in separate later patches -- so
mem_cgroup_flush_stats_ratelimited() will eventually become non-atomic.

Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
 include/linux/memcontrol.h |  5 ++++
 mm/memcontrol.c            | 58 ++++++++++++++++++++++++++++++++------
 mm/vmscan.c                |  2 +-
 3 files changed, 55 insertions(+), 10 deletions(-)
  

Comments

Shakeel Butt March 28, 2023, 3:09 p.m. UTC | #1
On Mon, Mar 27, 2023 at 11:16 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> Currently, all contexts that flush memcg stats do so with sleeping not
> allowed. Some of these contexts are perfectly safe to sleep in, such as
> reading cgroup files from userspace or the background periodic flusher.
>
> Refactor the code to make mem_cgroup_flush_stats() non-atomic (aka
> sleepable), and provide a separate atomic version. The atomic version is
> used in reclaim, refault, writeback, and in mem_cgroup_usage(). All
> other code paths are left to use the non-atomic version. This includes
> callbacks for userspace reads and the periodic flusher.
>
> Since refault is the only caller of mem_cgroup_flush_stats_ratelimited(),
> this function is changed to call the atomic version of
> mem_cgroup_flush_stats(). Reclaim and refault code paths are modified
> to do non-atomic flushing in separate later patches -- so
> mem_cgroup_flush_stats_ratelimited() will eventually become non-atomic.
>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>

Acked-by: Shakeel Butt <shakeelb@google.com>
  
Johannes Weiner March 28, 2023, 6:35 p.m. UTC | #2
On Tue, Mar 28, 2023 at 06:16:35AM +0000, Yosry Ahmed wrote:
> @@ -642,24 +642,57 @@ static void __mem_cgroup_flush_stats(void)
>  	 * from memcg flushers (e.g. reclaim, refault, etc).
>  	 */
>  	if (atomic_xchg(&stats_flush_ongoing, 1))
> -		return;
> +		return false;
>  
>  	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
> -	cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
> +	return true;
> +}
> +
> +static void mem_cgroup_post_stats_flush(void)
> +{
>  	atomic_set(&stats_flush_threshold, 0);
>  	atomic_set(&stats_flush_ongoing, 0);
>  }
>  
> -void mem_cgroup_flush_stats(void)
> +static bool mem_cgroup_should_flush_stats(void)
>  {
> -	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
> -		__mem_cgroup_flush_stats();
> +	return atomic_read(&stats_flush_threshold) > num_online_cpus();
> +}
> +
> +/* atomic functions, safe to call from any context */
> +static void __mem_cgroup_flush_stats_atomic(void)
> +{
> +	if (mem_cgroup_pre_stats_flush()) {
> +		cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
> +		mem_cgroup_post_stats_flush();
> +	}
> +}

I'm afraid I wasn't very nuanced with my complaint about the bool
parameter in the previous version. In this case, when you can do a
common helper for a couple of API functions defined right below it,
and the callers don't spread throughout the codebase, using bools
makes things simpler while still being easily understandable:

static void do_flush_stats(bool may_sleep)
{
	if (atomic_xchg(&stats_flush_ongoing, 1))
		return;

	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
	atomic_set(&stats_flush_threshold, 0);

	if (!may_sleep)
		cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
	else
		cgroup_rstat_flush(root_mem_cgroup->css.cgroup);

	atomic_set(&stats_flush_ongoing, 0);
}

void mem_cgroup_flush_stats(void)
{
	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
		do_flush_stats(true);
}

void mem_cgroup_flush_stats_atomic(void)
{
	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
		do_flush_stats(false);
}

>  void mem_cgroup_flush_stats_ratelimited(void)
>  {
>  	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
> -		mem_cgroup_flush_stats();
> +		mem_cgroup_flush_stats_atomic();
> +}

This should probably be mem_cgroup_flush_stats_atomic_ratelimited().

(Whee, kinda long, but that's alright. Very specialized caller...)

Btw, can you guys think of a reason against moving the threshold check
into the common function? It would then apply to the time-limited
flushes as well, but that shouldn't hurt anything. This would make the
code even simpler:

static void do_flush_stats(bool may_sleep)
{
	if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
		return;

	if (atomic_xchg(&stats_flush_ongoing, 1))
		return;

	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
	atomic_set(&stats_flush_threshold, 0);

	if (!may_sleep)
		cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
	else
		cgroup_rstat_flush(root_mem_cgroup->css.cgroup);

	atomic_set(&stats_flush_ongoing, 0);
}

void mem_cgroup_flush_stats(void)
{
	do_flush_stats(true);
}

void mem_cgroup_flush_stats_atomic(void)
{
	do_flush_stats(false);
}

void mem_cgroup_flush_stats_atomic_ratelimited(void)
{
	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
		do_flush_stats(false);
}

> @@ -2845,7 +2845,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
>  	 * Flush the memory cgroup stats, so that we read accurate per-memcg
>  	 * lruvec stats for heuristics.
>  	 */
> -	mem_cgroup_flush_stats();
> +	mem_cgroup_flush_stats_atomic();

I'm thinking this one could be non-atomic as well. It's called fairly
high up in reclaim without any locks held.
  
Yosry Ahmed March 28, 2023, 6:45 p.m. UTC | #3
On Tue, Mar 28, 2023 at 11:35 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Mar 28, 2023 at 06:16:35AM +0000, Yosry Ahmed wrote:
> > @@ -642,24 +642,57 @@ static void __mem_cgroup_flush_stats(void)
> >        * from memcg flushers (e.g. reclaim, refault, etc).
> >        */
> >       if (atomic_xchg(&stats_flush_ongoing, 1))
> > -             return;
> > +             return false;
> >
> >       WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
> > -     cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
> > +     return true;
> > +}
> > +
> > +static void mem_cgroup_post_stats_flush(void)
> > +{
> >       atomic_set(&stats_flush_threshold, 0);
> >       atomic_set(&stats_flush_ongoing, 0);
> >  }
> >
> > -void mem_cgroup_flush_stats(void)
> > +static bool mem_cgroup_should_flush_stats(void)
> >  {
> > -     if (atomic_read(&stats_flush_threshold) > num_online_cpus())
> > -             __mem_cgroup_flush_stats();
> > +     return atomic_read(&stats_flush_threshold) > num_online_cpus();
> > +}
> > +
> > +/* atomic functions, safe to call from any context */
> > +static void __mem_cgroup_flush_stats_atomic(void)
> > +{
> > +     if (mem_cgroup_pre_stats_flush()) {
> > +             cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
> > +             mem_cgroup_post_stats_flush();
> > +     }
> > +}
>
> I'm afraid I wasn't very nuanced with my complaint about the bool
> parameter in the previous version. In this case, when you can do a
> common helper for a couple of API functions defined right below it,
> and the callers don't spread throughout the codebase, using bools
> makes things simpler while still being easily understandable:

Looking at your suggestion now, it seems fairly obvious that this is
what I should have gone for. Will do that for v2. Thanks!

>
> static void do_flush_stats(bool may_sleep)
> {
>         if (atomic_xchg(&stats_flush_ongoing, 1))
>                 return;
>
>         WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
>         atomic_set(&stats_flush_threshold, 0);
>
>         if (!may_sleep)
>                 cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
>         else
>                 cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
>
>         atomic_set(&stats_flush_ongoing, 0);
> }
>
> void mem_cgroup_flush_stats(void)
> {
>         if (atomic_read(&stats_flush_threshold) > num_online_cpus())
>                 do_flush_stats(true);
> }
>
> void mem_cgroup_flush_stats_atomic(void)
> {
>         if (atomic_read(&stats_flush_threshold) > num_online_cpus())
>                 do_flush_stats(false);
> }
>
> >  void mem_cgroup_flush_stats_ratelimited(void)
> >  {
> >       if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
> > -             mem_cgroup_flush_stats();
> > +             mem_cgroup_flush_stats_atomic();
> > +}
>
> This should probably be mem_cgroup_flush_stats_atomic_ratelimited().
>
> (Whee, kinda long, but that's alright. Very specialized caller...)

It should, but the following patch makes it non-atomic anyway, so I
thought I wouldn't clutter the diff by renaming it here and then
reverting it back in the next patch.

There is an argument for maintaining a clean history tho in case the
next patch is reverted separately (which is the reason I put it in a
separate patch to begin with) -- so perhaps I should rename it here to
mem_cgroup_flush_stats_atomic_ratelimited () and back to
mem_cgroup_flush_stats_ratelimited() in the next patch, just for
consistency?

>
> Btw, can you guys think of a reason against moving the threshold check
> into the common function? It would then apply to the time-limited
> flushes as well, but that shouldn't hurt anything. This would make the
> code even simpler:

I think the point of having the threshold check outside the common
function is that the periodic flusher always flushes, regardless of
the threshold, to keep rstat flushing from critical contexts as cheap
as possible.

If you think it's not worth it, I can make that change. It is a
separate functional change tho, so maybe in a separate patch.

>
> static void do_flush_stats(bool may_sleep)
> {
>         if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
>                 return;
>
>         if (atomic_xchg(&stats_flush_ongoing, 1))
>                 return;
>
>         WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
>         atomic_set(&stats_flush_threshold, 0);
>
>         if (!may_sleep)
>                 cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
>         else
>                 cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
>
>         atomic_set(&stats_flush_ongoing, 0);
> }
>
> void mem_cgroup_flush_stats(void)
> {
>         do_flush_stats(true);
> }
>
> void mem_cgroup_flush_stats_atomic(void)
> {
>         do_flush_stats(false);
> }
>
> void mem_cgroup_flush_stats_atomic_ratelimited(void)
> {
>         if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
>                 do_flush_stats(false);
> }
>
> > @@ -2845,7 +2845,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
> >        * Flush the memory cgroup stats, so that we read accurate per-memcg
> >        * lruvec stats for heuristics.
> >        */
> > -     mem_cgroup_flush_stats();
> > +     mem_cgroup_flush_stats_atomic();
>
> I'm thinking this one could be non-atomic as well. It's called fairly
> high up in reclaim without any locks held.

A later patch does exactly that. I put making the reclaim and refault
paths non-atomic in separate patches to easily revert them if we see a
regression. Let me know if this is too defensive and if you'd rather
have them squashed.

Thanks!
  
Johannes Weiner March 28, 2023, 7:06 p.m. UTC | #4
On Tue, Mar 28, 2023 at 11:45:19AM -0700, Yosry Ahmed wrote:
> On Tue, Mar 28, 2023 at 11:35 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > On Tue, Mar 28, 2023 at 06:16:35AM +0000, Yosry Ahmed wrote:
> > >  void mem_cgroup_flush_stats_ratelimited(void)
> > >  {
> > >       if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
> > > -             mem_cgroup_flush_stats();
> > > +             mem_cgroup_flush_stats_atomic();
> > > +}
> >
> > This should probably be mem_cgroup_flush_stats_atomic_ratelimited().
> >
> > (Whee, kinda long, but that's alright. Very specialized caller...)
> 
> It should, but the following patch makes it non-atomic anyway, so I
> thought I wouldn't clutter the diff by renaming it here and then
> reverting it back in the next patch.
> 
> There is an argument for maintaining a clean history tho in case the
> next patch is reverted separately (which is the reason I put it in a
> separate patch to begin with) -- so perhaps I should rename it here to
> mem_cgroup_flush_stats_atomic_ratelimited () and back to
> mem_cgroup_flush_stats_ratelimited() in the next patch, just for
> consistency?

Sounds good to me. It's pretty minor churn.

> > Btw, can you guys think of a reason against moving the threshold check
> > into the common function? It would then apply to the time-limited
> > flushes as well, but that shouldn't hurt anything. This would make the
> > code even simpler:
> 
> I think the point of having the threshold check outside the common
> function is that the periodic flusher always flushes, regardless of
> the threshold, to keep rstat flushing from critical contexts as cheap
> as possible.

Good point. Yeah, let's keep it separate then.

> > > @@ -2845,7 +2845,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
> > >        * Flush the memory cgroup stats, so that we read accurate per-memcg
> > >        * lruvec stats for heuristics.
> > >        */
> > > -     mem_cgroup_flush_stats();
> > > +     mem_cgroup_flush_stats_atomic();
> >
> > I'm thinking this one could be non-atomic as well. It's called fairly
> > high up in reclaim without any locks held.
> 
> A later patch does exactly that. I put making the reclaim and refault
> paths non-atomic in separate patches to easily revert them if we see a
> regression. Let me know if this is too defensive and if you'd rather
> have them squashed.

No, good call. I should have just looked ahead first :-)
  
Yosry Ahmed March 28, 2023, 7:26 p.m. UTC | #5
On Tue, Mar 28, 2023 at 12:06 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Mar 28, 2023 at 11:45:19AM -0700, Yosry Ahmed wrote:
> > On Tue, Mar 28, 2023 at 11:35 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > On Tue, Mar 28, 2023 at 06:16:35AM +0000, Yosry Ahmed wrote:
> > > >  void mem_cgroup_flush_stats_ratelimited(void)
> > > >  {
> > > >       if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
> > > > -             mem_cgroup_flush_stats();
> > > > +             mem_cgroup_flush_stats_atomic();
> > > > +}
> > >
> > > This should probably be mem_cgroup_flush_stats_atomic_ratelimited().
> > >
> > > (Whee, kinda long, but that's alright. Very specialized caller...)
> >
> > It should, but the following patch makes it non-atomic anyway, so I
> > thought I wouldn't clutter the diff by renaming it here and then
> > reverting it back in the next patch.
> >
> > There is an argument for maintaining a clean history tho in case the
> > next patch is reverted separately (which is the reason I put it in a
> > separate patch to begin with) -- so perhaps I should rename it here to
> > mem_cgroup_flush_stats_atomic_ratelimited () and back to
> > mem_cgroup_flush_stats_ratelimited() in the next patch, just for
> > consistency?
>
> Sounds good to me. It's pretty minor churn.

Ack. Will do so for v2. Thanks!

>
> > > Btw, can you guys think of a reason against moving the threshold check
> > > into the common function? It would then apply to the time-limited
> > > flushes as well, but that shouldn't hurt anything. This would make the
> > > code even simpler:
> >
> > I think the point of having the threshold check outside the common
> > function is that the periodic flusher always flushes, regardless of
> > the threshold, to keep rstat flushing from critical contexts as cheap
> > as possible.
>
> Good point. Yeah, let's keep it separate then.

Agreed.

>
> > > > @@ -2845,7 +2845,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
> > > >        * Flush the memory cgroup stats, so that we read accurate per-memcg
> > > >        * lruvec stats for heuristics.
> > > >        */
> > > > -     mem_cgroup_flush_stats();
> > > > +     mem_cgroup_flush_stats_atomic();
> > >
> > > I'm thinking this one could be non-atomic as well. It's called fairly
> > > high up in reclaim without any locks held.
> >
> > A later patch does exactly that. I put making the reclaim and refault
> > paths non-atomic in separate patches to easily revert them if we see a
> > regression. Let me know if this is too defensive and if you'd rather
> > have them squashed.
>
> No, good call. I should have just looked ahead first :-)
  

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ac3f3b3a45e2..a4bc3910a2eb 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1037,6 +1037,7 @@  static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 }
 
 void mem_cgroup_flush_stats(void);
+void mem_cgroup_flush_stats_atomic(void);
 void mem_cgroup_flush_stats_ratelimited(void);
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
@@ -1535,6 +1536,10 @@  static inline void mem_cgroup_flush_stats(void)
 {
 }
 
+static inline void mem_cgroup_flush_stats_atomic(void)
+{
+}
+
 static inline void mem_cgroup_flush_stats_ratelimited(void)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 64ff33e02c96..57e8cbf701f3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -634,7 +634,7 @@  static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 	}
 }
 
-static void __mem_cgroup_flush_stats(void)
+static bool mem_cgroup_pre_stats_flush(void)
 {
 	/*
 	 * We always flush the entire tree, so concurrent flushers can just
@@ -642,24 +642,57 @@  static void __mem_cgroup_flush_stats(void)
 	 * from memcg flushers (e.g. reclaim, refault, etc).
 	 */
 	if (atomic_xchg(&stats_flush_ongoing, 1))
-		return;
+		return false;
 
 	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
-	cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
+	return true;
+}
+
+static void mem_cgroup_post_stats_flush(void)
+{
 	atomic_set(&stats_flush_threshold, 0);
 	atomic_set(&stats_flush_ongoing, 0);
 }
 
-void mem_cgroup_flush_stats(void)
+static bool mem_cgroup_should_flush_stats(void)
 {
-	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
-		__mem_cgroup_flush_stats();
+	return atomic_read(&stats_flush_threshold) > num_online_cpus();
+}
+
+/* atomic functions, safe to call from any context */
+static void __mem_cgroup_flush_stats_atomic(void)
+{
+	if (mem_cgroup_pre_stats_flush()) {
+		cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
+		mem_cgroup_post_stats_flush();
+	}
+}
+
+void mem_cgroup_flush_stats_atomic(void)
+{
+	if (mem_cgroup_should_flush_stats())
+		__mem_cgroup_flush_stats_atomic();
 }
 
 void mem_cgroup_flush_stats_ratelimited(void)
 {
 	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
-		mem_cgroup_flush_stats();
+		mem_cgroup_flush_stats_atomic();
+}
+
+/* non-atomic functions, only safe from sleepable contexts */
+static void __mem_cgroup_flush_stats(void)
+{
+	if (mem_cgroup_pre_stats_flush()) {
+		cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+		mem_cgroup_post_stats_flush();
+	}
+}
+
+void mem_cgroup_flush_stats(void)
+{
+	if (mem_cgroup_should_flush_stats())
+		__mem_cgroup_flush_stats();
 }
 
 static void flush_memcg_stats_dwork(struct work_struct *w)
@@ -3684,9 +3717,12 @@  static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 		 * done from irq context; use stale stats in this case.
 		 * Arguably, usage threshold events are not reliable on the root
 		 * memcg anyway since its usage is ill-defined.
+		 *
+		 * Additionally, other call paths through memcg_check_events()
+		 * disable irqs, so make sure we are flushing stats atomically.
 		 */
 		if (in_task())
-			mem_cgroup_flush_stats();
+			mem_cgroup_flush_stats_atomic();
 		val = memcg_page_state(memcg, NR_FILE_PAGES) +
 			memcg_page_state(memcg, NR_ANON_MAPPED);
 		if (swap)
@@ -4609,7 +4645,11 @@  void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	mem_cgroup_flush_stats();
+	/*
+	 * wb_writeback() takes a spinlock and calls
+	 * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep.
+	 */
+	mem_cgroup_flush_stats_atomic();
 
 	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c1c5e8b24b8..a9511ccb936f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2845,7 +2845,7 @@  static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
 	 * Flush the memory cgroup stats, so that we read accurate per-memcg
 	 * lruvec stats for heuristics.
 	 */
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats_atomic();
 
 	/*
 	 * Determine the scan balance between anon and file LRUs.