[v9,3/5] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full

Message ID 20221206162416.474800121@redhat.com
State New
Headers
Series Ensure quiet_vmstat() is called when the idle tick was stopped too |

Commit Message

Marcelo Tosatti Dec. 6, 2022, 4:18 p.m. UTC
  For nohz full CPUs, manage per-CPU stat syncing from CPU context:
start delayed work when marking per-CPU vmstat dirty.

When returning to userspace, fold the stats and cancel the delayed work.

When entering idle, only fold the stats.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/vmstat.h   |    4 ++--
 kernel/time/tick-sched.c |    2 +-
 mm/vmstat.c              |   41 ++++++++++++++++++++++++++++++++---------
 3 files changed, 35 insertions(+), 12 deletions(-)
  

Comments

Frederic Weisbecker Dec. 14, 2022, 1:18 p.m. UTC | #1
On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> For nohz full CPUs, manage per-CPU stat syncing from CPU context:
> start delayed work when marking per-CPU vmstat dirty.
> 
> When returning to userspace, fold the stats and cancel the delayed work.
> 
> When entering idle, only fold the stats.

The changelog still misses the reason behind the changes.

> @@ -195,9 +196,24 @@ void fold_vm_numa_events(void)
>  
>  #ifdef CONFIG_SMP
>  static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
> +static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
> +int sysctl_stat_interval __read_mostly = HZ;
>  
>  static inline void vmstat_mark_dirty(void)
>  {
> +	int cpu = smp_processor_id();
> +
> +	if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> +		struct delayed_work *dw;
> +
> +		dw = &per_cpu(vmstat_work, cpu);

this_cpu_ptr()

> +		if (!delayed_work_pending(dw)) {
> +			unsigned long delay;
> +
> +			delay = round_jiffies_relative(sysctl_stat_interval);
> +			queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
> +		}
> +	}
>  	this_cpu_write(vmstat_dirty, true);
>  }
>  
> @@ -1973,21 +1986,27 @@ static void vmstat_update(struct work_st
>   * until the diffs stay at zero. The function is used by NOHZ and can only be
>   * invoked when tick processing is not active.
>   */
> -void quiet_vmstat(void)
> +void quiet_vmstat(bool user)
>  {
> +	struct delayed_work *dw;
> +
>  	if (system_state != SYSTEM_RUNNING)
>  		return;
>  
>  	if (!is_vmstat_dirty())
>  		return;
>  
> +	refresh_cpu_vm_stats(false);
> +
> +	if (!user)
> +		return;
>  	/*
> -	 * Just refresh counters and do not care about the pending delayed
> -	 * vmstat_update. It doesn't fire that often to matter and canceling
> -	 * it would be too expensive from this path.
> -	 * vmstat_shepherd will take care about that for us.
> +	 * If the tick is stopped, cancel any delayed work to avoid
> +	 * interruptions to this CPU in the future.
>  	 */
> -	refresh_cpu_vm_stats(false);
> +	dw = &per_cpu(vmstat_work, smp_processor_id());

this_cpu_ptr()

Thanks.

> +	if (delayed_work_pending(dw))
> +		cancel_delayed_work(dw);
>  }
>  
>  /*
  
Frederic Weisbecker Dec. 14, 2022, 1:33 p.m. UTC | #2
On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
>  static inline void vmstat_mark_dirty(void)
>  {
> +	int cpu = smp_processor_id();
> +
> +	if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> +		struct delayed_work *dw;
> +
> +		dw = &per_cpu(vmstat_work, cpu);
> +		if (!delayed_work_pending(dw)) {
> +			unsigned long delay;
> +
> +			delay = round_jiffies_relative(sysctl_stat_interval);
> +			queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);

Currently the vmstat_work is flushed on cpu_hotplug (CPUHP_AP_ONLINE_DYN).
vmstat_shepherd makes sure to not rearm it afterward. But now it looks
possible for the above to do that mistake?

> +		}
> +	}
>  	this_cpu_write(vmstat_dirty, true);
>  }
> @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
>  	for_each_online_cpu(cpu) {
>  		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
>  
> +		/* NOHZ full CPUs manage their own vmstat flushing */
> +		if (tick_nohz_full_cpu(smp_processor_id()))

It should be the remote CPU instead of the current one.

Thanks.

> +			continue;
> +
>  		if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
>  			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
>
  
Marcelo Tosatti Dec. 16, 2022, 4:16 p.m. UTC | #3
On Wed, Dec 14, 2022 at 02:33:02PM +0100, Frederic Weisbecker wrote:
> On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> >  static inline void vmstat_mark_dirty(void)
> >  {
> > +	int cpu = smp_processor_id();
> > +
> > +	if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> > +		struct delayed_work *dw;
> > +
> > +		dw = &per_cpu(vmstat_work, cpu);
> > +		if (!delayed_work_pending(dw)) {
> > +			unsigned long delay;
> > +
> > +			delay = round_jiffies_relative(sysctl_stat_interval);
> > +			queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
> 
> Currently the vmstat_work is flushed on cpu_hotplug (CPUHP_AP_ONLINE_DYN).
> vmstat_shepherd makes sure to not rearm it afterward. But now it looks
> possible for the above to do that mistake?

Don't think the mistake is an issue. In case of a
queue_delayed_work_on being called after cancel_delayed_work_sync,
either vmstat_update executes on the local CPU, or on a
different CPU (after the bound kworkers have been moved).

Each case is fine (see vmstat_update).

> > +		}
> > +	}
> >  	this_cpu_write(vmstat_dirty, true);
> >  }
> > @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
> >  	for_each_online_cpu(cpu) {
> >  		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> >  
> > +		/* NOHZ full CPUs manage their own vmstat flushing */
> > +		if (tick_nohz_full_cpu(smp_processor_id()))
> 
> It should be the remote CPU instead of the current one.

Fixed.
  
Frederic Weisbecker Dec. 16, 2022, 10:47 p.m. UTC | #4
On Fri, Dec 16, 2022 at 01:16:09PM -0300, Marcelo Tosatti wrote:
> On Wed, Dec 14, 2022 at 02:33:02PM +0100, Frederic Weisbecker wrote:
> > On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> > >  static inline void vmstat_mark_dirty(void)
> > >  {
> > > +	int cpu = smp_processor_id();
> > > +
> > > +	if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> > > +		struct delayed_work *dw;
> > > +
> > > +		dw = &per_cpu(vmstat_work, cpu);
> > > +		if (!delayed_work_pending(dw)) {
> > > +			unsigned long delay;
> > > +
> > > +			delay = round_jiffies_relative(sysctl_stat_interval);
> > > +			queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
> > 
> > Currently the vmstat_work is flushed on cpu_hotplug (CPUHP_AP_ONLINE_DYN).
> > vmstat_shepherd makes sure to not rearm it afterward. But now it looks
> > possible for the above to do that mistake?
> 
> Don't think the mistake is an issue. In case of a
> queue_delayed_work_on being called after cancel_delayed_work_sync,
> either vmstat_update executes on the local CPU, or on a
> different CPU (after the bound kworkers have been moved).

But after the CPU goes offline, its workqueue pool becomes UNBOUND. Which means
that the vmstat_update() from the offline CPU can then execute partly on CPU 0, then
gets preempted and executes halfway on CPU 1, then gets preempted and...

Having a quick look at refresh_cpu_vm_stats(), I doesn't look ready for that...

Thanks.

> 
> Each case is fine (see vmstat_update).
> 
> > > +		}
> > > +	}
> > >  	this_cpu_write(vmstat_dirty, true);
> > >  }
> > > @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
> > >  	for_each_online_cpu(cpu) {
> > >  		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> > >  
> > > +		/* NOHZ full CPUs manage their own vmstat flushing */
> > > +		if (tick_nohz_full_cpu(smp_processor_id()))
> > 
> > It should be the remote CPU instead of the current one.
> 
> Fixed.
>
  

Patch

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c
+++ linux-2.6/mm/vmstat.c
@@ -28,6 +28,7 @@ 
 #include <linux/mm_inline.h>
 #include <linux/page_ext.h>
 #include <linux/page_owner.h>
+#include <linux/tick.h>
 
 #include "internal.h"
 
@@ -195,9 +196,24 @@  void fold_vm_numa_events(void)
 
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;
 
 static inline void vmstat_mark_dirty(void)
 {
+	int cpu = smp_processor_id();
+
+	if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
+		struct delayed_work *dw;
+
+		dw = &per_cpu(vmstat_work, cpu);
+		if (!delayed_work_pending(dw)) {
+			unsigned long delay;
+
+			delay = round_jiffies_relative(sysctl_stat_interval);
+			queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
+		}
+	}
 	this_cpu_write(vmstat_dirty, true);
 }
 
@@ -1886,9 +1902,6 @@  static const struct seq_operations vmsta
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
-
 #ifdef CONFIG_PROC_FS
 static void refresh_vm_stats(struct work_struct *work)
 {
@@ -1973,21 +1986,27 @@  static void vmstat_update(struct work_st
  * until the diffs stay at zero. The function is used by NOHZ and can only be
  * invoked when tick processing is not active.
  */
-void quiet_vmstat(void)
+void quiet_vmstat(bool user)
 {
+	struct delayed_work *dw;
+
 	if (system_state != SYSTEM_RUNNING)
 		return;
 
 	if (!is_vmstat_dirty())
 		return;
 
+	refresh_cpu_vm_stats(false);
+
+	if (!user)
+		return;
 	/*
-	 * Just refresh counters and do not care about the pending delayed
-	 * vmstat_update. It doesn't fire that often to matter and canceling
-	 * it would be too expensive from this path.
-	 * vmstat_shepherd will take care about that for us.
+	 * If the tick is stopped, cancel any delayed work to avoid
+	 * interruptions to this CPU in the future.
 	 */
-	refresh_cpu_vm_stats(false);
+	dw = &per_cpu(vmstat_work, smp_processor_id());
+	if (delayed_work_pending(dw))
+		cancel_delayed_work(dw);
 }
 
 /*
@@ -2009,6 +2028,10 @@  static void vmstat_shepherd(struct work_
 	for_each_online_cpu(cpu) {
 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
 
+		/* NOHZ full CPUs manage their own vmstat flushing */
+		if (tick_nohz_full_cpu(smp_processor_id()))
+			continue;
+
 		if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
 			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
 
Index: linux-2.6/include/linux/vmstat.h
===================================================================
--- linux-2.6.orig/include/linux/vmstat.h
+++ linux-2.6/include/linux/vmstat.h
@@ -290,7 +290,7 @@  extern void dec_zone_state(struct zone *
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
 
-void quiet_vmstat(void);
+void quiet_vmstat(bool user);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
@@ -403,7 +403,7 @@  static inline void __dec_node_page_state
 
 static inline void refresh_zone_stat_thresholds(void) { }
 static inline void cpu_vm_stats_fold(int cpu) { }
-static inline void quiet_vmstat(void) { }
+static inline void quiet_vmstat(bool user) { }
 
 static inline void drain_zonestat(struct zone *zone,
 			struct per_cpu_zonestat *pzstats) { }
Index: linux-2.6/kernel/time/tick-sched.c
===================================================================
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -911,7 +911,7 @@  static void tick_nohz_stop_tick(struct t
 	 */
 	if (!ts->tick_stopped) {
 		calc_load_nohz_start();
-		quiet_vmstat();
+		quiet_vmstat(false);
 
 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 		ts->tick_stopped = 1;