[v9,3/5] mm/vmstat: manage per-CPU stats from CPU context when NOHZ full
Commit Message
For nohz full CPUs, manage per-CPU stat syncing from CPU context:
start delayed work when marking per-CPU vmstat dirty.
When returning to userspace, fold the stats and cancel the delayed work.
When entering idle, only fold the stats.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
include/linux/vmstat.h | 4 ++--
kernel/time/tick-sched.c | 2 +-
mm/vmstat.c | 41 ++++++++++++++++++++++++++++++++---------
3 files changed, 35 insertions(+), 12 deletions(-)
Comments
On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> For nohz full CPUs, manage per-CPU stat syncing from CPU context:
> start delayed work when marking per-CPU vmstat dirty.
>
> When returning to userspace, fold the stats and cancel the delayed work.
>
> When entering idle, only fold the stats.
The changelog still misses the reason behind the changes.
> @@ -195,9 +196,24 @@ void fold_vm_numa_events(void)
>
> #ifdef CONFIG_SMP
> static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
> +static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
> +int sysctl_stat_interval __read_mostly = HZ;
>
> static inline void vmstat_mark_dirty(void)
> {
> + int cpu = smp_processor_id();
> +
> + if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> + struct delayed_work *dw;
> +
> + dw = &per_cpu(vmstat_work, cpu);
this_cpu_ptr()
> + if (!delayed_work_pending(dw)) {
> + unsigned long delay;
> +
> + delay = round_jiffies_relative(sysctl_stat_interval);
> + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
> + }
> + }
> this_cpu_write(vmstat_dirty, true);
> }
>
> @@ -1973,21 +1986,27 @@ static void vmstat_update(struct work_st
> * until the diffs stay at zero. The function is used by NOHZ and can only be
> * invoked when tick processing is not active.
> */
> -void quiet_vmstat(void)
> +void quiet_vmstat(bool user)
> {
> + struct delayed_work *dw;
> +
> if (system_state != SYSTEM_RUNNING)
> return;
>
> if (!is_vmstat_dirty())
> return;
>
> + refresh_cpu_vm_stats(false);
> +
> + if (!user)
> + return;
> /*
> - * Just refresh counters and do not care about the pending delayed
> - * vmstat_update. It doesn't fire that often to matter and canceling
> - * it would be too expensive from this path.
> - * vmstat_shepherd will take care about that for us.
> + * If the tick is stopped, cancel any delayed work to avoid
> + * interruptions to this CPU in the future.
> */
> - refresh_cpu_vm_stats(false);
> + dw = &per_cpu(vmstat_work, smp_processor_id());
this_cpu_ptr()
Thanks.
> + if (delayed_work_pending(dw))
> + cancel_delayed_work(dw);
> }
>
> /*
On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> static inline void vmstat_mark_dirty(void)
> {
> + int cpu = smp_processor_id();
> +
> + if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> + struct delayed_work *dw;
> +
> + dw = &per_cpu(vmstat_work, cpu);
> + if (!delayed_work_pending(dw)) {
> + unsigned long delay;
> +
> + delay = round_jiffies_relative(sysctl_stat_interval);
> + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
Currently the vmstat_work is flushed on cpu_hotplug (CPUHP_AP_ONLINE_DYN).
vmstat_shepherd makes sure to not rearm it afterward. But now it looks
possible for the above to do that mistake?
> + }
> + }
> this_cpu_write(vmstat_dirty, true);
> }
> @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
> for_each_online_cpu(cpu) {
> struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
>
> + /* NOHZ full CPUs manage their own vmstat flushing */
> + if (tick_nohz_full_cpu(smp_processor_id()))
It should be the remote CPU instead of the current one.
Thanks.
> + continue;
> +
> if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
> queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
>
On Wed, Dec 14, 2022 at 02:33:02PM +0100, Frederic Weisbecker wrote:
> On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> > static inline void vmstat_mark_dirty(void)
> > {
> > + int cpu = smp_processor_id();
> > +
> > + if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> > + struct delayed_work *dw;
> > +
> > + dw = &per_cpu(vmstat_work, cpu);
> > + if (!delayed_work_pending(dw)) {
> > + unsigned long delay;
> > +
> > + delay = round_jiffies_relative(sysctl_stat_interval);
> > + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
>
> Currently the vmstat_work is flushed on cpu_hotplug (CPUHP_AP_ONLINE_DYN).
> vmstat_shepherd makes sure to not rearm it afterward. But now it looks
> possible for the above to do that mistake?
Don't think the mistake is an issue. In case of a
queue_delayed_work_on being called after cancel_delayed_work_sync,
either vmstat_update executes on the local CPU, or on a
different CPU (after the bound kworkers have been moved).
Each case is fine (see vmstat_update).
> > + }
> > + }
> > this_cpu_write(vmstat_dirty, true);
> > }
> > @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
> > for_each_online_cpu(cpu) {
> > struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> >
> > + /* NOHZ full CPUs manage their own vmstat flushing */
> > + if (tick_nohz_full_cpu(smp_processor_id()))
>
> It should be the remote CPU instead of the current one.
Fixed.
On Fri, Dec 16, 2022 at 01:16:09PM -0300, Marcelo Tosatti wrote:
> On Wed, Dec 14, 2022 at 02:33:02PM +0100, Frederic Weisbecker wrote:
> > On Tue, Dec 06, 2022 at 01:18:29PM -0300, Marcelo Tosatti wrote:
> > > static inline void vmstat_mark_dirty(void)
> > > {
> > > + int cpu = smp_processor_id();
> > > +
> > > + if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
> > > + struct delayed_work *dw;
> > > +
> > > + dw = &per_cpu(vmstat_work, cpu);
> > > + if (!delayed_work_pending(dw)) {
> > > + unsigned long delay;
> > > +
> > > + delay = round_jiffies_relative(sysctl_stat_interval);
> > > + queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
> >
> > Currently the vmstat_work is flushed on cpu_hotplug (CPUHP_AP_ONLINE_DYN).
> > vmstat_shepherd makes sure to not rearm it afterward. But now it looks
> > possible for the above to do that mistake?
>
> Don't think the mistake is an issue. In case of a
> queue_delayed_work_on being called after cancel_delayed_work_sync,
> either vmstat_update executes on the local CPU, or on a
> different CPU (after the bound kworkers have been moved).
But after the CPU goes offline, its workqueue pool becomes UNBOUND. Which means
that the vmstat_update() from the offline CPU can then execute partly on CPU 0, then
gets preempted and executes halfway on CPU 1, then gets preempted and...
Having a quick look at refresh_cpu_vm_stats(), I doesn't look ready for that...
Thanks.
>
> Each case is fine (see vmstat_update).
>
> > > + }
> > > + }
> > > this_cpu_write(vmstat_dirty, true);
> > > }
> > > @@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
> > > for_each_online_cpu(cpu) {
> > > struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
> > >
> > > + /* NOHZ full CPUs manage their own vmstat flushing */
> > > + if (tick_nohz_full_cpu(smp_processor_id()))
> >
> > It should be the remote CPU instead of the current one.
>
> Fixed.
>
===================================================================
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/tick.h>
#include "internal.h"
@@ -195,9 +196,24 @@ void fold_vm_numa_events(void)
#ifdef CONFIG_SMP
static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;
static inline void vmstat_mark_dirty(void)
{
+ int cpu = smp_processor_id();
+
+ if (tick_nohz_full_cpu(cpu) && !this_cpu_read(vmstat_dirty)) {
+ struct delayed_work *dw;
+
+ dw = &per_cpu(vmstat_work, cpu);
+ if (!delayed_work_pending(dw)) {
+ unsigned long delay;
+
+ delay = round_jiffies_relative(sysctl_stat_interval);
+ queue_delayed_work_on(cpu, mm_percpu_wq, dw, delay);
+ }
+ }
this_cpu_write(vmstat_dirty, true);
}
@@ -1886,9 +1902,6 @@ static const struct seq_operations vmsta
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
-
#ifdef CONFIG_PROC_FS
static void refresh_vm_stats(struct work_struct *work)
{
@@ -1973,21 +1986,27 @@ static void vmstat_update(struct work_st
* until the diffs stay at zero. The function is used by NOHZ and can only be
* invoked when tick processing is not active.
*/
-void quiet_vmstat(void)
+void quiet_vmstat(bool user)
{
+ struct delayed_work *dw;
+
if (system_state != SYSTEM_RUNNING)
return;
if (!is_vmstat_dirty())
return;
+ refresh_cpu_vm_stats(false);
+
+ if (!user)
+ return;
/*
- * Just refresh counters and do not care about the pending delayed
- * vmstat_update. It doesn't fire that often to matter and canceling
- * it would be too expensive from this path.
- * vmstat_shepherd will take care about that for us.
+ * If the tick is stopped, cancel any delayed work to avoid
+ * interruptions to this CPU in the future.
*/
- refresh_cpu_vm_stats(false);
+ dw = &per_cpu(vmstat_work, smp_processor_id());
+ if (delayed_work_pending(dw))
+ cancel_delayed_work(dw);
}
/*
@@ -2009,6 +2028,10 @@ static void vmstat_shepherd(struct work_
for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+ /* NOHZ full CPUs manage their own vmstat flushing */
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ continue;
+
if (!delayed_work_pending(dw) && per_cpu(vmstat_dirty, cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
===================================================================
@@ -290,7 +290,7 @@ extern void dec_zone_state(struct zone *
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
-void quiet_vmstat(void);
+void quiet_vmstat(bool user);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);
@@ -403,7 +403,7 @@ static inline void __dec_node_page_state
static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
-static inline void quiet_vmstat(void) { }
+static inline void quiet_vmstat(bool user) { }
static inline void drain_zonestat(struct zone *zone,
struct per_cpu_zonestat *pzstats) { }
===================================================================
@@ -911,7 +911,7 @@ static void tick_nohz_stop_tick(struct t
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
- quiet_vmstat();
+ quiet_vmstat(false);
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;