[RFC] sched: Rate limit migrations

Message ID 20230411214116.361016-1-mathieu.desnoyers@efficios.com
State New
Headers
Series [RFC] sched: Rate limit migrations |

Commit Message

Mathieu Desnoyers April 11, 2023, 9:41 p.m. UTC
  This WIP patch rate-limits migrations to 32 migrations per 10ms window
for each task.

The specific migration count and window size can be changed with the
following defines in kernel/sched/sched.h:

- SCHED_MIGRATION_WINDOW_NS
- SCHED_MIGRATION_LIMIT

Testing is welcome, especially to see if it helps with Aaron's
migration-heavy workload wrt rseq concurrency id performance
regression.

Link: https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Olivier Dion <odion@efficios.com>
Cc: michael.christie@oracle.com
---
 include/linux/sched.h |  9 +++++++++
 kernel/fork.c         |  3 +++
 kernel/sched/core.c   | 42 ++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h  |  7 +++++++
 4 files changed, 57 insertions(+), 4 deletions(-)
  

Comments

Aaron Lu April 12, 2023, 11:53 a.m. UTC | #1
On Tue, Apr 11, 2023 at 05:41:16PM -0400, Mathieu Desnoyers wrote:
> This WIP patch rate-limits migrations to 32 migrations per 10ms window
> for each task.
> 
> The specific migration count and window size can be changed with the
> following defines in kernel/sched/sched.h:
> 
> - SCHED_MIGRATION_WINDOW_NS
> - SCHED_MIGRATION_LIMIT
> 
> Testing is welcome, especially to see if it helps with Aaron's
> migration-heavy workload wrt rseq concurrency id performance
> regression.

Initial test shows the migration number for a 5s window is still in some
millions and profile wise, contention didn't change much, still in the
range of single digit to 20% something during 3 minutes run.

If this limit works, then the total migration number should be less
than: 224 * 2 * 3200 * 5 = 7million so hard to say if the limit works as
expected since the number I captured is indeed less than 7 million.
224 * 2 means totally there are 448 tasks, 224 client and 224
server. 3200 is the theoretical migration number for a task in a 1s
window and 5 means 5 seconds.

I'll play with it more to see how things change.
  

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 48d48b2c73a5..bfd5e268900c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1316,6 +1316,15 @@  struct task_struct {
 	int				last_mm_cid;	/* Most recent cid in mm */
 	int				mm_cid_active;	/* Whether cid bitmap is active */
 #endif
+	/*
+	 * Keep track of last migration time to compare sched_clock
+	 * locally from a single CPU perspective.
+	 */
+	u64				last_migration_time;
+	/* Time slice used in current migration window. */
+	u64				migration_window_time_slice;
+	/* Number of migrations in current migration window. */
+	u32				migration_count;
 
 	struct tlbflush_unmap_batch	tlb_ubc;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3832bea713c4..791792a218f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,6 +1061,9 @@  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->last_mm_cid = -1;
 	tsk->mm_cid_active = 0;
 #endif
+	tsk->last_migration_time = 0;
+	tsk->migration_window_time_slice = 0;
+	tsk->migration_count = 0;
 	return tsk;
 
 free_stack:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2aac6f14f21c..a530727b11f3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2087,6 +2087,7 @@  void activate_task(struct rq *rq, struct task_struct *p, int flags)
 	if (task_on_rq_migrating(p)) {
 		flags |= ENQUEUE_MIGRATED;
 		sched_mm_cid_migrate_to(rq, p);
+		p->last_migration_time = sched_clock();
 	}
 
 	enqueue_task(rq, p, flags);
@@ -3547,17 +3548,47 @@  static int select_fallback_rq(int cpu, struct task_struct *p)
 	return dest_cpu;
 }
 
+static inline
+bool migration_allowed(struct task_struct *p, u64 current_time)
+{
+	u64 delta = current_time - p->last_migration_time;
+
+	if (delta + p->migration_window_time_slice > SCHED_MIGRATION_WINDOW_NS ||
+	    p->migration_count < SCHED_MIGRATION_LIMIT)
+		return true;
+	return false;
+}
+
+static inline
+void migration_add_delta_to_slice(struct task_struct *p, u64 current_time)
+{
+	u64 delta = current_time - p->last_migration_time;
+
+	if (delta + p->migration_window_time_slice > SCHED_MIGRATION_WINDOW_NS) {
+		/* Reset the migration window if it has ended. */
+		p->migration_window_time_slice = 0;
+		p->migration_count = 0;
+		return;
+	}
+	p->migration_window_time_slice += delta;
+	p->migration_count++;
+}
+
 /*
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
+int select_task_rq(struct task_struct *p, int prev_cpu, int wake_flags)
 {
+	u64 current_time = sched_clock();
+	int cpu = prev_cpu;
+
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
-		cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
-	else
+	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
+		if (migration_allowed(p, current_time))
+			cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
+	} else
 		cpu = cpumask_any(p->cpus_ptr);
 
 	/*
@@ -3573,6 +3604,9 @@  int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
 	if (unlikely(!is_cpu_allowed(p, cpu)))
 		cpu = select_fallback_rq(task_cpu(p), p);
 
+	if (prev_cpu != cpu)
+		migration_add_delta_to_slice(p, current_time);
+
 	return cpu;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 64220134fb45..e52cc38f10fc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -104,6 +104,13 @@  struct cpuidle_state;
 #define TASK_ON_RQ_QUEUED	1
 #define TASK_ON_RQ_MIGRATING	2
 
+/*
+ * A task can be migrated at most SCHED_MIGRATION_WINDOW_LIMIT times per
+ * sched-migration window.
+ */
+#define SCHED_MIGRATION_WINDOW_NS	(10ULL * 1000000)	/* 10 ms */
+#define SCHED_MIGRATION_LIMIT		32
+
 extern __read_mostly int scheduler_running;
 
 extern unsigned long calc_load_update;