[RFC,v2,2/5] sched: Note schedule() invocations at return-to-user with SM_USER

Message ID 20240202080920.3337862-3-vschneid@redhat.com
State New
Headers
Series sched/fair: Defer CFS throttle to user entry |

Commit Message

Valentin Schneider Feb. 2, 2024, 8:09 a.m. UTC
  task_struct.in_return_to_user is currently updated via atomic operations in
schedule_usermode().

However, one can note:
o .in_return_to_user is only updated for the current task
o There are no remote (smp_processor_id() != task_cpu(p)) accesses to
  .in_return_to_user

Add schedule_with_mode() to factorize schedule() with different flags to
pass down to __schedule_loop().

Add SM_USER to denote schedule() calls from return-to-userspace points.

Update .in_return_to_user from within the preemption-disabled, rq_lock-held
part of __schedule().

Suggested-by: Benjamin Segall <bsegall@google.com>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
 include/linux/sched.h |  2 +-
 kernel/sched/core.c   | 43 ++++++++++++++++++++++++++++++++-----------
 kernel/sched/fair.c   | 17 ++++++++++++++++-
 3 files changed, 49 insertions(+), 13 deletions(-)
  

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a0105d1eaa21..1b6f17b2150a6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1544,7 +1544,7 @@  struct task_struct {
 #endif
 
 #ifdef CONFIG_CFS_BANDWIDTH
-	atomic_t			in_return_to_user;
+	int				in_return_to_user;
 #endif
 	/*
 	 * New fields for task_struct should be added above here, so that
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7c028fad5a89..54e6690626b13 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4531,7 +4531,7 @@  static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	INIT_LIST_HEAD(&p->se.kernel_node);
-	atomic_set(&p->in_return_to_user, 0);
+	p->in_return_to_user            = false;
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -5147,6 +5147,9 @@  prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
 
 static inline void finish_lock_switch(struct rq *rq)
 {
+#ifdef CONFIG_CFS_BANDWIDTH
+	current->in_return_to_user = false;
+#endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
@@ -6562,6 +6565,18 @@  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 #define SM_PREEMPT		0x1
 #define SM_RTLOCK_WAIT		0x2
 
+/*
+ * Special case for CFS_BANDWIDTH where we need to know if the call to
+ * __schedule() is directely preceding an entry into userspace.
+ * It is removed from the mode argument as soon as it is used to not go against
+ * the SM_MASK_PREEMPT optimisation below.
+ */
+#ifdef CONFIG_CFS_BANDWIDTH
+# define SM_USER                0x4
+#else
+# define SM_USER                SM_NONE
+#endif
+
 #ifndef CONFIG_PREEMPT_RT
 # define SM_MASK_PREEMPT	(~0U)
 #else
@@ -6646,6 +6661,14 @@  static void __sched notrace __schedule(unsigned int sched_mode)
 	rq_lock(rq, &rf);
 	smp_mb__after_spinlock();
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	if (sched_mode & SM_USER) {
+		prev->in_return_to_user = true;
+		sched_mode &= ~SM_USER;
+	}
+#endif
+	SCHED_WARN_ON(sched_mode & SM_USER);
+
 	/* Promote REQ to ACT */
 	rq->clock_update_flags <<= 1;
 	update_rq_clock(rq);
@@ -6807,7 +6830,7 @@  static __always_inline void __schedule_loop(unsigned int sched_mode)
 	} while (need_resched());
 }
 
-asmlinkage __visible void __sched schedule(void)
+static __always_inline void schedule_with_mode(unsigned int sched_mode)
 {
 	struct task_struct *tsk = current;
 
@@ -6817,22 +6840,20 @@  asmlinkage __visible void __sched schedule(void)
 
 	if (!task_is_running(tsk))
 		sched_submit_work(tsk);
-	__schedule_loop(SM_NONE);
+	__schedule_loop(sched_mode);
 	sched_update_worker(tsk);
 }
+
+asmlinkage __visible void __sched schedule(void)
+{
+	schedule_with_mode(SM_NONE);
+}
 EXPORT_SYMBOL(schedule);
 
 asmlinkage __visible void __sched schedule_usermode(void)
 {
 #ifdef CONFIG_CFS_BANDWIDTH
-	/*
-	 * This is only atomic because of this simple implementation. We could
-	 * do something with an SM_USER to avoid other-cpu scheduler operations
-	 * racing against these writes.
-	 */
-	atomic_set(&current->in_return_to_user, true);
-	schedule();
-	atomic_set(&current->in_return_to_user, false);
+	schedule_with_mode(SM_USER);
 #else
 	schedule();
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a1808459a5acc..96504be6ee14a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6631,7 +6631,22 @@  static void enqueue_kernel(struct cfs_rq *cfs_rq, struct sched_entity *se, int c
 
 static bool is_kernel_task(struct task_struct *p)
 {
-	return sysctl_sched_cfs_bandwidth_kernel_bypass && !atomic_read(&p->in_return_to_user);
+	/*
+	 * The flag is updated within __schedule() with preemption disabled,
+	 * under the rq lock, and only when the task is current.
+	 *
+	 * Holding the rq lock for that task's CPU is thus sufficient for the
+	 * value to be stable, if the task is enqueued.
+	 *
+	 * If the task is dequeued, then task_cpu(p) *can* change, but this
+	 * so far only happens in enqueue_task_fair() which means either:
+	 * - the task is being activated, its CPU has been set previously in ttwu()
+	 * - the task is going through a "change" cycle (e.g. sched_move_task()),
+	 *   the pi_lock is also held so the CPU is stable.
+	 */
+	lockdep_assert_rq_held(cpu_rq(task_cpu(p)));
+
+	return sysctl_sched_cfs_bandwidth_kernel_bypass && !p->in_return_to_user;
 }
 
 /*