diff mbox series

[27/34] sched_ext: Implement SCX_KICK_WAIT

Message ID	20230711011412.100319-28-tj@kernel.org
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; Sender: Tejun Heo <htejun@gmail.com> From: Tejun Heo <tj@kernel.org> To: torvalds@linux-foundation.org, mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, bristot@redhat.com, vschneid@redhat.com, ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org, joshdon@google.com, brho@google.com, pjt@google.com, derkling@google.com, haoluo@google.com, dvernet@meta.com, dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org, kernel-team@meta.com, Tejun Heo <tj@kernel.org> Subject: [PATCH 27/34] sched_ext: Implement SCX_KICK_WAIT Date: Mon, 10 Jul 2023 15:13:45 -1000 Message-ID: <20230711011412.100319-28-tj@kernel.org> In-Reply-To: <20230711011412.100319-1-tj@kernel.org> References: <20230711011412.100319-1-tj@kernel.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	[01/34] cgroup: Implement cgroup_show_cftypes() \| [01/34] cgroup: Implement cgroup_show_cftypes() [02/34] sched: Restructure sched_class order sanity checks in sched_init() [03/34] sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() [04/34] sched: Add sched_class->reweight_task() [05/34] sched: Add sched_class->switching_to() and expose check_class_changing/changed() [06/34] sched: Factor out cgroup weight conversion functions [07/34] sched: Expose css_tg() and __setscheduler_prio() [08/34] sched: Enumerate CPU cgroup file types [09/34] sched: Add @reason to sched_class->rq_{on\|off}line() [10/34] sched: Add normal_policy() [11/34] sched_ext: Add boilerplate for extensible scheduler class [12/34] sched_ext: Implement BPF extensible scheduler class [13/34] sched_ext: Add scx_simple and scx_example_qmap example schedulers [14/34] sched_ext: Add sysrq-S which disables the BPF scheduler [15/34] sched_ext: Implement runnable task stall watchdog [16/34] sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT [17/34] sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext [18/34] sched_ext: Implement scx_bpf_kick_cpu() and task preemption support [19/34] sched_ext: Add a central scheduler which makes all scheduling decisions on one CPU [20/34] sched_ext: Make watchdog handle ops.dispatch() looping stall [21/34] sched_ext: Add task state tracking operations [22/34] sched_ext: Implement tickless support [23/34] sched_ext: Track tasks that are subjects of the in-flight SCX operation [24/34] sched_ext: Add cgroup support [25/34] sched_ext: Add a cgroup-based core-scheduling scheduler [26/34] sched_ext: Add a cgroup scheduler which uses flattened hierarchy [27/34] sched_ext: Implement SCX_KICK_WAIT [28/34] sched_ext: Implement sched_ext_ops.cpu_acquire/release() [29/34] sched_ext: Implement sched_ext_ops.cpu_online/offline() [30/34] sched_ext: Implement core-sched support [31/34] sched_ext: Add vtime-ordered priority queue to dispatch_q's [32/34] sched_ext: Documentation: scheduler: Document extensible scheduler class [33/34] sched_ext: Add a basic, userland vruntime scheduler [34/34] sched_ext: Add a rust userspace hybrid example scheduler

Commit Message

Tejun Heo July 11, 2023, 1:13 a.m. UTC

  From: David Vernet <dvernet@meta.com>

If set when calling scx_bpf_kick_cpu(), the invoking CPU will busy wait for
the kicked cpu to enter the scheduler. This will be used to improve the
exclusion guarantees in scx_pair.

Signed-off-by: David Vernet <dvernet@meta.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
---
 kernel/sched/core.c  |  4 +++-
 kernel/sched/ext.c   | 33 ++++++++++++++++++++++++++++++++-
 kernel/sched/ext.h   | 20 ++++++++++++++++++++
 kernel/sched/sched.h |  2 ++
 4 files changed, 57 insertions(+), 2 deletions(-)

Comments

Andrea Righi July 13, 2023, 1:45 p.m. UTC | #1

On Mon, Jul 10, 2023 at 03:13:45PM -1000, Tejun Heo wrote:
...
> +	for_each_cpu_andnot(cpu, this_rq->scx.cpus_to_wait,
> +			    cpumask_of(this_cpu)) {
> +		/*
> +		 * Pairs with smp_store_release() issued by this CPU in
> +		 * scx_notify_pick_next_task() on the resched path.
> +		 *
> +		 * We busy-wait here to guarantee that no other task can be
> +		 * scheduled on our core before the target CPU has entered the
> +		 * resched path.
> +		 */
> +		while (smp_load_acquire(&cpu_rq(cpu)->scx.pnt_seq) == pseqs[cpu])
> +			cpu_relax();
> +	}
> +

...

> +static inline void scx_notify_pick_next_task(struct rq *rq,
> +					     const struct task_struct *p,
> +					     const struct sched_class *active)
> +{
> +#ifdef CONFIG_SMP
> +	if (!scx_enabled())
> +		return;
> +	/*
> +	 * Pairs with the smp_load_acquire() issued by a CPU in
> +	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
> +	 * resched.
> +	 */
> +	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
> +#endif
> +}

We can't use smp_load_acquire()/smp_store_release() with a u64 on
32-bit architectures.

For example, on armhf the build is broken:

In function ‘scx_notify_pick_next_task’,
    inlined from ‘__pick_next_task’ at /<<PKGBUILDDIR>>/kernel/sched/core.c:6106:4,
    inlined from ‘pick_next_task’ at /<<PKGBUILDDIR>>/kernel/sched/core.c:6605:9,
    inlined from ‘__schedule’ at /<<PKGBUILDDIR>>/kernel/sched/core.c:6750:9:
/<<PKGBUILDDIR>>/include/linux/compiler_types.h:397:45: error: call to ‘__compiletime_assert_597’ declared with attribute error: Need native word sized stores/loads for atomicity.
  397 |         _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
      |                                             ^
/<<PKGBUILDDIR>>/include/linux/compiler_types.h:378:25: note: in definition of macro ‘__compiletime_assert’
  378 |                         prefix ## suffix();                             \
      |                         ^~~~~~
/<<PKGBUILDDIR>>/include/linux/compiler_types.h:397:9: note: in expansion of macro ‘_compiletime_assert’
  397 |         _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
      |         ^~~~~~~~~~~~~~~~~~~
/<<PKGBUILDDIR>>/include/linux/compiler_types.h:400:9: note: in expansion of macro ‘compiletime_assert’
  400 |         compiletime_assert(__native_word(t),                            \
      |         ^~~~~~~~~~~~~~~~~~
/<<PKGBUILDDIR>>/include/asm-generic/barrier.h:141:9: note: in expansion of macro ‘compiletime_assert_atomic_type’
  141 |         compiletime_assert_atomic_type(*p);                             \
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/<<PKGBUILDDIR>>/include/asm-generic/barrier.h:172:55: note: in expansion of macro ‘__smp_store_release’
  172 | #define smp_store_release(p, v) do { kcsan_release(); __smp_store_release(p, v); } while (0)
      |                                                       ^~~~~~~~~~~~~~~~~~~
/<<PKGBUILDDIR>>/kernel/sched/ext.h:159:9: note: in expansion of macro ‘smp_store_release’
  159 |         smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);

There's probably a better way to fix this, but for now I've temporarily
solved this using cmpxchg64() - see patch below.

I'm not sure if we already have an equivalent of
smp_store_release_u64/smp_load_acquire_u64(). Otherwise, it may be worth
to add them to a more generic place.

-Andrea

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 051c79fa25f7..5da72b1cf88d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3667,7 +3667,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 		 * scheduled on our core before the target CPU has entered the
 		 * resched path.
 		 */
-		while (smp_load_acquire(&cpu_rq(cpu)->scx.pnt_seq) == pseqs[cpu])
+		while (smp_load_acquire_u64(&cpu_rq(cpu)->scx.pnt_seq) == pseqs[cpu])
 			cpu_relax();
 	}
 
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 405037a4e6ce..ef4a24d77d30 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -144,6 +144,40 @@ void __scx_notify_pick_next_task(struct rq *rq,
 				 struct task_struct *p,
 				 const struct sched_class *active);
 
+#ifdef CONFIG_64BIT
+static inline u64 smp_load_acquire_u64(u64 *ptr)
+{
+	return smp_load_acquire(ptr);
+}
+
+static inline void smp_store_release_u64(u64 *ptr, u64 val)
+{
+	smp_store_release(ptr, val);
+}
+#else
+static inline u64 smp_load_acquire_u64(u64 *ptr)
+{
+	u64 prev, next;
+
+	do {
+		prev = *ptr;
+		next = prev;
+	} while (cmpxchg64(ptr, prev, next) != prev);
+
+	return prev;
+}
+
+static inline void smp_store_release_u64(u64 *ptr, u64 val)
+{
+	u64 prev, next;
+
+	do {
+		prev = *ptr;
+		next = val;
+	} while (cmpxchg64(ptr, prev, next) != prev);
+}
+#endif
+
 static inline void scx_notify_pick_next_task(struct rq *rq,
 					     struct task_struct *p,
 					     const struct sched_class *active)
@@ -156,7 +190,7 @@ static inline void scx_notify_pick_next_task(struct rq *rq,
 	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
 	 * resched.
 	 */
-	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
+	smp_store_release_u64(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
 #endif
 	if (!static_branch_unlikely(&scx_ops_cpu_preempt))
 		return;

Linus Torvalds July 13, 2023, 6:32 p.m. UTC | #2

On Thu, 13 Jul 2023 at 06:46, Andrea Righi <andrea.righi@canonical.com> wrote:
>
> I'm not sure if we already have an equivalent of
> smp_store_release_u64/smp_load_acquire_u64(). Otherwise, it may be worth
> to add them to a more generic place.

Yeah, a 64-bit atomic load/store is not necessarily even possible on
32-bit architectures.

And when it *is* possible, it might be very very expensive indeed (eg
on 32-bit x86, the way to do a 64-bit load would be with "cmpxchg8b",
which is ridiculously slow)

              Linus

Tejun Heo July 13, 2023, 7:48 p.m. UTC | #3

Hello,

On Thu, Jul 13, 2023 at 11:32:37AM -0700, Linus Torvalds wrote:
> On Thu, 13 Jul 2023 at 06:46, Andrea Righi <andrea.righi@canonical.com> wrote:
> >
> > I'm not sure if we already have an equivalent of
> > smp_store_release_u64/smp_load_acquire_u64(). Otherwise, it may be worth
> > to add them to a more generic place.
> 
> Yeah, a 64-bit atomic load/store is not necessarily even possible on
> 32-bit architectures.
> 
> And when it *is* possible, it might be very very expensive indeed (eg
> on 32-bit x86, the way to do a 64-bit load would be with "cmpxchg8b",
> which is ridiculously slow)

There are two places where sched_ext is depending on atomic load/store.
One's this pnt_seq which is using smp_store_release/load_acquire(). The
other is task_struct->scx.ops_state which uses atomic64_read_acquire() and
atomic64_store_release(). atomic64's are implemented with spinlocks on
32bits by default which is probably why Andrea didn't hit it.

pnt_seq is a per-cpu counter for successful pick_next_task's from sched_ext
and used to tell "has at least one pick_next_task() succeeded after my
kicking that CPU".

p->scx_ops.state has embedded qseq counter (2bits for state flags, the rest
for the counter. I gotta change the masks to macros too.) which is used to
detect whether the task has been dequeued and re-enqueued between while a
CPU is trying to double lock rq's for task migration.

As both are used to detect races in very short and immediate time windows,
using, respectively, 32bit and 30bit, should be safe practically. e.g. while
it's theoretically possible for the task to be dequeued and re-enqueued
exactly 2^30 times while the CPU is trying to switch rq locks, I don't think
that's practically possible without something going very wrong with the
machine (e.g. NMI / SMI).

I'll note the above and change both to unsigned longs.

Thanks.

diff mbox series

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 77eb4ee4f759..878e84694a6e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6052,8 +6052,10 @@  __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	for_each_active_class(class) {
 		p = class->pick_next_task(rq);
-		if (p)
+		if (p) {
+			scx_notify_pick_next_task(rq, p, class);
 			return p;
+		}
 	}
 
 	BUG(); /* The idle class should always have a runnable task. */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5862e8290207..48a8881ff01f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -125,6 +125,9 @@  static struct {
 
 #endif	/* CONFIG_SMP */
 
+/* for %SCX_KICK_WAIT */
+static u64 __percpu *scx_kick_cpus_pnt_seqs;
+
 /*
  * Direct dispatch marker.
  *
@@ -3269,6 +3272,7 @@  static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
 static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 {
 	struct rq *this_rq = this_rq();
+	u64 *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
 	int this_cpu = cpu_of(this_rq);
 	int cpu;
 
@@ -3282,14 +3286,32 @@  static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 			if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) &&
 			    rq->curr->sched_class == &ext_sched_class)
 				rq->curr->scx.slice = 0;
+			pseqs[cpu] = rq->scx.pnt_seq;
 			resched_curr(rq);
+		} else {
+			cpumask_clear_cpu(cpu, this_rq->scx.cpus_to_wait);
 		}
 
 		raw_spin_rq_unlock_irqrestore(rq, flags);
 	}
 
+	for_each_cpu_andnot(cpu, this_rq->scx.cpus_to_wait,
+			    cpumask_of(this_cpu)) {
+		/*
+		 * Pairs with smp_store_release() issued by this CPU in
+		 * scx_notify_pick_next_task() on the resched path.
+		 *
+		 * We busy-wait here to guarantee that no other task can be
+		 * scheduled on our core before the target CPU has entered the
+		 * resched path.
+		 */
+		while (smp_load_acquire(&cpu_rq(cpu)->scx.pnt_seq) == pseqs[cpu])
+			cpu_relax();
+	}
+
 	cpumask_clear(this_rq->scx.cpus_to_kick);
 	cpumask_clear(this_rq->scx.cpus_to_preempt);
+	cpumask_clear(this_rq->scx.cpus_to_wait);
 }
 
 void __init init_sched_ext_class(void)
@@ -3303,7 +3325,7 @@  void __init init_sched_ext_class(void)
 	 * through the generated vmlinux.h.
 	 */
 	WRITE_ONCE(v, SCX_WAKE_EXEC | SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP |
-		   SCX_TG_ONLINE);
+		   SCX_TG_ONLINE | SCX_KICK_PREEMPT);
 
 	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
 	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
@@ -3311,6 +3333,12 @@  void __init init_sched_ext_class(void)
 	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
 #endif
+	scx_kick_cpus_pnt_seqs =
+		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) *
+			       num_possible_cpus(),
+			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
+	BUG_ON(!scx_kick_cpus_pnt_seqs);
+
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 
@@ -3319,6 +3347,7 @@  void __init init_sched_ext_class(void)
 
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
+		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
 		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
 	}
 
@@ -3585,6 +3614,8 @@  void scx_bpf_kick_cpu(s32 cpu, u64 flags)
 	cpumask_set_cpu(cpu, rq->scx.cpus_to_kick);
 	if (flags & SCX_KICK_PREEMPT)
 		cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt);
+	if (flags & SCX_KICK_WAIT)
+		cpumask_set_cpu(cpu, rq->scx.cpus_to_wait);
 
 	irq_work_queue(&rq->scx.kick_cpus_irq_work);
 	preempt_enable();
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index c3404a0a7637..abb283ac3bc7 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -65,6 +65,7 @@  enum scx_pick_idle_cpu_flags {
 
 enum scx_kick_flags {
 	SCX_KICK_PREEMPT	= 1LLU << 0,	/* force scheduling on the CPU */
+	SCX_KICK_WAIT		= 1LLU << 1,	/* wait for the CPU to be rescheduled */
 };
 
 enum scx_tg_flags {
@@ -115,6 +116,22 @@  __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
 #define scx_ops_error(fmt, args...)						\
 	scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
 
+static inline void scx_notify_pick_next_task(struct rq *rq,
+					     const struct task_struct *p,
+					     const struct sched_class *active)
+{
+#ifdef CONFIG_SMP
+	if (!scx_enabled())
+		return;
+	/*
+	 * Pairs with the smp_load_acquire() issued by a CPU in
+	 * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
+	 * resched.
+	 */
+	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
+#endif
+}
+
 static inline void scx_notify_sched_tick(void)
 {
 	unsigned long last_check;
@@ -170,6 +187,9 @@  static inline int scx_check_setscheduler(struct task_struct *p,
 					 int policy) { return 0; }
 static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
 static inline void init_sched_ext_class(void) {}
+static inline void scx_notify_pick_next_task(struct rq *rq,
+					     const struct task_struct *p,
+					     const struct sched_class *active) {}
 static inline void scx_notify_sched_tick(void) {}
 
 #define for_each_active_class		for_each_class
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 00bf33fdbd64..ce6e0a73135b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -710,6 +710,8 @@  struct scx_rq {
 	u32			flags;
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_preempt;
+	cpumask_var_t		cpus_to_wait;
+	u64			pnt_seq;
 	struct irq_work		kick_cpus_irq_work;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */