The scheduler uses PREEMPT_COUNT and PREEMPTION to drive
preemption: the first to demarcate non-preemptible sections and
the second for the actual mechanics of preemption.
Enable both for voluntary preemption models.
In addition, define a new scheduler feature FORCE_PREEMPT which
can now be used to distinguish between voluntary and full
preemption models at runtime.
Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
init/Makefile | 2 +-
kernel/Kconfig.preempt | 12 ++++++++----
kernel/entry/common.c | 3 +--
kernel/sched/core.c | 26 +++++++++++---------------
kernel/sched/features.h | 6 ++++++
5 files changed, 27 insertions(+), 22 deletions(-)
On Tue, Nov 07, 2023 at 01:57:29PM -0800, Ankur Arora wrote:
> The scheduler uses PREEMPT_COUNT and PREEMPTION to drive
> preemption: the first to demarcate non-preemptible sections and
> the second for the actual mechanics of preemption.
>
> Enable both for voluntary preemption models.
>
> In addition, define a new scheduler feature FORCE_PREEMPT which
> can now be used to distinguish between voluntary and full
> preemption models at runtime.
>
> Originally-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
> ---
> init/Makefile | 2 +-
> kernel/Kconfig.preempt | 12 ++++++++----
> kernel/entry/common.c | 3 +--
> kernel/sched/core.c | 26 +++++++++++---------------
> kernel/sched/features.h | 6 ++++++
> 5 files changed, 27 insertions(+), 22 deletions(-)
>
> diff --git a/init/Makefile b/init/Makefile
> index 385fd80fa2ef..99e480f24cf3 100644
> --- a/init/Makefile
> +++ b/init/Makefile
> @@ -24,7 +24,7 @@ mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o
> #
>
> smp-flag-$(CONFIG_SMP) := SMP
> -preempt-flag-$(CONFIG_PREEMPT) := PREEMPT
> +preempt-flag-$(CONFIG_PREEMPTION) := PREEMPT_DYNAMIC
> preempt-flag-$(CONFIG_PREEMPT_RT) := PREEMPT_RT
>
> build-version = $(or $(KBUILD_BUILD_VERSION), $(build-version-auto))
> diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
> index aa87b5cd3ecc..074fe5e253b5 100644
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -6,20 +6,23 @@ choice
>
> config PREEMPT_NONE
> bool "No Forced Preemption (Server)"
> + select PREEMPTION
> help
> This is the traditional Linux preemption model, geared towards
> throughput. It will still provide good latencies most of the
> - time, but there are no guarantees and occasional longer delays
> - are possible.
> + time, but occasional delays are possible.
>
> Select this option if you are building a kernel for a server or
> scientific/computation system, or if you want to maximize the
> raw processing power of the kernel, irrespective of scheduling
> - latencies.
> + latencies. Unless your architecture actively disables preemption,
> + you can always switch to one of the other preemption models
> + at runtime.
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index 6433e6c77185..f7f2efabb5b5 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -422,8 +422,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
> }
>
> instrumentation_begin();
> - if (IS_ENABLED(CONFIG_PREEMPTION))
> - irqentry_exit_cond_resched();
> + irqentry_exit_cond_resched();
> /* Covers both tracing and lockdep */
> trace_hardirqs_on();
> instrumentation_end();
I'm totally confused by the PREEMPT_NONE changes here. How does that
make sense?
@@ -24,7 +24,7 @@ mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o
#
smp-flag-$(CONFIG_SMP) := SMP
-preempt-flag-$(CONFIG_PREEMPT) := PREEMPT
+preempt-flag-$(CONFIG_PREEMPTION) := PREEMPT_DYNAMIC
preempt-flag-$(CONFIG_PREEMPT_RT) := PREEMPT_RT
build-version = $(or $(KBUILD_BUILD_VERSION), $(build-version-auto))
@@ -6,20 +6,23 @@ choice
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
+ select PREEMPTION
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
- time, but there are no guarantees and occasional longer delays
- are possible.
+ time, but occasional delays are possible.
Select this option if you are building a kernel for a server or
scientific/computation system, or if you want to maximize the
raw processing power of the kernel, irrespective of scheduling
- latencies.
+ latencies. Unless your architecture actively disables preemption,
+ you can always switch to one of the other preemption models
+ at runtime.
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
+ select PREEMPTION
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -53,7 +56,8 @@ config PREEMPT
Select this if you are building a kernel for a desktop or
embedded system with latency requirements in the milliseconds
- range.
+ range. You can always switch to one of lower preemption options
+ at runtime.
config PREEMPT_RT
bool "Fully Preemptible Kernel (Real-Time)"
@@ -422,8 +422,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
}
instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION))
- irqentry_exit_cond_resched();
+ irqentry_exit_cond_resched();
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
@@ -1065,7 +1065,7 @@ void __resched_curr(struct rq *rq, resched_t rs)
*
* Always schedule eagerly, if:
*
- * - running under full preemption
+ * - running under full preemption (sched_feat(FORCE_PREEMPT))
*
* - idle: when not polling (or if we don't have TIF_POLLING_NRFLAG)
* force TIF_NEED_RESCHED to be set and send a resched IPI.
@@ -1081,7 +1081,7 @@ void resched_curr(struct rq *rq)
resched_t rs = RESCHED_lazy;
int context;
- if (IS_ENABLED(CONFIG_PREEMPT) ||
+ if (sched_feat(FORCE_PREEMPT) ||
(rq->curr->sched_class == &idle_sched_class)) {
rs = RESCHED_eager;
goto resched;
@@ -1108,7 +1108,6 @@ void resched_curr(struct rq *rq)
context = ct_state_cpu(cpu_of(rq));
if ((context == CONTEXT_USER) ||
(context == CONTEXT_GUEST)) {
-
rs = RESCHED_eager;
goto resched;
}
@@ -6597,20 +6596,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
*
- * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
- * paths. For example, see arch/x86/entry_64.S.
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and TIF_NEED_RESCHED[_LAZY]
+ * flags on userspace return paths. For example, see arch/x86/entry_64.S.
*
- * To drive preemption between tasks, the scheduler sets the flag in timer
- * interrupt handler scheduler_tick().
+ * To drive preemption between tasks, the scheduler sets one of these
+ * flags in timer interrupt handler scheduler_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
*
- * Now, if the new task added to the run-queue preempts the current
- * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
- * called on the nearest possible occasion:
- *
- * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
+ * - Now, if the new task added to the run-queue preempts the current
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ * called on the nearest possible occasion:
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
@@ -6619,10 +6616,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
- * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
- * then at the next:
+ * - If the new task preempts the current task, but the scheduling
+ * policy is only preempt voluntarily, then at the next:
*
- * - cond_resched() call
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
@@ -89,3 +89,9 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
SCHED_FEAT(HZ_BW, true)
+
+#if defined(CONFIG_PREEMPT)
+SCHED_FEAT(FORCE_PREEMPT, true)
+#else
+SCHED_FEAT(FORCE_PREEMPT, false)
+#endif