[01/30] preempt: introduce CONFIG_PREEMPT_AUTO

Message ID 20240213055554.1802415-2-ankur.a.arora@oracle.com
State New
Headers
Series PREEMPT_AUTO: support lazy rescheduling |

Commit Message

Ankur Arora Feb. 13, 2024, 5:55 a.m. UTC
  PREEMPT_AUTO adds a new scheduling model which, like PREEMPT_DYNAMIC,
allows dynamic switching between a none/voluntary/full preemption
model. However, unlike PREEMPT_DYNAMIC, it doesn't use explicit
preemption points for the voluntary models.

It works by depending on CONFIG_PREEMPTION (and thus PREEMPT_COUNT),
allowing the scheduler to always know when it is safe to preempt
for all three preemption models.

In addition, it uses an additional need-resched bit
(TIF_NEED_RESCHED_LAZY) which, with TIF_NEED_RESCHED allows the
scheduler to express two kinds of rescheduling intent: schedule at
the earliest opportunity (the usual TIF_NEED_RESCHED semantics), or
express a need for rescheduling while allowing the task on the
runqueue to run to timeslice completion (TIF_NEED_RESCHED_LAZY).

Based on the preemption model in use, the scheduler chooses
need-resched in the following manner:

		TIF_NEED_RESCHED 	TIF_NEED_RESCHED_LAZY

none		never   		always [*]
voluntary       higher sched class	other tasks [*]
full 		always                  never

[*] when preempting idle, or for kernel tasks that are 'urgent' in
some way (ex. resched_cpu() used as an RCU hammer), we use
TIF_NEED_RESCHED.

As mentioned above, the other part is when preemption happens -- when
are the need-resched flags checked:

                 exit-to-user    ret-to-kernel    preempt_count()
NEED_RESCHED_LAZY     Y               N                N
NEED_RESCHED          Y               Y                Y

Exposed under CONFIG_EXPERT for now.

Cc: Peter Ziljstra <peterz@infradead.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Originally-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/lkml/87jzshhexi.ffs@tglx/
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 .../admin-guide/kernel-parameters.txt         |  1 +
 include/linux/thread_info.h                   |  8 ++++
 init/Makefile                                 |  1 +
 kernel/Kconfig.preempt                        | 37 +++++++++++++++++--
 4 files changed, 44 insertions(+), 3 deletions(-)
  

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 31b3a25680d0..5d2bd21f98e1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4662,6 +4662,7 @@ 
 
 	preempt=	[KNL]
 			Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC
+			or CONFIG_PREEMPT_AUTO.
 			none - Limited to cond_resched() calls
 			voluntary - Limited to cond_resched() and might_sleep() calls
 			full - Any section that isn't explicitly preempt disabled
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9ea0b28068f4..7b1d9185aac6 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -59,6 +59,14 @@  enum syscall_work_bit {
 
 #include <asm/thread_info.h>
 
+/*
+ * Fall back to the default behaviour if we don't have CONFIG_PREEMPT_AUTO.
+ */
+#ifndef CONFIG_PREEMPT_AUTO
+#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
+#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
+#endif
+
 #ifdef __KERNEL__
 
 #ifndef arch_set_restart_data
diff --git a/init/Makefile b/init/Makefile
index cbac576c57d6..da1dba3116dc 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -27,6 +27,7 @@  smp-flag-$(CONFIG_SMP)			:= SMP
 preempt-flag-$(CONFIG_PREEMPT_BUILD)	:= PREEMPT
 preempt-flag-$(CONFIG_PREEMPT_DYNAMIC)	:= PREEMPT_DYNAMIC
 preempt-flag-$(CONFIG_PREEMPT_RT)	:= PREEMPT_RT
+preempt-flag-$(CONFIG_PREEMPT_AUTO)	:= PREEMPT_AUTO
 
 build-version = $(or $(KBUILD_BUILD_VERSION), $(build-version-auto))
 build-timestamp = $(or $(KBUILD_BUILD_TIMESTAMP), $(build-timestamp-auto))
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a821..fe83040ad755 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,13 +11,17 @@  config PREEMPT_BUILD
 	select PREEMPTION
 	select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
 
+config HAVE_PREEMPT_AUTO
+	bool
+
 choice
 	prompt "Preemption Model"
 	default PREEMPT_NONE
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
-	select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
+	select PREEMPT_NONE_BUILD if (!PREEMPT_DYNAMIC && !PREEMPT_AUTO)
+
 	help
 	  This is the traditional Linux preemption model, geared towards
 	  throughput. It will still provide good latencies most of the
@@ -32,7 +36,7 @@  config PREEMPT_NONE
 config PREEMPT_VOLUNTARY
 	bool "Voluntary Kernel Preemption (Desktop)"
 	depends on !ARCH_NO_PREEMPT
-	select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
+	select PREEMPT_VOLUNTARY_BUILD if (!PREEMPT_DYNAMIC && !PREEMPT_AUTO)
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
@@ -95,7 +99,7 @@  config PREEMPTION
 
 config PREEMPT_DYNAMIC
 	bool "Preemption behaviour defined on boot"
-	depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+	depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT && !PREEMPT_AUTO
 	select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
 	select PREEMPT_BUILD
 	default y if HAVE_PREEMPT_DYNAMIC_CALL
@@ -115,6 +119,33 @@  config PREEMPT_DYNAMIC
 	  Interesting if you want the same pre-built kernel should be used for
 	  both Server and Desktop workloads.
 
+config PREEMPT_AUTO
+	bool "Scheduler controlled preemption model"
+	depends on EXPERT && HAVE_PREEMPT_AUTO && !ARCH_NO_PREEMPT
+	select PREEMPT_BUILD
+	help
+	  This option allows to define the preemption model on the kernel
+	  command line parameter and thus override the default preemption
+	  model selected during compile time.
+
+	  However, note that the compile time choice of preemption model
+	  might impact other kernel options like the specific RCU model.
+
+	  This feature makes the latency of the kernel configurable by
+	  allowing the scheduler to choose when to preempt based on
+	  the preemption policy in effect. It does this without needing
+	  voluntary preemption points.
+
+	  With PREEMPT_NONE: the scheduler allows a task (executing in
+	  user or kernel context) to run to completion, at least until
+	  its current tick expires.
+
+	  With PREEMPT_VOLUNTARY: similar to PREEMPT_NONE, but the scheduler
+	  will also preempt for higher priority class of processes but not
+	  lower.
+
+	  With PREEMPT: the scheduler preempts at the earliest opportunity.
+
 config SCHED_CORE
 	bool "Core Scheduling for SMT"
 	depends on SCHED_SMT