[3/5] sched: add a few helpers to wake up tasks on the current cpu
Commit Message
Add complete_on_current_cpu, wake_up_poll_on_current_cpu helpers to wake
up processes on the current CPU.
Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
include/linux/completion.h | 1 +
include/linux/swait.h | 1 +
include/linux/wait.h | 3 +++
kernel/sched/completion.c | 12 ++++++++++++
kernel/sched/core.c | 2 +-
kernel/sched/swait.c | 11 +++++++++++
kernel/sched/wait.c | 5 +++++
7 files changed, 34 insertions(+), 1 deletion(-)
Comments
On Wed, Oct 19, 2022 at 06:10:46PM -0700, Andrei Vagin wrote:
> Add complete_on_current_cpu, wake_up_poll_on_current_cpu helpers to wake
> up processes on the current CPU.
There is an astounding lack of *why* in this changelog.
On Thu, Oct 20, 2022 at 04:15:04PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 19, 2022 at 06:10:46PM -0700, Andrei Vagin wrote:
> > Add complete_on_current_cpu, wake_up_poll_on_current_cpu helpers to wake
> > up processes on the current CPU.
>
> There is an astounding lack of *why* in this changelog.
I use them in the next patch to handle seccomp user notify requests
faster.
The seccomp notify mechanism allows less privileged processes to offload
specific syscalls to more privileged processes. In many cases, the
workflow is fully synchronous. It means a target process triggers a
system call, the kernel stops it and wakes up a supervisor process that
handles the system call and returns controls back to the target process.
In this context, "synchronous" means that only one process is running
and another one is waiting.
New helpers advices the scheduler to move the wakee to the current CPU.
For synchronous workflows like described above, these helpers makes
context switches a few times faster.
For example, using these helpers allows to reduce a seccomp user notify
rountdrip time from 12µs to 3µs.
Thanks,
Andrei
On Thu, Oct 20, 2022 at 5:44 PM Andrei Vagin <avagin@gmail.com> wrote:
>
> On Thu, Oct 20, 2022 at 04:15:04PM +0200, Peter Zijlstra wrote:
> > On Wed, Oct 19, 2022 at 06:10:46PM -0700, Andrei Vagin wrote:
> > > Add complete_on_current_cpu, wake_up_poll_on_current_cpu helpers to wake
> > > up processes on the current CPU.
> >
> > There is an astounding lack of *why* in this changelog.
>
> I use them in the next patch to handle seccomp user notify requests
> faster.
>
> The seccomp notify mechanism allows less privileged processes to offload
> specific syscalls to more privileged processes. In many cases, the
> workflow is fully synchronous. It means a target process triggers a
> system call, the kernel stops it and wakes up a supervisor process that
> handles the system call and returns controls back to the target process.
> In this context, "synchronous" means that only one process is running
> and another one is waiting.
>
> New helpers advices the scheduler to move the wakee to the current CPU.
> For synchronous workflows like described above, these helpers makes
> context switches a few times faster.
Peter,
I've found that I don't understand why WF_SYNC doesn't work in this
case. The test from the last patch shows performance improvements in the
case of WF_CURRENT_CPU, but WF_SYNC doesn't make any difference. I
looked at the code and found that select_task_rq_fair calls
select_idle_sibling, but it doesn't take into account the sync flag.
Does it make sense to do something like this:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ebe7222664c..c29f758ccfe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7249,7 +7249,8 @@ select_task_rq_fair(struct task_struct *p, int
prev_cpu, int wake_flags)
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
- new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ if (!(sync && cpu == new_cpu && this_rq()->nr_running == 1))
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
}
rcu_read_unlock();
With this patch, the test shows the same numbers for WF_CURRENT_CPU and WF_SYNC.
Thanks,
Andrei
@@ -116,6 +116,7 @@ extern bool try_wait_for_completion(struct completion *x);
extern bool completion_done(struct completion *x);
extern void complete(struct completion *);
+extern void complete_on_current_cpu(struct completion *x);
extern void complete_all(struct completion *);
#endif
@@ -147,6 +147,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
extern void swake_up_one(struct swait_queue_head *q);
extern void swake_up_all(struct swait_queue_head *q);
extern void swake_up_locked(struct swait_queue_head *q);
+extern void swake_up_locked_on_current_cpu(struct swait_queue_head *q);
extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
@@ -210,6 +210,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
}
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
unsigned int mode, void *key, wait_queue_entry_t *bookmark);
@@ -237,6 +238,8 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head);
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
+#define wake_up_poll_on_current_cpu(x, m) \
+ __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
#define wake_up_locked_poll(x, m) \
__wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m) \
@@ -38,6 +38,18 @@ void complete(struct completion *x)
}
EXPORT_SYMBOL(complete);
+void complete_on_current_cpu(struct completion *x)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+
+ if (x->done != UINT_MAX)
+ x->done++;
+ swake_up_locked_on_current_cpu(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+
/**
* complete_all: - signals all threads waiting on this completion
* @x: holds the state of this particular completion
@@ -6822,7 +6822,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
@@ -31,6 +31,17 @@ void swake_up_locked(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_locked);
+void swake_up_locked_on_current_cpu(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+
+ if (list_empty(&q->task_list))
+ return;
+
+ curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+ try_to_wake_up(curr->task, TASK_NORMAL, WF_CURRENT_CPU);
+ list_del_init(&curr->task_list);
+}
/*
* Wake up all waiters. This is an interface which is solely exposed for
* completions and not for general usage.
@@ -157,6 +157,11 @@ void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
}
EXPORT_SYMBOL(__wake_up);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
+{
+ __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
+}
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/