[4/6] seccomp: add the synchronous mode for seccomp_unotify

Message ID 20230308073201.3102738-5-avagin@google.com
State New
Headers
Series seccomp: add the synchronous mode for seccomp_unotify |

Commit Message

Andrei Vagin March 8, 2023, 7:31 a.m. UTC
  seccomp_unotify allows more privileged processes do actions on behalf
of less privileged processes.

In many cases, the workflow is fully synchronous. It means a target
process triggers a system call and passes controls to a supervisor
process that handles the system call and returns controls to the target
process. In this context, "synchronous" means that only one process is
running and another one is waiting.

There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
move the wakee to the current CPU. For such synchronous workflows, it
makes context switches a few times faster.

Right now, each interaction takes 12µs. With this patch, it takes about
3µs.

This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
it used to enable the sync mode.

Signed-off-by: Andrei Vagin <avagin@google.com>
---
 include/uapi/linux/seccomp.h |  4 ++++
 kernel/seccomp.c             | 31 +++++++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 2 deletions(-)
  

Comments

Andy Lutomirski April 6, 2023, 3:42 a.m. UTC | #1
On Tue, Mar 7, 2023 at 11:32 PM Andrei Vagin <avagin@google.com> wrote:
>
> seccomp_unotify allows more privileged processes do actions on behalf
> of less privileged processes.
>
> In many cases, the workflow is fully synchronous. It means a target
> process triggers a system call and passes controls to a supervisor
> process that handles the system call and returns controls to the target
> process. In this context, "synchronous" means that only one process is
> running and another one is waiting.
>
> There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
> move the wakee to the current CPU. For such synchronous workflows, it
> makes context switches a few times faster.
>
> Right now, each interaction takes 12µs. With this patch, it takes about
> 3µs.

This is great, but:

>
> This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
> it used to enable the sync mode.

Other than being faster, what does this flag actually do in terms of
user-visible semantics?

>
> Signed-off-by: Andrei Vagin <avagin@google.com>
> ---
>  include/uapi/linux/seccomp.h |  4 ++++
>  kernel/seccomp.c             | 31 +++++++++++++++++++++++++++++--
>  2 files changed, 33 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 0fdc6ef02b94..dbfc9b37fcae 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -115,6 +115,8 @@ struct seccomp_notif_resp {
>         __u32 flags;
>  };
>
> +#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
> +
>  /* valid flags for seccomp_notif_addfd */
>  #define SECCOMP_ADDFD_FLAG_SETFD       (1UL << 0) /* Specify remote fd */
>  #define SECCOMP_ADDFD_FLAG_SEND                (1UL << 1) /* Addfd and return it, atomically */
> @@ -150,4 +152,6 @@ struct seccomp_notif_addfd {
>  #define SECCOMP_IOCTL_NOTIF_ADDFD      SECCOMP_IOW(3, \
>                                                 struct seccomp_notif_addfd)
>
> +#define SECCOMP_IOCTL_NOTIF_SET_FLAGS  SECCOMP_IOW(4, __u64)
> +
>  #endif /* _UAPI_LINUX_SECCOMP_H */
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 9fca9345111c..d323edeae7da 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -143,9 +143,12 @@ struct seccomp_kaddfd {
>   *           filter->notify_lock.
>   * @next_id: The id of the next request.
>   * @notifications: A list of struct seccomp_knotif elements.
> + * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
>   */
> +
>  struct notification {
>         atomic_t requests;
> +       u32 flags;
>         u64 next_id;
>         struct list_head notifications;
>  };
> @@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
>         INIT_LIST_HEAD(&n.addfd);
>
>         atomic_inc(&match->notif->requests);
> -       wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> +       if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> +               wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
> +       else
> +               wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
>
>         /*
>          * This is where we wait for a reply from userspace.
> @@ -1593,7 +1599,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
>         knotif->error = resp.error;
>         knotif->val = resp.val;
>         knotif->flags = resp.flags;
> -       complete(&knotif->ready);
> +       if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> +               complete_on_current_cpu(&knotif->ready);
> +       else
> +               complete(&knotif->ready);
>  out:
>         mutex_unlock(&filter->notify_lock);
>         return ret;
> @@ -1623,6 +1632,22 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
>         return ret;
>  }
>
> +static long seccomp_notify_set_flags(struct seccomp_filter *filter,
> +                                   unsigned long flags)
> +{
> +       long ret;
> +
> +       if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> +               return -EINVAL;
> +
> +       ret = mutex_lock_interruptible(&filter->notify_lock);
> +       if (ret < 0)
> +               return ret;
> +       filter->notif->flags = flags;
> +       mutex_unlock(&filter->notify_lock);
> +       return 0;
> +}
> +
>  static long seccomp_notify_addfd(struct seccomp_filter *filter,
>                                  struct seccomp_notif_addfd __user *uaddfd,
>                                  unsigned int size)
> @@ -1752,6 +1777,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>         case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
>         case SECCOMP_IOCTL_NOTIF_ID_VALID:
>                 return seccomp_notify_id_valid(filter, buf);
> +       case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
> +               return seccomp_notify_set_flags(filter, arg);
>         }
>
>         /* Extensible Argument ioctls */
> --
> 2.40.0.rc0.216.gc4246ad0f0-goog
>
  
Andrei Vagin April 10, 2023, 6:59 a.m. UTC | #2
On Wed, Apr 5, 2023 at 8:42 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Tue, Mar 7, 2023 at 11:32 PM Andrei Vagin <avagin@google.com> wrote:
> >
> > seccomp_unotify allows more privileged processes to do actions on behalf
> > of less privileged processes.
> >
> > In many cases, the workflow is fully synchronous. It means a target
> > process triggers a system call and passes controls to a supervisor
> > process that handles the system call and returns controls to the target
> > process. In this context, "synchronous" means that only one process is
> > running and another one is waiting.
> >
> > There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
> > move the wakee to the current CPU. For such synchronous workflows, it
> > makes context switches a few times faster.
> >
> > Right now, each interaction takes 12µs. With this patch, it takes about
> > 3µs.
>
> This is great, but:
>
> >
> > This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
> > it used to enable the sync mode.
>
> Other than being faster, what does this flag actually do in terms of
> user-visible semantics?

In short, the process handling an event wakes up on the same cpu where the
process that triggered the event has been running. Knowing this fact, the user
can understand when it is appropriate to use this flag.

Let's imagine that we have two processes where one calls syscalls (the
target) and another one handles these syscalls (the supervisor). In this case,
the user should see that both processes are running on the same cpu.

If we have one target process and one supervisor process, they synchronously
swap with each other and don't need to run on cpu concurrently.  But
it becomes more
complicated if one supervisor handles a group of target processes. In this
case, setting the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag depends on the
frequency of events. If the supervisor often has pending events (doesn't sleep
between events), it is better to unset the flag or add more supervisor
processes.

Thanks,
Andrei
  
Andy Lutomirski April 10, 2023, 8:53 p.m. UTC | #3
On Sun, Apr 9, 2023, at 11:59 PM, Andrei Vagin wrote:
> On Wed, Apr 5, 2023 at 8:42 PM Andy Lutomirski <luto@kernel.org> wrote:
>>
>> On Tue, Mar 7, 2023 at 11:32 PM Andrei Vagin <avagin@google.com> wrote:
>> >
>> > seccomp_unotify allows more privileged processes to do actions on behalf
>> > of less privileged processes.
>> >
>> > In many cases, the workflow is fully synchronous. It means a target
>> > process triggers a system call and passes controls to a supervisor
>> > process that handles the system call and returns controls to the target
>> > process. In this context, "synchronous" means that only one process is
>> > running and another one is waiting.
>> >
>> > There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
>> > move the wakee to the current CPU. For such synchronous workflows, it
>> > makes context switches a few times faster.
>> >
>> > Right now, each interaction takes 12µs. With this patch, it takes about
>> > 3µs.
>>
>> This is great, but:
>>
>> >
>> > This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
>> > it used to enable the sync mode.
>>
>> Other than being faster, what does this flag actually do in terms of
>> user-visible semantics?
>
> In short, the process handling an event wakes up on the same cpu where the
> process that triggered the event has been running. Knowing this fact, the user
> can understand when it is appropriate to use this flag.
>
> Let's imagine that we have two processes where one calls syscalls (the
> target) and another one handles these syscalls (the supervisor). In this case,
> the user should see that both processes are running on the same cpu.

So I think you're saying it has no semantic effect.  It's a performance thing.

>
> If we have one target process and one supervisor process, they synchronously
> swap with each other and don't need to run on cpu concurrently.  But
> it becomes more
> complicated if one supervisor handles a group of target processes. In this
> case, setting the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag depends on the
> frequency of events. If the supervisor often has pending events (doesn't sleep
> between events), it is better to unset the flag or add more supervisor
> processes.

ISTM the kernel ought to be able to handle this much more automatically.  The scheduler knows whether the target is running and how busy it has been.

I'm not sure what the right heuristic is in the kernel, but I'm also not convinced that user code has any particular insight here.
  

Patch

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0fdc6ef02b94..dbfc9b37fcae 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -115,6 +115,8 @@  struct seccomp_notif_resp {
 	__u32 flags;
 };
 
+#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
+
 /* valid flags for seccomp_notif_addfd */
 #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
 #define SECCOMP_ADDFD_FLAG_SEND		(1UL << 1) /* Addfd and return it, atomically */
@@ -150,4 +152,6 @@  struct seccomp_notif_addfd {
 #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3, \
 						struct seccomp_notif_addfd)
 
+#define SECCOMP_IOCTL_NOTIF_SET_FLAGS	SECCOMP_IOW(4, __u64)
+
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 9fca9345111c..d323edeae7da 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,9 +143,12 @@  struct seccomp_kaddfd {
  *           filter->notify_lock.
  * @next_id: The id of the next request.
  * @notifications: A list of struct seccomp_knotif elements.
+ * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
  */
+
 struct notification {
 	atomic_t requests;
+	u32 flags;
 	u64 next_id;
 	struct list_head notifications;
 };
@@ -1117,7 +1120,10 @@  static int seccomp_do_user_notification(int this_syscall,
 	INIT_LIST_HEAD(&n.addfd);
 
 	atomic_inc(&match->notif->requests);
-	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
+	if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+		wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
+	else
+		wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
 
 	/*
 	 * This is where we wait for a reply from userspace.
@@ -1593,7 +1599,10 @@  static long seccomp_notify_send(struct seccomp_filter *filter,
 	knotif->error = resp.error;
 	knotif->val = resp.val;
 	knotif->flags = resp.flags;
-	complete(&knotif->ready);
+	if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+		complete_on_current_cpu(&knotif->ready);
+	else
+		complete(&knotif->ready);
 out:
 	mutex_unlock(&filter->notify_lock);
 	return ret;
@@ -1623,6 +1632,22 @@  static long seccomp_notify_id_valid(struct seccomp_filter *filter,
 	return ret;
 }
 
+static long seccomp_notify_set_flags(struct seccomp_filter *filter,
+				    unsigned long flags)
+{
+	long ret;
+
+	if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return ret;
+	filter->notif->flags = flags;
+	mutex_unlock(&filter->notify_lock);
+	return 0;
+}
+
 static long seccomp_notify_addfd(struct seccomp_filter *filter,
 				 struct seccomp_notif_addfd __user *uaddfd,
 				 unsigned int size)
@@ -1752,6 +1777,8 @@  static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
 	case SECCOMP_IOCTL_NOTIF_ID_VALID:
 		return seccomp_notify_id_valid(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
+		return seccomp_notify_set_flags(filter, arg);
 	}
 
 	/* Extensible Argument ioctls */