[net-next,v4,1/4] scm: add SO_PASSPIDFD and SCM_PIDFD

Message ID 20230413133355.350571-2-aleksandr.mikhalitsyn@canonical.com
State New
Headers
Series Add SCM_PIDFD and SO_PEERPIDFD |

Commit Message

Aleksandr Mikhalitsyn April 13, 2023, 1:33 p.m. UTC
  Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS,
but it contains pidfd instead of plain pid, which allows programmers not
to care about PID reuse problem.

Idea comes from UAPI kernel group:
https://uapi-group.org/kernel-features/

Big thanks to Christian Brauner and Lennart Poettering for productive
discussions about this.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: David Ahern <dsahern@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
Cc: Lennart Poettering <mzxreary@0pointer.de>
Cc: Luca Boccassi <bluca@debian.org>
Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Tested-by: Luca Boccassi <bluca@debian.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
---
v4:
	- fixed silent fd_install if writting of CMSG to the userspace fails (pointed by Christian)
v2:
	According to review comments from Kuniyuki Iwashima and Christian Brauner:
	- use pidfd_create(..) retval as a result
	- whitespace change
---
 arch/alpha/include/uapi/asm/socket.h    |  2 ++
 arch/mips/include/uapi/asm/socket.h     |  2 ++
 arch/parisc/include/uapi/asm/socket.h   |  2 ++
 arch/sparc/include/uapi/asm/socket.h    |  2 ++
 include/linux/net.h                     |  1 +
 include/linux/socket.h                  |  1 +
 include/net/scm.h                       | 39 +++++++++++++++++++++++--
 include/uapi/asm-generic/socket.h       |  2 ++
 net/core/sock.c                         | 11 +++++++
 net/mptcp/sockopt.c                     |  1 +
 net/unix/af_unix.c                      | 18 ++++++++----
 tools/include/uapi/asm-generic/socket.h |  2 ++
 12 files changed, 76 insertions(+), 7 deletions(-)
  

Comments

Christian Brauner April 17, 2023, 3:18 p.m. UTC | #1
On Thu, Apr 13, 2023 at 03:33:52PM +0200, Alexander Mikhalitsyn wrote:
> Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS,
> but it contains pidfd instead of plain pid, which allows programmers not
> to care about PID reuse problem.
> 
> Idea comes from UAPI kernel group:
> https://uapi-group.org/kernel-features/
> 
> Big thanks to Christian Brauner and Lennart Poettering for productive
> discussions about this.
> 
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Jakub Kicinski <kuba@kernel.org>
> Cc: Paolo Abeni <pabeni@redhat.com>
> Cc: Leon Romanovsky <leon@kernel.org>
> Cc: David Ahern <dsahern@kernel.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
> Cc: Lennart Poettering <mzxreary@0pointer.de>
> Cc: Luca Boccassi <bluca@debian.org>
> Cc: linux-kernel@vger.kernel.org
> Cc: netdev@vger.kernel.org
> Cc: linux-arch@vger.kernel.org
> Tested-by: Luca Boccassi <bluca@debian.org>
> Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
> ---
> v4:
> 	- fixed silent fd_install if writting of CMSG to the userspace fails (pointed by Christian)
> v2:
> 	According to review comments from Kuniyuki Iwashima and Christian Brauner:
> 	- use pidfd_create(..) retval as a result
> 	- whitespace change
> ---
>  arch/alpha/include/uapi/asm/socket.h    |  2 ++
>  arch/mips/include/uapi/asm/socket.h     |  2 ++
>  arch/parisc/include/uapi/asm/socket.h   |  2 ++
>  arch/sparc/include/uapi/asm/socket.h    |  2 ++
>  include/linux/net.h                     |  1 +
>  include/linux/socket.h                  |  1 +
>  include/net/scm.h                       | 39 +++++++++++++++++++++++--
>  include/uapi/asm-generic/socket.h       |  2 ++
>  net/core/sock.c                         | 11 +++++++
>  net/mptcp/sockopt.c                     |  1 +
>  net/unix/af_unix.c                      | 18 ++++++++----
>  tools/include/uapi/asm-generic/socket.h |  2 ++
>  12 files changed, 76 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> index 739891b94136..ff310613ae64 100644
> --- a/arch/alpha/include/uapi/asm/socket.h
> +++ b/arch/alpha/include/uapi/asm/socket.h
> @@ -137,6 +137,8 @@
>  
>  #define SO_RCVMARK		75
>  
> +#define SO_PASSPIDFD		76
> +
>  #if !defined(__KERNEL__)
>  
>  #if __BITS_PER_LONG == 64
> diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> index 18f3d95ecfec..762dcb80e4ec 100644
> --- a/arch/mips/include/uapi/asm/socket.h
> +++ b/arch/mips/include/uapi/asm/socket.h
> @@ -148,6 +148,8 @@
>  
>  #define SO_RCVMARK		75
>  
> +#define SO_PASSPIDFD		76
> +
>  #if !defined(__KERNEL__)
>  
>  #if __BITS_PER_LONG == 64
> diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> index f486d3dfb6bb..df16a3e16d64 100644
> --- a/arch/parisc/include/uapi/asm/socket.h
> +++ b/arch/parisc/include/uapi/asm/socket.h
> @@ -129,6 +129,8 @@
>  
>  #define SO_RCVMARK		0x4049
>  
> +#define SO_PASSPIDFD		0x404A
> +
>  #if !defined(__KERNEL__)
>  
>  #if __BITS_PER_LONG == 64
> diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> index 2fda57a3ea86..6e2847804fea 100644
> --- a/arch/sparc/include/uapi/asm/socket.h
> +++ b/arch/sparc/include/uapi/asm/socket.h
> @@ -130,6 +130,8 @@
>  
>  #define SO_RCVMARK               0x0054
>  
> +#define SO_PASSPIDFD             0x0055
> +
>  #if !defined(__KERNEL__)
>  
>  
> diff --git a/include/linux/net.h b/include/linux/net.h
> index b73ad8e3c212..c234dfbe7a30 100644
> --- a/include/linux/net.h
> +++ b/include/linux/net.h
> @@ -43,6 +43,7 @@ struct net;
>  #define SOCK_PASSSEC		4
>  #define SOCK_SUPPORT_ZC		5
>  #define SOCK_CUSTOM_SOCKOPT	6
> +#define SOCK_PASSPIDFD		7
>  
>  #ifndef ARCH_HAS_SOCKET_TYPES
>  /**
> diff --git a/include/linux/socket.h b/include/linux/socket.h
> index 13c3a237b9c9..6bf90f251910 100644
> --- a/include/linux/socket.h
> +++ b/include/linux/socket.h
> @@ -177,6 +177,7 @@ static inline size_t msg_data_left(struct msghdr *msg)
>  #define	SCM_RIGHTS	0x01		/* rw: access rights (array of int) */
>  #define SCM_CREDENTIALS 0x02		/* rw: struct ucred		*/
>  #define SCM_SECURITY	0x03		/* rw: security label		*/
> +#define SCM_PIDFD	0x04		/* ro: pidfd (int)		*/
>  
>  struct ucred {
>  	__u32	pid;
> diff --git a/include/net/scm.h b/include/net/scm.h
> index 585adc1346bd..c67f765a165b 100644
> --- a/include/net/scm.h
> +++ b/include/net/scm.h
> @@ -120,12 +120,44 @@ static inline bool scm_has_secdata(struct socket *sock)
>  }
>  #endif /* CONFIG_SECURITY_NETWORK */
>  
> +static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
> +{
> +	struct file *pidfd_file = NULL;
> +	int pidfd;
> +
> +	/*
> +	 * put_cmsg() doesn't return an error if CMSG is truncated,
> +	 * that's why we need to opencode these checks here.
> +	 */
> +	if ((msg->msg_controllen <= sizeof(struct cmsghdr)) ||
> +	    (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) {
> +		msg->msg_flags |= MSG_CTRUNC;
> +		return;

Hm, curious about this: We mark the message as truncated for SCM_PIDFD
but if the same conditions were to apply for SCM_PASSCRED we don't mark
the message as truncated. Am I reading this correct? And is so, you
please briefly explain this difference?

> +	}
> +
> +	WARN_ON_ONCE(!scm->pid);
> +	pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
> +
> +	if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {

If the put_cmsg() of the pidfd fails userspace needs to be able to
detect this. Otherwise they can't distinguish between the SCM_PIDFD
value being zero because the put_cmsg() failed or put_cmsg() succeeded
and the allocated fd nr was 0.

Looking at put_cmsg() it looks to me that userspace will receive a
SCM_PIDFD message only if the put_cmsg() is completely successful. IIUC,
then this change is fine.
  
Aleksandr Mikhalitsyn April 17, 2023, 4:01 p.m. UTC | #2
On Mon, Apr 17, 2023 at 5:18 PM Christian Brauner <brauner@kernel.org> wrote:
>
> On Thu, Apr 13, 2023 at 03:33:52PM +0200, Alexander Mikhalitsyn wrote:
> > Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS,
> > but it contains pidfd instead of plain pid, which allows programmers not
> > to care about PID reuse problem.
> >
> > Idea comes from UAPI kernel group:
> > https://uapi-group.org/kernel-features/
> >
> > Big thanks to Christian Brauner and Lennart Poettering for productive
> > discussions about this.
> >
> > Cc: "David S. Miller" <davem@davemloft.net>
> > Cc: Eric Dumazet <edumazet@google.com>
> > Cc: Jakub Kicinski <kuba@kernel.org>
> > Cc: Paolo Abeni <pabeni@redhat.com>
> > Cc: Leon Romanovsky <leon@kernel.org>
> > Cc: David Ahern <dsahern@kernel.org>
> > Cc: Arnd Bergmann <arnd@arndb.de>
> > Cc: Kees Cook <keescook@chromium.org>
> > Cc: Christian Brauner <brauner@kernel.org>
> > Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
> > Cc: Lennart Poettering <mzxreary@0pointer.de>
> > Cc: Luca Boccassi <bluca@debian.org>
> > Cc: linux-kernel@vger.kernel.org
> > Cc: netdev@vger.kernel.org
> > Cc: linux-arch@vger.kernel.org
> > Tested-by: Luca Boccassi <bluca@debian.org>
> > Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
> > ---
> > v4:
> >       - fixed silent fd_install if writting of CMSG to the userspace fails (pointed by Christian)
> > v2:
> >       According to review comments from Kuniyuki Iwashima and Christian Brauner:
> >       - use pidfd_create(..) retval as a result
> >       - whitespace change
> > ---
> >  arch/alpha/include/uapi/asm/socket.h    |  2 ++
> >  arch/mips/include/uapi/asm/socket.h     |  2 ++
> >  arch/parisc/include/uapi/asm/socket.h   |  2 ++
> >  arch/sparc/include/uapi/asm/socket.h    |  2 ++
> >  include/linux/net.h                     |  1 +
> >  include/linux/socket.h                  |  1 +
> >  include/net/scm.h                       | 39 +++++++++++++++++++++++--
> >  include/uapi/asm-generic/socket.h       |  2 ++
> >  net/core/sock.c                         | 11 +++++++
> >  net/mptcp/sockopt.c                     |  1 +
> >  net/unix/af_unix.c                      | 18 ++++++++----
> >  tools/include/uapi/asm-generic/socket.h |  2 ++
> >  12 files changed, 76 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> > index 739891b94136..ff310613ae64 100644
> > --- a/arch/alpha/include/uapi/asm/socket.h
> > +++ b/arch/alpha/include/uapi/asm/socket.h
> > @@ -137,6 +137,8 @@
> >
> >  #define SO_RCVMARK           75
> >
> > +#define SO_PASSPIDFD         76
> > +
> >  #if !defined(__KERNEL__)
> >
> >  #if __BITS_PER_LONG == 64
> > diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> > index 18f3d95ecfec..762dcb80e4ec 100644
> > --- a/arch/mips/include/uapi/asm/socket.h
> > +++ b/arch/mips/include/uapi/asm/socket.h
> > @@ -148,6 +148,8 @@
> >
> >  #define SO_RCVMARK           75
> >
> > +#define SO_PASSPIDFD         76
> > +
> >  #if !defined(__KERNEL__)
> >
> >  #if __BITS_PER_LONG == 64
> > diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> > index f486d3dfb6bb..df16a3e16d64 100644
> > --- a/arch/parisc/include/uapi/asm/socket.h
> > +++ b/arch/parisc/include/uapi/asm/socket.h
> > @@ -129,6 +129,8 @@
> >
> >  #define SO_RCVMARK           0x4049
> >
> > +#define SO_PASSPIDFD         0x404A
> > +
> >  #if !defined(__KERNEL__)
> >
> >  #if __BITS_PER_LONG == 64
> > diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> > index 2fda57a3ea86..6e2847804fea 100644
> > --- a/arch/sparc/include/uapi/asm/socket.h
> > +++ b/arch/sparc/include/uapi/asm/socket.h
> > @@ -130,6 +130,8 @@
> >
> >  #define SO_RCVMARK               0x0054
> >
> > +#define SO_PASSPIDFD             0x0055
> > +
> >  #if !defined(__KERNEL__)
> >
> >
> > diff --git a/include/linux/net.h b/include/linux/net.h
> > index b73ad8e3c212..c234dfbe7a30 100644
> > --- a/include/linux/net.h
> > +++ b/include/linux/net.h
> > @@ -43,6 +43,7 @@ struct net;
> >  #define SOCK_PASSSEC         4
> >  #define SOCK_SUPPORT_ZC              5
> >  #define SOCK_CUSTOM_SOCKOPT  6
> > +#define SOCK_PASSPIDFD               7
> >
> >  #ifndef ARCH_HAS_SOCKET_TYPES
> >  /**
> > diff --git a/include/linux/socket.h b/include/linux/socket.h
> > index 13c3a237b9c9..6bf90f251910 100644
> > --- a/include/linux/socket.h
> > +++ b/include/linux/socket.h
> > @@ -177,6 +177,7 @@ static inline size_t msg_data_left(struct msghdr *msg)
> >  #define      SCM_RIGHTS      0x01            /* rw: access rights (array of int) */
> >  #define SCM_CREDENTIALS 0x02         /* rw: struct ucred             */
> >  #define SCM_SECURITY 0x03            /* rw: security label           */
> > +#define SCM_PIDFD    0x04            /* ro: pidfd (int)              */
> >
> >  struct ucred {
> >       __u32   pid;
> > diff --git a/include/net/scm.h b/include/net/scm.h
> > index 585adc1346bd..c67f765a165b 100644
> > --- a/include/net/scm.h
> > +++ b/include/net/scm.h
> > @@ -120,12 +120,44 @@ static inline bool scm_has_secdata(struct socket *sock)
> >  }
> >  #endif /* CONFIG_SECURITY_NETWORK */
> >
> > +static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
> > +{
> > +     struct file *pidfd_file = NULL;
> > +     int pidfd;
> > +
> > +     /*
> > +      * put_cmsg() doesn't return an error if CMSG is truncated,
> > +      * that's why we need to opencode these checks here.
> > +      */
> > +     if ((msg->msg_controllen <= sizeof(struct cmsghdr)) ||
> > +         (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) {
> > +             msg->msg_flags |= MSG_CTRUNC;
> > +             return;
>
> Hm, curious about this: We mark the message as truncated for SCM_PIDFD
> but if the same conditions were to apply for SCM_PASSCRED we don't mark
> the message as truncated. Am I reading this correct? And is so, you
> please briefly explain this difference?

Hi, Christian!

For SCM_CREDENTIALS we mark it too. Inside the put_cmsg function:
https://github.com/torvalds/linux/blob/6a8f57ae2eb07ab39a6f0ccad60c760743051026/net/core/scm.c#L225

The reason why I'm open-coding these checks is that I want to know
that the message
doesn't fit into the userspace buffer before doing pidfd_prepare and
other stuff and because
put_cmsg is not returning an error when message doesn't fit in the
userspace buffer and
we won't be able to properly do pidfd cleanup (put struct pid and fd index).

>
> > +     }
> > +
> > +     WARN_ON_ONCE(!scm->pid);
> > +     pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
> > +
> > +     if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
>
> If the put_cmsg() of the pidfd fails userspace needs to be able to
> detect this. Otherwise they can't distinguish between the SCM_PIDFD
> value being zero because the put_cmsg() failed or put_cmsg() succeeded
> and the allocated fd nr was 0.

If pidfd_prepare fails then userspace will receive SCM_PIDFD message
with negative pidfd value.

>
> Looking at put_cmsg() it looks to me that userspace will receive a
> SCM_PIDFD message only if the put_cmsg() is completely successful. IIUC,
> then this change is fine.

Kind regards,
Alex
  
Christian Brauner April 17, 2023, 5:16 p.m. UTC | #3
On Mon, Apr 17, 2023 at 06:01:16PM +0200, Aleksandr Mikhalitsyn wrote:
> On Mon, Apr 17, 2023 at 5:18 PM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Thu, Apr 13, 2023 at 03:33:52PM +0200, Alexander Mikhalitsyn wrote:
> > > Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS,
> > > but it contains pidfd instead of plain pid, which allows programmers not
> > > to care about PID reuse problem.
> > >
> > > Idea comes from UAPI kernel group:
> > > https://uapi-group.org/kernel-features/
> > >
> > > Big thanks to Christian Brauner and Lennart Poettering for productive
> > > discussions about this.
> > >
> > > Cc: "David S. Miller" <davem@davemloft.net>
> > > Cc: Eric Dumazet <edumazet@google.com>
> > > Cc: Jakub Kicinski <kuba@kernel.org>
> > > Cc: Paolo Abeni <pabeni@redhat.com>
> > > Cc: Leon Romanovsky <leon@kernel.org>
> > > Cc: David Ahern <dsahern@kernel.org>
> > > Cc: Arnd Bergmann <arnd@arndb.de>
> > > Cc: Kees Cook <keescook@chromium.org>
> > > Cc: Christian Brauner <brauner@kernel.org>
> > > Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > Cc: Lennart Poettering <mzxreary@0pointer.de>
> > > Cc: Luca Boccassi <bluca@debian.org>
> > > Cc: linux-kernel@vger.kernel.org
> > > Cc: netdev@vger.kernel.org
> > > Cc: linux-arch@vger.kernel.org
> > > Tested-by: Luca Boccassi <bluca@debian.org>
> > > Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
> > > ---
> > > v4:
> > >       - fixed silent fd_install if writting of CMSG to the userspace fails (pointed by Christian)
> > > v2:
> > >       According to review comments from Kuniyuki Iwashima and Christian Brauner:
> > >       - use pidfd_create(..) retval as a result
> > >       - whitespace change
> > > ---
> > >  arch/alpha/include/uapi/asm/socket.h    |  2 ++
> > >  arch/mips/include/uapi/asm/socket.h     |  2 ++
> > >  arch/parisc/include/uapi/asm/socket.h   |  2 ++
> > >  arch/sparc/include/uapi/asm/socket.h    |  2 ++
> > >  include/linux/net.h                     |  1 +
> > >  include/linux/socket.h                  |  1 +
> > >  include/net/scm.h                       | 39 +++++++++++++++++++++++--
> > >  include/uapi/asm-generic/socket.h       |  2 ++
> > >  net/core/sock.c                         | 11 +++++++
> > >  net/mptcp/sockopt.c                     |  1 +
> > >  net/unix/af_unix.c                      | 18 ++++++++----
> > >  tools/include/uapi/asm-generic/socket.h |  2 ++
> > >  12 files changed, 76 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> > > index 739891b94136..ff310613ae64 100644
> > > --- a/arch/alpha/include/uapi/asm/socket.h
> > > +++ b/arch/alpha/include/uapi/asm/socket.h
> > > @@ -137,6 +137,8 @@
> > >
> > >  #define SO_RCVMARK           75
> > >
> > > +#define SO_PASSPIDFD         76
> > > +
> > >  #if !defined(__KERNEL__)
> > >
> > >  #if __BITS_PER_LONG == 64
> > > diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> > > index 18f3d95ecfec..762dcb80e4ec 100644
> > > --- a/arch/mips/include/uapi/asm/socket.h
> > > +++ b/arch/mips/include/uapi/asm/socket.h
> > > @@ -148,6 +148,8 @@
> > >
> > >  #define SO_RCVMARK           75
> > >
> > > +#define SO_PASSPIDFD         76
> > > +
> > >  #if !defined(__KERNEL__)
> > >
> > >  #if __BITS_PER_LONG == 64
> > > diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> > > index f486d3dfb6bb..df16a3e16d64 100644
> > > --- a/arch/parisc/include/uapi/asm/socket.h
> > > +++ b/arch/parisc/include/uapi/asm/socket.h
> > > @@ -129,6 +129,8 @@
> > >
> > >  #define SO_RCVMARK           0x4049
> > >
> > > +#define SO_PASSPIDFD         0x404A
> > > +
> > >  #if !defined(__KERNEL__)
> > >
> > >  #if __BITS_PER_LONG == 64
> > > diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> > > index 2fda57a3ea86..6e2847804fea 100644
> > > --- a/arch/sparc/include/uapi/asm/socket.h
> > > +++ b/arch/sparc/include/uapi/asm/socket.h
> > > @@ -130,6 +130,8 @@
> > >
> > >  #define SO_RCVMARK               0x0054
> > >
> > > +#define SO_PASSPIDFD             0x0055
> > > +
> > >  #if !defined(__KERNEL__)
> > >
> > >
> > > diff --git a/include/linux/net.h b/include/linux/net.h
> > > index b73ad8e3c212..c234dfbe7a30 100644
> > > --- a/include/linux/net.h
> > > +++ b/include/linux/net.h
> > > @@ -43,6 +43,7 @@ struct net;
> > >  #define SOCK_PASSSEC         4
> > >  #define SOCK_SUPPORT_ZC              5
> > >  #define SOCK_CUSTOM_SOCKOPT  6
> > > +#define SOCK_PASSPIDFD               7
> > >
> > >  #ifndef ARCH_HAS_SOCKET_TYPES
> > >  /**
> > > diff --git a/include/linux/socket.h b/include/linux/socket.h
> > > index 13c3a237b9c9..6bf90f251910 100644
> > > --- a/include/linux/socket.h
> > > +++ b/include/linux/socket.h
> > > @@ -177,6 +177,7 @@ static inline size_t msg_data_left(struct msghdr *msg)
> > >  #define      SCM_RIGHTS      0x01            /* rw: access rights (array of int) */
> > >  #define SCM_CREDENTIALS 0x02         /* rw: struct ucred             */
> > >  #define SCM_SECURITY 0x03            /* rw: security label           */
> > > +#define SCM_PIDFD    0x04            /* ro: pidfd (int)              */
> > >
> > >  struct ucred {
> > >       __u32   pid;
> > > diff --git a/include/net/scm.h b/include/net/scm.h
> > > index 585adc1346bd..c67f765a165b 100644
> > > --- a/include/net/scm.h
> > > +++ b/include/net/scm.h
> > > @@ -120,12 +120,44 @@ static inline bool scm_has_secdata(struct socket *sock)
> > >  }
> > >  #endif /* CONFIG_SECURITY_NETWORK */
> > >
> > > +static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
> > > +{
> > > +     struct file *pidfd_file = NULL;
> > > +     int pidfd;
> > > +
> > > +     /*
> > > +      * put_cmsg() doesn't return an error if CMSG is truncated,
> > > +      * that's why we need to opencode these checks here.
> > > +      */
> > > +     if ((msg->msg_controllen <= sizeof(struct cmsghdr)) ||
> > > +         (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) {
> > > +             msg->msg_flags |= MSG_CTRUNC;
> > > +             return;
> >
> > Hm, curious about this: We mark the message as truncated for SCM_PIDFD
> > but if the same conditions were to apply for SCM_PASSCRED we don't mark
> > the message as truncated. Am I reading this correct? And is so, you
> > please briefly explain this difference?
> 
> Hi, Christian!
> 
> For SCM_CREDENTIALS we mark it too. Inside the put_cmsg function:
> https://github.com/torvalds/linux/blob/6a8f57ae2eb07ab39a6f0ccad60c760743051026/net/core/scm.c#L225
> 
> The reason why I'm open-coding these checks is that I want to know
> that the message
> doesn't fit into the userspace buffer before doing pidfd_prepare and
> other stuff and because
> put_cmsg is not returning an error when message doesn't fit in the
> userspace buffer and
> we won't be able to properly do pidfd cleanup (put struct pid and fd index).
> 
> >
> > > +     }
> > > +
> > > +     WARN_ON_ONCE(!scm->pid);
> > > +     pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
> > > +
> > > +     if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
> >
> > If the put_cmsg() of the pidfd fails userspace needs to be able to
> > detect this. Otherwise they can't distinguish between the SCM_PIDFD
> > value being zero because the put_cmsg() failed or put_cmsg() succeeded
> > and the allocated fd nr was 0.
> 
> If pidfd_prepare fails then userspace will receive SCM_PIDFD message
> with negative pidfd value.

So we discussed this a bit offline and I think there's still an issue.
If put_cmsg() fails

          if (msg->msg_control_is_user) {
                  struct cmsghdr __user *cm = msg->msg_control_user;

                  check_object_size(data, cmlen - sizeof(*cm), true);

                  if (!user_write_access_begin(cm, cmlen))
                          goto efault;

		  // This succeeds so cm->cmsg_len == sizeof(int)
                  unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);

		  // This succeeds so cm->cmsg_level == SOL_SOCKET
                  unsafe_put_user(level, &cm->cmsg_level, efault_end);

		  // This succeeds so cm->cmsg_type == SCM_PIDFD
                  unsafe_put_user(type, &cm->cmsg_type, efault_end);

		  // This fails and leaves all bits set to 0
                  unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
                                      cmlen - sizeof(*cm), efault_end);
                  user_write_access_end();

so now we hit

          if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
                  if (pidfd_file) {
                          put_unused_fd(pidfd);
                          fput(pidfd_file);
                  }

                  return;
          }

and return early. Afaict, userspace would now receive:

	if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(int)) &&
	    cmsg->cmsg_level == SOL_SOCKET &&
	    cmsg->cmsg_type == SCM_PIDFD) {
		memcpy(&pidfd, CMSG_DATA(cmsg), sizeof(int));

		// pidfd is now 0 which is a valid fd number
		// it'll likely refer to /dev/stdin or whatever and so
		// will fail or, worst case, 0 refers to another pidfd :)
		pidfd_send_signal(pidfd, SIGKILL);

so we need to address this. So one way I think that would solve this is:

diff --git a/net/core/scm.c b/net/core/scm.c
index 3cd7dd377e53..d1f4cd135c5a 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -236,9 +236,9 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)

                unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);
                unsafe_put_user(level, &cm->cmsg_level, efault_end);
-               unsafe_put_user(type, &cm->cmsg_type, efault_end);
                unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
                                    cmlen - sizeof(*cm), efault_end);
+               unsafe_put_user(type, &cm->cmsg_type, efault_end);
                user_write_access_end();
        } else {
                struct cmsghdr *cm = msg->msg_control;

such that we only copy cm->cmsg_type after we transfered the data.
  
Eric Dumazet April 17, 2023, 5:43 p.m. UTC | #4
On Mon, Apr 17, 2023 at 7:16 PM Christian Brauner <brauner@kernel.org> wrote:
>
> On Mon, Apr 17, 2023 at 06:01:16PM +0200, Aleksandr Mikhalitsyn wrote:
> > On Mon, Apr 17, 2023 at 5:18 PM Christian Brauner <brauner@kernel.org> wrote:
> > >
> > > On Thu, Apr 13, 2023 at 03:33:52PM +0200, Alexander Mikhalitsyn wrote:
> > > > Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS,
> > > > but it contains pidfd instead of plain pid, which allows programmers not
> > > > to care about PID reuse problem.
> > > >
> > > > Idea comes from UAPI kernel group:
> > > > https://uapi-group.org/kernel-features/
> > > >
> > > > Big thanks to Christian Brauner and Lennart Poettering for productive
> > > > discussions about this.
> > > >
> > > > Cc: "David S. Miller" <davem@davemloft.net>
> > > > Cc: Eric Dumazet <edumazet@google.com>
> > > > Cc: Jakub Kicinski <kuba@kernel.org>
> > > > Cc: Paolo Abeni <pabeni@redhat.com>
> > > > Cc: Leon Romanovsky <leon@kernel.org>
> > > > Cc: David Ahern <dsahern@kernel.org>
> > > > Cc: Arnd Bergmann <arnd@arndb.de>
> > > > Cc: Kees Cook <keescook@chromium.org>
> > > > Cc: Christian Brauner <brauner@kernel.org>
> > > > Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > Cc: Lennart Poettering <mzxreary@0pointer.de>
> > > > Cc: Luca Boccassi <bluca@debian.org>
> > > > Cc: linux-kernel@vger.kernel.org
> > > > Cc: netdev@vger.kernel.org
> > > > Cc: linux-arch@vger.kernel.org
> > > > Tested-by: Luca Boccassi <bluca@debian.org>
> > > > Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
> > > > ---
> > > > v4:
> > > >       - fixed silent fd_install if writting of CMSG to the userspace fails (pointed by Christian)
> > > > v2:
> > > >       According to review comments from Kuniyuki Iwashima and Christian Brauner:
> > > >       - use pidfd_create(..) retval as a result
> > > >       - whitespace change
> > > > ---
> > > >  arch/alpha/include/uapi/asm/socket.h    |  2 ++
> > > >  arch/mips/include/uapi/asm/socket.h     |  2 ++
> > > >  arch/parisc/include/uapi/asm/socket.h   |  2 ++
> > > >  arch/sparc/include/uapi/asm/socket.h    |  2 ++
> > > >  include/linux/net.h                     |  1 +
> > > >  include/linux/socket.h                  |  1 +
> > > >  include/net/scm.h                       | 39 +++++++++++++++++++++++--
> > > >  include/uapi/asm-generic/socket.h       |  2 ++
> > > >  net/core/sock.c                         | 11 +++++++
> > > >  net/mptcp/sockopt.c                     |  1 +
> > > >  net/unix/af_unix.c                      | 18 ++++++++----
> > > >  tools/include/uapi/asm-generic/socket.h |  2 ++
> > > >  12 files changed, 76 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> > > > index 739891b94136..ff310613ae64 100644
> > > > --- a/arch/alpha/include/uapi/asm/socket.h
> > > > +++ b/arch/alpha/include/uapi/asm/socket.h
> > > > @@ -137,6 +137,8 @@
> > > >
> > > >  #define SO_RCVMARK           75
> > > >
> > > > +#define SO_PASSPIDFD         76
> > > > +
> > > >  #if !defined(__KERNEL__)
> > > >
> > > >  #if __BITS_PER_LONG == 64
> > > > diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> > > > index 18f3d95ecfec..762dcb80e4ec 100644
> > > > --- a/arch/mips/include/uapi/asm/socket.h
> > > > +++ b/arch/mips/include/uapi/asm/socket.h
> > > > @@ -148,6 +148,8 @@
> > > >
> > > >  #define SO_RCVMARK           75
> > > >
> > > > +#define SO_PASSPIDFD         76
> > > > +
> > > >  #if !defined(__KERNEL__)
> > > >
> > > >  #if __BITS_PER_LONG == 64
> > > > diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> > > > index f486d3dfb6bb..df16a3e16d64 100644
> > > > --- a/arch/parisc/include/uapi/asm/socket.h
> > > > +++ b/arch/parisc/include/uapi/asm/socket.h
> > > > @@ -129,6 +129,8 @@
> > > >
> > > >  #define SO_RCVMARK           0x4049
> > > >
> > > > +#define SO_PASSPIDFD         0x404A
> > > > +
> > > >  #if !defined(__KERNEL__)
> > > >
> > > >  #if __BITS_PER_LONG == 64
> > > > diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> > > > index 2fda57a3ea86..6e2847804fea 100644
> > > > --- a/arch/sparc/include/uapi/asm/socket.h
> > > > +++ b/arch/sparc/include/uapi/asm/socket.h
> > > > @@ -130,6 +130,8 @@
> > > >
> > > >  #define SO_RCVMARK               0x0054
> > > >
> > > > +#define SO_PASSPIDFD             0x0055
> > > > +
> > > >  #if !defined(__KERNEL__)
> > > >
> > > >
> > > > diff --git a/include/linux/net.h b/include/linux/net.h
> > > > index b73ad8e3c212..c234dfbe7a30 100644
> > > > --- a/include/linux/net.h
> > > > +++ b/include/linux/net.h
> > > > @@ -43,6 +43,7 @@ struct net;
> > > >  #define SOCK_PASSSEC         4
> > > >  #define SOCK_SUPPORT_ZC              5
> > > >  #define SOCK_CUSTOM_SOCKOPT  6
> > > > +#define SOCK_PASSPIDFD               7
> > > >
> > > >  #ifndef ARCH_HAS_SOCKET_TYPES
> > > >  /**
> > > > diff --git a/include/linux/socket.h b/include/linux/socket.h
> > > > index 13c3a237b9c9..6bf90f251910 100644
> > > > --- a/include/linux/socket.h
> > > > +++ b/include/linux/socket.h
> > > > @@ -177,6 +177,7 @@ static inline size_t msg_data_left(struct msghdr *msg)
> > > >  #define      SCM_RIGHTS      0x01            /* rw: access rights (array of int) */
> > > >  #define SCM_CREDENTIALS 0x02         /* rw: struct ucred             */
> > > >  #define SCM_SECURITY 0x03            /* rw: security label           */
> > > > +#define SCM_PIDFD    0x04            /* ro: pidfd (int)              */
> > > >
> > > >  struct ucred {
> > > >       __u32   pid;
> > > > diff --git a/include/net/scm.h b/include/net/scm.h
> > > > index 585adc1346bd..c67f765a165b 100644
> > > > --- a/include/net/scm.h
> > > > +++ b/include/net/scm.h
> > > > @@ -120,12 +120,44 @@ static inline bool scm_has_secdata(struct socket *sock)
> > > >  }
> > > >  #endif /* CONFIG_SECURITY_NETWORK */
> > > >
> > > > +static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
> > > > +{
> > > > +     struct file *pidfd_file = NULL;
> > > > +     int pidfd;
> > > > +
> > > > +     /*
> > > > +      * put_cmsg() doesn't return an error if CMSG is truncated,
> > > > +      * that's why we need to opencode these checks here.
> > > > +      */
> > > > +     if ((msg->msg_controllen <= sizeof(struct cmsghdr)) ||
> > > > +         (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) {
> > > > +             msg->msg_flags |= MSG_CTRUNC;
> > > > +             return;
> > >
> > > Hm, curious about this: We mark the message as truncated for SCM_PIDFD
> > > but if the same conditions were to apply for SCM_PASSCRED we don't mark
> > > the message as truncated. Am I reading this correct? And is so, you
> > > please briefly explain this difference?
> >
> > Hi, Christian!
> >
> > For SCM_CREDENTIALS we mark it too. Inside the put_cmsg function:
> > https://github.com/torvalds/linux/blob/6a8f57ae2eb07ab39a6f0ccad60c760743051026/net/core/scm.c#L225
> >
> > The reason why I'm open-coding these checks is that I want to know
> > that the message
> > doesn't fit into the userspace buffer before doing pidfd_prepare and
> > other stuff and because
> > put_cmsg is not returning an error when message doesn't fit in the
> > userspace buffer and
> > we won't be able to properly do pidfd cleanup (put struct pid and fd index).
> >
> > >
> > > > +     }
> > > > +
> > > > +     WARN_ON_ONCE(!scm->pid);
> > > > +     pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
> > > > +
> > > > +     if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
> > >
> > > If the put_cmsg() of the pidfd fails userspace needs to be able to
> > > detect this. Otherwise they can't distinguish between the SCM_PIDFD
> > > value being zero because the put_cmsg() failed or put_cmsg() succeeded
> > > and the allocated fd nr was 0.
> >
> > If pidfd_prepare fails then userspace will receive SCM_PIDFD message
> > with negative pidfd value.
>
> So we discussed this a bit offline and I think there's still an issue.
> If put_cmsg() fails
>
>           if (msg->msg_control_is_user) {
>                   struct cmsghdr __user *cm = msg->msg_control_user;
>
>                   check_object_size(data, cmlen - sizeof(*cm), true);
>
>                   if (!user_write_access_begin(cm, cmlen))
>                           goto efault;
>
>                   // This succeeds so cm->cmsg_len == sizeof(int)
>                   unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);
>
>                   // This succeeds so cm->cmsg_level == SOL_SOCKET
>                   unsafe_put_user(level, &cm->cmsg_level, efault_end);
>
>                   // This succeeds so cm->cmsg_type == SCM_PIDFD
>                   unsafe_put_user(type, &cm->cmsg_type, efault_end);
>
>                   // This fails and leaves all bits set to 0
>                   unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
>                                       cmlen - sizeof(*cm), efault_end);
>                   user_write_access_end();
>
> so now we hit
>
>           if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
>                   if (pidfd_file) {
>                           put_unused_fd(pidfd);
>                           fput(pidfd_file);
>                   }
>
>                   return;
>           }
>
> and return early. Afaict, userspace would now receive:
>
>         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(int)) &&
>             cmsg->cmsg_level == SOL_SOCKET &&
>             cmsg->cmsg_type == SCM_PIDFD) {
>                 memcpy(&pidfd, CMSG_DATA(cmsg), sizeof(int));
>
>                 // pidfd is now 0 which is a valid fd number
>                 // it'll likely refer to /dev/stdin or whatever and so
>                 // will fail or, worst case, 0 refers to another pidfd :)
>                 pidfd_send_signal(pidfd, SIGKILL);
>
> so we need to address this. So one way I think that would solve this is:
>
> diff --git a/net/core/scm.c b/net/core/scm.c
> index 3cd7dd377e53..d1f4cd135c5a 100644
> --- a/net/core/scm.c
> +++ b/net/core/scm.c
> @@ -236,9 +236,9 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
>
>                 unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);
>                 unsafe_put_user(level, &cm->cmsg_level, efault_end);
> -               unsafe_put_user(type, &cm->cmsg_type, efault_end);
>                 unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
>                                     cmlen - sizeof(*cm), efault_end);
> +               unsafe_put_user(type, &cm->cmsg_type, efault_end);
>                 user_write_access_end();
>         } else {
>                 struct cmsghdr *cm = msg->msg_control;
>
> such that we only copy cm->cmsg_type after we transfered the data.

This looks wrong to me.

if put_cmsg() returns -EFAULT, then msg->msg_control and
msg->msg_controllen were not changed.

So the user application should not attempt to read this part of the
control buffer, this could contain garbage.
  
Christian Brauner April 18, 2023, 8:16 a.m. UTC | #5
On Mon, Apr 17, 2023 at 07:43:19PM +0200, Eric Dumazet wrote:
> On Mon, Apr 17, 2023 at 7:16 PM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Mon, Apr 17, 2023 at 06:01:16PM +0200, Aleksandr Mikhalitsyn wrote:
> > > On Mon, Apr 17, 2023 at 5:18 PM Christian Brauner <brauner@kernel.org> wrote:
> > > >
> > > > On Thu, Apr 13, 2023 at 03:33:52PM +0200, Alexander Mikhalitsyn wrote:
> > > > > Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS,
> > > > > but it contains pidfd instead of plain pid, which allows programmers not
> > > > > to care about PID reuse problem.
> > > > >
> > > > > Idea comes from UAPI kernel group:
> > > > > https://uapi-group.org/kernel-features/
> > > > >
> > > > > Big thanks to Christian Brauner and Lennart Poettering for productive
> > > > > discussions about this.
> > > > >
> > > > > Cc: "David S. Miller" <davem@davemloft.net>
> > > > > Cc: Eric Dumazet <edumazet@google.com>
> > > > > Cc: Jakub Kicinski <kuba@kernel.org>
> > > > > Cc: Paolo Abeni <pabeni@redhat.com>
> > > > > Cc: Leon Romanovsky <leon@kernel.org>
> > > > > Cc: David Ahern <dsahern@kernel.org>
> > > > > Cc: Arnd Bergmann <arnd@arndb.de>
> > > > > Cc: Kees Cook <keescook@chromium.org>
> > > > > Cc: Christian Brauner <brauner@kernel.org>
> > > > > Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > > Cc: Lennart Poettering <mzxreary@0pointer.de>
> > > > > Cc: Luca Boccassi <bluca@debian.org>
> > > > > Cc: linux-kernel@vger.kernel.org
> > > > > Cc: netdev@vger.kernel.org
> > > > > Cc: linux-arch@vger.kernel.org
> > > > > Tested-by: Luca Boccassi <bluca@debian.org>
> > > > > Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> > > > > Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
> > > > > ---
> > > > > v4:
> > > > >       - fixed silent fd_install if writting of CMSG to the userspace fails (pointed by Christian)
> > > > > v2:
> > > > >       According to review comments from Kuniyuki Iwashima and Christian Brauner:
> > > > >       - use pidfd_create(..) retval as a result
> > > > >       - whitespace change
> > > > > ---
> > > > >  arch/alpha/include/uapi/asm/socket.h    |  2 ++
> > > > >  arch/mips/include/uapi/asm/socket.h     |  2 ++
> > > > >  arch/parisc/include/uapi/asm/socket.h   |  2 ++
> > > > >  arch/sparc/include/uapi/asm/socket.h    |  2 ++
> > > > >  include/linux/net.h                     |  1 +
> > > > >  include/linux/socket.h                  |  1 +
> > > > >  include/net/scm.h                       | 39 +++++++++++++++++++++++--
> > > > >  include/uapi/asm-generic/socket.h       |  2 ++
> > > > >  net/core/sock.c                         | 11 +++++++
> > > > >  net/mptcp/sockopt.c                     |  1 +
> > > > >  net/unix/af_unix.c                      | 18 ++++++++----
> > > > >  tools/include/uapi/asm-generic/socket.h |  2 ++
> > > > >  12 files changed, 76 insertions(+), 7 deletions(-)
> > > > >
> > > > > diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> > > > > index 739891b94136..ff310613ae64 100644
> > > > > --- a/arch/alpha/include/uapi/asm/socket.h
> > > > > +++ b/arch/alpha/include/uapi/asm/socket.h
> > > > > @@ -137,6 +137,8 @@
> > > > >
> > > > >  #define SO_RCVMARK           75
> > > > >
> > > > > +#define SO_PASSPIDFD         76
> > > > > +
> > > > >  #if !defined(__KERNEL__)
> > > > >
> > > > >  #if __BITS_PER_LONG == 64
> > > > > diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> > > > > index 18f3d95ecfec..762dcb80e4ec 100644
> > > > > --- a/arch/mips/include/uapi/asm/socket.h
> > > > > +++ b/arch/mips/include/uapi/asm/socket.h
> > > > > @@ -148,6 +148,8 @@
> > > > >
> > > > >  #define SO_RCVMARK           75
> > > > >
> > > > > +#define SO_PASSPIDFD         76
> > > > > +
> > > > >  #if !defined(__KERNEL__)
> > > > >
> > > > >  #if __BITS_PER_LONG == 64
> > > > > diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> > > > > index f486d3dfb6bb..df16a3e16d64 100644
> > > > > --- a/arch/parisc/include/uapi/asm/socket.h
> > > > > +++ b/arch/parisc/include/uapi/asm/socket.h
> > > > > @@ -129,6 +129,8 @@
> > > > >
> > > > >  #define SO_RCVMARK           0x4049
> > > > >
> > > > > +#define SO_PASSPIDFD         0x404A
> > > > > +
> > > > >  #if !defined(__KERNEL__)
> > > > >
> > > > >  #if __BITS_PER_LONG == 64
> > > > > diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> > > > > index 2fda57a3ea86..6e2847804fea 100644
> > > > > --- a/arch/sparc/include/uapi/asm/socket.h
> > > > > +++ b/arch/sparc/include/uapi/asm/socket.h
> > > > > @@ -130,6 +130,8 @@
> > > > >
> > > > >  #define SO_RCVMARK               0x0054
> > > > >
> > > > > +#define SO_PASSPIDFD             0x0055
> > > > > +
> > > > >  #if !defined(__KERNEL__)
> > > > >
> > > > >
> > > > > diff --git a/include/linux/net.h b/include/linux/net.h
> > > > > index b73ad8e3c212..c234dfbe7a30 100644
> > > > > --- a/include/linux/net.h
> > > > > +++ b/include/linux/net.h
> > > > > @@ -43,6 +43,7 @@ struct net;
> > > > >  #define SOCK_PASSSEC         4
> > > > >  #define SOCK_SUPPORT_ZC              5
> > > > >  #define SOCK_CUSTOM_SOCKOPT  6
> > > > > +#define SOCK_PASSPIDFD               7
> > > > >
> > > > >  #ifndef ARCH_HAS_SOCKET_TYPES
> > > > >  /**
> > > > > diff --git a/include/linux/socket.h b/include/linux/socket.h
> > > > > index 13c3a237b9c9..6bf90f251910 100644
> > > > > --- a/include/linux/socket.h
> > > > > +++ b/include/linux/socket.h
> > > > > @@ -177,6 +177,7 @@ static inline size_t msg_data_left(struct msghdr *msg)
> > > > >  #define      SCM_RIGHTS      0x01            /* rw: access rights (array of int) */
> > > > >  #define SCM_CREDENTIALS 0x02         /* rw: struct ucred             */
> > > > >  #define SCM_SECURITY 0x03            /* rw: security label           */
> > > > > +#define SCM_PIDFD    0x04            /* ro: pidfd (int)              */
> > > > >
> > > > >  struct ucred {
> > > > >       __u32   pid;
> > > > > diff --git a/include/net/scm.h b/include/net/scm.h
> > > > > index 585adc1346bd..c67f765a165b 100644
> > > > > --- a/include/net/scm.h
> > > > > +++ b/include/net/scm.h
> > > > > @@ -120,12 +120,44 @@ static inline bool scm_has_secdata(struct socket *sock)
> > > > >  }
> > > > >  #endif /* CONFIG_SECURITY_NETWORK */
> > > > >
> > > > > +static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
> > > > > +{
> > > > > +     struct file *pidfd_file = NULL;
> > > > > +     int pidfd;
> > > > > +
> > > > > +     /*
> > > > > +      * put_cmsg() doesn't return an error if CMSG is truncated,
> > > > > +      * that's why we need to opencode these checks here.
> > > > > +      */
> > > > > +     if ((msg->msg_controllen <= sizeof(struct cmsghdr)) ||
> > > > > +         (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) {
> > > > > +             msg->msg_flags |= MSG_CTRUNC;
> > > > > +             return;
> > > >
> > > > Hm, curious about this: We mark the message as truncated for SCM_PIDFD
> > > > but if the same conditions were to apply for SCM_PASSCRED we don't mark
> > > > the message as truncated. Am I reading this correct? And is so, you
> > > > please briefly explain this difference?
> > >
> > > Hi, Christian!
> > >
> > > For SCM_CREDENTIALS we mark it too. Inside the put_cmsg function:
> > > https://github.com/torvalds/linux/blob/6a8f57ae2eb07ab39a6f0ccad60c760743051026/net/core/scm.c#L225
> > >
> > > The reason why I'm open-coding these checks is that I want to know
> > > that the message
> > > doesn't fit into the userspace buffer before doing pidfd_prepare and
> > > other stuff and because
> > > put_cmsg is not returning an error when message doesn't fit in the
> > > userspace buffer and
> > > we won't be able to properly do pidfd cleanup (put struct pid and fd index).
> > >
> > > >
> > > > > +     }
> > > > > +
> > > > > +     WARN_ON_ONCE(!scm->pid);
> > > > > +     pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
> > > > > +
> > > > > +     if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
> > > >
> > > > If the put_cmsg() of the pidfd fails userspace needs to be able to
> > > > detect this. Otherwise they can't distinguish between the SCM_PIDFD
> > > > value being zero because the put_cmsg() failed or put_cmsg() succeeded
> > > > and the allocated fd nr was 0.
> > >
> > > If pidfd_prepare fails then userspace will receive SCM_PIDFD message
> > > with negative pidfd value.
> >
> > So we discussed this a bit offline and I think there's still an issue.
> > If put_cmsg() fails
> >
> >           if (msg->msg_control_is_user) {
> >                   struct cmsghdr __user *cm = msg->msg_control_user;
> >
> >                   check_object_size(data, cmlen - sizeof(*cm), true);
> >
> >                   if (!user_write_access_begin(cm, cmlen))
> >                           goto efault;
> >
> >                   // This succeeds so cm->cmsg_len == sizeof(int)
> >                   unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);
> >
> >                   // This succeeds so cm->cmsg_level == SOL_SOCKET
> >                   unsafe_put_user(level, &cm->cmsg_level, efault_end);
> >
> >                   // This succeeds so cm->cmsg_type == SCM_PIDFD
> >                   unsafe_put_user(type, &cm->cmsg_type, efault_end);
> >
> >                   // This fails and leaves all bits set to 0
> >                   unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
> >                                       cmlen - sizeof(*cm), efault_end);
> >                   user_write_access_end();
> >
> > so now we hit
> >
> >           if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
> >                   if (pidfd_file) {
> >                           put_unused_fd(pidfd);
> >                           fput(pidfd_file);
> >                   }
> >
> >                   return;
> >           }
> >
> > and return early. Afaict, userspace would now receive:
> >
> >         if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(int)) &&
> >             cmsg->cmsg_level == SOL_SOCKET &&
> >             cmsg->cmsg_type == SCM_PIDFD) {
> >                 memcpy(&pidfd, CMSG_DATA(cmsg), sizeof(int));
> >
> >                 // pidfd is now 0 which is a valid fd number
> >                 // it'll likely refer to /dev/stdin or whatever and so
> >                 // will fail or, worst case, 0 refers to another pidfd :)
> >                 pidfd_send_signal(pidfd, SIGKILL);
> >
> > so we need to address this. So one way I think that would solve this is:
> >
> > diff --git a/net/core/scm.c b/net/core/scm.c
> > index 3cd7dd377e53..d1f4cd135c5a 100644
> > --- a/net/core/scm.c
> > +++ b/net/core/scm.c
> > @@ -236,9 +236,9 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
> >
> >                 unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);
> >                 unsafe_put_user(level, &cm->cmsg_level, efault_end);
> > -               unsafe_put_user(type, &cm->cmsg_type, efault_end);
> >                 unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
> >                                     cmlen - sizeof(*cm), efault_end);
> > +               unsafe_put_user(type, &cm->cmsg_type, efault_end);
> >                 user_write_access_end();
> >         } else {
> >                 struct cmsghdr *cm = msg->msg_control;
> >
> > such that we only copy cm->cmsg_type after we transfered the data.
> 
> This looks wrong to me.
> 
> if put_cmsg() returns -EFAULT, then msg->msg_control and
> msg->msg_controllen were not changed.
> 
> So the user application should not attempt to read this part of the
> control buffer, this could contain garbage.

Thanks for the review, Eric. That's reassuring.

I've done a bit of container related networking before but I'm fumbling
my way through the reviews here. So any additional reviews here would be
very helpful.
  
Christian Brauner April 18, 2023, 1:07 p.m. UTC | #6
On Thu, Apr 13, 2023 at 03:33:52PM +0200, Alexander Mikhalitsyn wrote:
> Implement SCM_PIDFD, a new type of CMSG type analogical to SCM_CREDENTIALS,
> but it contains pidfd instead of plain pid, which allows programmers not
> to care about PID reuse problem.
> 
> Idea comes from UAPI kernel group:
> https://uapi-group.org/kernel-features/
> 
> Big thanks to Christian Brauner and Lennart Poettering for productive
> discussions about this.
> 
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Jakub Kicinski <kuba@kernel.org>
> Cc: Paolo Abeni <pabeni@redhat.com>
> Cc: Leon Romanovsky <leon@kernel.org>
> Cc: David Ahern <dsahern@kernel.org>
> Cc: Arnd Bergmann <arnd@arndb.de>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Christian Brauner <brauner@kernel.org>
> Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
> Cc: Lennart Poettering <mzxreary@0pointer.de>
> Cc: Luca Boccassi <bluca@debian.org>
> Cc: linux-kernel@vger.kernel.org
> Cc: netdev@vger.kernel.org
> Cc: linux-arch@vger.kernel.org
> Tested-by: Luca Boccassi <bluca@debian.org>
> Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
> ---
> v4:
> 	- fixed silent fd_install if writting of CMSG to the userspace fails (pointed by Christian)

I don't have a lot more to add to this,
Reviewed-by: Christian Brauner <brauner@kernel.org>
  

Patch

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 739891b94136..ff310613ae64 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -137,6 +137,8 @@ 
 
 #define SO_RCVMARK		75
 
+#define SO_PASSPIDFD		76
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 18f3d95ecfec..762dcb80e4ec 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -148,6 +148,8 @@ 
 
 #define SO_RCVMARK		75
 
+#define SO_PASSPIDFD		76
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index f486d3dfb6bb..df16a3e16d64 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -129,6 +129,8 @@ 
 
 #define SO_RCVMARK		0x4049
 
+#define SO_PASSPIDFD		0x404A
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 2fda57a3ea86..6e2847804fea 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -130,6 +130,8 @@ 
 
 #define SO_RCVMARK               0x0054
 
+#define SO_PASSPIDFD             0x0055
+
 #if !defined(__KERNEL__)
 
 
diff --git a/include/linux/net.h b/include/linux/net.h
index b73ad8e3c212..c234dfbe7a30 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -43,6 +43,7 @@  struct net;
 #define SOCK_PASSSEC		4
 #define SOCK_SUPPORT_ZC		5
 #define SOCK_CUSTOM_SOCKOPT	6
+#define SOCK_PASSPIDFD		7
 
 #ifndef ARCH_HAS_SOCKET_TYPES
 /**
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 13c3a237b9c9..6bf90f251910 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -177,6 +177,7 @@  static inline size_t msg_data_left(struct msghdr *msg)
 #define	SCM_RIGHTS	0x01		/* rw: access rights (array of int) */
 #define SCM_CREDENTIALS 0x02		/* rw: struct ucred		*/
 #define SCM_SECURITY	0x03		/* rw: security label		*/
+#define SCM_PIDFD	0x04		/* ro: pidfd (int)		*/
 
 struct ucred {
 	__u32	pid;
diff --git a/include/net/scm.h b/include/net/scm.h
index 585adc1346bd..c67f765a165b 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -120,12 +120,44 @@  static inline bool scm_has_secdata(struct socket *sock)
 }
 #endif /* CONFIG_SECURITY_NETWORK */
 
+static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
+{
+	struct file *pidfd_file = NULL;
+	int pidfd;
+
+	/*
+	 * put_cmsg() doesn't return an error if CMSG is truncated,
+	 * that's why we need to opencode these checks here.
+	 */
+	if ((msg->msg_controllen <= sizeof(struct cmsghdr)) ||
+	    (msg->msg_controllen - sizeof(struct cmsghdr)) < sizeof(int)) {
+		msg->msg_flags |= MSG_CTRUNC;
+		return;
+	}
+
+	WARN_ON_ONCE(!scm->pid);
+	pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
+
+	if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
+		if (pidfd_file) {
+			put_unused_fd(pidfd);
+			fput(pidfd_file);
+		}
+
+		return;
+	}
+
+	if (pidfd_file)
+		fd_install(pidfd, pidfd_file);
+}
+
 static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 				struct scm_cookie *scm, int flags)
 {
 	if (!msg->msg_control) {
-		if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp ||
-		    scm_has_secdata(sock))
+		if (test_bit(SOCK_PASSCRED, &sock->flags) ||
+		    test_bit(SOCK_PASSPIDFD, &sock->flags) ||
+		    scm->fp || scm_has_secdata(sock))
 			msg->msg_flags |= MSG_CTRUNC;
 		scm_destroy(scm);
 		return;
@@ -141,6 +173,9 @@  static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 		put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
 	}
 
+	if (test_bit(SOCK_PASSPIDFD, &sock->flags))
+		scm_pidfd_recv(msg, scm);
+
 	scm_destroy_cred(scm);
 
 	scm_passec(sock, msg, scm);
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 638230899e98..b76169fdb80b 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -132,6 +132,8 @@ 
 
 #define SO_RCVMARK		75
 
+#define SO_PASSPIDFD		76
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/core/sock.c b/net/core/sock.c
index c25888795390..3f974246ba3e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1246,6 +1246,13 @@  int sk_setsockopt(struct sock *sk, int level, int optname,
 			clear_bit(SOCK_PASSCRED, &sock->flags);
 		break;
 
+	case SO_PASSPIDFD:
+		if (valbool)
+			set_bit(SOCK_PASSPIDFD, &sock->flags);
+		else
+			clear_bit(SOCK_PASSPIDFD, &sock->flags);
+		break;
+
 	case SO_TIMESTAMP_OLD:
 	case SO_TIMESTAMP_NEW:
 	case SO_TIMESTAMPNS_OLD:
@@ -1737,6 +1744,10 @@  int sk_getsockopt(struct sock *sk, int level, int optname,
 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
 		break;
 
+	case SO_PASSPIDFD:
+		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
+		break;
+
 	case SO_PEERCRED:
 	{
 		struct ucred peercred;
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index b655cebda0f3..67be0558862f 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -355,6 +355,7 @@  static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
 	case SO_BROADCAST:
 	case SO_BSDCOMPAT:
 	case SO_PASSCRED:
+	case SO_PASSPIDFD:
 	case SO_PASSSEC:
 	case SO_RXQ_OVFL:
 	case SO_WIFI_STATUS:
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index fb31e8a4409e..6d5dff4dfe83 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1361,7 +1361,8 @@  static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		if (err)
 			goto out;
 
-		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
+		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
+		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
 		    !unix_sk(sk)->addr) {
 			err = unix_autobind(sk);
 			if (err)
@@ -1469,7 +1470,8 @@  static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (err)
 		goto out;
 
-	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
+	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
+	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
 		err = unix_autobind(sk);
 		if (err)
 			goto out;
@@ -1670,6 +1672,8 @@  static void unix_sock_inherit_flags(const struct socket *old,
 {
 	if (test_bit(SOCK_PASSCRED, &old->flags))
 		set_bit(SOCK_PASSCRED, &new->flags);
+	if (test_bit(SOCK_PASSPIDFD, &old->flags))
+		set_bit(SOCK_PASSPIDFD, &new->flags);
 	if (test_bit(SOCK_PASSSEC, &old->flags))
 		set_bit(SOCK_PASSSEC, &new->flags);
 }
@@ -1819,8 +1823,10 @@  static bool unix_passcred_enabled(const struct socket *sock,
 				  const struct sock *other)
 {
 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
+	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
 	       !other->sk_socket ||
-	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
+	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
+	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
 }
 
 /*
@@ -1922,7 +1928,8 @@  static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 			goto out;
 	}
 
-	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
+	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
+	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
 		err = unix_autobind(sk);
 		if (err)
 			goto out;
@@ -2824,7 +2831,8 @@  static int unix_stream_read_generic(struct unix_stream_read_state *state,
 			/* Never glue messages from different writers */
 			if (!unix_skb_scm_eq(skb, &scm))
 				break;
-		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
+		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
+			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
 			/* Copy credentials */
 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
 			unix_set_secdata(&scm, skb);
diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h
index 8756df13be50..fbbc4bf53ee3 100644
--- a/tools/include/uapi/asm-generic/socket.h
+++ b/tools/include/uapi/asm-generic/socket.h
@@ -121,6 +121,8 @@ 
 
 #define SO_RCVMARK		75
 
+#define SO_PASSPIDFD		76
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))