[v2] pidfd: implement PIDFD_THREAD flag for pidfd_open()

Message ID 20240130112638.GA29773@redhat.com
State New
Headers
Series [v2] pidfd: implement PIDFD_THREAD flag for pidfd_open() |

Commit Message

Oleg Nesterov Jan. 30, 2024, 11:26 a.m. UTC
  With this flag:

	- pidfd_open() doesn't require that the target task must be
	  a thread-group leader

	- pidfd_poll() succeeds when the task exits and becomes a
	  zombie (iow, passes exit_notify()), even if it is a leader
	  and thread-group is not empty.

thread_group_exited() is no longer used, perhaps it can die.

Co-developed-by: Tycho Andersen <tycho@tycho.pizza>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/uapi/linux/pidfd.h |  3 ++-
 kernel/exit.c              |  7 +++++++
 kernel/fork.c              | 38 +++++++++++++++++++++++++++++++-------
 kernel/pid.c               | 14 +++-----------
 kernel/signal.c            |  4 +++-
 5 files changed, 46 insertions(+), 20 deletions(-)
  

Comments

Oleg Nesterov Jan. 30, 2024, 11:34 a.m. UTC | #1
Damn. Self-NACK.

I forgot (we all ;) about mt-exec, and there are 2 problems.

1. The "if (!thread_group_leader(tsk))" block in de_thread() needs
   do_notify_pidfd() too, the execing non-leader thread looses its
   old pid, pidfd_poll(PIDFD_THREAD, pid-of-execing-sub-thread)
   should succeed. Must be fixed, I think.

2. pidfd_poll(PIDFD_THREAD, pid-of-group-leader) should not succeed
   when its sub-thread execs, the execing thread inherits the leader's
   pid. Perhaps pidfd_task_exited() can check sig->group_exec_task,
   but:

   OTOH. do we really care? Group leader can exit and become a
   zombie. Then another thread can exec. Perhaps we can simply
   emphasize that if you use pidfd_open(leader-pid, PIDFD_THREAD)
   then you shouldn't expect that after poll() this pid can't be
   "vivified" again? I don't think this can be "fixed".

I'll try to think more. We don't want to take tasklist in poll...

Oleg.

On 01/30, Oleg Nesterov wrote:
>
> With this flag:
> 
> 	- pidfd_open() doesn't require that the target task must be
> 	  a thread-group leader
> 
> 	- pidfd_poll() succeeds when the task exits and becomes a
> 	  zombie (iow, passes exit_notify()), even if it is a leader
> 	  and thread-group is not empty.
> 
> thread_group_exited() is no longer used, perhaps it can die.
> 
> Co-developed-by: Tycho Andersen <tycho@tycho.pizza>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
>  include/uapi/linux/pidfd.h |  3 ++-
>  kernel/exit.c              |  7 +++++++
>  kernel/fork.c              | 38 +++++++++++++++++++++++++++++++-------
>  kernel/pid.c               | 14 +++-----------
>  kernel/signal.c            |  4 +++-
>  5 files changed, 46 insertions(+), 20 deletions(-)
> 
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> index 5406fbc13074..2e6461459877 100644
> --- a/include/uapi/linux/pidfd.h
> +++ b/include/uapi/linux/pidfd.h
> @@ -7,6 +7,7 @@
>  #include <linux/fcntl.h>
>  
>  /* Flags for pidfd_open().  */
> -#define PIDFD_NONBLOCK O_NONBLOCK
> +#define PIDFD_NONBLOCK	O_NONBLOCK
> +#define PIDFD_THREAD	O_EXCL
>  
>  #endif /* _UAPI_LINUX_PIDFD_H */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index dfb963d2f862..493647fd7c07 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
>  		kill_orphaned_pgrp(tsk->group_leader, NULL);
>  
>  	tsk->exit_state = EXIT_ZOMBIE;
> +	/*
> +	 * sub-thread or delay_group_leader(), wake up the
> +	 * PIDFD_THREAD waiters.
> +	 */
> +	if (!thread_group_empty(tsk))
> +		do_notify_pidfd(tsk);
> +
>  	if (unlikely(tsk->ptrace)) {
>  		int sig = thread_group_leader(tsk) &&
>  				thread_group_empty(tsk) &&
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 347641398f9d..bea32071fff2 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -101,6 +101,7 @@
>  #include <linux/user_events.h>
>  #include <linux/iommu.h>
>  #include <linux/rseq.h>
> +#include <uapi/linux/pidfd.h>
>  
>  #include <asm/pgalloc.h>
>  #include <linux/uaccess.h>
> @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>  
>  	seq_put_decimal_ll(m, "Pid:\t", nr);
>  
> +	/* TODO: report PIDFD_THREAD */
> +
>  #ifdef CONFIG_PID_NS
>  	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
>  	if (nr > 0) {
> @@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>  }
>  #endif
>  
> +static bool pidfd_task_exited(struct pid *pid, bool thread)
> +{
> +	struct task_struct *task;
> +	bool exited;
> +
> +	rcu_read_lock();
> +	task = pid_task(pid, PIDTYPE_PID);
> +	exited = !task ||
> +		(READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
> +	rcu_read_unlock();
> +
> +	return exited;
> +}
> +
>  /*
>   * Poll support for process exit notification.
>   */
>  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
>  {
>  	struct pid *pid = file->private_data;
> +	bool thread = file->f_flags & PIDFD_THREAD;
>  	__poll_t poll_flags = 0;
>  
>  	poll_wait(file, &pid->wait_pidfd, pts);
> -
>  	/*
> -	 * Inform pollers only when the whole thread group exits.
> -	 * If the thread group leader exits before all other threads in the
> -	 * group, then poll(2) should block, similar to the wait(2) family.
> +	 * Depending on PIDFD_THREAD, inform pollers when the thread
> +	 * or the whole thread-group exits.
>  	 */
> -	if (thread_group_exited(pid))
> +	if (pidfd_task_exited(pid, thread))
>  		poll_flags = EPOLLIN | EPOLLRDNORM;
>  
>  	return poll_flags;
> @@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
>  		return PTR_ERR(pidfd_file);
>  	}
>  	get_pid(pid); /* held by pidfd_file now */
> +	/*
> +	 * anon_inode_getfile() ignores everything outside of the
> +	 * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
> +	 */
> +	pidfd_file->f_flags |= (flags & PIDFD_THREAD);
>  	*ret = pidfd_file;
>  	return pidfd;
>  }
> @@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
>   * Allocate a new file that stashes @pid and reserve a new pidfd number in the
>   * caller's file descriptor table. The pidfd is reserved but not installed yet.
>   *
> - * The helper verifies that @pid is used as a thread group leader.
> + * The helper verifies that @pid is still in use, without PIDFD_THREAD the
> + * task identified by @pid must be a thread-group leader.
>   *
>   * If this function returns successfully the caller is responsible to either
>   * call fd_install() passing the returned pidfd and pidfd file as arguments in
> @@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
>   */
>  int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
>  {
> -	if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
> +	bool thread = flags & PIDFD_THREAD;
> +
> +	if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID));
>  		return -EINVAL;
>  
>  	return __pidfd_prepare(pid, flags, ret);
> diff --git a/kernel/pid.c b/kernel/pid.c
> index c7a3e359f8f5..e11144466828 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
>   * Return the task associated with @pidfd. The function takes a reference on
>   * the returned task. The caller is responsible for releasing that reference.
>   *
> - * Currently, the process identified by @pidfd is always a thread-group leader.
> - * This restriction currently exists for all aspects of pidfds including pidfd
> - * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
> - * (only supports thread group leaders).
> - *
>   * Return: On success, the task_struct associated with the pidfd.
>   *	   On error, a negative errno number will be returned.
>   */
> @@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags)
>   * @flags: flags to pass
>   *
>   * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> - * the process identified by @pid. Currently, the process identified by
> - * @pid must be a thread-group leader. This restriction currently exists
> - * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> - * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> - * leaders).
> + * the task identified by @pid. Without PIDFD_THREAD flag the target task
> + * must be a thread-group leader.
>   *
>   * Return: On success, a cloexec pidfd is returned.
>   *         On error, a negative errno number will be returned.
> @@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
>  	int fd;
>  	struct pid *p;
>  
> -	if (flags & ~PIDFD_NONBLOCK)
> +	if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
>  		return -EINVAL;
>  
>  	if (pid <= 0)
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 9561a3962ca6..5f48d2c4b409 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
>  	WARN_ON_ONCE(!tsk->ptrace &&
>  	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
>  	/*
> -	 * tsk is a group leader and has no threads, wake up the pidfd waiters.
> +	 * tsk is a group leader and has no threads, wake up the
> +	 * non-PIDFD_THREAD waiters.
>  	 */
>  	if (thread_group_empty(tsk))
>  		do_notify_pidfd(tsk);
> @@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
>  		prepare_kill_siginfo(sig, &kinfo);
>  	}
>  
> +	/* TODO: respect PIDFD_THREAD */
>  	ret = kill_pid_info(sig, &kinfo, pid);
>  
>  err:
> -- 
> 2.25.1.362.g51ebf55
>
  
Tycho Andersen Jan. 31, 2024, 2:27 a.m. UTC | #2
On Tue, Jan 30, 2024 at 12:34:09PM +0100, Oleg Nesterov wrote:
> Damn. Self-NACK.
> 
> I forgot (we all ;) about mt-exec, and there are 2 problems.
> 
> 1. The "if (!thread_group_leader(tsk))" block in de_thread() needs
>    do_notify_pidfd() too, the execing non-leader thread looses its
>    old pid, pidfd_poll(PIDFD_THREAD, pid-of-execing-sub-thread)
>    should succeed. Must be fixed, I think.

I think the `test_non_tgl_exec` from my tests exercises the scenario
you're describing, and it works.

> 2. pidfd_poll(PIDFD_THREAD, pid-of-group-leader) should not succeed
>    when its sub-thread execs, the execing thread inherits the leader's
>    pid. Perhaps pidfd_task_exited() can check sig->group_exec_task,

I didn't have an explicit test for this, but I hacked one up, and
pidfd_poll(PIDFD_THREAD, pid-of-group-leader) doesn't return after
exec.

I think it's ok? But I must be missing something.

Tycho
  
Oleg Nesterov Jan. 31, 2024, 1:52 p.m. UTC | #3
On 01/30, Tycho Andersen wrote:
>
> On Tue, Jan 30, 2024 at 12:34:09PM +0100, Oleg Nesterov wrote:
> > Damn. Self-NACK.
> >
> > I forgot (we all ;) about mt-exec, and there are 2 problems.
> >
> > 1. The "if (!thread_group_leader(tsk))" block in de_thread() needs
> >    do_notify_pidfd() too, the execing non-leader thread looses its
> >    old pid, pidfd_poll(PIDFD_THREAD, pid-of-execing-sub-thread)
> >    should succeed. Must be fixed, I think.
>
> I think the `test_non_tgl_exec` from my tests exercises the scenario
> you're describing, and it works.

This means your test is racy, I guess.

Look. We have a leader L, its sub-thtread T with the pid TPID, and
another process X which sleeps in pidfd_poll(PIDFD_THREAD, TPID).

T starts de_thread and kills the leader L. The leader exits and wakes
X up.

Then T does de_thread() -> exchange_tids() so we have

	// BEFORE:
	// pid_task(TPID, PIDTYPE_PID) == T
	exchange_tids(tsk, leader);
	// AFTER:
	// pid_task(TPID, PIDTYPE_PID) == L

Now. If X calls pidfd_task_exited(TPID, true) "AFTER" then we are
fine, pidfd_task_exited() will return true. OK, this is not exactly
true, leader->exit_state == 0 right after exchange_tids(), but lets
ignore.

However. If X calls pidfd_task_exited(TPID, true) "BEFORE" it will
return false: pid_task(TPID) == T and T is not going to die. So
pidfd_poll() will block again forever, TPID is going to die.

See?

Fixed in v3.

> > 2. pidfd_poll(PIDFD_THREAD, pid-of-group-leader) should not succeed
> >    when its sub-thread execs, the execing thread inherits the leader's
> >    pid. Perhaps pidfd_task_exited() can check sig->group_exec_task,
>
> I didn't have an explicit test for this, but I hacked one up, and
> pidfd_poll(PIDFD_THREAD, pid-of-group-leader) doesn't return after
> exec.

See above, this depends on timing.

See also v3 I've sent, I tried to document the problems with mt-exec.

Oleg.
  

Patch

diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 5406fbc13074..2e6461459877 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -7,6 +7,7 @@ 
 #include <linux/fcntl.h>
 
 /* Flags for pidfd_open().  */
-#define PIDFD_NONBLOCK O_NONBLOCK
+#define PIDFD_NONBLOCK	O_NONBLOCK
+#define PIDFD_THREAD	O_EXCL
 
 #endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index dfb963d2f862..493647fd7c07 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -739,6 +739,13 @@  static void exit_notify(struct task_struct *tsk, int group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 
 	tsk->exit_state = EXIT_ZOMBIE;
+	/*
+	 * sub-thread or delay_group_leader(), wake up the
+	 * PIDFD_THREAD waiters.
+	 */
+	if (!thread_group_empty(tsk))
+		do_notify_pidfd(tsk);
+
 	if (unlikely(tsk->ptrace)) {
 		int sig = thread_group_leader(tsk) &&
 				thread_group_empty(tsk) &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 347641398f9d..bea32071fff2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -101,6 +101,7 @@ 
 #include <linux/user_events.h>
 #include <linux/iommu.h>
 #include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -2050,6 +2051,8 @@  static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 
 	seq_put_decimal_ll(m, "Pid:\t", nr);
 
+	/* TODO: report PIDFD_THREAD */
+
 #ifdef CONFIG_PID_NS
 	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
 	if (nr > 0) {
@@ -2068,22 +2071,35 @@  static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
+static bool pidfd_task_exited(struct pid *pid, bool thread)
+{
+	struct task_struct *task;
+	bool exited;
+
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	exited = !task ||
+		(READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
+	rcu_read_unlock();
+
+	return exited;
+}
+
 /*
  * Poll support for process exit notification.
  */
 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
 	struct pid *pid = file->private_data;
+	bool thread = file->f_flags & PIDFD_THREAD;
 	__poll_t poll_flags = 0;
 
 	poll_wait(file, &pid->wait_pidfd, pts);
-
 	/*
-	 * Inform pollers only when the whole thread group exits.
-	 * If the thread group leader exits before all other threads in the
-	 * group, then poll(2) should block, similar to the wait(2) family.
+	 * Depending on PIDFD_THREAD, inform pollers when the thread
+	 * or the whole thread-group exits.
 	 */
-	if (thread_group_exited(pid))
+	if (pidfd_task_exited(pid, thread))
 		poll_flags = EPOLLIN | EPOLLRDNORM;
 
 	return poll_flags;
@@ -2141,6 +2157,11 @@  static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 		return PTR_ERR(pidfd_file);
 	}
 	get_pid(pid); /* held by pidfd_file now */
+	/*
+	 * anon_inode_getfile() ignores everything outside of the
+	 * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
+	 */
+	pidfd_file->f_flags |= (flags & PIDFD_THREAD);
 	*ret = pidfd_file;
 	return pidfd;
 }
@@ -2154,7 +2175,8 @@  static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
  * Allocate a new file that stashes @pid and reserve a new pidfd number in the
  * caller's file descriptor table. The pidfd is reserved but not installed yet.
  *
- * The helper verifies that @pid is used as a thread group leader.
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
  *
  * If this function returns successfully the caller is responsible to either
  * call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2173,7 +2195,9 @@  static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
  */
 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
 {
-	if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+	bool thread = flags & PIDFD_THREAD;
+
+	if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID));
 		return -EINVAL;
 
 	return __pidfd_prepare(pid, flags, ret);
diff --git a/kernel/pid.c b/kernel/pid.c
index c7a3e359f8f5..e11144466828 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -552,11 +552,6 @@  struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
  * Return the task associated with @pidfd. The function takes a reference on
  * the returned task. The caller is responsible for releasing that reference.
  *
- * Currently, the process identified by @pidfd is always a thread-group leader.
- * This restriction currently exists for all aspects of pidfds including pidfd
- * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
- * (only supports thread group leaders).
- *
  * Return: On success, the task_struct associated with the pidfd.
  *	   On error, a negative errno number will be returned.
  */
@@ -615,11 +610,8 @@  static int pidfd_create(struct pid *pid, unsigned int flags)
  * @flags: flags to pass
  *
  * This creates a new pid file descriptor with the O_CLOEXEC flag set for
- * the process identified by @pid. Currently, the process identified by
- * @pid must be a thread-group leader. This restriction currently exists
- * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
- * be used with CLONE_THREAD) and pidfd polling (only supports thread group
- * leaders).
+ * the task identified by @pid. Without PIDFD_THREAD flag the target task
+ * must be a thread-group leader.
  *
  * Return: On success, a cloexec pidfd is returned.
  *         On error, a negative errno number will be returned.
@@ -629,7 +621,7 @@  SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 	int fd;
 	struct pid *p;
 
-	if (flags & ~PIDFD_NONBLOCK)
+	if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
 		return -EINVAL;
 
 	if (pid <= 0)
diff --git a/kernel/signal.c b/kernel/signal.c
index 9561a3962ca6..5f48d2c4b409 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2051,7 +2051,8 @@  bool do_notify_parent(struct task_struct *tsk, int sig)
 	WARN_ON_ONCE(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 	/*
-	 * tsk is a group leader and has no threads, wake up the pidfd waiters.
+	 * tsk is a group leader and has no threads, wake up the
+	 * non-PIDFD_THREAD waiters.
 	 */
 	if (thread_group_empty(tsk))
 		do_notify_pidfd(tsk);
@@ -3926,6 +3927,7 @@  SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 		prepare_kill_siginfo(sig, &kinfo);
 	}
 
+	/* TODO: respect PIDFD_THREAD */
 	ret = kill_pid_info(sig, &kinfo, pid);
 
 err: