[v3,1/1] pidfd: implement PIDFD_THREAD flag for pidfd_open()
Commit Message
With this flag:
- pidfd_open() doesn't require that the target task must be
a thread-group leader
- pidfd_poll() succeeds when the task exits and becomes a
zombie (iow, passes exit_notify()), even if it is a leader
and thread-group is not empty.
This means that the behaviour of pidfd_poll(PIDFD_THREAD,
pid-of-group-leader) is not well defined if it races with
exec() from its sub-thread; pidfd_poll() can succeed or not
depending on whether pidfd_task_exited() is called before
or after exchange_tids().
Perhaps we can improve this behaviour later, pidfd_poll()
can probably take sig->group_exec_task into account. But
this doesn't really differ from the case when the leader
exits before other threads (so pidfd_poll() succeeds) and
then another thread execs and pidfd_poll() will block again.
thread_group_exited() is no longer used, perhaps it can die.
Co-developed-by: Tycho Andersen <tycho@tycho.pizza>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
fs/exec.c | 6 +++++-
include/linux/pid.h | 3 ++-
include/uapi/linux/pidfd.h | 3 ++-
kernel/exit.c | 7 +++++++
kernel/fork.c | 38 +++++++++++++++++++++++++++++++-------
kernel/pid.c | 14 +++-----------
kernel/signal.c | 6 ++++--
7 files changed, 54 insertions(+), 23 deletions(-)
Comments
On Wed, Jan 31, 2024 at 02:26:02PM +0100, Oleg Nesterov wrote:
> With this flag:
>
> - pidfd_open() doesn't require that the target task must be
> a thread-group leader
>
> - pidfd_poll() succeeds when the task exits and becomes a
> zombie (iow, passes exit_notify()), even if it is a leader
> and thread-group is not empty.
>
> This means that the behaviour of pidfd_poll(PIDFD_THREAD,
> pid-of-group-leader) is not well defined if it races with
> exec() from its sub-thread; pidfd_poll() can succeed or not
> depending on whether pidfd_task_exited() is called before
> or after exchange_tids().
I think that we can live with that for now. If it doesn't hinder Tycho's
use-case it's fine.
>
> Perhaps we can improve this behaviour later, pidfd_poll()
> can probably take sig->group_exec_task into account. But
> this doesn't really differ from the case when the leader
> exits before other threads (so pidfd_poll() succeeds) and
> then another thread execs and pidfd_poll() will block again.
.. because it assumes the old thread-group leader's struct pid, right.
One would hope that implementations don't poll on the same fd again
after they saw an exit event.
(I often wish that we could report custom data from poll similar to
kqueue on bsd then we could attach file specific information to the
notification. That would enable userspace to handle this the way they
want.)
>
> thread_group_exited() is no longer used, perhaps it can die.
Feel free to add a patch for that on top of it.
>
> Co-developed-by: Tycho Andersen <tycho@tycho.pizza>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> fs/exec.c | 6 +++++-
> include/linux/pid.h | 3 ++-
> include/uapi/linux/pidfd.h | 3 ++-
> kernel/exit.c | 7 +++++++
> kernel/fork.c | 38 +++++++++++++++++++++++++++++++-------
> kernel/pid.c | 14 +++-----------
> kernel/signal.c | 6 ++++--
> 7 files changed, 54 insertions(+), 23 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 73e4045df271..0fd7e668c477 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk)
>
> BUG_ON(leader->exit_state != EXIT_ZOMBIE);
> leader->exit_state = EXIT_DEAD;
> -
> + /*
> + * leader and tsk exhanged their pids, the old pid dies,
> + * wake up the PIDFD_THREAD waiters.
> + */
> + do_notify_pidfd(leader);
> /*
> * We are going to release_task()->ptrace_unlink() silently,
> * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index e6a041cb8bac..8124d57752b9 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops;
>
> struct file;
>
> -extern struct pid *pidfd_pid(const struct file *file);
> +struct pid *pidfd_pid(const struct file *file);
> struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
> struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
> int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
> +void do_notify_pidfd(struct task_struct *task);
>
> static inline struct pid *get_pid(struct pid *pid)
> {
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> index 5406fbc13074..2e6461459877 100644
> --- a/include/uapi/linux/pidfd.h
> +++ b/include/uapi/linux/pidfd.h
> @@ -7,6 +7,7 @@
> #include <linux/fcntl.h>
>
> /* Flags for pidfd_open(). */
> -#define PIDFD_NONBLOCK O_NONBLOCK
> +#define PIDFD_NONBLOCK O_NONBLOCK
> +#define PIDFD_THREAD O_EXCL
>
> #endif /* _UAPI_LINUX_PIDFD_H */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index dfb963d2f862..493647fd7c07 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
> kill_orphaned_pgrp(tsk->group_leader, NULL);
>
> tsk->exit_state = EXIT_ZOMBIE;
> + /*
> + * sub-thread or delay_group_leader(), wake up the
> + * PIDFD_THREAD waiters.
> + */
> + if (!thread_group_empty(tsk))
> + do_notify_pidfd(tsk);
> +
> if (unlikely(tsk->ptrace)) {
> int sig = thread_group_leader(tsk) &&
> thread_group_empty(tsk) &&
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 347641398f9d..bea32071fff2 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -101,6 +101,7 @@
> #include <linux/user_events.h>
> #include <linux/iommu.h>
> #include <linux/rseq.h>
> +#include <uapi/linux/pidfd.h>
>
> #include <asm/pgalloc.h>
> #include <linux/uaccess.h>
> @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>
> seq_put_decimal_ll(m, "Pid:\t", nr);
>
> + /* TODO: report PIDFD_THREAD */
> +
> #ifdef CONFIG_PID_NS
> seq_put_decimal_ll(m, "\nNSpid:\t", nr);
> if (nr > 0) {
> @@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
> }
> #endif
>
> +static bool pidfd_task_exited(struct pid *pid, bool thread)
> +{
> + struct task_struct *task;
> + bool exited;
> +
> + rcu_read_lock();
> + task = pid_task(pid, PIDTYPE_PID);
> + exited = !task ||
> + (READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
> + rcu_read_unlock();
> +
> + return exited;
> +}
> +
> /*
> * Poll support for process exit notification.
> */
> static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
> {
> struct pid *pid = file->private_data;
> + bool thread = file->f_flags & PIDFD_THREAD;
> __poll_t poll_flags = 0;
>
> poll_wait(file, &pid->wait_pidfd, pts);
> -
> /*
> - * Inform pollers only when the whole thread group exits.
> - * If the thread group leader exits before all other threads in the
> - * group, then poll(2) should block, similar to the wait(2) family.
> + * Depending on PIDFD_THREAD, inform pollers when the thread
> + * or the whole thread-group exits.
> */
> - if (thread_group_exited(pid))
> + if (pidfd_task_exited(pid, thread))
> poll_flags = EPOLLIN | EPOLLRDNORM;
>
> return poll_flags;
> @@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
> return PTR_ERR(pidfd_file);
> }
> get_pid(pid); /* held by pidfd_file now */
> + /*
> + * anon_inode_getfile() ignores everything outside of the
> + * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
> + */
> + pidfd_file->f_flags |= (flags & PIDFD_THREAD);
> *ret = pidfd_file;
> return pidfd;
> }
> @@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
> * Allocate a new file that stashes @pid and reserve a new pidfd number in the
> * caller's file descriptor table. The pidfd is reserved but not installed yet.
> *
> - * The helper verifies that @pid is used as a thread group leader.
> + * The helper verifies that @pid is still in use, without PIDFD_THREAD the
> + * task identified by @pid must be a thread-group leader.
> *
> * If this function returns successfully the caller is responsible to either
> * call fd_install() passing the returned pidfd and pidfd file as arguments in
> @@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
> */
> int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
> {
> - if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
> + bool thread = flags & PIDFD_THREAD;
> +
> + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID));
> return -EINVAL;
>
> return __pidfd_prepare(pid, flags, ret);
> diff --git a/kernel/pid.c b/kernel/pid.c
> index c7a3e359f8f5..e11144466828 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
> * Return the task associated with @pidfd. The function takes a reference on
> * the returned task. The caller is responsible for releasing that reference.
> *
> - * Currently, the process identified by @pidfd is always a thread-group leader.
> - * This restriction currently exists for all aspects of pidfds including pidfd
> - * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
> - * (only supports thread group leaders).
> - *
> * Return: On success, the task_struct associated with the pidfd.
> * On error, a negative errno number will be returned.
> */
> @@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags)
> * @flags: flags to pass
> *
> * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> - * the process identified by @pid. Currently, the process identified by
> - * @pid must be a thread-group leader. This restriction currently exists
> - * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> - * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> - * leaders).
> + * the task identified by @pid. Without PIDFD_THREAD flag the target task
> + * must be a thread-group leader.
> *
> * Return: On success, a cloexec pidfd is returned.
> * On error, a negative errno number will be returned.
> @@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> int fd;
> struct pid *p;
>
> - if (flags & ~PIDFD_NONBLOCK)
> + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
> return -EINVAL;
>
> if (pid <= 0)
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 9561a3962ca6..9b40109f0c56 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2019,7 +2019,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
> return ret;
> }
>
> -static void do_notify_pidfd(struct task_struct *task)
> +void do_notify_pidfd(struct task_struct *task)
> {
> struct pid *pid;
>
> @@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
> WARN_ON_ONCE(!tsk->ptrace &&
> (tsk->group_leader != tsk || !thread_group_empty(tsk)));
> /*
> - * tsk is a group leader and has no threads, wake up the pidfd waiters.
> + * tsk is a group leader and has no threads, wake up the
> + * non-PIDFD_THREAD waiters.
> */
> if (thread_group_empty(tsk))
> do_notify_pidfd(tsk);
> @@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> prepare_kill_siginfo(sig, &kinfo);
> }
>
> + /* TODO: respect PIDFD_THREAD */
> ret = kill_pid_info(sig, &kinfo, pid);
>
> err:
> --
> 2.25.1.362.g51ebf55
>
>
On 01/31, Christian Brauner wrote:
>
> On Wed, Jan 31, 2024 at 02:26:02PM +0100, Oleg Nesterov wrote:
> > With this flag:
> >
> > - pidfd_open() doesn't require that the target task must be
> > a thread-group leader
> >
> > - pidfd_poll() succeeds when the task exits and becomes a
> > zombie (iow, passes exit_notify()), even if it is a leader
> > and thread-group is not empty.
> >
> > This means that the behaviour of pidfd_poll(PIDFD_THREAD,
> > pid-of-group-leader) is not well defined if it races with
> > exec() from its sub-thread; pidfd_poll() can succeed or not
> > depending on whether pidfd_task_exited() is called before
> > or after exchange_tids().
>
> I think that we can live with that for now. If it doesn't hinder Tycho's
> use-case it's fine.
OK, good.
> (I often wish that we could report custom data from poll similar to
> kqueue on bsd then we could attach file specific information to the
> notification. That would enable userspace to handle this the way they
> want.)
Or at least perhaps we can change do_notify_pidfd() to use the keyed
wakeups... I'll try to take a look later.
> > thread_group_exited() is no longer used, perhaps it can die.
>
> Feel free to add a patch for that on top of it.
Yes, will do.
Oleg.
Hi Oleg,
On Wed, Jan 31, 2024 at 02:26:02PM +0100, Oleg Nesterov wrote:
> With this flag:
>
> - pidfd_open() doesn't require that the target task must be
> a thread-group leader
>
> - pidfd_poll() succeeds when the task exits and becomes a
> zombie (iow, passes exit_notify()), even if it is a leader
> and thread-group is not empty.
>
> This means that the behaviour of pidfd_poll(PIDFD_THREAD,
> pid-of-group-leader) is not well defined if it races with
> exec() from its sub-thread; pidfd_poll() can succeed or not
> depending on whether pidfd_task_exited() is called before
> or after exchange_tids().
>
> Perhaps we can improve this behaviour later, pidfd_poll()
> can probably take sig->group_exec_task into account. But
> this doesn't really differ from the case when the leader
> exits before other threads (so pidfd_poll() succeeds) and
> then another thread execs and pidfd_poll() will block again.
I don't have a strong opinion here; leaving it "undefined" for now is
fine with me.
> @@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
> */
> int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
> {
> - if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
> + bool thread = flags & PIDFD_THREAD;
> +
> + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID));
Small typo here, trailing ;. When I fix that, tests pass for me.
Assuming that's fixed up:
Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Tested-by: Tycho Andersen <tandersen@netflix.com>
Thanks for your help!
Tycho
On 01/31, Tycho Andersen wrote:
>
> > int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
> > {
> > - if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
> > + bool thread = flags & PIDFD_THREAD;
> > +
> > + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID));
>
> Small typo here, trailing ;. When I fix that, tests pass for me.
OOPS.
Christian, should I spam lkml with v4, or will you add this fix yourself?
> Assuming that's fixed up:
>
> Reviewed-by: Tycho Andersen <tandersen@netflix.com>
> Tested-by: Tycho Andersen <tandersen@netflix.com>
Thanks Tycho!!!
Oleg.
On Wed, Jan 31, 2024 at 07:01:47PM +0100, Oleg Nesterov wrote:
> On 01/31, Tycho Andersen wrote:
> >
> > > int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
> > > {
> > > - if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
> > > + bool thread = flags & PIDFD_THREAD;
> > > +
> > > + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID));
> >
> > Small typo here, trailing ;. When I fix that, tests pass for me.
>
> OOPS.
>
> Christian, should I spam lkml with v4, or will you add this fix yourself?
I fixed that up. No need to resend!
On Wed, Jan 31, 2024 at 02:26:02PM +0100, Oleg Nesterov wrote:
> With this flag:
>
> - pidfd_open() doesn't require that the target task must be
> a thread-group leader
>
> - pidfd_poll() succeeds when the task exits and becomes a
> zombie (iow, passes exit_notify()), even if it is a leader
> and thread-group is not empty.
>
> This means that the behaviour of pidfd_poll(PIDFD_THREAD,
> pid-of-group-leader) is not well defined if it races with
> exec() from its sub-thread; pidfd_poll() can succeed or not
> depending on whether pidfd_task_exited() is called before
> or after exchange_tids().
>
> Perhaps we can improve this behaviour later, pidfd_poll()
> can probably take sig->group_exec_task into account. But
> this doesn't really differ from the case when the leader
> exits before other threads (so pidfd_poll() succeeds) and
> then another thread execs and pidfd_poll() will block again.
>
> thread_group_exited() is no longer used, perhaps it can die.
>
> Co-developed-by: Tycho Andersen <tycho@tycho.pizza>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> ---
> fs/exec.c | 6 +++++-
> include/linux/pid.h | 3 ++-
> include/uapi/linux/pidfd.h | 3 ++-
> kernel/exit.c | 7 +++++++
> kernel/fork.c | 38 +++++++++++++++++++++++++++++++-------
> kernel/pid.c | 14 +++-----------
> kernel/signal.c | 6 ++++--
> 7 files changed, 54 insertions(+), 23 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 73e4045df271..0fd7e668c477 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk)
>
> BUG_ON(leader->exit_state != EXIT_ZOMBIE);
> leader->exit_state = EXIT_DEAD;
> -
> + /*
> + * leader and tsk exhanged their pids, the old pid dies,
> + * wake up the PIDFD_THREAD waiters.
> + */
> + do_notify_pidfd(leader);
> /*
> * We are going to release_task()->ptrace_unlink() silently,
> * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index e6a041cb8bac..8124d57752b9 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops;
>
> struct file;
>
> -extern struct pid *pidfd_pid(const struct file *file);
> +struct pid *pidfd_pid(const struct file *file);
> struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
> struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
> int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
> +void do_notify_pidfd(struct task_struct *task);
>
> static inline struct pid *get_pid(struct pid *pid)
> {
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> index 5406fbc13074..2e6461459877 100644
> --- a/include/uapi/linux/pidfd.h
> +++ b/include/uapi/linux/pidfd.h
> @@ -7,6 +7,7 @@
> #include <linux/fcntl.h>
>
> /* Flags for pidfd_open(). */
> -#define PIDFD_NONBLOCK O_NONBLOCK
> +#define PIDFD_NONBLOCK O_NONBLOCK
> +#define PIDFD_THREAD O_EXCL
>
> #endif /* _UAPI_LINUX_PIDFD_H */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index dfb963d2f862..493647fd7c07 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
> kill_orphaned_pgrp(tsk->group_leader, NULL);
>
> tsk->exit_state = EXIT_ZOMBIE;
> + /*
> + * sub-thread or delay_group_leader(), wake up the
> + * PIDFD_THREAD waiters.
> + */
> + if (!thread_group_empty(tsk))
> + do_notify_pidfd(tsk);
> +
> if (unlikely(tsk->ptrace)) {
> int sig = thread_group_leader(tsk) &&
> thread_group_empty(tsk) &&
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 347641398f9d..bea32071fff2 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -101,6 +101,7 @@
> #include <linux/user_events.h>
> #include <linux/iommu.h>
> #include <linux/rseq.h>
> +#include <uapi/linux/pidfd.h>
>
> #include <asm/pgalloc.h>
> #include <linux/uaccess.h>
> @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>
> seq_put_decimal_ll(m, "Pid:\t", nr);
>
> + /* TODO: report PIDFD_THREAD */
One more thing that came to my mind. We also support waitid(P_PIDFD,
pidfd). And I just looked through the code and I think it does the right
thing when we pass it a PIDFD_THREAD pidfd because wait_consider_task()
rejects task->exit_signal != SIGCHLD in eligible_child() and
CLONE_THREAD implies task->exit_signal == -1.
On 02/01, Christian Brauner wrote:
>
> One more thing that came to my mind. We also support waitid(P_PIDFD,
> pidfd). And I just looked through the code and I think it does the right
> thing when we pass it a PIDFD_THREAD pidfd
Yes, I too looked into kernel_waitid_prepare(P_PIDFD) and didn't notice
anything wrong,
> because wait_consider_task()
Even simpler, I think. waitid(P_PIDFD, pidfd_with_PIDFD_THREAD) doesn't
really differ from waitid(P_PID, pidfd_pid(pidfd_with_PIDFD_THREAD)), so
it should work "as expected".
Oleg.
> @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>
> seq_put_decimal_ll(m, "Pid:\t", nr);
>
> + /* TODO: report PIDFD_THREAD */
So I think we don't need to do anything here. Since PIDFD_THREAD sets
O_EXCL in file->f_flags and in contrast to do_dentry_open() it isn't
dropped. So userspace can already detect PIDFD_NONBLOCK as O_NONBLOCK
and PIDFD_THREAD as O_EXCL.
On 02/02, Christian Brauner wrote:
>
> > @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
> >
> > seq_put_decimal_ll(m, "Pid:\t", nr);
> >
> > + /* TODO: report PIDFD_THREAD */
>
> So I think we don't need to do anything here. Since PIDFD_THREAD sets
> O_EXCL in file->f_flags and in contrast to do_dentry_open() it isn't
> dropped. So userspace can already detect PIDFD_NONBLOCK as O_NONBLOCK
> and PIDFD_THREAD as O_EXCL.
Ah, indeed, I didn't know that fs/proc/fd.c:seq_show() reports ->f_flags.
Thanks.
OK, what about another TODO in sys_pidfd_send_signal() ?
I mean, should I send a simple patch which changes pidfd_send_signal()
to use do_send_specific() if PIDFD_THREAD ? Or do you think this should
be controlled by pidfd_send_signal's "flags" argument?
I honestly do not know what makes more sense.
Oleg.
On Mon, Feb 05, 2024 at 04:22:41PM +0100, Oleg Nesterov wrote:
> On 02/02, Christian Brauner wrote:
> >
> > > @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
> > >
> > > seq_put_decimal_ll(m, "Pid:\t", nr);
> > >
> > > + /* TODO: report PIDFD_THREAD */
> >
> > So I think we don't need to do anything here. Since PIDFD_THREAD sets
> > O_EXCL in file->f_flags and in contrast to do_dentry_open() it isn't
> > dropped. So userspace can already detect PIDFD_NONBLOCK as O_NONBLOCK
> > and PIDFD_THREAD as O_EXCL.
>
> Ah, indeed, I didn't know that fs/proc/fd.c:seq_show() reports ->f_flags.
> Thanks.
>
>
>
> OK, what about another TODO in sys_pidfd_send_signal() ?
>
> I mean, should I send a simple patch which changes pidfd_send_signal()
> to use do_send_specific() if PIDFD_THREAD ? Or do you think this should
Yes, by default the type of pidfd should determine the scope of the
signal as discussed elsewhere.
(And later we can add flags that allow the caller to change the scope of
the signal. So that Andy's use-cases are covered as well.)
Hello,
kernel test robot noticed "last_state.load_disk_fail" on:
commit: aa9c201b150f15cf12e8df8af531b2c74ae1a8fc ("[PATCH v3 1/1] pidfd: implement PIDFD_THREAD flag for pidfd_open()")
url: https://github.com/intel-lab-lkp/linux/commits/Oleg-Nesterov/pidfd-implement-PIDFD_THREAD-flag-for-pidfd_open/20240131-213408
base: https://git.kernel.org/cgit/linux/kernel/git/vfs/vfs.git vfs.all
patch link: https://lore.kernel.org/all/20240131132602.GA23641@redhat.com/
patch subject: [PATCH v3 1/1] pidfd: implement PIDFD_THREAD flag for pidfd_open()
in testcase: boot
compiler: gcc-12
test machine: 128 threads 2 sockets Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz (Ice Lake) with 128G memory
(please refer to attached dmesg/kmsg for entire log/backtrace)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202402081434.9e1ded3-oliver.sang@intel.com
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240208/202402081434.9e1ded3-oliver.sang@intel.com
as in dmesg in above link:
LKP: ttyS0: 1408: can't load the disk /dev/disk/by-id/ata-SanDisk_SDSSDH3250G_182971800454-part1, skip testing...
we know this should be related with some early issues but we failed to figure
out. so here also attached parent dmesg as dmesg-e0ee7b583f.xz FYI.
On Thu, Feb 08, 2024 at 03:54:44PM +0800, kernel test robot wrote:
>
>
> Hello,
>
> kernel test robot noticed "last_state.load_disk_fail" on:
>
> commit: aa9c201b150f15cf12e8df8af531b2c74ae1a8fc ("[PATCH v3 1/1] pidfd: implement PIDFD_THREAD flag for pidfd_open()")
> url: https://github.com/intel-lab-lkp/linux/commits/Oleg-Nesterov/pidfd-implement-PIDFD_THREAD-flag-for-pidfd_open/20240131-213408
> base: https://git.kernel.org/cgit/linux/kernel/git/vfs/vfs.git vfs.all
> patch link: https://lore.kernel.org/all/20240131132602.GA23641@redhat.com/
> patch subject: [PATCH v3 1/1] pidfd: implement PIDFD_THREAD flag for pidfd_open()
>
> in testcase: boot
>
> compiler: gcc-12
> test machine: 128 threads 2 sockets Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz (Ice Lake) with 128G memory
>
> (please refer to attached dmesg/kmsg for entire log/backtrace)
>
>
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <oliver.sang@intel.com>
> | Closes: https://lore.kernel.org/oe-lkp/202402081434.9e1ded3-oliver.sang@intel.com
>
>
>
> The kernel config and materials to reproduce are available at:
> https://download.01.org/0day-ci/archive/20240208/202402081434.9e1ded3-oliver.sang@intel.com
>
>
> as in dmesg in above link:
>
> LKP: ttyS0: 1408: can't load the disk /dev/disk/by-id/ata-SanDisk_SDSSDH3250G_182971800454-part1, skip testing...
>
> we know this should be related with some early issues but we failed to figure
> out. so here also attached parent dmesg as dmesg-e0ee7b583f.xz FYI.
I have a hard time seeing how this would be caused by any of Oleg's
changes. Plus, the merges from the vfs tree linked under "url:" above
are all pretty out of date?
On 02/08, Christian Brauner wrote:
>
> On Thu, Feb 08, 2024 at 03:54:44PM +0800, kernel test robot wrote:
> >
> >
> > Hello,
> >
> > kernel test robot noticed "last_state.load_disk_fail" on:
> >
> > commit: aa9c201b150f15cf12e8df8af531b2c74ae1a8fc ("[PATCH v3 1/1] pidfd: implement PIDFD_THREAD flag for pidfd_open()")
> > url: https://github.com/intel-lab-lkp/linux/commits/Oleg-Nesterov/pidfd-implement-PIDFD_THREAD-flag-for-pidfd_open/20240131-213408
> > base: https://git.kernel.org/cgit/linux/kernel/git/vfs/vfs.git vfs.all
> > patch link: https://lore.kernel.org/all/20240131132602.GA23641@redhat.com/
> > patch subject: [PATCH v3 1/1] pidfd: implement PIDFD_THREAD flag for pidfd_open()
> >
> > in testcase: boot
> >
> > compiler: gcc-12
> > test machine: 128 threads 2 sockets Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz (Ice Lake) with 128G memory
> >
> > (please refer to attached dmesg/kmsg for entire log/backtrace)
> >
> >
> >
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <oliver.sang@intel.com>
> > | Closes: https://lore.kernel.org/oe-lkp/202402081434.9e1ded3-oliver.sang@intel.com
> >
> >
> >
> > The kernel config and materials to reproduce are available at:
> > https://download.01.org/0day-ci/archive/20240208/202402081434.9e1ded3-oliver.sang@intel.com
> >
> >
> > as in dmesg in above link:
> >
> > LKP: ttyS0: 1408: can't load the disk /dev/disk/by-id/ata-SanDisk_SDSSDH3250G_182971800454-part1, skip testing...
> >
> > we know this should be related with some early issues but we failed to figure
> > out. so here also attached parent dmesg as dmesg-e0ee7b583f.xz FYI.
>
> I have a hard time seeing how this would be caused by any of Oleg's
> changes. Plus, the merges from the vfs tree linked under "url:" above
> are all pretty out of date?
Yes, the patch under url still has the extra trailing ";" after "if"
so pidfd_prepare() will always fail. Fixed by Tycho.
Oleg.
@@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk)
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
leader->exit_state = EXIT_DEAD;
-
+ /*
+ * leader and tsk exhanged their pids, the old pid dies,
+ * wake up the PIDFD_THREAD waiters.
+ */
+ do_notify_pidfd(leader);
/*
* We are going to release_task()->ptrace_unlink() silently,
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
@@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops;
struct file;
-extern struct pid *pidfd_pid(const struct file *file);
+struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
+void do_notify_pidfd(struct task_struct *task);
static inline struct pid *get_pid(struct pid *pid)
{
@@ -7,6 +7,7 @@
#include <linux/fcntl.h>
/* Flags for pidfd_open(). */
-#define PIDFD_NONBLOCK O_NONBLOCK
+#define PIDFD_NONBLOCK O_NONBLOCK
+#define PIDFD_THREAD O_EXCL
#endif /* _UAPI_LINUX_PIDFD_H */
@@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
tsk->exit_state = EXIT_ZOMBIE;
+ /*
+ * sub-thread or delay_group_leader(), wake up the
+ * PIDFD_THREAD waiters.
+ */
+ if (!thread_group_empty(tsk))
+ do_notify_pidfd(tsk);
+
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -101,6 +101,7 @@
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
seq_put_decimal_ll(m, "Pid:\t", nr);
+ /* TODO: report PIDFD_THREAD */
+
#ifdef CONFIG_PID_NS
seq_put_decimal_ll(m, "\nNSpid:\t", nr);
if (nr > 0) {
@@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
}
#endif
+static bool pidfd_task_exited(struct pid *pid, bool thread)
+{
+ struct task_struct *task;
+ bool exited;
+
+ rcu_read_lock();
+ task = pid_task(pid, PIDTYPE_PID);
+ exited = !task ||
+ (READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
+ rcu_read_unlock();
+
+ return exited;
+}
+
/*
* Poll support for process exit notification.
*/
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct pid *pid = file->private_data;
+ bool thread = file->f_flags & PIDFD_THREAD;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
-
/*
- * Inform pollers only when the whole thread group exits.
- * If the thread group leader exits before all other threads in the
- * group, then poll(2) should block, similar to the wait(2) family.
+ * Depending on PIDFD_THREAD, inform pollers when the thread
+ * or the whole thread-group exits.
*/
- if (thread_group_exited(pid))
+ if (pidfd_task_exited(pid, thread))
poll_flags = EPOLLIN | EPOLLRDNORM;
return poll_flags;
@@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
return PTR_ERR(pidfd_file);
}
get_pid(pid); /* held by pidfd_file now */
+ /*
+ * anon_inode_getfile() ignores everything outside of the
+ * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
+ */
+ pidfd_file->f_flags |= (flags & PIDFD_THREAD);
*ret = pidfd_file;
return pidfd;
}
@@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
*
- * The helper verifies that @pid is used as a thread group leader.
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
*/
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ bool thread = flags & PIDFD_THREAD;
+
+ if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID));
return -EINVAL;
return __pidfd_prepare(pid, flags, ret);
@@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
* Return the task associated with @pidfd. The function takes a reference on
* the returned task. The caller is responsible for releasing that reference.
*
- * Currently, the process identified by @pidfd is always a thread-group leader.
- * This restriction currently exists for all aspects of pidfds including pidfd
- * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
- * (only supports thread group leaders).
- *
* Return: On success, the task_struct associated with the pidfd.
* On error, a negative errno number will be returned.
*/
@@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags)
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
- * the process identified by @pid. Currently, the process identified by
- * @pid must be a thread-group leader. This restriction currently exists
- * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
- * be used with CLONE_THREAD) and pidfd polling (only supports thread group
- * leaders).
+ * the task identified by @pid. Without PIDFD_THREAD flag the target task
+ * must be a thread-group leader.
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
@@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
int fd;
struct pid *p;
- if (flags & ~PIDFD_NONBLOCK)
+ if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
return -EINVAL;
if (pid <= 0)
@@ -2019,7 +2019,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
return ret;
}
-static void do_notify_pidfd(struct task_struct *task)
+void do_notify_pidfd(struct task_struct *task)
{
struct pid *pid;
@@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
/*
- * tsk is a group leader and has no threads, wake up the pidfd waiters.
+ * tsk is a group leader and has no threads, wake up the
+ * non-PIDFD_THREAD waiters.
*/
if (thread_group_empty(tsk))
do_notify_pidfd(tsk);
@@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
prepare_kill_siginfo(sig, &kinfo);
}
+ /* TODO: respect PIDFD_THREAD */
ret = kill_pid_info(sig, &kinfo, pid);
err: