[1/3] pidfd_poll: report POLLHUP when pid_task() == NULL

Message ID 20240202131226.GA26018@redhat.com
State New
Headers
Series pidfd_poll: report POLLHUP when pid_task() == NULL |

Commit Message

Oleg Nesterov Feb. 2, 2024, 1:12 p.m. UTC
  Add another wake_up_all(wait_pidfd) into __change_pid() and change
pidfd_poll() to include EPOLLHUP if task == NULL.

This allows to wait until the target process/thread is reaped.

TODO: change do_notify_pidfd() to use the keyed wakeups.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/fork.c | 22 +++++++---------------
 kernel/pid.c  |  5 +++++
 2 files changed, 12 insertions(+), 15 deletions(-)
  

Comments

Christian Brauner Feb. 2, 2024, 2:44 p.m. UTC | #1
> TODO: change do_notify_pidfd() to use the keyed wakeups.

How does the following appended patch look?
  
Christian Brauner Feb. 2, 2024, 3:16 p.m. UTC | #2
On Fri, Feb 02, 2024 at 03:44:45PM +0100, Christian Brauner wrote:
> > TODO: change do_notify_pidfd() to use the keyed wakeups.
> 
> How does the following appended patch look?

I missed the conversion in detach_pid() which should also just become:

diff --git a/kernel/pid.c b/kernel/pid.c
index de0bf2f8d18b..1bfcfa195226 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -351,7 +351,8 @@ static void __change_pid(struct task_struct *task, enum pid_type type,

        if (type == PIDTYPE_PID) {
                WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
-               wake_up_all(&pid->wait_pidfd);
+               pidfd_wake_up_poll(&pid->wait_pidfd,
+                                  EPOLLIN | EPOLLRDNORM | EPOLLHUP);
        }

        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
  
Oleg Nesterov Feb. 2, 2024, 4:07 p.m. UTC | #3
On 02/02, Christian Brauner wrote:
>
> > TODO: change do_notify_pidfd() to use the keyed wakeups.
>
> How does the following appended patch look?

No, no.

I think we need a simpler patch. I was going to send it as 4/4, but I'd
like to think more, _perhaps_ we can also discriminate the PIDFD_THREAD
and non-PIDFD_THREAD waiters. I'll try to make the patch(es) tomorrow or
at least provided more info.

3 notes for now:

	1. we can't use wake_up_poll(), it passes nr_exclusive => 1

	2. exit_notify() should not pass EPOLLHUP to wake_up, we do
	   not want to wake up the { .events = POLLHUP } waiters.

	3. we do not need to change __change_pid().

	   Well, _perhaps_ it can/should use __wake_up_pollfree(), but
	   I need to check if fs/select.c use "autoremove" or not.


> -static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
> +static __poll_t pidfd_poll(struct file *file, poll_table *wait)
>  {
>  	struct pid *pid = file->private_data;
>  	bool thread = file->f_flags & PIDFD_THREAD;
>  	struct task_struct *task;
>  	__poll_t poll_flags = 0;
>  
> -	poll_wait(file, &pid->wait_pidfd, pts);
> +	poll_wait(file, &pid->wait_pidfd, wait);

This is correct but only cosemtic and has nothing to do with what
we discuss?

Oleg.
  
Christian Brauner Feb. 2, 2024, 5:24 p.m. UTC | #4
> I think we need a simpler patch. I was going to send it as 4/4, but I'd
> like to think more, _perhaps_ we can also discriminate the PIDFD_THREAD
> and non-PIDFD_THREAD waiters. I'll try to make the patch(es) tomorrow or

Right, I didn't go that far.

> at least provided more info.
> 
> 3 notes for now:
> 
> 	1. we can't use wake_up_poll(), it passes nr_exclusive => 1

Bah. So we need the same stuff we did for io_uring and use
__wake_up() directly. Or we add wake_up_all_poll() and convert the other
three callsites:

// uncompiled, untested

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 8aa3372f21a0..210ee0d69b6f 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -234,6 +234,8 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head);
 #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
 #define wake_up_poll(x, m)                                                     \
        __wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
+#define wake_up_all_poll(x, m)                                                 \
+       __wake_up(x, TASK_NORMAL, 0, poll_to_key(m))
 #define wake_up_poll_on_current_cpu(x, m)                                      \
        __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
 #define wake_up_locked_poll(x, m)                                              \
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 04e33f25919c..65dcd5dc9645 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -228,8 +228,7 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
 {
        if (wq_has_sleeper(&ctx->poll_wq))
-               __wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
-                               poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+               wake_up_all_poll(&ctx->poll_wq, EPOLL_URING_WAKE | EPOLLIN);
 }

 static inline void io_cqring_wake(struct io_ring_ctx *ctx)
@@ -245,8 +244,7 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx)
         * epoll and should terminate multishot poll at that point.
         */
        if (wq_has_sleeper(&ctx->cq_wait))
-               __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
-                               poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+               wake_up_all_poll(&ctx->cq_wait, EPOLL_URING_WAKE | EPOLLIN);
 }

 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 51e38f5f4701..ee849fb35603 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -208,7 +208,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);  /* For internal
use only */

 void __wake_up_pollfree(struct wait_queue_head *wq_head)
  {
  -       __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP |
  POLLFREE));
  +       wake_up_all_poll(wq_head, EPOLLHUP | POLLFREE);
          /* POLLFREE must have cleared the queue. */
	          WARN_ON_ONCE(waitqueue_active(wq_head));
		   }

> 
> 	2. exit_notify() should not pass EPOLLHUP to wake_up, we do
> 	   not want to wake up the { .events = POLLHUP } waiters.

Indeed.

> 
> 	3. we do not need to change __change_pid().
> 
> 	   Well, _perhaps_ it can/should use __wake_up_pollfree(), but
> 	   I need to check if fs/select.c use "autoremove" or not.
> 
> 
> > -static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
> > +static __poll_t pidfd_poll(struct file *file, poll_table *wait)
> >  {
> >  	struct pid *pid = file->private_data;
> >  	bool thread = file->f_flags & PIDFD_THREAD;
> >  	struct task_struct *task;
> >  	__poll_t poll_flags = 0;
> >  
> > -	poll_wait(file, &pid->wait_pidfd, pts);
> > +	poll_wait(file, &pid->wait_pidfd, wait);
> 
> This is correct but only cosemtic and has nothing to do with what
> we discuss?

No, I just folded all of the changes because it was just a draft. See
the updated draft I appended.
  
Oleg Nesterov Feb. 2, 2024, 7:05 p.m. UTC | #5
On 02/02, Christian Brauner wrote:
>
> > I think we need a simpler patch. I was going to send it as 4/4, but I'd
> > like to think more, _perhaps_ we can also discriminate the PIDFD_THREAD
> > and non-PIDFD_THREAD waiters. I'll try to make the patch(es) tomorrow or
>
> Right, I didn't go that far.
>
> > at least provided more info.
> >
> > 3 notes for now:
> >
> > 	1. we can't use wake_up_poll(), it passes nr_exclusive => 1
>
> Bah. So we need the same stuff we did for io_uring and use
> __wake_up() directly. Or we add wake_up_all_poll() and convert the other
> three callsites:

..

> +#define wake_up_all_poll(x, m)                                                 \
> +       __wake_up(x, TASK_NORMAL, 0, poll_to_key(m))

Agreed, but I think this + s/wake_up/wake_up_all_poll/ conversions
need a separate patch.


> -void do_notify_pidfd(struct task_struct *task)
> +void pidfd_wake_up_poll(struct task_struct *task, bool dead)
>  {
> -	struct pid *pid;
> -
>  	WARN_ON(task->exit_state == 0);
> -	pid = task_pid(task);
> -	wake_up_all(&pid->wait_pidfd);
> +	WARN_ON(mask == 0);
> +	wake_up_all_poll(&task_pid(task)->wait_pidfd,
> +			 EPOLLIN | EPOLLRDNORM | dead ? EPOLLHUP : 0);

No...

This is still overcomplicated and is not right.

Christian, I'll write another email tomorrow.

Oleg.
  
Christian Brauner Feb. 2, 2024, 7:50 p.m. UTC | #6
On Fri, Feb 02, 2024 at 08:05:29PM +0100, Oleg Nesterov wrote:
> On 02/02, Christian Brauner wrote:
> >
> > > I think we need a simpler patch. I was going to send it as 4/4, but I'd
> > > like to think more, _perhaps_ we can also discriminate the PIDFD_THREAD
> > > and non-PIDFD_THREAD waiters. I'll try to make the patch(es) tomorrow or
> >
> > Right, I didn't go that far.
> >
> > > at least provided more info.
> > >
> > > 3 notes for now:
> > >
> > > 	1. we can't use wake_up_poll(), it passes nr_exclusive => 1
> >
> > Bah. So we need the same stuff we did for io_uring and use
> > __wake_up() directly. Or we add wake_up_all_poll() and convert the other
> > three callsites:
> 
> ...
> 
> > +#define wake_up_all_poll(x, m)                                                 \
> > +       __wake_up(x, TASK_NORMAL, 0, poll_to_key(m))
> 
> Agreed, but I think this + s/wake_up/wake_up_all_poll/ conversions
> need a separate patch.

Yeah, I know. This is just a scribbled draft.

> 
> 
> > -void do_notify_pidfd(struct task_struct *task)
> > +void pidfd_wake_up_poll(struct task_struct *task, bool dead)
> >  {
> > -	struct pid *pid;
> > -
> >  	WARN_ON(task->exit_state == 0);
> > -	pid = task_pid(task);
> > -	wake_up_all(&pid->wait_pidfd);
> > +	WARN_ON(mask == 0);
> > +	wake_up_all_poll(&task_pid(task)->wait_pidfd,
> > +			 EPOLLIN | EPOLLRDNORM | dead ? EPOLLHUP : 0);
> 
> No...
> 
> This is still overcomplicated and is not right.

I'm all ears.

> Christian, I'll write another email tomorrow.

Sure, there's no rush. I had not intended that this patch be used. I
have another large series I need to take care of so I can't spend a lot
of time on writing this anyway. I just hadn't used the keyed apis before
and got curious. So don't get the impression that I intend to write
this. I fully expected you to do it.
  
Oleg Nesterov Feb. 3, 2024, 12:04 p.m. UTC | #7
Christian, I apologize for my terse and unclear emails yesterday,
I was a bit busy.

On 02/02, Oleg Nesterov wrote:
>
> On 02/02, Christian Brauner wrote:
> >
> > > I think we need a simpler patch. I was going to send it as 4/4, but I'd
> > > like to think more, _perhaps_ we can also discriminate the PIDFD_THREAD
> > > and non-PIDFD_THREAD waiters. I'll try to make the patch(es) tomorrow or
> >
> > Right, I didn't go that far.

OK, so lets forget about the PIDFD_THREAD waiters for the moment.
Then everything is trivial, please see below.

> > > 	1. we can't use wake_up_poll(), it passes nr_exclusive => 1
> >
> > Bah. So we need the same stuff we did for io_uring and use
> > __wake_up() directly. Or we add wake_up_all_poll() and convert the other
> > three callsites:
>
> ...
>
> > +#define wake_up_all_poll(x, m)                                                 \
> > +       __wake_up(x, TASK_NORMAL, 0, poll_to_key(m))
>
> Agreed, but I think this + s/wake_up/wake_up_all_poll/ conversions
> need a separate patch.

And if it was not clear I like this change! In fact I thought about
the new helper too, but I didn't realize that it already have the
users.

> > -void do_notify_pidfd(struct task_struct *task)
> > +void pidfd_wake_up_poll(struct task_struct *task, bool dead)
> >  {
> > -	struct pid *pid;
> > -
> >  	WARN_ON(task->exit_state == 0);
> > -	pid = task_pid(task);
> > -	wake_up_all(&pid->wait_pidfd);
> > +	WARN_ON(mask == 0);
> > +	wake_up_all_poll(&task_pid(task)->wait_pidfd,
> > +			 EPOLLIN | EPOLLRDNORM | dead ? EPOLLHUP : 0);
>
> No...
>
> This is still overcomplicated and is not right.
                                ^^^^^^^^^^^^^^^^
Sorry, sorry, I misread your change, "dead" is always false so it has
no effect and thus the change is correct.

But why do we need this arg? All we need is the trivial one-liner:

--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2025,7 +2025,8 @@ void do_notify_pidfd(struct task_struct *task)
 
 	WARN_ON(task->exit_state == 0);
 	pid = task_pid(task);
-	wake_up_all(&pid->wait_pidfd);
+	__wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
+		  poll_to_key(EPOLLIN | EPOLLRDNORM));
 }
 
 /*

and I was going to send the patch above as 4/4, but then decided
to delay it, see below.

We can rename do_notify_pidfd() if you wish, and of course the
new wake_up_all_poll() helper you proposed makes sense, but this
is another story.

As for __change_pid(). In this case wake_up_all() is fine, we can
change it to use wake_up_all_poll() too for consistency, but this
is not strictly necessary and in fact "key = 0" makes a bit more
sense imo.

And just in case... previously I said that (perhaps) it can use
__wake_up_pollfree() but no, this would be obviously wrong.

------------------------------------------------------------------
Now let's recall about the PIDFD_THREAD waiters. exit_notify() does
		
	/*
	 * sub-thread or delay_group_leader(), wake up the
	 * PIDFD_THREAD waiters.
	 */
	if (!thread_group_empty(tsk))
		do_notify_pidfd(tsk);

and it would be nice to not wakeup the non-PIDFD_THREAD waiters.

I was thinking about something like the changes below but

	- I am NOT sure it will work! I need to read the code
	  in fs/select.c

	- in fact I am not sure this makes a lot of sense, and
	  the hack in pidfd_poll() doesn't look very nice even
	  _if_ it can work.

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2081,6 +2081,13 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	struct task_struct *task;
 	__poll_t poll_flags = 0;
 
+	if (thread && pts && pts->_qproc) {
+		// We are not registered yet. Update the key to mark
+		// us a a PIDFD_THREAD waiter, __pollwait() will copy
+		// this ->_key to poll_table_entry->key.
+		if (pts->_key & EPOLLIN) // exclude the POLLHUP-only waiters
+			pts->_key |= EPOLLMSG; // random flag
+	}
 	poll_wait(file, &pid->wait_pidfd, pts);
 	/*
 	 * Depending on PIDFD_THREAD, inform pollers when the thread

Now, do_notify_pidfd() can do

	if (!thread_group_empty(tsk))
		mask = EPOLLMSG; // matches the hack in pidfd_poll
	else
		mask = EPOLLIN | EPOLLRDNORM;

	__wake_up(..., poll_to_key(mask));

Yes, in this case it makes more sense to pass !thread_group_empty()
as a "bool thread" argument.

---------------------------------------------------------------------

What do you think?

I am starting to think that I shouldn't have delayed the 1st trivial
change. Feel free to push it, with or without rename, with or without
the new wake_up_all_poll() helper, I am fine either way. But please
don't add the new "int dead" argument, afaics it makes no sense.

Thanks,

Oleg.
  
Christian Brauner Feb. 3, 2024, 4:46 p.m. UTC | #8
On Sat, Feb 03, 2024 at 01:04:26PM +0100, Oleg Nesterov wrote:
> Christian, I apologize for my terse and unclear emails yesterday,
> I was a bit busy.

Oh don't worry, I didn't take it badly in any way. That was all totally
ok me getting the keys api wrong is not your problem. ;)

> 
> On 02/02, Oleg Nesterov wrote:
> >
> > On 02/02, Christian Brauner wrote:
> > >
> > > > I think we need a simpler patch. I was going to send it as 4/4, but I'd
> > > > like to think more, _perhaps_ we can also discriminate the PIDFD_THREAD
> > > > and non-PIDFD_THREAD waiters. I'll try to make the patch(es) tomorrow or
> > >
> > > Right, I didn't go that far.
> 
> OK, so lets forget about the PIDFD_THREAD waiters for the moment.
> Then everything is trivial, please see below.
> 
> > > > 	1. we can't use wake_up_poll(), it passes nr_exclusive => 1
> > >
> > > Bah. So we need the same stuff we did for io_uring and use
> > > __wake_up() directly. Or we add wake_up_all_poll() and convert the other
> > > three callsites:
> >
> > ...
> >
> > > +#define wake_up_all_poll(x, m)                                                 \
> > > +       __wake_up(x, TASK_NORMAL, 0, poll_to_key(m))
> >
> > Agreed, but I think this + s/wake_up/wake_up_all_poll/ conversions
> > need a separate patch.
> 
> And if it was not clear I like this change! In fact I thought about
> the new helper too, but I didn't realize that it already have the
> users.
> 
> > > -void do_notify_pidfd(struct task_struct *task)
> > > +void pidfd_wake_up_poll(struct task_struct *task, bool dead)
> > >  {
> > > -	struct pid *pid;
> > > -
> > >  	WARN_ON(task->exit_state == 0);
> > > -	pid = task_pid(task);
> > > -	wake_up_all(&pid->wait_pidfd);
> > > +	WARN_ON(mask == 0);
> > > +	wake_up_all_poll(&task_pid(task)->wait_pidfd,
> > > +			 EPOLLIN | EPOLLRDNORM | dead ? EPOLLHUP : 0);
> >
> > No...
> >
> > This is still overcomplicated and is not right.
>                                 ^^^^^^^^^^^^^^^^
> Sorry, sorry, I misread your change, "dead" is always false so it has
> no effect and thus the change is correct.
> 
> But why do we need this arg? All we need is the trivial one-liner:
> 
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2025,7 +2025,8 @@ void do_notify_pidfd(struct task_struct *task)
>  
>  	WARN_ON(task->exit_state == 0);
>  	pid = task_pid(task);
> -	wake_up_all(&pid->wait_pidfd);
> +	__wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
> +		  poll_to_key(EPOLLIN | EPOLLRDNORM));

Ok, care to just send me a full patch for this?
Just paste it here and I'll pick it.

>  }
>  
>  /*
> 
> and I was going to send the patch above as 4/4, but then decided
> to delay it, see below.
> 
> We can rename do_notify_pidfd() if you wish, and of course the
> new wake_up_all_poll() helper you proposed makes sense, but this
> is another story.

If at all then absolutely a separate patch.

> 
> As for __change_pid(). In this case wake_up_all() is fine, we can
> change it to use wake_up_all_poll() too for consistency, but this
> is not strictly necessary and in fact "key = 0" makes a bit more
> sense imo.

Ok, I see.

> 
> And just in case... previously I said that (perhaps) it can use
> __wake_up_pollfree() but no, this would be obviously wrong.
> 
> ------------------------------------------------------------------
> Now let's recall about the PIDFD_THREAD waiters. exit_notify() does
> 		
> 	/*
> 	 * sub-thread or delay_group_leader(), wake up the
> 	 * PIDFD_THREAD waiters.
> 	 */
> 	if (!thread_group_empty(tsk))
> 		do_notify_pidfd(tsk);
> 
> and it would be nice to not wakeup the non-PIDFD_THREAD waiters.
> 
> I was thinking about something like the changes below but
> 
> 	- I am NOT sure it will work! I need to read the code
> 	  in fs/select.c
> 
> 	- in fact I am not sure this makes a lot of sense, and
> 	  the hack in pidfd_poll() doesn't look very nice even
> 	  _if_ it can work.

First, good idea to make this work. :)
Second, I agree that this looks ugly. ;/

So here's a very likely a stupid idea. To make that clean we essentially
need kernel private information that can't be set in userspace (Btw,
look at EPOLL_URING_WAKE which is similar in that it cannot be set from
userspace. It's not the same thing ofc but it is a private bit.). Which
is the gist of your proposal in a way.

So we would have to grab a new private bit in the epoll flag space. A
subsystem (e.g., pidfd, seccomp) could use that private bit to avoid
spurious wakeups such as this. I'm sure that this would be useful to
others?

It's not something we need to do right now but my point is that this
would be more palatable if this coulde be made generally useful.

The only thing that's ugly is that we'd raise that bit in ->poll() but
maybe that's acceptable through a helper? If we'd wanted to set it from
epoll itself we'd need a new fop most likely. :/

> 
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2081,6 +2081,13 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
>  	struct task_struct *task;
>  	__poll_t poll_flags = 0;
>  
> +	if (thread && pts && pts->_qproc) {
> +		// We are not registered yet. Update the key to mark
> +		// us a a PIDFD_THREAD waiter, __pollwait() will copy
> +		// this ->_key to poll_table_entry->key.
> +		if (pts->_key & EPOLLIN) // exclude the POLLHUP-only waiters
> +			pts->_key |= EPOLLMSG; // random flag
> +	}
>  	poll_wait(file, &pid->wait_pidfd, pts);
>  	/*
>  	 * Depending on PIDFD_THREAD, inform pollers when the thread
> 
> Now, do_notify_pidfd() can do
> 
> 	if (!thread_group_empty(tsk))
> 		mask = EPOLLMSG; // matches the hack in pidfd_poll
> 	else
> 		mask = EPOLLIN | EPOLLRDNORM;
> 
> 	__wake_up(..., poll_to_key(mask));
> 
> Yes, in this case it makes more sense to pass !thread_group_empty()
> as a "bool thread" argument.
> 
> ---------------------------------------------------------------------
> 
> What do you think?
> 
> I am starting to think that I shouldn't have delayed the 1st trivial
> change. Feel free to push it, with or without rename, with or without
> the new wake_up_all_poll() helper, I am fine either way. But please
> don't add the new "int dead" argument, afaics it makes no sense.

I would never go above your head, don't worry. :)
  
Oleg Nesterov Feb. 5, 2024, 2:08 p.m. UTC | #9
On 02/03, Christian Brauner wrote:
>
> On Sat, Feb 03, 2024 at 01:04:26PM +0100, Oleg Nesterov wrote:
> >
> > -	wake_up_all(&pid->wait_pidfd);
> > +	__wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
> > +		  poll_to_key(EPOLLIN | EPOLLRDNORM));
>
>
> Ok, care to just send me a full patch for this?

Will do in a minute,



> Second, I agree that this looks ugly. ;/

Agreed ;)

> So here's a very likely a stupid idea. To make that clean we essentially
> need kernel private information that can't be set in userspace (Btw,
> look at EPOLL_URING_WAKE which is similar in that it cannot be set from
> userspace. It's not the same thing ofc but it is a private bit.). Which
> is the gist of your proposal in a way.
>
> So we would have to grab a new private bit in the epoll flag space.

Agreed, but just in case:

	- EPOLLMSG (ab)used by this patch can't "leak" to userspace even
	  if it was (erroneously) set in pollfd.events

	- If EPOLLMSG was set by userspace nothing bad can happen, just
	  poll(non-PIDFD_THREAD-pidfd) will get the spurious wakeups.

So. I am attaching the patch for the record, in case we return to this
later.

It seems to work fine, but when I look into fs/eventpoll.c I suspect
it is not epoll friendly. I _think_ that the neccessary fix is trivial,
ep_item_poll() should just copy pt->_key to epi->event.events after
vfs_poll(), but I am not sure. So lets forget it for now.

Oleg.
---

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 8124d57752b9..7467cdb9735b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -74,7 +74,7 @@ struct pid *pidfd_pid(const struct file *file);
 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
 struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
-void do_notify_pidfd(struct task_struct *task);
+void do_notify_pidfd(struct task_struct *task, bool thread);
 
 static inline struct pid *get_pid(struct pid *pid)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index 493647fd7c07..c31f36d3a1ed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -744,7 +744,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 * PIDFD_THREAD waiters.
 	 */
 	if (!thread_group_empty(tsk))
-		do_notify_pidfd(tsk);
+		do_notify_pidfd(tsk, true);
 
 	if (unlikely(tsk->ptrace)) {
 		int sig = thread_group_leader(tsk) &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 8d08a2d1b095..3b4474ff6f4a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2081,6 +2081,15 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	struct task_struct *task;
 	__poll_t poll_flags = 0;
 
+	if (pts && pts->_qproc) {
+		/*
+		 * We are not registered yet. Update pts->_key to mark us as
+		 * a non POLLHUP-only PIDFD_THREAD waiter for do_notify_pidfd,
+		 * _pollwait() will copy this _key to poll_table_entry->key.
+		 */
+		if (thread && (pts->_key & (EPOLLIN | EPOLLRDNORM)))
+			pts->_key |= EPOLLMSG;
+	}
 	poll_wait(file, &pid->wait_pidfd, pts);
 	/*
 	 * Depending on PIDFD_THREAD, inform pollers when the thread
diff --git a/kernel/signal.c b/kernel/signal.c
index c3fac06937e2..f51070dec132 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2019,14 +2019,15 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
 	return ret;
 }
 
-void do_notify_pidfd(struct task_struct *task)
+void do_notify_pidfd(struct task_struct *task, bool thread)
 {
+	/* see the usage of EPOLLMSG in pidfd_poll() */
+	__poll_t m = thread ? EPOLLMSG : EPOLLIN | EPOLLRDNORM;
 	struct pid *pid = task_pid(task);
 
 	WARN_ON(task->exit_state == 0);
 
-	__wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
-			poll_to_key(EPOLLIN | EPOLLRDNORM));
+	__wake_up(&pid->wait_pidfd, TASK_NORMAL, 0, poll_to_key(m));
 }
 
 /*
@@ -2056,7 +2057,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	 * non-PIDFD_THREAD waiters.
 	 */
 	if (thread_group_empty(tsk))
-		do_notify_pidfd(tsk);
+		do_notify_pidfd(tsk, false);
 
 	if (sig != SIGCHLD) {
 		/*
  

Patch

diff --git a/kernel/fork.c b/kernel/fork.c
index b8c6ec9a08dd..8d08a2d1b095 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2071,20 +2071,6 @@  static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
-static bool pidfd_task_exited(struct pid *pid, bool thread)
-{
-	struct task_struct *task;
-	bool exited;
-
-	rcu_read_lock();
-	task = pid_task(pid, PIDTYPE_PID);
-	exited = !task ||
-		(READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
-	rcu_read_unlock();
-
-	return exited;
-}
-
 /*
  * Poll support for process exit notification.
  */
@@ -2092,6 +2078,7 @@  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
 	struct pid *pid = file->private_data;
 	bool thread = file->f_flags & PIDFD_THREAD;
+	struct task_struct *task;
 	__poll_t poll_flags = 0;
 
 	poll_wait(file, &pid->wait_pidfd, pts);
@@ -2099,8 +2086,13 @@  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	 * Depending on PIDFD_THREAD, inform pollers when the thread
 	 * or the whole thread-group exits.
 	 */
-	if (pidfd_task_exited(pid, thread))
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
+	else if (task->exit_state && (thread || thread_group_empty(task)))
 		poll_flags = EPOLLIN | EPOLLRDNORM;
+	rcu_read_unlock();
 
 	return poll_flags;
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index e11144466828..62461c7c82b8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -349,6 +349,11 @@  static void __change_pid(struct task_struct *task, enum pid_type type,
 	hlist_del_rcu(&task->pid_links[type]);
 	*pid_ptr = new;
 
+	if (type == PIDTYPE_PID) {
+		WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
+		wake_up_all(&pid->wait_pidfd);
+	}
+
 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
 		if (pid_has_task(pid, tmp))
 			return;