diff mbox series

[6/6] eventpoll: add support for min-wait

Message ID	20221030220203.31210-7-axboe@kernel.dk
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; From: Jens Axboe <axboe@kernel.dk> To: linux-kernel@vger.kernel.org, netdev@vger.kernel.org Cc: Jens Axboe <axboe@kernel.dk> Subject: [PATCH 6/6] eventpoll: add support for min-wait Date: Sun, 30 Oct 2022 16:02:03 -0600 Message-Id: <20221030220203.31210-7-axboe@kernel.dk> In-Reply-To: <20221030220203.31210-1-axboe@kernel.dk> References: <20221030220203.31210-1-axboe@kernel.dk> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	Add support for epoll min_wait \| [PATCHSET,v3,0/5] Add support for epoll min_wait [1/6] eventpoll: cleanup branches around sleeping for events [2/6] eventpoll: don't pass in 'timed_out' to ep_busy_loop() [3/6] eventpoll: split out wait handling [4/6] eventpoll: move expires to epoll_wq [5/6] eventpoll: move file checking earlier for epoll_ctl() [6/6] eventpoll: add support for min-wait

Commit Message

Jens Axboe Oct. 30, 2022, 10:02 p.m. UTC

  Rather than just have a timeout value for waiting on events, add
EPOLL_CTL_MIN_WAIT to allow setting a minimum time that epoll_wait()
should always wait for events to arrive.

For medium workload efficiencies, some production workloads inject
artificial timers or sleeps before calling epoll_wait() to get
better batching and higher efficiencies. While this does help, it's
not as efficient as it could be. By adding support for epoll_wait()
for this directly, we can avoids extra context switches and scheduler
and timer overhead.

As an example, running an AB test on an identical workload at about
~370K reqs/second, without this change and with the sleep hack
mentioned above (using 200 usec as the timeout), we're doing 310K-340K
non-voluntary context switches per second. Idle CPU on the host is 27-34%.
With the the sleep hack removed and epoll set to the same 200 usec
value, we're handling the exact same load but at 292K-315k non-voluntary
context switches and idle CPU of 33-41%, a substantial win.

Basic test case:

struct d {
        int p1, p2;
};

static void *fn(void *data)
{
        struct d *d = data;
        char b = 0x89;

	/* Generate 2 events 20 msec apart */
        usleep(10000);
        write(d->p1, &b, sizeof(b));
        usleep(10000);
        write(d->p2, &b, sizeof(b));

        return NULL;
}

int main(int argc, char *argv[])
{
        struct epoll_event ev, events[2];
        pthread_t thread;
        int p1[2], p2[2];
        struct d d;
        int efd, ret;

        efd = epoll_create1(0);
        if (efd < 0) {
                perror("epoll_create");
                return 1;
        }

        if (pipe(p1) < 0) {
                perror("pipe");
                return 1;
        }
        if (pipe(p2) < 0) {
                perror("pipe");
                return 1;
        }

        ev.events = EPOLLIN;
        ev.data.fd = p1[0];
        if (epoll_ctl(efd, EPOLL_CTL_ADD, p1[0], &ev) < 0) {
                perror("epoll add");
                return 1;
        }
        ev.events = EPOLLIN;
        ev.data.fd = p2[0];
        if (epoll_ctl(efd, EPOLL_CTL_ADD, p2[0], &ev) < 0) {
                perror("epoll add");
                return 1;
        }

	/* always wait 200 msec for events */
        ev.data.u64 = 200000;
        if (epoll_ctl(efd, EPOLL_CTL_MIN_WAIT, -1, &ev) < 0) {
                perror("epoll add set timeout");
                return 1;
        }

        d.p1 = p1[1];
        d.p2 = p2[1];
        pthread_create(&thread, NULL, fn, &d);

	/* expect to get 2 events here rather than just 1 */
        ret = epoll_wait(efd, events, 2, -1);
        printf("epoll_wait=%d\n", ret);

        return 0;
}

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/eventpoll.c                 | 97 +++++++++++++++++++++++++++++-----
 include/linux/eventpoll.h      |  2 +-
 include/uapi/linux/eventpoll.h |  1 +
 3 files changed, 85 insertions(+), 15 deletions(-)

Comments

Soheil Hassas Yeganeh Nov. 8, 2022, 10:14 p.m. UTC | #1

On Sun, Oct 30, 2022 at 04:02:03PM -0600, Jens Axboe wrote:
> Rather than just have a timeout value for waiting on events, add
> EPOLL_CTL_MIN_WAIT to allow setting a minimum time that epoll_wait()
> should always wait for events to arrive.
> 
> For medium workload efficiencies, some production workloads inject
> artificial timers or sleeps before calling epoll_wait() to get
> better batching and higher efficiencies. While this does help, it's
> not as efficient as it could be. By adding support for epoll_wait()
> for this directly, we can avoids extra context switches and scheduler
> and timer overhead.
> 
> As an example, running an AB test on an identical workload at about
> ~370K reqs/second, without this change and with the sleep hack
> mentioned above (using 200 usec as the timeout), we're doing 310K-340K
> non-voluntary context switches per second. Idle CPU on the host is 27-34%.
> With the the sleep hack removed and epoll set to the same 200 usec
> value, we're handling the exact same load but at 292K-315k non-voluntary
> context switches and idle CPU of 33-41%, a substantial win.
> 
> Basic test case:
> 
> struct d {
>         int p1, p2;
> };
> 
> static void *fn(void *data)
> {
>         struct d *d = data;
>         char b = 0x89;
> 
> 	/* Generate 2 events 20 msec apart */
>         usleep(10000);
>         write(d->p1, &b, sizeof(b));
>         usleep(10000);
>         write(d->p2, &b, sizeof(b));
> 
>         return NULL;
> }
> 
> int main(int argc, char *argv[])
> {
>         struct epoll_event ev, events[2];
>         pthread_t thread;
>         int p1[2], p2[2];
>         struct d d;
>         int efd, ret;
> 
>         efd = epoll_create1(0);
>         if (efd < 0) {
>                 perror("epoll_create");
>                 return 1;
>         }
> 
>         if (pipe(p1) < 0) {
>                 perror("pipe");
>                 return 1;
>         }
>         if (pipe(p2) < 0) {
>                 perror("pipe");
>                 return 1;
>         }
> 
>         ev.events = EPOLLIN;
>         ev.data.fd = p1[0];
>         if (epoll_ctl(efd, EPOLL_CTL_ADD, p1[0], &ev) < 0) {
>                 perror("epoll add");
>                 return 1;
>         }
>         ev.events = EPOLLIN;
>         ev.data.fd = p2[0];
>         if (epoll_ctl(efd, EPOLL_CTL_ADD, p2[0], &ev) < 0) {
>                 perror("epoll add");
>                 return 1;
>         }
> 
> 	/* always wait 200 msec for events */
>         ev.data.u64 = 200000;
>         if (epoll_ctl(efd, EPOLL_CTL_MIN_WAIT, -1, &ev) < 0) {
>                 perror("epoll add set timeout");
>                 return 1;
>         }
> 
>         d.p1 = p1[1];
>         d.p2 = p2[1];
>         pthread_create(&thread, NULL, fn, &d);
> 
> 	/* expect to get 2 events here rather than just 1 */
>         ret = epoll_wait(efd, events, 2, -1);
>         printf("epoll_wait=%d\n", ret);
> 
>         return 0;
> }

It might be worth adding a note in the commit message stating that
EPOLL_CTL_MIN_WAIT is a no-op when timeout is 0. This is a desired
behavior but it's not easy to see in the flow.

> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/eventpoll.c                 | 97 +++++++++++++++++++++++++++++-----
>  include/linux/eventpoll.h      |  2 +-
>  include/uapi/linux/eventpoll.h |  1 +
>  3 files changed, 85 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 962d897bbfc6..9e00f8780ec5 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -117,6 +117,9 @@ struct eppoll_entry {
>  	/* The "base" pointer is set to the container "struct epitem" */
>  	struct epitem *base;
>  
> +	/* min wait time if (min_wait_ts) & 1 != 0 */
> +	ktime_t min_wait_ts;
> +
>  	/*
>  	 * Wait queue item that will be linked to the target file wait
>  	 * queue head.
> @@ -217,6 +220,9 @@ struct eventpoll {
>  	u64 gen;
>  	struct hlist_head refs;
>  
> +	/* min wait for epoll_wait() */
> +	unsigned int min_wait_ts;
> +
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>  	/* used to track busy poll napi_id */
>  	unsigned int napi_id;
> @@ -1747,6 +1753,32 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
>  	return to;
>  }
>  
> +struct epoll_wq {
> +	wait_queue_entry_t wait;
> +	struct hrtimer timer;
> +	ktime_t timeout_ts;
> +	ktime_t min_wait_ts;
> +	struct eventpoll *ep;
> +	bool timed_out;
> +	int maxevents;
> +	int wakeups;
> +};
> +
> +static bool ep_should_min_wait(struct epoll_wq *ewq)
> +{
> +	if (ewq->min_wait_ts & 1) {
> +		/* just an approximation */
> +		if (++ewq->wakeups >= ewq->maxevents)
> +			goto stop_wait;

Is there a way to short cut the wait if the process is being terminated?

We issues in production systems in the past where too many threads were
in epoll_wait and the process got terminated.  It'd be nice if these
threads could exit the syscall as fast as possible.

> +		if (ktime_before(ktime_get_ns(), ewq->min_wait_ts))
> +			return true;
> +	}
> +
> +stop_wait:
> +	ewq->min_wait_ts &= ~(u64) 1;
> +	return false;
> +}
> +
>  /*
>   * autoremove_wake_function, but remove even on failure to wake up, because we
>   * know that default_wake_function/ttwu will only fail if the thread is already
> @@ -1756,27 +1788,37 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
>  static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
>  				       unsigned int mode, int sync, void *key)
>  {
> -	int ret = default_wake_function(wq_entry, mode, sync, key);
> +	struct epoll_wq *ewq = container_of(wq_entry, struct epoll_wq, wait);
> +	int ret;
> +
> +	/*
> +	 * If min wait time hasn't been satisfied yet, keep waiting
> +	 */
> +	if (ep_should_min_wait(ewq))
> +		return 0;
>  
> +	ret = default_wake_function(wq_entry, mode, sync, key);
>  	list_del_init(&wq_entry->entry);
>  	return ret;
>  }
>  
> -struct epoll_wq {
> -	wait_queue_entry_t wait;
> -	struct hrtimer timer;
> -	ktime_t timeout_ts;
> -	bool timed_out;
> -};
> -
>  static enum hrtimer_restart ep_timer(struct hrtimer *timer)
>  {
>  	struct epoll_wq *ewq = container_of(timer, struct epoll_wq, timer);
>  	struct task_struct *task = ewq->wait.private;
> +	const bool is_min_wait = ewq->min_wait_ts & 1;
> +
> +	if (!is_min_wait || ep_events_available(ewq->ep)) {
> +		if (!is_min_wait)
> +			ewq->timed_out = true;
> +		ewq->min_wait_ts &= ~(u64) 1;
> +		wake_up_process(task);
> +		return HRTIMER_NORESTART;
> +	}
>  
> -	ewq->timed_out = true;
> -	wake_up_process(task);
> -	return HRTIMER_NORESTART;
> +	ewq->min_wait_ts &= ~(u64) 1;
> +	hrtimer_set_expires_range_ns(&ewq->timer, ewq->timeout_ts, 0);
> +	return HRTIMER_RESTART;
>  }
>  
>  static void ep_schedule(struct eventpoll *ep, struct epoll_wq *ewq, ktime_t *to,
> @@ -1831,12 +1873,16 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>  
>  	lockdep_assert_irqs_enabled();
>  
> +	ewq.min_wait_ts = 0;
> +	ewq.ep = ep;
> +	ewq.maxevents = maxevents;
>  	ewq.timed_out = false;
> +	ewq.wakeups = 0;
>  
>  	if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
>  		slack = select_estimate_accuracy(timeout);
> +		ewq.timeout_ts = timespec64_to_ktime(*timeout);
>  		to = &ewq.timeout_ts;
> -		*to = timespec64_to_ktime(*timeout);
>  	} else if (timeout) {
>  		/*
>  		 * Avoid the unnecessary trip to the wait queue loop, if the
> @@ -1845,6 +1891,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>  		ewq.timed_out = true;
>  	}
>  
> +	/*
> +	 * If min_wait is set for this epoll instance, note the min_wait
> +	 * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
> +	 * the state bit for whether or not min_wait is enabled.
> +	 */
> +	if (ep->min_wait_ts) {

Can we limit this block to "ewq.timed_out && ep->min_wait_ts"?
AFAICT, the code we run here is completely wasted if timeout is 0.

> +		ewq.min_wait_ts = ktime_add_us(ktime_get_ns(),
> +						ep->min_wait_ts);
> +		ewq.min_wait_ts |= (u64) 1;
> +		to = &ewq.min_wait_ts;
> +	}
> +
>  	/*
>  	 * This call is racy: We may or may not see events that are being added
>  	 * to the ready list under the lock (e.g., in IRQ callbacks). For cases
> @@ -1913,7 +1971,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>  		 * important.
>  		 */
>  		eavail = ep_events_available(ep);
> -		if (!eavail) {
> +		if (!eavail || ewq.min_wait_ts & 1) {
>  			__add_wait_queue_exclusive(&ep->wq, &ewq.wait);
>  			write_unlock_irq(&ep->lock);
>  			ep_schedule(ep, &ewq, to, slack);
> @@ -2125,6 +2183,17 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
>  	 */
>  	ep = f.file->private_data;
>  
> +	/*
> +	 * Handle EPOLL_CTL_MIN_WAIT upfront as we don't need to care about
> +	 * the fd being passed in.
> +	 */
> +	if (op == EPOLL_CTL_MIN_WAIT) {
> +		/* return old value */
> +		error = ep->min_wait_ts;
> +		ep->min_wait_ts = epds->data;
> +		goto error_fput;
> +	}
> +
>  	/* Get the "struct file *" for the target file */
>  	tf = fdget(fd);
>  	if (!tf.file)
> @@ -2257,7 +2326,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
>  {
>  	struct epoll_event epds;
>  
> -	if (ep_op_has_event(op) &&
> +	if ((ep_op_has_event(op) || op == EPOLL_CTL_MIN_WAIT) &&
>  	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
>  		return -EFAULT;
>  
> diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
> index 3337745d81bd..cbef635cb7e4 100644
> --- a/include/linux/eventpoll.h
> +++ b/include/linux/eventpoll.h
> @@ -59,7 +59,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
>  /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
>  static inline int ep_op_has_event(int op)
>  {
> -	return op != EPOLL_CTL_DEL;
> +	return op != EPOLL_CTL_DEL && op != EPOLL_CTL_MIN_WAIT;
>  }
>  
>  #else
> diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
> index 8a3432d0f0dc..81ecb1ca36e0 100644
> --- a/include/uapi/linux/eventpoll.h
> +++ b/include/uapi/linux/eventpoll.h
> @@ -26,6 +26,7 @@
>  #define EPOLL_CTL_ADD 1
>  #define EPOLL_CTL_DEL 2
>  #define EPOLL_CTL_MOD 3
> +#define EPOLL_CTL_MIN_WAIT	4

Have you considered introducing another epoll_pwait sycall variant?

That has a major benefit that min wait can be different per poller,
on the different epollfd.  The usage would also be more readable:

"epoll for X amount of time but don't return sooner than Y."

This would be similar to the approach that willemb@google.com used
when introducing epoll_pwait2.

>  
>  /* Epoll event masks */
>  #define EPOLLIN		(__force __poll_t)0x00000001
> -- 
> 2.35.1
>

Jens Axboe Nov. 8, 2022, 10:20 p.m. UTC | #2

On 11/8/22 3:14 PM, Soheil Hassas Yeganeh wrote:
> On Sun, Oct 30, 2022 at 04:02:03PM -0600, Jens Axboe wrote:
>> Rather than just have a timeout value for waiting on events, add
>> EPOLL_CTL_MIN_WAIT to allow setting a minimum time that epoll_wait()
>> should always wait for events to arrive.
>>
>> For medium workload efficiencies, some production workloads inject
>> artificial timers or sleeps before calling epoll_wait() to get
>> better batching and higher efficiencies. While this does help, it's
>> not as efficient as it could be. By adding support for epoll_wait()
>> for this directly, we can avoids extra context switches and scheduler
>> and timer overhead.
>>
>> As an example, running an AB test on an identical workload at about
>> ~370K reqs/second, without this change and with the sleep hack
>> mentioned above (using 200 usec as the timeout), we're doing 310K-340K
>> non-voluntary context switches per second. Idle CPU on the host is 27-34%.
>> With the the sleep hack removed and epoll set to the same 200 usec
>> value, we're handling the exact same load but at 292K-315k non-voluntary
>> context switches and idle CPU of 33-41%, a substantial win.
>>
>> Basic test case:
>>
>> struct d {
>>         int p1, p2;
>> };
>>
>> static void *fn(void *data)
>> {
>>         struct d *d = data;
>>         char b = 0x89;
>>
>> 	/* Generate 2 events 20 msec apart */
>>         usleep(10000);
>>         write(d->p1, &b, sizeof(b));
>>         usleep(10000);
>>         write(d->p2, &b, sizeof(b));
>>
>>         return NULL;
>> }
>>
>> int main(int argc, char *argv[])
>> {
>>         struct epoll_event ev, events[2];
>>         pthread_t thread;
>>         int p1[2], p2[2];
>>         struct d d;
>>         int efd, ret;
>>
>>         efd = epoll_create1(0);
>>         if (efd < 0) {
>>                 perror("epoll_create");
>>                 return 1;
>>         }
>>
>>         if (pipe(p1) < 0) {
>>                 perror("pipe");
>>                 return 1;
>>         }
>>         if (pipe(p2) < 0) {
>>                 perror("pipe");
>>                 return 1;
>>         }
>>
>>         ev.events = EPOLLIN;
>>         ev.data.fd = p1[0];
>>         if (epoll_ctl(efd, EPOLL_CTL_ADD, p1[0], &ev) < 0) {
>>                 perror("epoll add");
>>                 return 1;
>>         }
>>         ev.events = EPOLLIN;
>>         ev.data.fd = p2[0];
>>         if (epoll_ctl(efd, EPOLL_CTL_ADD, p2[0], &ev) < 0) {
>>                 perror("epoll add");
>>                 return 1;
>>         }
>>
>> 	/* always wait 200 msec for events */
>>         ev.data.u64 = 200000;
>>         if (epoll_ctl(efd, EPOLL_CTL_MIN_WAIT, -1, &ev) < 0) {
>>                 perror("epoll add set timeout");
>>                 return 1;
>>         }
>>
>>         d.p1 = p1[1];
>>         d.p2 = p2[1];
>>         pthread_create(&thread, NULL, fn, &d);
>>
>> 	/* expect to get 2 events here rather than just 1 */
>>         ret = epoll_wait(efd, events, 2, -1);
>>         printf("epoll_wait=%d\n", ret);
>>
>>         return 0;
>> }
> 
> It might be worth adding a note in the commit message stating that
> EPOLL_CTL_MIN_WAIT is a no-op when timeout is 0. This is a desired
> behavior but it's not easy to see in the flow.

True, will do.

>> +struct epoll_wq {
>> +	wait_queue_entry_t wait;
>> +	struct hrtimer timer;
>> +	ktime_t timeout_ts;
>> +	ktime_t min_wait_ts;
>> +	struct eventpoll *ep;
>> +	bool timed_out;
>> +	int maxevents;
>> +	int wakeups;
>> +};
>> +
>> +static bool ep_should_min_wait(struct epoll_wq *ewq)
>> +{
>> +	if (ewq->min_wait_ts & 1) {
>> +		/* just an approximation */
>> +		if (++ewq->wakeups >= ewq->maxevents)
>> +			goto stop_wait;
> 
> Is there a way to short cut the wait if the process is being terminated?
> 
> We issues in production systems in the past where too many threads were
> in epoll_wait and the process got terminated.  It'd be nice if these
> threads could exit the syscall as fast as possible.

Good point, it'd be a bit racy though as this is called from the waitq
callback and hence not in the task itself. But probably Good Enough for
most use cases?

This should probably be a separate patch though, as it seems this
affects regular waits too without min_wait set?

>> @@ -1845,6 +1891,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>>  		ewq.timed_out = true;
>>  	}
>>  
>> +	/*
>> +	 * If min_wait is set for this epoll instance, note the min_wait
>> +	 * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
>> +	 * the state bit for whether or not min_wait is enabled.
>> +	 */
>> +	if (ep->min_wait_ts) {
> 
> Can we limit this block to "ewq.timed_out && ep->min_wait_ts"?
> AFAICT, the code we run here is completely wasted if timeout is 0.

Yep certainly, I can gate it on both of those conditions.

>> diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
>> index 8a3432d0f0dc..81ecb1ca36e0 100644
>> --- a/include/uapi/linux/eventpoll.h
>> +++ b/include/uapi/linux/eventpoll.h
>> @@ -26,6 +26,7 @@
>>  #define EPOLL_CTL_ADD 1
>>  #define EPOLL_CTL_DEL 2
>>  #define EPOLL_CTL_MOD 3
>> +#define EPOLL_CTL_MIN_WAIT	4
> 
> Have you considered introducing another epoll_pwait sycall variant?
> 
> That has a major benefit that min wait can be different per poller,
> on the different epollfd.  The usage would also be more readable:
> 
> "epoll for X amount of time but don't return sooner than Y."
> 
> This would be similar to the approach that willemb@google.com used
> when introducing epoll_pwait2.

I have, see other replies in this thread, notably the ones with Stefan
today. Happy to do that, and my current branch does split out the ctl
addition from the meat of the min_wait support for this reason. Can't
seem to find a great way to do it, as we'd need to move to a struct
argument for this as epoll_pwait2() is already at max arguments for a
syscall. Suggestions more than welcome.

Willem de Bruijn Nov. 8, 2022, 10:25 p.m. UTC | #3

> > This would be similar to the approach that willemb@google.com used
> > when introducing epoll_pwait2.
>
> I have, see other replies in this thread, notably the ones with Stefan
> today. Happy to do that, and my current branch does split out the ctl
> addition from the meat of the min_wait support for this reason. Can't
> seem to find a great way to do it, as we'd need to move to a struct
> argument for this as epoll_pwait2() is already at max arguments for a
> syscall. Suggestions more than welcome.

Expect an array of two timespecs as fourth argument?

Jens Axboe Nov. 8, 2022, 10:29 p.m. UTC | #4

On 11/8/22 3:25 PM, Willem de Bruijn wrote:
>>> This would be similar to the approach that willemb@google.com used
>>> when introducing epoll_pwait2.
>>
>> I have, see other replies in this thread, notably the ones with Stefan
>> today. Happy to do that, and my current branch does split out the ctl
>> addition from the meat of the min_wait support for this reason. Can't
>> seem to find a great way to do it, as we'd need to move to a struct
>> argument for this as epoll_pwait2() is already at max arguments for a
>> syscall. Suggestions more than welcome.
> 
> Expect an array of two timespecs as fourth argument?

Unfortunately even epoll_pwait2() doesn't have any kind of flags
argument to be able to do tricks like that... But I guess we could do
that with epoll_pwait3(), but it'd be an extra indirection for the copy
at that point (copy array of pointers, copy pointer if not NULL), which
would be unfortunate. I'd hate to have to argue that API to anyone, let
alone Linus, when pushing the series.

Soheil Hassas Yeganeh Nov. 8, 2022, 10:41 p.m. UTC | #5

On Tue, Nov 8, 2022 at 5:20 PM Jens Axboe <axboe@kernel.dk> wrote:
> > Is there a way to short cut the wait if the process is being terminated?
> >
> > We issues in production systems in the past where too many threads were
> > in epoll_wait and the process got terminated.  It'd be nice if these
> > threads could exit the syscall as fast as possible.
>
> Good point, it'd be a bit racy though as this is called from the waitq
> callback and hence not in the task itself. But probably Good Enough for
> most use cases?

Sounds good. We can definitely do that as a follow up later.

> This should probably be a separate patch though, as it seems this
> affects regular waits too without min_wait set?
>
> >> @@ -1845,6 +1891,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
> >>              ewq.timed_out = true;
> >>      }
> >>
> >> +    /*
> >> +     * If min_wait is set for this epoll instance, note the min_wait
> >> +     * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
> >> +     * the state bit for whether or not min_wait is enabled.
> >> +     */
> >> +    if (ep->min_wait_ts) {
> >
> > Can we limit this block to "ewq.timed_out && ep->min_wait_ts"?
> > AFAICT, the code we run here is completely wasted if timeout is 0.
>
> Yep certainly, I can gate it on both of those conditions.

Thanks. I think that would help. You might also want to restructure the if/else
condition above but it's your call.

On Tue, Nov 8, 2022 at 5:29 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 11/8/22 3:25 PM, Willem de Bruijn wrote:
> >>> This would be similar to the approach that willemb@google.com used
> >>> when introducing epoll_pwait2.
> >>
> >> I have, see other replies in this thread, notably the ones with Stefan
> >> today. Happy to do that, and my current branch does split out the ctl
> >> addition from the meat of the min_wait support for this reason. Can't
> >> seem to find a great way to do it, as we'd need to move to a struct
> >> argument for this as epoll_pwait2() is already at max arguments for a
> >> syscall. Suggestions more than welcome.
> >
> > Expect an array of two timespecs as fourth argument?
>
> Unfortunately even epoll_pwait2() doesn't have any kind of flags
> argument to be able to do tricks like that... But I guess we could do
> that with epoll_pwait3(), but it'd be an extra indirection for the copy
> at that point (copy array of pointers, copy pointer if not NULL), which
> would be unfortunate. I'd hate to have to argue that API to anyone, let
> alone Linus, when pushing the series.

I personally like what Willem suggested. It feels more natural to me
and as you suggested previously it can be a struct argument.

The overheads would be similar to any syscall that accepts itimerspec.

I understand your concern on "epoll_pwait3".  I wish Linus would weigh
in here. :-)

Willem de Bruijn Nov. 8, 2022, 10:44 p.m. UTC | #6

On Tue, Nov 8, 2022 at 5:30 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 11/8/22 3:25 PM, Willem de Bruijn wrote:
> >>> This would be similar to the approach that willemb@google.com used
> >>> when introducing epoll_pwait2.
> >>
> >> I have, see other replies in this thread, notably the ones with Stefan
> >> today. Happy to do that, and my current branch does split out the ctl
> >> addition from the meat of the min_wait support for this reason. Can't
> >> seem to find a great way to do it, as we'd need to move to a struct
> >> argument for this as epoll_pwait2() is already at max arguments for a
> >> syscall. Suggestions more than welcome.
> >
> > Expect an array of two timespecs as fourth argument?
>
> Unfortunately even epoll_pwait2() doesn't have any kind of flags
> argument to be able to do tricks like that... But I guess we could do
> that with epoll_pwait3(), but it'd be an extra indirection for the copy
> at that point (copy array of pointers, copy pointer if not NULL), which
> would be unfortunate. I'd hate to have to argue that API to anyone, let
> alone Linus, when pushing the series.

I did mean for a new syscall epoll_pwait3. But not an array of
pointers, an array of structs. The second arg is then mandatory for
this epoll_pwait_minwait variant of the syscall.

It would indeed have been nicer to be able to do this in epoll_pwait2
based on a flag. It's just doubling the size in copy_from_user in
get_timespec64.

Btw, when I added epoll_pwait2, there was a reasonable request to
also update the manpages and add a basic test to
tools/testing/selftests/filesystems/epoll. That is some extra work with
a syscall based approach.

Jens Axboe Dec. 1, 2022, 6 p.m. UTC | #7

>>> @@ -1845,6 +1891,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>>>  		ewq.timed_out = true;
>>>  	}
>>>  
>>> +	/*
>>> +	 * If min_wait is set for this epoll instance, note the min_wait
>>> +	 * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
>>> +	 * the state bit for whether or not min_wait is enabled.
>>> +	 */
>>> +	if (ep->min_wait_ts) {
>>
>> Can we limit this block to "ewq.timed_out && ep->min_wait_ts"?
>> AFAICT, the code we run here is completely wasted if timeout is 0.
> 
> Yep certainly, I can gate it on both of those conditions.
Looking at this for a respin, I think it should be gated on
!ewq.timed_out? timed_out == true is the path that it's wasted on
anyway.

Soheil Hassas Yeganeh Dec. 1, 2022, 6:39 p.m. UTC | #8

On Thu, Dec 1, 2022 at 1:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> >>> @@ -1845,6 +1891,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
> >>>             ewq.timed_out = true;
> >>>     }
> >>>
> >>> +   /*
> >>> +    * If min_wait is set for this epoll instance, note the min_wait
> >>> +    * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
> >>> +    * the state bit for whether or not min_wait is enabled.
> >>> +    */
> >>> +   if (ep->min_wait_ts) {
> >>
> >> Can we limit this block to "ewq.timed_out && ep->min_wait_ts"?
> >> AFAICT, the code we run here is completely wasted if timeout is 0.
> >
> > Yep certainly, I can gate it on both of those conditions.
> Looking at this for a respin, I think it should be gated on
> !ewq.timed_out? timed_out == true is the path that it's wasted on
> anyway.

Ah, yes, that's a good point. The check should be !ewq.timed_out.

Thanks,
Soheil

> --
> Jens Axboe
>

Jens Axboe Dec. 1, 2022, 6:41 p.m. UTC | #9

On 12/1/22 11:39 AM, Soheil Hassas Yeganeh wrote:
> On Thu, Dec 1, 2022 at 1:00 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>>>>> @@ -1845,6 +1891,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>>>>>             ewq.timed_out = true;
>>>>>     }
>>>>>
>>>>> +   /*
>>>>> +    * If min_wait is set for this epoll instance, note the min_wait
>>>>> +    * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
>>>>> +    * the state bit for whether or not min_wait is enabled.
>>>>> +    */
>>>>> +   if (ep->min_wait_ts) {
>>>>
>>>> Can we limit this block to "ewq.timed_out && ep->min_wait_ts"?
>>>> AFAICT, the code we run here is completely wasted if timeout is 0.
>>>
>>> Yep certainly, I can gate it on both of those conditions.
>> Looking at this for a respin, I think it should be gated on
>> !ewq.timed_out? timed_out == true is the path that it's wasted on
>> anyway.
> 
> Ah, yes, that's a good point. The check should be !ewq.timed_out.

The just posted v4 has the check (and the right one :-))

diff mbox series

Patch

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 962d897bbfc6..9e00f8780ec5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -117,6 +117,9 @@  struct eppoll_entry {
 	/* The "base" pointer is set to the container "struct epitem" */
 	struct epitem *base;
 
+	/* min wait time if (min_wait_ts) & 1 != 0 */
+	ktime_t min_wait_ts;
+
 	/*
 	 * Wait queue item that will be linked to the target file wait
 	 * queue head.
@@ -217,6 +220,9 @@  struct eventpoll {
 	u64 gen;
 	struct hlist_head refs;
 
+	/* min wait for epoll_wait() */
+	unsigned int min_wait_ts;
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	/* used to track busy poll napi_id */
 	unsigned int napi_id;
@@ -1747,6 +1753,32 @@  static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
 	return to;
 }
 
+struct epoll_wq {
+	wait_queue_entry_t wait;
+	struct hrtimer timer;
+	ktime_t timeout_ts;
+	ktime_t min_wait_ts;
+	struct eventpoll *ep;
+	bool timed_out;
+	int maxevents;
+	int wakeups;
+};
+
+static bool ep_should_min_wait(struct epoll_wq *ewq)
+{
+	if (ewq->min_wait_ts & 1) {
+		/* just an approximation */
+		if (++ewq->wakeups >= ewq->maxevents)
+			goto stop_wait;
+		if (ktime_before(ktime_get_ns(), ewq->min_wait_ts))
+			return true;
+	}
+
+stop_wait:
+	ewq->min_wait_ts &= ~(u64) 1;
+	return false;
+}
+
 /*
  * autoremove_wake_function, but remove even on failure to wake up, because we
  * know that default_wake_function/ttwu will only fail if the thread is already
@@ -1756,27 +1788,37 @@  static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
 static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
 				       unsigned int mode, int sync, void *key)
 {
-	int ret = default_wake_function(wq_entry, mode, sync, key);
+	struct epoll_wq *ewq = container_of(wq_entry, struct epoll_wq, wait);
+	int ret;
+
+	/*
+	 * If min wait time hasn't been satisfied yet, keep waiting
+	 */
+	if (ep_should_min_wait(ewq))
+		return 0;
 
+	ret = default_wake_function(wq_entry, mode, sync, key);
 	list_del_init(&wq_entry->entry);
 	return ret;
 }
 
-struct epoll_wq {
-	wait_queue_entry_t wait;
-	struct hrtimer timer;
-	ktime_t timeout_ts;
-	bool timed_out;
-};
-
 static enum hrtimer_restart ep_timer(struct hrtimer *timer)
 {
 	struct epoll_wq *ewq = container_of(timer, struct epoll_wq, timer);
 	struct task_struct *task = ewq->wait.private;
+	const bool is_min_wait = ewq->min_wait_ts & 1;
+
+	if (!is_min_wait || ep_events_available(ewq->ep)) {
+		if (!is_min_wait)
+			ewq->timed_out = true;
+		ewq->min_wait_ts &= ~(u64) 1;
+		wake_up_process(task);
+		return HRTIMER_NORESTART;
+	}
 
-	ewq->timed_out = true;
-	wake_up_process(task);
-	return HRTIMER_NORESTART;
+	ewq->min_wait_ts &= ~(u64) 1;
+	hrtimer_set_expires_range_ns(&ewq->timer, ewq->timeout_ts, 0);
+	return HRTIMER_RESTART;
 }
 
 static void ep_schedule(struct eventpoll *ep, struct epoll_wq *ewq, ktime_t *to,
@@ -1831,12 +1873,16 @@  static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 
 	lockdep_assert_irqs_enabled();
 
+	ewq.min_wait_ts = 0;
+	ewq.ep = ep;
+	ewq.maxevents = maxevents;
 	ewq.timed_out = false;
+	ewq.wakeups = 0;
 
 	if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
 		slack = select_estimate_accuracy(timeout);
+		ewq.timeout_ts = timespec64_to_ktime(*timeout);
 		to = &ewq.timeout_ts;
-		*to = timespec64_to_ktime(*timeout);
 	} else if (timeout) {
 		/*
 		 * Avoid the unnecessary trip to the wait queue loop, if the
@@ -1845,6 +1891,18 @@  static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		ewq.timed_out = true;
 	}
 
+	/*
+	 * If min_wait is set for this epoll instance, note the min_wait
+	 * time. Ensure the lowest bit is set in ewq.min_wait_ts, that's
+	 * the state bit for whether or not min_wait is enabled.
+	 */
+	if (ep->min_wait_ts) {
+		ewq.min_wait_ts = ktime_add_us(ktime_get_ns(),
+						ep->min_wait_ts);
+		ewq.min_wait_ts |= (u64) 1;
+		to = &ewq.min_wait_ts;
+	}
+
 	/*
 	 * This call is racy: We may or may not see events that are being added
 	 * to the ready list under the lock (e.g., in IRQ callbacks). For cases
@@ -1913,7 +1971,7 @@  static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		 * important.
 		 */
 		eavail = ep_events_available(ep);
-		if (!eavail) {
+		if (!eavail || ewq.min_wait_ts & 1) {
 			__add_wait_queue_exclusive(&ep->wq, &ewq.wait);
 			write_unlock_irq(&ep->lock);
 			ep_schedule(ep, &ewq, to, slack);
@@ -2125,6 +2183,17 @@  int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	 */
 	ep = f.file->private_data;
 
+	/*
+	 * Handle EPOLL_CTL_MIN_WAIT upfront as we don't need to care about
+	 * the fd being passed in.
+	 */
+	if (op == EPOLL_CTL_MIN_WAIT) {
+		/* return old value */
+		error = ep->min_wait_ts;
+		ep->min_wait_ts = epds->data;
+		goto error_fput;
+	}
+
 	/* Get the "struct file *" for the target file */
 	tf = fdget(fd);
 	if (!tf.file)
@@ -2257,7 +2326,7 @@  SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 {
 	struct epoll_event epds;
 
-	if (ep_op_has_event(op) &&
+	if ((ep_op_has_event(op) || op == EPOLL_CTL_MIN_WAIT) &&
 	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
 		return -EFAULT;
 
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 3337745d81bd..cbef635cb7e4 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -59,7 +59,7 @@  int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
 static inline int ep_op_has_event(int op)
 {
-	return op != EPOLL_CTL_DEL;
+	return op != EPOLL_CTL_DEL && op != EPOLL_CTL_MIN_WAIT;
 }
 
 #else
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 8a3432d0f0dc..81ecb1ca36e0 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -26,6 +26,7 @@ 
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
 #define EPOLL_CTL_MOD 3
+#define EPOLL_CTL_MIN_WAIT	4
 
 /* Epoll event masks */
 #define EPOLLIN		(__force __poll_t)0x00000001