[RFC,1/1] eventpoll: support busy poll per epoll instance

Message ID 20240120004247.42036-2-jdamato@fastly.com
State New
Headers
Series RFC: Allow busy poll to be set per epoll instance |

Commit Message

Joe Damato Jan. 20, 2024, 12:42 a.m. UTC
  Add F_EPOLL_{S,G}ET_BUSY_POLL_USECS to allow setting a busy poll timeout
per epoll instance so that individual applications can enable (or
disable) epoll based busy poll as needed.

Prior to this change, epoll-based busy poll could only be enabled
system-wide, which limits the usefulness of busy poll.

Signed-off-by: Joe Damato <jdamato@fastly.com>
---
 fs/eventpoll.c                   | 71 ++++++++++++++++++++++++++++++--
 fs/fcntl.c                       |  5 +++
 include/linux/eventpoll.h        |  2 +
 include/uapi/linux/fcntl.h       |  6 +++
 tools/include/uapi/linux/fcntl.h |  6 +++
 tools/perf/trace/beauty/fcntl.c  |  3 +-
 6 files changed, 88 insertions(+), 5 deletions(-)
  

Comments

Christian Brauner Jan. 22, 2024, 3:25 p.m. UTC | #1
On Sat, Jan 20, 2024 at 12:42:47AM +0000, Joe Damato wrote:
> Add F_EPOLL_{S,G}ET_BUSY_POLL_USECS to allow setting a busy poll timeout
> per epoll instance so that individual applications can enable (or
> disable) epoll based busy poll as needed.
> 
> Prior to this change, epoll-based busy poll could only be enabled
> system-wide, which limits the usefulness of busy poll.
> 
> Signed-off-by: Joe Damato <jdamato@fastly.com>
> ---

This should be an ioctl on the epoll fd, not a fcntl(). fcntl()s
should aim to be generic which this isn't. We've recently rejected a
memfd addition as a fcntl() as well for the same reason.
  
Joe Damato Jan. 22, 2024, 9:16 p.m. UTC | #2
On Mon, Jan 22, 2024 at 04:25:01PM +0100, Christian Brauner wrote:
> On Sat, Jan 20, 2024 at 12:42:47AM +0000, Joe Damato wrote:
> > Add F_EPOLL_{S,G}ET_BUSY_POLL_USECS to allow setting a busy poll timeout
> > per epoll instance so that individual applications can enable (or
> > disable) epoll based busy poll as needed.
> > 
> > Prior to this change, epoll-based busy poll could only be enabled
> > system-wide, which limits the usefulness of busy poll.
> > 
> > Signed-off-by: Joe Damato <jdamato@fastly.com>
> > ---
> 
> This should be an ioctl on the epoll fd, not a fcntl(). fcntl()s
> should aim to be generic which this isn't. We've recently rejected a
> memfd addition as a fcntl() as well for the same reason.

OK, thanks for the review. An ioctl makes more sense, I agree.

I'll rewrite it as you've suggested and send another RFC.
  

Patch

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3534d36a1474..a8087c2b47ef 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -227,6 +227,8 @@  struct eventpoll {
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	/* used to track busy poll napi_id */
 	unsigned int napi_id;
+	/* busy poll timeout */
+	u64 busy_poll_usecs;
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -386,12 +388,39 @@  static inline int ep_events_available(struct eventpoll *ep)
 		READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
 }
 
+/**
+ * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
+ * from the epoll instance ep is preferred, but if it is not set fallback to
+ * the system-wide global via busy_loop_timeout.
+ */
+static inline bool busy_loop_ep_timeout(unsigned long start_time, struct eventpoll *ep)
+{
 #ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
+
+	if (bp_usec) {
+		unsigned long end_time = start_time + bp_usec;
+		unsigned long now = busy_loop_current_time();
+
+		return time_after(now, end_time);
+	} else {
+		return busy_loop_timeout(start_time);
+	}
+#endif
+	return true;
+}
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static bool ep_busy_loop_on(struct eventpoll *ep)
+{
+	return !!ep->busy_poll_usecs ^ net_busy_loop_on();
+}
+
 static bool ep_busy_loop_end(void *p, unsigned long start_time)
 {
 	struct eventpoll *ep = p;
 
-	return ep_events_available(ep) || busy_loop_timeout(start_time);
+	return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
 }
 
 /*
@@ -404,7 +433,7 @@  static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
 {
 	unsigned int napi_id = READ_ONCE(ep->napi_id);
 
-	if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
+	if ((napi_id >= MIN_NAPI_ID) && ep_busy_loop_on(ep)) {
 		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
 			       BUSY_POLL_BUDGET);
 		if (ep_events_available(ep))
@@ -430,7 +459,8 @@  static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 	struct socket *sock;
 	struct sock *sk;
 
-	if (!net_busy_loop_on())
+	ep = epi->ep;
+	if (!ep_busy_loop_on(ep))
 		return;
 
 	sock = sock_from_file(epi->ffd.file);
@@ -442,7 +472,6 @@  static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 		return;
 
 	napi_id = READ_ONCE(sk->sk_napi_id);
-	ep = epi->ep;
 
 	/* Non-NAPI IDs can be rejected
 	 *	or
@@ -466,6 +495,10 @@  static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 {
 }
 
+static inline bool ep_busy_loop_on(struct eventpoll *ep)
+{
+	return false;
+}
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
 /*
@@ -933,6 +966,33 @@  static const struct file_operations eventpoll_fops = {
 	.llseek		= noop_llseek,
 };
 
+unsigned long eventpoll_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+	struct eventpoll *ep;
+
+	if (!is_file_epoll(file))
+		return -EINVAL;
+
+	ep = file->private_data;
+
+	switch (cmd) {
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	case F_EPOLL_SET_BUSY_POLL_USECS:
+		ret = ep->busy_poll_usecs = arg;
+		break;
+	case F_EPOLL_GET_BUSY_POLL_USECS:
+		ret = ep->busy_poll_usecs;
+		break;
+#endif
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
 /*
  * This is called from eventpoll_release() to unlink files from the eventpoll
  * interface. We need to have this facility to cleanup correctly files that are
@@ -2058,6 +2118,9 @@  static int do_epoll_create(int flags)
 		error = PTR_ERR(file);
 		goto out_free_fd;
 	}
+#ifndef CONFIG_NET_RX_BUSY_POLL
+	ep->busy_poll_usecs = 0;
+#endif
 	ep->file = file;
 	fd_install(fd, file);
 	return fd;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..f232e7c2eb9d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -9,6 +9,7 @@ 
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/sched/task.h>
+#include <linux/eventpoll.h>
 #include <linux/fs.h>
 #include <linux/filelock.h>
 #include <linux/file.h>
@@ -419,6 +420,10 @@  static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_SET_RW_HINT:
 		err = fcntl_rw_hint(filp, cmd, arg);
 		break;
+	case F_EPOLL_GET_BUSY_POLL_USECS:
+	case F_EPOLL_SET_BUSY_POLL_USECS:
+		err = eventpoll_fcntl(filp, cmd, arg);
+		break;
 	default:
 		break;
 	}
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 3337745d81bd..3e6a49d14f52 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -22,6 +22,8 @@  struct file;
 struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
 #endif
 
+unsigned long eventpoll_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+
 /* Used to release the epoll bits inside the "struct file" */
 void eventpoll_release_file(struct file *file);
 
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 282e90aeb163..522134ab9580 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -56,6 +56,12 @@ 
 #define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
 #define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)
 
+/*
+ * Set/Get busy poll usecs for an epoll instance.
+ */
+#define F_EPOLL_GET_BUSY_POLL_USECS (F_LINUX_SPECIFIC_BASE + 15)
+#define F_EPOLL_SET_BUSY_POLL_USECS (F_LINUX_SPECIFIC_BASE + 16)
+
 /*
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 6c80f96049bd..1937f8b74783 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -56,6 +56,12 @@ 
 #define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
 #define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)
 
+/*
+ * Set/Get busy poll usecs for an epoll instance.
+ */
+#define F_EPOLL_GET_BUSY_POLL_USECS (F_LINUX_SPECIFIC_BASE + 15)
+#define F_EPOLL_SET_BUSY_POLL_USECS (F_LINUX_SPECIFIC_BASE + 16)
+
 /*
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
diff --git a/tools/perf/trace/beauty/fcntl.c b/tools/perf/trace/beauty/fcntl.c
index 56ef83b3d130..dae5647c5c1a 100644
--- a/tools/perf/trace/beauty/fcntl.c
+++ b/tools/perf/trace/beauty/fcntl.c
@@ -94,7 +94,8 @@  size_t syscall_arg__scnprintf_fcntl_arg(char *bf, size_t size, struct syscall_ar
 	    cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK ||
 	    cmd == F_GETOWN_EX || cmd == F_SETOWN_EX ||
 	    cmd == F_GET_RW_HINT || cmd == F_SET_RW_HINT ||
-	    cmd == F_GET_FILE_RW_HINT || cmd == F_SET_FILE_RW_HINT)
+	    cmd == F_GET_FILE_RW_HINT || cmd == F_SET_FILE_RW_HINT ||
+	    cmd == F_EPOLL_GET_BUSY_POLL_USECS || cmd == F_EPOLL_SET_BUSY_POLL_USECS)
 		return syscall_arg__scnprintf_hex(bf, size, arg);
 
 	return syscall_arg__scnprintf_long(bf, size, arg);