[RFC] io_uring: Add io_uring_setup flag to pre-register ring fd and never install it
Commit Message
With IORING_REGISTER_USE_REGISTERED_RING, an application can register
the ring fd and use it via registered index rather than installed fd.
This allows using a registered ring for everything *except* the initial
mmap.
With IORING_SETUP_NO_MMAP, io_uring_setup uses buffers allocated by the
user, rather than requiring a subsequent mmap.
The combination of the two allows a user to operate *entirely* via a
registered ring fd, making it unnecessary to ever install the fd in the
first place. So, add a flag IORING_SETUP_REGISTERED_FD_ONLY to make
io_uring_setup register the fd and return a registered index, without
installing the fd.
This allows an application to avoid touching the fd table at all, and
allows a library to never even momentarily install a file descriptor.
This splits out an io_ring_add_registered_file helper from
io_ring_add_registered_fd, for use by io_uring_setup.
Signed-off-by: Josh Triplett <josh@joshtriplett.org>
---
Does this seem like a reasonable approach?
include/uapi/linux/io_uring.h | 7 +++++++
io_uring/io_uring.c | 37 +++++++++++++++++++++--------------
io_uring/io_uring.h | 3 +++
io_uring/tctx.c | 31 ++++++++++++++++++-----------
4 files changed, 52 insertions(+), 26 deletions(-)
Comments
On Sat, 29 Apr 2023 01:40:30 +0900, Josh Triplett wrote:
> With IORING_REGISTER_USE_REGISTERED_RING, an application can register
> the ring fd and use it via registered index rather than installed fd.
> This allows using a registered ring for everything *except* the initial
> mmap.
>
> With IORING_SETUP_NO_MMAP, io_uring_setup uses buffers allocated by the
> user, rather than requiring a subsequent mmap.
>
> [...]
Applied, thanks!
[1/1] io_uring: Add io_uring_setup flag to pre-register ring fd and never install it
commit: 6e76ac595855db27bbdaef337173294a6fd6eb2c
Best regards,
@@ -178,6 +178,13 @@ enum {
*/
#define IORING_SETUP_NO_MMAP (1U << 14)
+/*
+ * Register the ring fd in itself for use with
+ * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather
+ * than an fd.
+ */
+#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -3809,19 +3809,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return 0;
}
-static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
+static int io_uring_install_fd(struct file *file)
{
- int ret, fd;
+ int fd;
fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
if (fd < 0)
return fd;
-
- ret = __io_uring_add_tctx_node(ctx);
- if (ret) {
- put_unused_fd(fd);
- return ret;
- }
fd_install(fd, file);
return fd;
}
@@ -3861,6 +3855,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
struct io_uring_params __user *params)
{
struct io_ring_ctx *ctx;
+ struct io_uring_task *tctx;
struct file *file;
int ret;
@@ -3872,6 +3867,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
entries = IORING_MAX_ENTRIES;
}
+ if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
+ && !(p->flags & IORING_SETUP_NO_MMAP))
+ return -EINVAL;
+
/*
* Use twice as many entries for the CQ ring. It's possible for the
* application to drive a higher depth than the size of the SQ ring,
@@ -4028,22 +4027,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
goto err;
}
+ ret = __io_uring_add_tctx_node(ctx);
+ if (ret)
+ goto err_fput;
+ tctx = current->io_uring;
+
/*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
*/
- ret = io_uring_install_fd(ctx, file);
- if (ret < 0) {
- /* fput will clean it up */
- fput(file);
- return ret;
- }
+ if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
+ ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX);
+ else
+ ret = io_uring_install_fd(file);
+ if (ret < 0)
+ goto err_fput;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
io_ring_ctx_wait_and_kill(ctx);
return ret;
+err_fput:
+ fput(file);
+ return ret;
}
/*
@@ -4070,7 +4077,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
- IORING_SETUP_NO_MMAP))
+ IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -75,6 +75,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx);
+int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
+ int start, int end);
+
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
@@ -208,31 +208,40 @@ void io_uring_unreg_ringfd(void)
}
}
-static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end)
{
- struct file *file;
int offset;
-
for (offset = start; offset < end; offset++) {
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
if (tctx->registered_rings[offset])
continue;
- file = fget(fd);
- if (!file) {
- return -EBADF;
- } else if (!io_is_uring_fops(file)) {
- fput(file);
- return -EOPNOTSUPP;
- }
tctx->registered_rings[offset] = file;
return offset;
}
-
return -EBUSY;
}
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+ int start, int end)
+{
+ struct file *file;
+ int offset;
+
+ file = fget(fd);
+ if (!file) {
+ return -EBADF;
+ } else if (!io_is_uring_fops(file)) {
+ fput(file);
+ return -EOPNOTSUPP;
+ }
+ offset = io_ring_add_registered_file(tctx, file, start, end);
+ if (offset < 0)
+ fput(file);
+ return offset;
+}
+
/*
* Register a ring fd to avoid fdget/fdput for each io_uring_enter()
* invocation. User passes in an array of struct io_uring_rsrc_update