On Thu, Sep 21, 2023 at 12:29:04PM -0600, Jens Axboe wrote:
> +int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +{
> + struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
> + u32 flags;
> +
> + if (unlikely(sqe->fd || sqe->len || sqe->buf_index || sqe->file_index))
> + return -EINVAL;
> +
> + iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
> + iof->futex_val = READ_ONCE(sqe->addr2);
> + iof->futex_mask = READ_ONCE(sqe->addr3);
> + flags = READ_ONCE(sqe->futex_flags);
> +
> + if (flags & ~FUTEX2_VALID_MASK)
> + return -EINVAL;
> +
> + iof->futex_flags = futex2_to_flags(flags);
So prep does the flags conversion..
> + if (!futex_flags_valid(iof->futex_flags))
> + return -EINVAL;
> +
> + if (!futex_validate_input(iof->futex_flags, iof->futex_val) ||
> + !futex_validate_input(iof->futex_flags, iof->futex_mask))
> + return -EINVAL;
> +
> + return 0;
> +}
> +int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
> +{
> + struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
> + struct io_ring_ctx *ctx = req->ctx;
> + struct io_futex_data *ifd = NULL;
> + struct futex_hash_bucket *hb;
> + int ret;
> +
> + if (!iof->futex_mask) {
> + ret = -EINVAL;
> + goto done;
> + }
> +
> + io_ring_submit_lock(ctx, issue_flags);
> + ifd = io_alloc_ifd(ctx);
> + if (!ifd) {
> + ret = -ENOMEM;
> + goto done_unlock;
> + }
> +
> + req->async_data = ifd;
> + ifd->q = futex_q_init;
> + ifd->q.bitset = iof->futex_mask;
> + ifd->q.wake = io_futex_wake_fn;
> + ifd->req = req;
> +
> + ret = futex_wait_setup(iof->uaddr, iof->futex_val,
> + futex2_to_flags(iof->futex_flags), &ifd->q, &hb);
But then wait and..
> + if (!ret) {
> + hlist_add_head(&req->hash_node, &ctx->futex_list);
> + io_ring_submit_unlock(ctx, issue_flags);
> +
> + futex_queue(&ifd->q, hb);
> + return IOU_ISSUE_SKIP_COMPLETE;
> + }
> +
> +done_unlock:
> + io_ring_submit_unlock(ctx, issue_flags);
> +done:
> + if (ret < 0)
> + req_set_fail(req);
> + io_req_set_res(req, ret, 0);
> + kfree(ifd);
> + return IOU_OK;
> +}
> +
> +int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
> +{
> + struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
> + int ret;
> +
> + ret = futex_wake(iof->uaddr, futex2_to_flags(iof->futex_flags),
... wake do it both again?
Also, I think we want wake to have wake do:
'FLAGS_STRICT | iof->futex_flags'
See 43adf8449510 ("futex: FLAGS_STRICT"), I'm thinking that waking 0
futexes should honour that request by waking 0, not 1 :-)
> + iof->futex_val, iof->futex_mask);
> + if (ret < 0)
> + req_set_fail(req);
> + io_req_set_res(req, ret, 0);
> + return IOU_OK;
> +}
On 9/27/23 3:05 AM, Peter Zijlstra wrote:
> On Thu, Sep 21, 2023 at 12:29:04PM -0600, Jens Axboe wrote:
>
>> +int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> +{
>> + struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
>> + u32 flags;
>> +
>> + if (unlikely(sqe->fd || sqe->len || sqe->buf_index || sqe->file_index))
>> + return -EINVAL;
>> +
>> + iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
>> + iof->futex_val = READ_ONCE(sqe->addr2);
>> + iof->futex_mask = READ_ONCE(sqe->addr3);
>> + flags = READ_ONCE(sqe->futex_flags);
>> +
>> + if (flags & ~FUTEX2_VALID_MASK)
>> + return -EINVAL;
>> +
>> + iof->futex_flags = futex2_to_flags(flags);
>
> So prep does the flags conversion..
>
>> + if (!futex_flags_valid(iof->futex_flags))
>> + return -EINVAL;
>> +
>> + if (!futex_validate_input(iof->futex_flags, iof->futex_val) ||
>> + !futex_validate_input(iof->futex_flags, iof->futex_mask))
>> + return -EINVAL;
>> +
>> + return 0;
>> +}
>
>> +int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
>> +{
>> + struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
>> + struct io_ring_ctx *ctx = req->ctx;
>> + struct io_futex_data *ifd = NULL;
>> + struct futex_hash_bucket *hb;
>> + int ret;
>> +
>> + if (!iof->futex_mask) {
>> + ret = -EINVAL;
>> + goto done;
>> + }
>> +
>> + io_ring_submit_lock(ctx, issue_flags);
>> + ifd = io_alloc_ifd(ctx);
>> + if (!ifd) {
>> + ret = -ENOMEM;
>> + goto done_unlock;
>> + }
>> +
>> + req->async_data = ifd;
>> + ifd->q = futex_q_init;
>> + ifd->q.bitset = iof->futex_mask;
>> + ifd->q.wake = io_futex_wake_fn;
>> + ifd->req = req;
>> +
>> + ret = futex_wait_setup(iof->uaddr, iof->futex_val,
>> + futex2_to_flags(iof->futex_flags), &ifd->q, &hb);
>
> But then wait and..
>
>> + if (!ret) {
>> + hlist_add_head(&req->hash_node, &ctx->futex_list);
>> + io_ring_submit_unlock(ctx, issue_flags);
>> +
>> + futex_queue(&ifd->q, hb);
>> + return IOU_ISSUE_SKIP_COMPLETE;
>> + }
>> +
>> +done_unlock:
>> + io_ring_submit_unlock(ctx, issue_flags);
>> +done:
>> + if (ret < 0)
>> + req_set_fail(req);
>> + io_req_set_res(req, ret, 0);
>> + kfree(ifd);
>> + return IOU_OK;
>> +}
>> +
>> +int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
>> +{
>> + struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
>> + int ret;
>> +
>> + ret = futex_wake(iof->uaddr, futex2_to_flags(iof->futex_flags),
>
> ... wake do it both again?
Oops good catch, yes just the prep side should do it of course. I'll fix
that up.
> Also, I think we want wake to have wake do:
>
> 'FLAGS_STRICT | iof->futex_flags'
>
> See 43adf8449510 ("futex: FLAGS_STRICT"), I'm thinking that waking 0
> futexes should honour that request by waking 0, not 1 :-)
Thanks for the pointer, yeah agree that sounds sane. Most syscalls that
take an number/size that is zero will indeed return zero. I'll add a
test case for that too.
@@ -315,6 +315,11 @@ struct io_ring_ctx {
struct hlist_head waitid_list;
+#ifdef CONFIG_FUTEX
+ struct hlist_head futex_list;
+ struct io_alloc_cache futex_cache;
+#endif
+
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
@@ -66,6 +66,7 @@ struct io_uring_sqe {
__u32 msg_ring_flags;
__u32 uring_cmd_flags;
__u32 waitid_flags;
+ __u32 futex_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@@ -243,6 +244,8 @@ enum io_uring_op {
IORING_OP_SENDMSG_ZC,
IORING_OP_READ_MULTISHOT,
IORING_OP_WAITID,
+ IORING_OP_FUTEX_WAIT,
+ IORING_OP_FUTEX_WAKE,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -10,3 +10,4 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o \
notif.o waitid.o
obj-$(CONFIG_IO_WQ) += io-wq.o
+obj-$(CONFIG_FUTEX) += futex.o
@@ -16,6 +16,7 @@
#include "poll.h"
#include "timeout.h"
#include "waitid.h"
+#include "futex.h"
#include "cancel.h"
struct io_cancel {
@@ -124,6 +125,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
if (ret != -ENOENT)
return ret;
+ ret = io_futex_cancel(ctx, cd, issue_flags);
+ if (ret != -ENOENT)
+ return ret;
+
spin_lock(&ctx->completion_lock);
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
ret = io_timeout_cancel(ctx, cd);
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#ifndef IORING_CANCEL_H
+#define IORING_CANCEL_H
#include <linux/io_uring_types.h>
@@ -22,3 +24,5 @@ void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
+
+#endif
new file mode 100644
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "../kernel/futex/futex.h"
+#include "io_uring.h"
+#include "rsrc.h"
+#include "futex.h"
+
+struct io_futex {
+ struct file *file;
+ u32 __user *uaddr;
+ unsigned long futex_val;
+ unsigned long futex_mask;
+ u32 futex_flags;
+};
+
+struct io_futex_data {
+ union {
+ struct futex_q q;
+ struct io_cache_entry cache;
+ };
+ struct io_kiocb *req;
+};
+
+void io_futex_cache_init(struct io_ring_ctx *ctx)
+{
+ io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX,
+ sizeof(struct io_futex_data));
+}
+
+static void io_futex_cache_entry_free(struct io_cache_entry *entry)
+{
+ kfree(container_of(entry, struct io_futex_data, cache));
+}
+
+void io_futex_cache_free(struct io_ring_ctx *ctx)
+{
+ io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free);
+}
+
+static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
+{
+ struct io_futex_data *ifd = req->async_data;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ io_tw_lock(ctx, ts);
+ if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache))
+ kfree(ifd);
+ req->async_data = NULL;
+ hlist_del_init(&req->hash_node);
+ io_req_task_complete(req, ts);
+}
+
+static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ struct io_futex_data *ifd = req->async_data;
+
+ /* futex wake already done or in progress */
+ if (!futex_unqueue(&ifd->q))
+ return false;
+
+ hlist_del_init(&req->hash_node);
+ io_req_set_res(req, -ECANCELED, 0);
+ req->io_task_work.func = io_futex_complete;
+ io_req_task_work_add(req);
+ return true;
+}
+
+int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ int nr = 0;
+
+ if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
+ return -ENOENT;
+
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
+ if (req->cqe.user_data != cd->data &&
+ !(cd->flags & IORING_ASYNC_CANCEL_ANY))
+ continue;
+ if (__io_futex_cancel(ctx, req))
+ nr++;
+ if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
+ break;
+ }
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ if (nr)
+ return nr;
+
+ return -ENOENT;
+}
+
+bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool found = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
+ if (!io_match_task_safe(req, task, cancel_all))
+ continue;
+ __io_futex_cancel(ctx, req);
+ found = true;
+ }
+
+ return found;
+}
+
+int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+ u32 flags;
+
+ if (unlikely(sqe->fd || sqe->len || sqe->buf_index || sqe->file_index))
+ return -EINVAL;
+
+ iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ iof->futex_val = READ_ONCE(sqe->addr2);
+ iof->futex_mask = READ_ONCE(sqe->addr3);
+ flags = READ_ONCE(sqe->futex_flags);
+
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ iof->futex_flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(iof->futex_flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(iof->futex_flags, iof->futex_val) ||
+ !futex_validate_input(iof->futex_flags, iof->futex_mask))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct io_futex_data *ifd = container_of(q, struct io_futex_data, q);
+ struct io_kiocb *req = ifd->req;
+
+ if (unlikely(!__futex_wake_mark(q)))
+ return;
+
+ io_req_set_res(req, 0, 0);
+ req->io_task_work.func = io_futex_complete;
+ io_req_task_work_add(req);
+}
+
+static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
+{
+ struct io_cache_entry *entry;
+
+ entry = io_alloc_cache_get(&ctx->futex_cache);
+ if (entry)
+ return container_of(entry, struct io_futex_data, cache);
+
+ return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
+}
+
+int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_futex_data *ifd = NULL;
+ struct futex_hash_bucket *hb;
+ int ret;
+
+ if (!iof->futex_mask) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ io_ring_submit_lock(ctx, issue_flags);
+ ifd = io_alloc_ifd(ctx);
+ if (!ifd) {
+ ret = -ENOMEM;
+ goto done_unlock;
+ }
+
+ req->async_data = ifd;
+ ifd->q = futex_q_init;
+ ifd->q.bitset = iof->futex_mask;
+ ifd->q.wake = io_futex_wake_fn;
+ ifd->req = req;
+
+ ret = futex_wait_setup(iof->uaddr, iof->futex_val,
+ futex2_to_flags(iof->futex_flags), &ifd->q, &hb);
+ if (!ret) {
+ hlist_add_head(&req->hash_node, &ctx->futex_list);
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ futex_queue(&ifd->q, hb);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+done_unlock:
+ io_ring_submit_unlock(ctx, issue_flags);
+done:
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ kfree(ifd);
+ return IOU_OK;
+}
+
+int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
+ int ret;
+
+ ret = futex_wake(iof->uaddr, futex2_to_flags(iof->futex_flags),
+ iof->futex_val, iof->futex_mask);
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
new file mode 100644
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "cancel.h"
+
+int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags);
+int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags);
+
+#if defined(CONFIG_FUTEX)
+int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags);
+bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all);
+void io_futex_cache_init(struct io_ring_ctx *ctx);
+void io_futex_cache_free(struct io_ring_ctx *ctx);
+#else
+static inline int io_futex_cancel(struct io_ring_ctx *ctx,
+ struct io_cancel_data *cd,
+ unsigned int issue_flags)
+{
+ return 0;
+}
+static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
+ struct task_struct *task, bool cancel_all)
+{
+ return false;
+}
+static inline void io_futex_cache_init(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_futex_cache_free(struct io_ring_ctx *ctx)
+{
+}
+#endif
@@ -93,6 +93,7 @@
#include "net.h"
#include "notif.h"
#include "waitid.h"
+#include "futex.h"
#include "timeout.h"
#include "poll.h"
@@ -330,6 +331,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
sizeof(struct async_poll));
io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_msghdr));
+ io_futex_cache_init(ctx);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
@@ -350,6 +352,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_HLIST_HEAD(&ctx->waitid_list);
+#ifdef CONFIG_FUTEX
+ INIT_HLIST_HEAD(&ctx->futex_list);
+#endif
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx;
@@ -2894,6 +2899,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_eventfd_unregister(ctx);
io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+ io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
mutex_unlock(&ctx->uring_lock);
if (ctx->sq_creds)
@@ -3306,6 +3312,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
ret |= io_waitid_remove_all(ctx, task, cancel_all);
+ ret |= io_futex_remove_all(ctx, task, cancel_all);
mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task)
@@ -34,6 +34,7 @@
#include "cancel.h"
#include "rw.h"
#include "waitid.h"
+#include "futex.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@@ -444,6 +445,22 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_waitid_prep,
.issue = io_waitid,
},
+ [IORING_OP_FUTEX_WAIT] = {
+#if defined(CONFIG_FUTEX)
+ .prep = io_futex_prep,
+ .issue = io_futex_wait,
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+ },
+ [IORING_OP_FUTEX_WAKE] = {
+#if defined(CONFIG_FUTEX)
+ .prep = io_futex_prep,
+ .issue = io_futex_wake,
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -670,6 +687,12 @@ const struct io_cold_def io_cold_defs[] = {
.name = "WAITID",
.async_size = sizeof(struct io_waitid_async),
},
+ [IORING_OP_FUTEX_WAIT] = {
+ .name = "FUTEX_WAIT",
+ },
+ [IORING_OP_FUTEX_WAKE] = {
+ .name = "FUTEX_WAKE",
+ },
};
const char *io_uring_get_opcode(u8 opcode)