[RFC,5/9] ntsync: Introduce NTSYNC_IOC_WAIT_ANY.
Commit Message
This corresponds to part of the functionality of the NT syscall
NtWaitForMultipleObjects(). Specifically, it implements the behaviour where
the third argument (wait_any) is TRUE, and it does not handle alertable waits.
Those features have been split out into separate patches to ease review.
Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
---
drivers/misc/ntsync.c | 229 ++++++++++++++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 13 ++
2 files changed, 242 insertions(+)
Comments
On Wed, Jan 24, 2024, at 01:40, Elizabeth Figura wrote:
> + if (args->timeout) {
> + struct timespec64 to;
> +
> + if (get_timespec64(&to, u64_to_user_ptr(args->timeout)))
> + return -EFAULT;
> + if (!timespec64_valid(&to))
> + return -EINVAL;
> +
> + timeout = timespec64_to_ns(&to);
> + }
Have you considered just passing the nanosecond value here?
Since you do not appear to write it back, that would avoid
the complexities of dealing with timespec layout differences
and indirection.
> + ids = kmalloc_array(count, sizeof(*ids), GFP_KERNEL);
> + if (!ids)
> + return -ENOMEM;
> + if (copy_from_user(ids, u64_to_user_ptr(args->objs),
> + array_size(count, sizeof(*ids)))) {
> + kfree(ids);
> + return -EFAULT;
> + }
This looks like memdup_user() would be slightly simpler.
Arnd
On Wednesday, 24 January 2024 01:56:52 CST Arnd Bergmann wrote:
> On Wed, Jan 24, 2024, at 01:40, Elizabeth Figura wrote:
>
> > + if (args->timeout) {
> > + struct timespec64 to;
> > +
> > + if (get_timespec64(&to, u64_to_user_ptr(args->timeout)))
> > + return -EFAULT;
> > + if (!timespec64_valid(&to))
> > + return -EINVAL;
> > +
> > + timeout = timespec64_to_ns(&to);
> > + }
>
> Have you considered just passing the nanosecond value here?
> Since you do not appear to write it back, that would avoid
> the complexities of dealing with timespec layout differences
> and indirection.
That'd be nicer in general. I think there was some documentation that advised
using timespec64 for new ioctl interfaces but it may have been outdated or
misread.
>
> > + ids = kmalloc_array(count, sizeof(*ids), GFP_KERNEL);
> > + if (!ids)
> > + return -ENOMEM;
> > + if (copy_from_user(ids, u64_to_user_ptr(args->objs),
> > + array_size(count, sizeof(*ids)))) {
> > + kfree(ids);
> > + return -EFAULT;
> > + }
>
> This looks like memdup_user() would be slightly simpler.
That's useful, thanks.
On Wed, Jan 24, 2024, at 19:02, Elizabeth Figura wrote:
> On Wednesday, 24 January 2024 01:56:52 CST Arnd Bergmann wrote:
>> On Wed, Jan 24, 2024, at 01:40, Elizabeth Figura wrote:
>>
>> > + if (args->timeout) {
>> > + struct timespec64 to;
>> > +
>> > + if (get_timespec64(&to, u64_to_user_ptr(args->timeout)))
>> > + return -EFAULT;
>> > + if (!timespec64_valid(&to))
>> > + return -EINVAL;
>> > +
>> > + timeout = timespec64_to_ns(&to);
>> > + }
>>
>> Have you considered just passing the nanosecond value here?
>> Since you do not appear to write it back, that would avoid
>> the complexities of dealing with timespec layout differences
>> and indirection.
>
> That'd be nicer in general. I think there was some documentation that advised
> using timespec64 for new ioctl interfaces but it may have been outdated or
> misread.
It's probably something I wrote. It depends a bit on
whether you have an absolute or relative timeout. If
the timeout is relative to the current time as I understand
it is here, a 64-bit number seems more logical to me.
For absolute times, I would usually use a __kernel_timespec,
especially if it's CLOCK_REALTIME. In this case you would
also need to specify the time domain.
Arnd
On Wednesday, 24 January 2024 13:52:52 CST Arnd Bergmann wrote:
> On Wed, Jan 24, 2024, at 19:02, Elizabeth Figura wrote:
> > On Wednesday, 24 January 2024 01:56:52 CST Arnd Bergmann wrote:
> >> On Wed, Jan 24, 2024, at 01:40, Elizabeth Figura wrote:
> >>
> >> > + if (args->timeout) {
> >> > + struct timespec64 to;
> >> > +
> >> > + if (get_timespec64(&to, u64_to_user_ptr(args->timeout)))
> >> > + return -EFAULT;
> >> > + if (!timespec64_valid(&to))
> >> > + return -EINVAL;
> >> > +
> >> > + timeout = timespec64_to_ns(&to);
> >> > + }
> >>
> >> Have you considered just passing the nanosecond value here?
> >> Since you do not appear to write it back, that would avoid
> >> the complexities of dealing with timespec layout differences
> >> and indirection.
> >
> > That'd be nicer in general. I think there was some documentation that advised
> > using timespec64 for new ioctl interfaces but it may have been outdated or
> > misread.
>
> It's probably something I wrote. It depends a bit on
> whether you have an absolute or relative timeout. If
> the timeout is relative to the current time as I understand
> it is here, a 64-bit number seems more logical to me.
>
> For absolute times, I would usually use a __kernel_timespec,
> especially if it's CLOCK_REALTIME. In this case you would
> also need to specify the time domain.
Currently the interface does pass it as an absolute time, with the
domain implicitly being MONOTONIC. This particular choice comes from
process/botching-up-ioctls.rst, which is admittedly focused around GPU
ioctls, but the rationale of having easily restartable ioctls applies
here too.
(E.g. Wine does play games with signals, so we do want to be able to
interrupt arbitrary waits with EINTR. The "usual" fast path for ntsync
waits won't hit that, but we want to have it work.)
On the other hand, if we can pass the timeout as relative, and write it
back on exit like ppoll() does [assuming that's not proscribed], that
would presumably be slightly better for performance.
When writing the patch I just picked the recommended option, and didn't
bother doing any micro-optimizations afterward.
What's the rationale for using timespec for absolute or written-back
timeouts, instead of dealing in ns directly? I'm afraid it's not
obvious to me.
--Zeb
On Wed, Jan 24, 2024, at 23:28, Elizabeth Figura wrote:
> On Wednesday, 24 January 2024 13:52:52 CST Arnd Bergmann wrote:
>> On Wed, Jan 24, 2024, at 19:02, Elizabeth Figura wrote:
>> > That'd be nicer in general. I think there was some documentation that advised
>> > using timespec64 for new ioctl interfaces but it may have been outdated or
>> > misread.
>>
>> It's probably something I wrote. It depends a bit on
>> whether you have an absolute or relative timeout. If
>> the timeout is relative to the current time as I understand
>> it is here, a 64-bit number seems more logical to me.
>>
>> For absolute times, I would usually use a __kernel_timespec,
>> especially if it's CLOCK_REALTIME. In this case you would
>> also need to specify the time domain.
>
> Currently the interface does pass it as an absolute time, with the
> domain implicitly being MONOTONIC. This particular choice comes from
> process/botching-up-ioctls.rst, which is admittedly focused around GPU
> ioctls, but the rationale of having easily restartable ioctls applies
> here too.
Ok, I was thinking of Documentation/driver-api/ioctl.rst, which
has similar recommendations.
> (E.g. Wine does play games with signals, so we do want to be able to
> interrupt arbitrary waits with EINTR. The "usual" fast path for ntsync
> waits won't hit that, but we want to have it work.)
>
> On the other hand, if we can pass the timeout as relative, and write it
> back on exit like ppoll() does [assuming that's not proscribed], that
> would presumably be slightly better for performance.
I've seen arguments go either way between absolute and relative
times, just pick whatever works best for you here.
> When writing the patch I just picked the recommended option, and didn't
> bother doing any micro-optimizations afterward.
>
> What's the rationale for using timespec for absolute or written-back
> timeouts, instead of dealing in ns directly? I'm afraid it's not
> obvious to me.
There is no hard rule either way, I mainly didn't like the
indirect pointer to the timespec that you have here. For
traditional unix-style interfaces, a timespec with CLOCK_REALTIME
times usually makes sense since that is what user space is
already using elsewhere, but you probably don't need to
worry about that. In theory, the single u64 CLOCK_REALTIME
nanoseconds have the problem of no longer working after year
2262, but with CLOCK_MONOTONIC that is not a concern anyway.
Between embedding a __u64 nanosecond value and embedding
a __kernel_timespec, I would pick whichever avoids converting
a __u64 back into a timespec, as that is an expensive
operation at least on 32-bit code.
Arnd
On Thursday, 25 January 2024 11:02:26 CST Arnd Bergmann wrote:
> On Wed, Jan 24, 2024, at 23:28, Elizabeth Figura wrote:
> > On Wednesday, 24 January 2024 13:52:52 CST Arnd Bergmann wrote:
> >> On Wed, Jan 24, 2024, at 19:02, Elizabeth Figura wrote:
> >> > That'd be nicer in general. I think there was some documentation that
> >> > advised using timespec64 for new ioctl interfaces but it may have been
> >> > outdated or misread.
> >>
> >> It's probably something I wrote. It depends a bit on
> >> whether you have an absolute or relative timeout. If
> >> the timeout is relative to the current time as I understand
> >> it is here, a 64-bit number seems more logical to me.
> >>
> >> For absolute times, I would usually use a __kernel_timespec,
> >> especially if it's CLOCK_REALTIME. In this case you would
> >> also need to specify the time domain.
> >
> > Currently the interface does pass it as an absolute time, with the
> > domain implicitly being MONOTONIC. This particular choice comes from
> > process/botching-up-ioctls.rst, which is admittedly focused around GPU
> > ioctls, but the rationale of having easily restartable ioctls applies
> > here too.
>
> Ok, I was thinking of Documentation/driver-api/ioctl.rst, which
> has similar recommendations.
>
> > (E.g. Wine does play games with signals, so we do want to be able to
> > interrupt arbitrary waits with EINTR. The "usual" fast path for ntsync
> > waits won't hit that, but we want to have it work.)
> >
> > On the other hand, if we can pass the timeout as relative, and write it
> > back on exit like ppoll() does [assuming that's not proscribed], that
> > would presumably be slightly better for performance.
>
> I've seen arguments go either way between absolute and relative
> times, just pick whatever works best for you here.
>
> > When writing the patch I just picked the recommended option, and didn't
> > bother doing any micro-optimizations afterward.
> >
> > What's the rationale for using timespec for absolute or written-back
> > timeouts, instead of dealing in ns directly? I'm afraid it's not
> > obvious to me.
>
> There is no hard rule either way, I mainly didn't like the
> indirect pointer to the timespec that you have here. For
> traditional unix-style interfaces, a timespec with CLOCK_REALTIME
> times usually makes sense since that is what user space is
> already using elsewhere, but you probably don't need to
> worry about that. In theory, the single u64 CLOCK_REALTIME
> nanoseconds have the problem of no longer working after year
> 2262, but with CLOCK_MONOTONIC that is not a concern anyway.
>
> Between embedding a __u64 nanosecond value and embedding
> a __kernel_timespec, I would pick whichever avoids converting
> a __u64 back into a timespec, as that is an expensive
> operation at least on 32-bit code.
Makes sense. I'll probably switch to using a relative and written-back u64
then, thanks!
@@ -23,6 +23,8 @@ struct ntsync_obj {
struct kref refcount;
spinlock_t lock;
+ struct list_head any_waiters;
+
enum ntsync_type type;
/* The following fields are protected by the object lock. */
@@ -34,6 +36,28 @@ struct ntsync_obj {
} u;
};
+struct ntsync_q_entry {
+ struct list_head node;
+ struct ntsync_q *q;
+ struct ntsync_obj *obj;
+ __u32 index;
+};
+
+struct ntsync_q {
+ struct task_struct *task;
+ __u32 owner;
+
+ /*
+ * Protected via atomic_cmpxchg(). Only the thread that wins the
+ * compare-and-swap may actually change object states and wake this
+ * task.
+ */
+ atomic_t signaled;
+
+ __u32 count;
+ struct ntsync_q_entry entries[];
+};
+
struct ntsync_device {
struct xarray objects;
};
@@ -109,6 +133,26 @@ static void init_obj(struct ntsync_obj *obj)
{
kref_init(&obj->refcount);
spin_lock_init(&obj->lock);
+ INIT_LIST_HEAD(&obj->any_waiters);
+}
+
+static void try_wake_any_sem(struct ntsync_obj *sem)
+{
+ struct ntsync_q_entry *entry;
+
+ lockdep_assert_held(&sem->lock);
+
+ list_for_each_entry(entry, &sem->any_waiters, node) {
+ struct ntsync_q *q = entry->q;
+
+ if (!sem->u.sem.count)
+ break;
+
+ if (atomic_cmpxchg(&q->signaled, -1, entry->index) == -1) {
+ sem->u.sem.count--;
+ wake_up_process(q->task);
+ }
+ }
}
static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
@@ -194,6 +238,8 @@ static int ntsync_put_sem(struct ntsync_device *dev, void __user *argp)
prev_count = sem->u.sem.count;
ret = put_sem_state(sem, args.count);
+ if (!ret)
+ try_wake_any_sem(sem);
spin_unlock(&sem->lock);
@@ -205,6 +251,187 @@ static int ntsync_put_sem(struct ntsync_device *dev, void __user *argp)
return ret;
}
+static int ntsync_schedule(const struct ntsync_q *q, ktime_t *timeout)
+{
+ int ret = 0;
+
+ do {
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (atomic_read(&q->signaled) != -1) {
+ ret = 0;
+ break;
+ }
+ ret = schedule_hrtimeout(timeout, HRTIMER_MODE_ABS);
+ } while (ret < 0);
+ __set_current_state(TASK_RUNNING);
+
+ return ret;
+}
+
+/*
+ * Allocate and initialize the ntsync_q structure, but do not queue us yet.
+ * Also, calculate the relative timeout.
+ */
+static int setup_wait(struct ntsync_device *dev,
+ const struct ntsync_wait_args *args,
+ ktime_t *ret_timeout, struct ntsync_q **ret_q)
+{
+ const __u32 count = args->count;
+ struct ntsync_q *q;
+ ktime_t timeout = 0;
+ __u32 *ids;
+ __u32 i, j;
+
+ if (!args->owner || args->pad)
+ return -EINVAL;
+
+ if (args->count > NTSYNC_MAX_WAIT_COUNT)
+ return -EINVAL;
+
+ if (args->timeout) {
+ struct timespec64 to;
+
+ if (get_timespec64(&to, u64_to_user_ptr(args->timeout)))
+ return -EFAULT;
+ if (!timespec64_valid(&to))
+ return -EINVAL;
+
+ timeout = timespec64_to_ns(&to);
+ }
+
+ ids = kmalloc_array(count, sizeof(*ids), GFP_KERNEL);
+ if (!ids)
+ return -ENOMEM;
+ if (copy_from_user(ids, u64_to_user_ptr(args->objs),
+ array_size(count, sizeof(*ids)))) {
+ kfree(ids);
+ return -EFAULT;
+ }
+
+ q = kmalloc(struct_size(q, entries, count), GFP_KERNEL);
+ if (!q) {
+ kfree(ids);
+ return -ENOMEM;
+ }
+ q->task = current;
+ q->owner = args->owner;
+ atomic_set(&q->signaled, -1);
+ q->count = count;
+
+ for (i = 0; i < count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = get_obj(dev, ids[i]);
+
+ if (!obj)
+ goto err;
+
+ entry->obj = obj;
+ entry->q = q;
+ entry->index = i;
+ }
+
+ kfree(ids);
+
+ *ret_q = q;
+ *ret_timeout = timeout;
+ return 0;
+
+err:
+ for (j = 0; j < i; j++)
+ put_obj(q->entries[j].obj);
+ kfree(ids);
+ kfree(q);
+ return -EINVAL;
+}
+
+static void try_wake_any_obj(struct ntsync_obj *obj)
+{
+ switch (obj->type) {
+ case NTSYNC_TYPE_SEM:
+ try_wake_any_sem(obj);
+ break;
+ }
+}
+
+static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp)
+{
+ struct ntsync_wait_args args;
+ struct ntsync_q *q;
+ ktime_t timeout;
+ int signaled;
+ __u32 i;
+ int ret;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ ret = setup_wait(dev, &args, &timeout, &q);
+ if (ret < 0)
+ return ret;
+
+ /* queue ourselves */
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = entry->obj;
+
+ spin_lock(&obj->lock);
+ list_add_tail(&entry->node, &obj->any_waiters);
+ spin_unlock(&obj->lock);
+ }
+
+ /* check if we are already signaled */
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_obj *obj = q->entries[i].obj;
+
+ if (atomic_read(&q->signaled) != -1)
+ break;
+
+ spin_lock(&obj->lock);
+ try_wake_any_obj(obj);
+ spin_unlock(&obj->lock);
+ }
+
+ /* sleep */
+
+ ret = ntsync_schedule(q, args.timeout ? &timeout : NULL);
+
+ /* and finally, unqueue */
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = entry->obj;
+
+ spin_lock(&obj->lock);
+ list_del(&entry->node);
+ spin_unlock(&obj->lock);
+
+ put_obj(obj);
+ }
+
+ signaled = atomic_read(&q->signaled);
+ if (signaled != -1) {
+ struct ntsync_wait_args __user *user_args = argp;
+
+ /* even if we caught a signal, we need to communicate success */
+ ret = 0;
+
+ if (put_user(signaled, &user_args->index))
+ ret = -EFAULT;
+ } else if (!ret) {
+ ret = -ETIMEDOUT;
+ }
+
+ kfree(q);
+ return ret;
+}
+
static long ntsync_char_ioctl(struct file *file, unsigned int cmd,
unsigned long parm)
{
@@ -218,6 +445,8 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd,
return ntsync_delete(dev, argp);
case NTSYNC_IOC_PUT_SEM:
return ntsync_put_sem(dev, argp);
+ case NTSYNC_IOC_WAIT_ANY:
+ return ntsync_wait_any(dev, argp);
default:
return -ENOIOCTLCMD;
}
@@ -16,6 +16,17 @@ struct ntsync_sem_args {
__u32 max;
};
+struct ntsync_wait_args {
+ __u64 timeout;
+ __u64 objs;
+ __u32 count;
+ __u32 owner;
+ __u32 index;
+ __u32 pad;
+};
+
+#define NTSYNC_MAX_WAIT_COUNT 64
+
#define NTSYNC_IOC_BASE 0xf7
#define NTSYNC_IOC_CREATE_SEM _IOWR(NTSYNC_IOC_BASE, 0, \
@@ -23,5 +34,7 @@ struct ntsync_sem_args {
#define NTSYNC_IOC_DELETE _IOW (NTSYNC_IOC_BASE, 1, __u32)
#define NTSYNC_IOC_PUT_SEM _IOWR(NTSYNC_IOC_BASE, 2, \
struct ntsync_sem_args)
+#define NTSYNC_IOC_WAIT_ANY _IOWR(NTSYNC_IOC_BASE, 3, \
+ struct ntsync_wait_args)
#endif