[RFC,2/2] tracing/user_events: Fixup enable faults asyncly

Message ID 20221027224011.2075-3-beaub@linux.microsoft.com
State New
Headers
Series tracing/user_events: Remote write ABI |

Commit Message

Beau Belgrave Oct. 27, 2022, 10:40 p.m. UTC
  When events are enabled within the various tracing facilities, such as
ftrace/perf, the event_mutex is held. As events are enabled pages are
accessed. We do not want page faults to occur under this lock. Instead
queue the fault to a workqueue to be handled in a process context safe
way without the lock.

The enable address is disabled while the async fault-in occurs. This
ensures that we don't attempt fault-in more than is necessary. Once the
page has been faulted in, the address write is attempted again. If the
page couldn't fault-in, then we wait until the next time the event is
enabled to prevent any potential infinite loops.

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
---
 kernel/trace/trace_events_user.c | 125 ++++++++++++++++++++++++++++++-
 1 file changed, 121 insertions(+), 4 deletions(-)
  

Comments

Mathieu Desnoyers Oct. 28, 2022, 10:07 p.m. UTC | #1
On 2022-10-27 18:40, Beau Belgrave wrote:
> When events are enabled within the various tracing facilities, such as
> ftrace/perf, the event_mutex is held. As events are enabled pages are
> accessed. We do not want page faults to occur under this lock. Instead
> queue the fault to a workqueue to be handled in a process context safe
> way without the lock.

Good stuff! On my end, I've progressed on the "side" userspace 
instrumentation library prototype. It implements the static 
instrumentation layer to which a simple userspace printf-based tracer 
connects (for testing purposes). All moving parts are wired up now, 
including the type system and RCU to protect callback iteration against 
concurrent userspace tracer registration/unregistration.

The top bit of the "enable" words are reserved for user events. So you 
can see the "TODO" where the user events ioctl/writev would be expected:

https://github.com/compudj/side

I'm still slightly unsure about using "uint32_t" for enable check, or 
going for "unsigned long". The core question there is whether a 32-bit 
word test would cause partial register stalls on 64-bit architectures. 
Going for unsigned long would require that user events receives 
information about the bitness of the word as input from userspace. 
(bit=63 rather than 31). Perhaps this is something the user events ABI 
should accommodate by reserving more than 5 bits to express the target bit ?

> 
> The enable address is disabled while the async fault-in occurs. This
> ensures that we don't attempt fault-in more than is necessary. Once the
> page has been faulted in, the address write is attempted again. If the
> page couldn't fault-in, then we wait until the next time the event is
> enabled to prevent any potential infinite loops.

So if the page is removed from the page cache between the point where it 
is faulted in and the moment the write is re-attempted, that will not 
trigger another attempt at paging in the page, am I correct ?

I would think this is unexpected. I would expect that failing to fault 
in the page would stop any further attempts, but simply failing to pin 
the page after faulting it in should simply try again.

Thoughts ?

Thanks,

Mathieu


> 
> Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
> ---
>   kernel/trace/trace_events_user.c | 125 ++++++++++++++++++++++++++++++-
>   1 file changed, 121 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
> index 633f24c2a1ac..f1eb8101e053 100644
> --- a/kernel/trace/trace_events_user.c
> +++ b/kernel/trace/trace_events_user.c
> @@ -81,11 +81,22 @@ struct user_event_enabler {
>   	struct list_head link;
>   	struct mm_struct *mm;
>   	struct file *file;
> +	refcount_t refcnt;
>   	unsigned long enable_addr;
>   	unsigned int enable_bit: 5,
> -		     __reserved: 27;
> +		     __reserved: 26,
> +		     disabled: 1;
>   };
>   
> +/* Used for asynchronous faulting in of pages */
> +struct user_event_enabler_fault {
> +	struct work_struct work;
> +	struct user_event_enabler *enabler;
> +	struct user_event *event;
> +};
> +
> +static struct kmem_cache *fault_cache;
> +
>   /*
>    * Stores per-event properties, as users register events
>    * within a file a user_event might be created if it does not
> @@ -236,6 +247,19 @@ static void user_event_enabler_destroy(struct user_event_enabler *enabler)
>   	kfree(enabler);
>   }
>   
> +static __always_inline struct user_event_enabler
> +*user_event_enabler_get(struct user_event_enabler *enabler)
> +{
> +	refcount_inc(&enabler->refcnt);
> +	return enabler;
> +}
> +
> +static void user_event_enabler_put(struct user_event_enabler *enabler)
> +{
> +	if (refcount_dec_and_test(&enabler->refcnt))
> +		user_event_enabler_destroy(enabler);
> +}
> +
>   static void user_event_enabler_remove(struct file *file,
>   				      struct user_event *user)
>   {
> @@ -249,13 +273,93 @@ static void user_event_enabler_remove(struct file *file,
>   		if (enabler->file != file)
>   			continue;
>   
> +		enabler->disabled = 0;
>   		list_del(&enabler->link);
> -		user_event_enabler_destroy(enabler);
> +		user_event_enabler_put(enabler);
>   	}
>   
>   	mutex_unlock(&event_mutex);
>   }
>   
> +static void user_event_enabler_write(struct user_event_enabler *enabler,
> +				     struct user_event *user);
> +
> +static void user_event_enabler_fault_fixup(struct work_struct *work)
> +{
> +	struct user_event_enabler_fault *fault = container_of(
> +		work, struct user_event_enabler_fault, work);
> +	struct user_event_enabler *enabler = fault->enabler;
> +	struct user_event *user = fault->event;
> +	struct mm_struct *mm = enabler->mm;
> +	unsigned long uaddr = enabler->enable_addr;
> +	bool unlocked = false;
> +	int ret;
> +
> +	might_sleep();
> +
> +	mmap_read_lock(mm);
> +
> +	ret = fixup_user_fault(mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
> +			       &unlocked);
> +
> +	mmap_read_unlock(mm);
> +
> +	if (ret)
> +		pr_warn("user_events: Fixup fault failed with %d "
> +			"for mm: 0x%pK offset: 0x%llx event: %s\n", ret, mm,
> +			(unsigned long long)uaddr, EVENT_NAME(user));
> +
> +	/* Prevent state changes from racing */
> +	mutex_lock(&event_mutex);
> +
> +	/*
> +	 * If we managed to get the page, re-issue the write. We do not
> +	 * want to get into a possible infinite loop, which is why we only
> +	 * attempt again directly if the page came in. If we couldn't get
> +	 * the page here, then we will try again the next time the event is
> +	 * enabled/disabled.
> +	 */
> +	enabler->disabled = 0;
> +
> +	if (!ret)
> +		user_event_enabler_write(enabler, user);
> +
> +	mutex_unlock(&event_mutex);
> +
> +	refcount_dec(&user->refcnt);
> +	user_event_enabler_put(enabler);
> +	kmem_cache_free(fault_cache, fault);
> +}
> +
> +static bool user_event_enabler_queue_fault(struct user_event_enabler *enabler,
> +					   struct user_event *user)
> +{
> +	struct user_event_enabler_fault *fault;
> +
> +	fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
> +
> +	if (!fault)
> +		return false;
> +
> +	INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
> +	fault->enabler = user_event_enabler_get(enabler);
> +	fault->event = user;
> +
> +	refcount_inc(&user->refcnt);
> +	enabler->disabled = 1;
> +
> +	if (!schedule_work(&fault->work)) {
> +		enabler->disabled = 0;
> +		refcount_dec(&user->refcnt);
> +		user_event_enabler_put(enabler);
> +		kmem_cache_free(fault_cache, fault);
> +
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
>   static void user_event_enabler_write(struct user_event_enabler *enabler,
>   				     struct user_event *user)
>   {
> @@ -266,6 +370,11 @@ static void user_event_enabler_write(struct user_event_enabler *enabler,
>   	void *kaddr;
>   	int ret;
>   
> +	lockdep_assert_held(&event_mutex);
> +
> +	if (unlikely(enabler->disabled))
> +		return;
> +
>   	mmap_read_lock(mm);
>   
>   	ret = pin_user_pages_remote(mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
> @@ -273,8 +382,10 @@ static void user_event_enabler_write(struct user_event_enabler *enabler,
>   
>   	mmap_read_unlock(mm);
>   
> -	if (ret <= 0) {
> -		pr_warn("user_events: Enable write failed\n");
> +	if (unlikely(ret <= 0)) {
> +		if (!user_event_enabler_queue_fault(enabler, user))
> +			pr_warn("user_events: Unable to queue fault handler\n");
> +
>   		return;
>   	}
>   
> @@ -321,6 +432,7 @@ static struct user_event_enabler
>   	enabler->file = file;
>   	enabler->enable_addr = (unsigned long)reg->enable_addr;
>   	enabler->enable_bit = reg->enable_bit;
> +	refcount_set(&enabler->refcnt, 1);
>   
>   	/* Prevents state changes from racing with new enablers */
>   	mutex_lock(&event_mutex);
> @@ -1902,6 +2014,11 @@ static int __init trace_events_user_init(void)
>   {
>   	int ret;
>   
> +	fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
> +
> +	if (!fault_cache)
> +		return -ENOMEM;
> +
>   	init_group = user_event_group_create(&init_user_ns);
>   
>   	if (!init_group)
  
Mathieu Desnoyers Oct. 28, 2022, 10:19 p.m. UTC | #2
On 2022-10-27 18:40, Beau Belgrave wrote:
> When events are enabled within the various tracing facilities, such as
> ftrace/perf, the event_mutex is held. As events are enabled pages are
> accessed. We do not want page faults to occur under this lock. Instead
> queue the fault to a workqueue to be handled in a process context safe
> way without the lock.
> 
> The enable address is disabled while the async fault-in occurs. This
> ensures that we don't attempt fault-in more than is necessary. Once the
> page has been faulted in, the address write is attempted again. If the
> page couldn't fault-in, then we wait until the next time the event is
> enabled to prevent any potential infinite loops.

I'm also unclear about how the system call initiating the enabled state 
change is delayed (or not) when a page fault is queued.

I would expect that when a page fault is needed, after enqueuing work to 
the worker thread, the system call initiating the state change would 
somehow wait for a completion (after releasing the user events mutex). 
That completion would be signaled by the worker thread either if the 
page fault fails, or if the state change is done.

Thoughts ?

Thanks,

Mathieu

> 
> Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
> ---
>   kernel/trace/trace_events_user.c | 125 ++++++++++++++++++++++++++++++-
>   1 file changed, 121 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
> index 633f24c2a1ac..f1eb8101e053 100644
> --- a/kernel/trace/trace_events_user.c
> +++ b/kernel/trace/trace_events_user.c
> @@ -81,11 +81,22 @@ struct user_event_enabler {
>   	struct list_head link;
>   	struct mm_struct *mm;
>   	struct file *file;
> +	refcount_t refcnt;
>   	unsigned long enable_addr;
>   	unsigned int enable_bit: 5,
> -		     __reserved: 27;
> +		     __reserved: 26,
> +		     disabled: 1;
>   };
>   
> +/* Used for asynchronous faulting in of pages */
> +struct user_event_enabler_fault {
> +	struct work_struct work;
> +	struct user_event_enabler *enabler;
> +	struct user_event *event;
> +};
> +
> +static struct kmem_cache *fault_cache;
> +
>   /*
>    * Stores per-event properties, as users register events
>    * within a file a user_event might be created if it does not
> @@ -236,6 +247,19 @@ static void user_event_enabler_destroy(struct user_event_enabler *enabler)
>   	kfree(enabler);
>   }
>   
> +static __always_inline struct user_event_enabler
> +*user_event_enabler_get(struct user_event_enabler *enabler)
> +{
> +	refcount_inc(&enabler->refcnt);
> +	return enabler;
> +}
> +
> +static void user_event_enabler_put(struct user_event_enabler *enabler)
> +{
> +	if (refcount_dec_and_test(&enabler->refcnt))
> +		user_event_enabler_destroy(enabler);
> +}
> +
>   static void user_event_enabler_remove(struct file *file,
>   				      struct user_event *user)
>   {
> @@ -249,13 +273,93 @@ static void user_event_enabler_remove(struct file *file,
>   		if (enabler->file != file)
>   			continue;
>   
> +		enabler->disabled = 0;
>   		list_del(&enabler->link);
> -		user_event_enabler_destroy(enabler);
> +		user_event_enabler_put(enabler);
>   	}
>   
>   	mutex_unlock(&event_mutex);
>   }
>   
> +static void user_event_enabler_write(struct user_event_enabler *enabler,
> +				     struct user_event *user);
> +
> +static void user_event_enabler_fault_fixup(struct work_struct *work)
> +{
> +	struct user_event_enabler_fault *fault = container_of(
> +		work, struct user_event_enabler_fault, work);
> +	struct user_event_enabler *enabler = fault->enabler;
> +	struct user_event *user = fault->event;
> +	struct mm_struct *mm = enabler->mm;
> +	unsigned long uaddr = enabler->enable_addr;
> +	bool unlocked = false;
> +	int ret;
> +
> +	might_sleep();
> +
> +	mmap_read_lock(mm);
> +
> +	ret = fixup_user_fault(mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
> +			       &unlocked);
> +
> +	mmap_read_unlock(mm);
> +
> +	if (ret)
> +		pr_warn("user_events: Fixup fault failed with %d "
> +			"for mm: 0x%pK offset: 0x%llx event: %s\n", ret, mm,
> +			(unsigned long long)uaddr, EVENT_NAME(user));
> +
> +	/* Prevent state changes from racing */
> +	mutex_lock(&event_mutex);
> +
> +	/*
> +	 * If we managed to get the page, re-issue the write. We do not
> +	 * want to get into a possible infinite loop, which is why we only
> +	 * attempt again directly if the page came in. If we couldn't get
> +	 * the page here, then we will try again the next time the event is
> +	 * enabled/disabled.
> +	 */
> +	enabler->disabled = 0;
> +
> +	if (!ret)
> +		user_event_enabler_write(enabler, user);
> +
> +	mutex_unlock(&event_mutex);
> +
> +	refcount_dec(&user->refcnt);
> +	user_event_enabler_put(enabler);
> +	kmem_cache_free(fault_cache, fault);
> +}
> +
> +static bool user_event_enabler_queue_fault(struct user_event_enabler *enabler,
> +					   struct user_event *user)
> +{
> +	struct user_event_enabler_fault *fault;
> +
> +	fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
> +
> +	if (!fault)
> +		return false;
> +
> +	INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
> +	fault->enabler = user_event_enabler_get(enabler);
> +	fault->event = user;
> +
> +	refcount_inc(&user->refcnt);
> +	enabler->disabled = 1;
> +
> +	if (!schedule_work(&fault->work)) {
> +		enabler->disabled = 0;
> +		refcount_dec(&user->refcnt);
> +		user_event_enabler_put(enabler);
> +		kmem_cache_free(fault_cache, fault);
> +
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
>   static void user_event_enabler_write(struct user_event_enabler *enabler,
>   				     struct user_event *user)
>   {
> @@ -266,6 +370,11 @@ static void user_event_enabler_write(struct user_event_enabler *enabler,
>   	void *kaddr;
>   	int ret;
>   
> +	lockdep_assert_held(&event_mutex);
> +
> +	if (unlikely(enabler->disabled))
> +		return;
> +
>   	mmap_read_lock(mm);
>   
>   	ret = pin_user_pages_remote(mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
> @@ -273,8 +382,10 @@ static void user_event_enabler_write(struct user_event_enabler *enabler,
>   
>   	mmap_read_unlock(mm);
>   
> -	if (ret <= 0) {
> -		pr_warn("user_events: Enable write failed\n");
> +	if (unlikely(ret <= 0)) {
> +		if (!user_event_enabler_queue_fault(enabler, user))
> +			pr_warn("user_events: Unable to queue fault handler\n");
> +
>   		return;
>   	}
>   
> @@ -321,6 +432,7 @@ static struct user_event_enabler
>   	enabler->file = file;
>   	enabler->enable_addr = (unsigned long)reg->enable_addr;
>   	enabler->enable_bit = reg->enable_bit;
> +	refcount_set(&enabler->refcnt, 1);
>   
>   	/* Prevents state changes from racing with new enablers */
>   	mutex_lock(&event_mutex);
> @@ -1902,6 +2014,11 @@ static int __init trace_events_user_init(void)
>   {
>   	int ret;
>   
> +	fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
> +
> +	if (!fault_cache)
> +		return -ENOMEM;
> +
>   	init_group = user_event_group_create(&init_user_ns);
>   
>   	if (!init_group)
  
Beau Belgrave Oct. 28, 2022, 10:35 p.m. UTC | #3
On Fri, Oct 28, 2022 at 06:07:34PM -0400, Mathieu Desnoyers wrote:
> On 2022-10-27 18:40, Beau Belgrave wrote:
> > When events are enabled within the various tracing facilities, such as
> > ftrace/perf, the event_mutex is held. As events are enabled pages are
> > accessed. We do not want page faults to occur under this lock. Instead
> > queue the fault to a workqueue to be handled in a process context safe
> > way without the lock.
> 
> Good stuff! On my end, I've progressed on the "side" userspace
> instrumentation library prototype. It implements the static instrumentation
> layer to which a simple userspace printf-based tracer connects (for testing
> purposes). All moving parts are wired up now, including the type system and
> RCU to protect callback iteration against concurrent userspace tracer
> registration/unregistration.
> 

Cool!

> The top bit of the "enable" words are reserved for user events. So you can
> see the "TODO" where the user events ioctl/writev would be expected:
> 
> https://github.com/compudj/side
> 

I'll check this out!

> I'm still slightly unsure about using "uint32_t" for enable check, or going
> for "unsigned long". The core question there is whether a 32-bit word test
> would cause partial register stalls on 64-bit architectures. Going for
> unsigned long would require that user events receives information about the
> bitness of the word as input from userspace. (bit=63 rather than 31).
> Perhaps this is something the user events ABI should accommodate by
> reserving more than 5 bits to express the target bit ?
> 

Yeah, I thought about this. From the user side you can actually use any
arbitrary bits you want by passing a 32-bit aligned value. So you could
take the lower or top half of a 64-bit value. The reason I limit to 0-31
bits is to ensure no page straddling occurs. Futex also does this, check
out get_futex_key() in kernel/futex/core.c.

I would recommend trying out uint64 but give the upper half to
user_events ABI and checking if that works for you if you want say 63
bits to user space. You'd tell the ABI bit 31, but in user space it
would be bit 63.

> > 
> > The enable address is disabled while the async fault-in occurs. This
> > ensures that we don't attempt fault-in more than is necessary. Once the
> > page has been faulted in, the address write is attempted again. If the
> > page couldn't fault-in, then we wait until the next time the event is
> > enabled to prevent any potential infinite loops.
> 
> So if the page is removed from the page cache between the point where it is
> faulted in and the moment the write is re-attempted, that will not trigger
> another attempt at paging in the page, am I correct ?
> 

If we fault and the fixup user fault fails still with retries, then we
give up until the next enablement of that site.

However, if we fault and the fixup user fault with retries works, and
then we fault again trying to write, that will retry.

> I would think this is unexpected. I would expect that failing to fault in
> the page would stop any further attempts, but simply failing to pin the page
> after faulting it in should simply try again.
> 

The issue I repro is mmap() register the enable at that location, then
unmap(). All the faulting errors just tell you -EFAULT in this case,
even though it was maliciously removed. In this case you could get the
kernel to infinite loop trying to page it in.

> Thoughts ?
> 

We need to have some sanity toward giving up faulting in data that never
will make it. The code currently assigns that line to if
fixup_user_fault with retries fails, we give up. pin_user_pages_remote
will not stop it from being attempted again.

> Thanks,
> 
> Mathieu
> 

Thanks,
-Beau
  
Beau Belgrave Oct. 28, 2022, 10:42 p.m. UTC | #4
On Fri, Oct 28, 2022 at 06:19:10PM -0400, Mathieu Desnoyers wrote:
> On 2022-10-27 18:40, Beau Belgrave wrote:
> > When events are enabled within the various tracing facilities, such as
> > ftrace/perf, the event_mutex is held. As events are enabled pages are
> > accessed. We do not want page faults to occur under this lock. Instead
> > queue the fault to a workqueue to be handled in a process context safe
> > way without the lock.
> > 
> > The enable address is disabled while the async fault-in occurs. This
> > ensures that we don't attempt fault-in more than is necessary. Once the
> > page has been faulted in, the address write is attempted again. If the
> > page couldn't fault-in, then we wait until the next time the event is
> > enabled to prevent any potential infinite loops.
> 
> I'm also unclear about how the system call initiating the enabled state
> change is delayed (or not) when a page fault is queued.
> 

It's not, if needed we could call schedule_delayed_work(). However, I
don't think we need it. When pin_user_pages_remote is invoked, it's with
FOLL_NOFAULT. This will tell us if we need to fault pages in, we then
call fixup_user_fault with unlocked value passed. This will cause the
fixup to retry and get the page in.

It's called out in the comments for this exact purpose (lucked out
here):
mm/gup.c
 * This is meant to be called in the specific scenario where for locking reasons
 * we try to access user memory in atomic context (within a pagefault_disable()
 * section), this returns -EFAULT, and we want to resolve the user fault before
 * trying again.

The fault in happens in a workqueue, this is the same way KVM does it's
async page fault logic, so it's not a new pattern. As soon as the
fault-in is done, we have a page we should be able to use, so we
re-attempt the write immediately. If the write fails, another queue
happens and we could loop, but not like the unmap() case I replied with
earlier.

> I would expect that when a page fault is needed, after enqueuing work to the
> worker thread, the system call initiating the state change would somehow
> wait for a completion (after releasing the user events mutex). That
> completion would be signaled by the worker thread either if the page fault
> fails, or if the state change is done.
> 

I didn't see a generic fault-in + notify anywhere, do you know of one I
could use? Otherwise, it seems the pattern used is check fault, fault-in
via workqueue, re-attempt.

> Thoughts ?
> 
> Thanks,
> 
> Mathieu
> 

Thanks,
-Beau
  
Mathieu Desnoyers Oct. 29, 2022, 2:23 p.m. UTC | #5
On 2022-10-28 18:35, Beau Belgrave wrote:
> On Fri, Oct 28, 2022 at 06:07:34PM -0400, Mathieu Desnoyers wrote:
>> On 2022-10-27 18:40, Beau Belgrave wrote:

[...]

> 
>> I'm still slightly unsure about using "uint32_t" for enable check, or going
>> for "unsigned long". The core question there is whether a 32-bit word test
>> would cause partial register stalls on 64-bit architectures. Going for
>> unsigned long would require that user events receives information about the
>> bitness of the word as input from userspace. (bit=63 rather than 31).
>> Perhaps this is something the user events ABI should accommodate by
>> reserving more than 5 bits to express the target bit ?
>>
> 
> Yeah, I thought about this. From the user side you can actually use any
> arbitrary bits you want by passing a 32-bit aligned value. So you could
> take the lower or top half of a 64-bit value. The reason I limit to 0-31
> bits is to ensure no page straddling occurs. Futex also does this, check
> out get_futex_key() in kernel/futex/core.c.

We can ensure no page straddling by checking the address alignment 
compared to the size of the integer (4 or 8 bytes) also received as input.

> 
> I would recommend trying out uint64 but give the upper half to
> user_events ABI and checking if that works for you if you want say 63
> bits to user space. You'd tell the ABI bit 31, but in user space it
> would be bit 63.

That's tricky because Linux supports both big and little endian, which 
will make the ABI tricky to use if we try to be too clever about this.

Also, just stating a "pointer" and "bits offset" is not enough if we 
want to support 32-bit and 64-bit integer types, because the bit offset 
does not consider endianness.

I would recommend to get the following information through the user 
events registration:

- pointer address,
- size of integer type (4 or 8 bytes),
- bit offset (currently 31 or 63).

This way we have all the information we need to set the right bit in 
both little and big endian. We also have all the information we need to 
validate natural alignment, e.g.:

/* Check alignment */
if (addr & (type_size - 1))
	return -EINVAL;
/* Check bit offset range */
if (bit_offset > (type_size * CHAR_BIT) - 1)
	return -EINVAL;
switch (type_size) {
case 4:
	/* Update 32-bit integer */
	break;
#if BITS_PER_LONG >= 64
case 8:
	/* Update 64-bit integer */
	break;
#endif
default:
	return -EINVAL;
}


> 
>>>
>>> The enable address is disabled while the async fault-in occurs. This
>>> ensures that we don't attempt fault-in more than is necessary. Once the
>>> page has been faulted in, the address write is attempted again. If the
>>> page couldn't fault-in, then we wait until the next time the event is
>>> enabled to prevent any potential infinite loops.
>>
>> So if the page is removed from the page cache between the point where it is
>> faulted in and the moment the write is re-attempted, that will not trigger
>> another attempt at paging in the page, am I correct ?
>>
> 
> If we fault and the fixup user fault fails still with retries, then we
> give up until the next enablement of that site.
> 
> However, if we fault and the fixup user fault with retries works, and
> then we fault again trying to write, that will retry.
> 
>> I would think this is unexpected. I would expect that failing to fault in
>> the page would stop any further attempts, but simply failing to pin the page
>> after faulting it in should simply try again.
>>
> 
> The issue I repro is mmap() register the enable at that location, then
> unmap(). All the faulting errors just tell you -EFAULT in this case,
> even though it was maliciously removed. In this case you could get the
> kernel to infinite loop trying to page it in.
> 
>> Thoughts ?
>>
> 
> We need to have some sanity toward giving up faulting in data that never
> will make it. The code currently assigns that line to if
> fixup_user_fault with retries fails, we give up. pin_user_pages_remote
> will not stop it from being attempted again.

What are the legitimate cases that can make fixup_user_fault fail ? If 
there are none, and the only case that can make this fail is userspace 
unmapping memory, then we should very well kill the offending process.

An example here is futex fault_in_user_writeable(). When it fails, it 
appears that the futex_wake_op simply returns -EFAULT to the caller.

Thanks,

Mathieu

> 
>> Thanks,
>>
>> Mathieu
>>
> 
> Thanks,
> -Beau
  
Mathieu Desnoyers Oct. 29, 2022, 2:40 p.m. UTC | #6
On 2022-10-28 18:42, Beau Belgrave wrote:
> On Fri, Oct 28, 2022 at 06:19:10PM -0400, Mathieu Desnoyers wrote:
>> On 2022-10-27 18:40, Beau Belgrave wrote:
>>> When events are enabled within the various tracing facilities, such as
>>> ftrace/perf, the event_mutex is held. As events are enabled pages are
>>> accessed. We do not want page faults to occur under this lock. Instead
>>> queue the fault to a workqueue to be handled in a process context safe
>>> way without the lock.
>>>
>>> The enable address is disabled while the async fault-in occurs. This
>>> ensures that we don't attempt fault-in more than is necessary. Once the
>>> page has been faulted in, the address write is attempted again. If the
>>> page couldn't fault-in, then we wait until the next time the event is
>>> enabled to prevent any potential infinite loops.
>>
>> I'm also unclear about how the system call initiating the enabled state
>> change is delayed (or not) when a page fault is queued.
>>
> 
> It's not, if needed we could call schedule_delayed_work(). However, I
> don't think we need it. When pin_user_pages_remote is invoked, it's with
> FOLL_NOFAULT. This will tell us if we need to fault pages in, we then
> call fixup_user_fault with unlocked value passed. This will cause the
> fixup to retry and get the page in.
> 
> It's called out in the comments for this exact purpose (lucked out
> here):
> mm/gup.c
>   * This is meant to be called in the specific scenario where for locking reasons
>   * we try to access user memory in atomic context (within a pagefault_disable()
>   * section), this returns -EFAULT, and we want to resolve the user fault before
>   * trying again.
> 
> The fault in happens in a workqueue, this is the same way KVM does it's
> async page fault logic, so it's not a new pattern. As soon as the
> fault-in is done, we have a page we should be able to use, so we
> re-attempt the write immediately. If the write fails, another queue
> happens and we could loop, but not like the unmap() case I replied with
> earlier.
> 
>> I would expect that when a page fault is needed, after enqueuing work to the
>> worker thread, the system call initiating the state change would somehow
>> wait for a completion (after releasing the user events mutex). That
>> completion would be signaled by the worker thread either if the page fault
>> fails, or if the state change is done.
>>
> 
> I didn't see a generic fault-in + notify anywhere, do you know of one I
> could use? Otherwise, it seems the pattern used is check fault, fault-in
> via workqueue, re-attempt.

I don't think we're talking about the same thing. I'll try stating my 
concern in a different way.

user_event_enabler_write() calls user_event_enabler_queue_fault() when 
it cannot perform the enabled state update immediately (because a page 
fault is needed).

But then AFAIU it returns immediately to the caller. The caller could 
very well expect that the event has been enabled, as requested, 
immediately when the enabler write returns. The fact that enabling the 
event can be delayed for an arbitrary amount of time due to page faults 
means that we have no hard guarantee that the event is enabled as 
requested upon return to the caller.

I'd like to add a completion there, to be waited for after 
user_event_enabler_queue_fault(), but before returning from 
user_event_enabler_create(). Waiting for the completion should be done 
without any mutexes held, so after releasing event_mutex.

The completion would be placed on the stack of 
user_event_enabler_create(), and a reference to the completion would be 
placed in the queued fault request. The completion notification would be 
emitted by the worker thread either when enabling is done, or if a page 
fault fails.

See include/linux/completion.h

Thoughts ?

Thanks,

Mathieu


> 
>> Thoughts ?
>>
>> Thanks,
>>
>> Mathieu
>>
> 
> Thanks,
> -Beau
  
Mathieu Desnoyers Oct. 30, 2022, 11:45 a.m. UTC | #7
On 2022-10-29 10:40, Mathieu Desnoyers wrote:
> On 2022-10-28 18:42, Beau Belgrave wrote:
>> On Fri, Oct 28, 2022 at 06:19:10PM -0400, Mathieu Desnoyers wrote:
>>> On 2022-10-27 18:40, Beau Belgrave wrote:
>>>> When events are enabled within the various tracing facilities, such as
>>>> ftrace/perf, the event_mutex is held. As events are enabled pages are
>>>> accessed. We do not want page faults to occur under this lock. Instead
>>>> queue the fault to a workqueue to be handled in a process context safe
>>>> way without the lock.
>>>>
>>>> The enable address is disabled while the async fault-in occurs. This
>>>> ensures that we don't attempt fault-in more than is necessary. Once the
>>>> page has been faulted in, the address write is attempted again. If the
>>>> page couldn't fault-in, then we wait until the next time the event is
>>>> enabled to prevent any potential infinite loops.
>>>
>>> I'm also unclear about how the system call initiating the enabled state
>>> change is delayed (or not) when a page fault is queued.
>>>
>>
>> It's not, if needed we could call schedule_delayed_work(). However, I
>> don't think we need it. When pin_user_pages_remote is invoked, it's with
>> FOLL_NOFAULT. This will tell us if we need to fault pages in, we then
>> call fixup_user_fault with unlocked value passed. This will cause the
>> fixup to retry and get the page in.
>>
>> It's called out in the comments for this exact purpose (lucked out
>> here):
>> mm/gup.c
>>   * This is meant to be called in the specific scenario where for 
>> locking reasons
>>   * we try to access user memory in atomic context (within a 
>> pagefault_disable()
>>   * section), this returns -EFAULT, and we want to resolve the user 
>> fault before
>>   * trying again.
>>
>> The fault in happens in a workqueue, this is the same way KVM does it's
>> async page fault logic, so it's not a new pattern. As soon as the
>> fault-in is done, we have a page we should be able to use, so we
>> re-attempt the write immediately. If the write fails, another queue
>> happens and we could loop, but not like the unmap() case I replied with
>> earlier.
>>
>>> I would expect that when a page fault is needed, after enqueuing work 
>>> to the
>>> worker thread, the system call initiating the state change would somehow
>>> wait for a completion (after releasing the user events mutex). That
>>> completion would be signaled by the worker thread either if the page 
>>> fault
>>> fails, or if the state change is done.
>>>
>>
>> I didn't see a generic fault-in + notify anywhere, do you know of one I
>> could use? Otherwise, it seems the pattern used is check fault, fault-in
>> via workqueue, re-attempt.
> 
> I don't think we're talking about the same thing. I'll try stating my 
> concern in a different way.
> 
> user_event_enabler_write() calls user_event_enabler_queue_fault() when 
> it cannot perform the enabled state update immediately (because a page 
> fault is needed).
> 
> But then AFAIU it returns immediately to the caller. The caller could 
> very well expect that the event has been enabled, as requested, 
> immediately when the enabler write returns. The fact that enabling the 
> event can be delayed for an arbitrary amount of time due to page faults 
> means that we have no hard guarantee that the event is enabled as 
> requested upon return to the caller.
> 
> I'd like to add a completion there, to be waited for after 
> user_event_enabler_queue_fault(), but before returning from 
> user_event_enabler_create(). Waiting for the completion should be done 
> without any mutexes held, so after releasing event_mutex.
> 
> The completion would be placed on the stack of 
> user_event_enabler_create(), and a reference to the completion would be 
> placed in the queued fault request. The completion notification would be 
> emitted by the worker thread either when enabling is done, or if a page 
> fault fails.
> 
> See include/linux/completion.h
> 
> Thoughts ?

Actually, after further thinking, I wonder if we need a worker thread at 
all to perform the page faults.

Could we simply handle the page faults from user_event_enabler_create() 
by releasing the mutex and re-trying ? From what contexts is 
user_event_enabler_create() called ? (any locks taken ? system call 
context ? irqs and preemption enabled or disabled ?)

Thanks,

Mathieu

> 
> Thanks,
> 
> Mathieu
> 
> 
>>
>>> Thoughts ?
>>>
>>> Thanks,
>>>
>>> Mathieu
>>>
>>
>> Thanks,
>> -Beau
>
  
Beau Belgrave Oct. 31, 2022, 4:58 p.m. UTC | #8
On Sat, Oct 29, 2022 at 10:23:02AM -0400, Mathieu Desnoyers wrote:
> On 2022-10-28 18:35, Beau Belgrave wrote:
> > On Fri, Oct 28, 2022 at 06:07:34PM -0400, Mathieu Desnoyers wrote:
> > > On 2022-10-27 18:40, Beau Belgrave wrote:
> 
> [...]
> 
> > 
> > > I'm still slightly unsure about using "uint32_t" for enable check, or going
> > > for "unsigned long". The core question there is whether a 32-bit word test
> > > would cause partial register stalls on 64-bit architectures. Going for
> > > unsigned long would require that user events receives information about the
> > > bitness of the word as input from userspace. (bit=63 rather than 31).
> > > Perhaps this is something the user events ABI should accommodate by
> > > reserving more than 5 bits to express the target bit ?
> > > 
> > 
> > Yeah, I thought about this. From the user side you can actually use any
> > arbitrary bits you want by passing a 32-bit aligned value. So you could
> > take the lower or top half of a 64-bit value. The reason I limit to 0-31
> > bits is to ensure no page straddling occurs. Futex also does this, check
> > out get_futex_key() in kernel/futex/core.c.
> 
> We can ensure no page straddling by checking the address alignment compared
> to the size of the integer (4 or 8 bytes) also received as input.
> 

Ok.

> > 
> > I would recommend trying out uint64 but give the upper half to
> > user_events ABI and checking if that works for you if you want say 63
> > bits to user space. You'd tell the ABI bit 31, but in user space it
> > would be bit 63.
> 
> That's tricky because Linux supports both big and little endian, which will
> make the ABI tricky to use if we try to be too clever about this.
> 
> Also, just stating a "pointer" and "bits offset" is not enough if we want to
> support 32-bit and 64-bit integer types, because the bit offset does not
> consider endianness.
> 
> I would recommend to get the following information through the user events
> registration:
> 
> - pointer address,
> - size of integer type (4 or 8 bytes),
> - bit offset (currently 31 or 63).
> 
> This way we have all the information we need to set the right bit in both
> little and big endian. We also have all the information we need to validate
> natural alignment, e.g.:
> 
> /* Check alignment */
> if (addr & (type_size - 1))
> 	return -EINVAL;
> /* Check bit offset range */
> if (bit_offset > (type_size * CHAR_BIT) - 1)
> 	return -EINVAL;
> switch (type_size) {
> case 4:
> 	/* Update 32-bit integer */
> 	break;
> #if BITS_PER_LONG >= 64
> case 8:
> 	/* Update 64-bit integer */
> 	break;
> #endif
> default:
> 	return -EINVAL;
> }
> 

Sounds good, I'll update this.

> 
> > 
> > > > 
> > > > The enable address is disabled while the async fault-in occurs. This
> > > > ensures that we don't attempt fault-in more than is necessary. Once the
> > > > page has been faulted in, the address write is attempted again. If the
> > > > page couldn't fault-in, then we wait until the next time the event is
> > > > enabled to prevent any potential infinite loops.
> > > 
> > > So if the page is removed from the page cache between the point where it is
> > > faulted in and the moment the write is re-attempted, that will not trigger
> > > another attempt at paging in the page, am I correct ?
> > > 
> > 
> > If we fault and the fixup user fault fails still with retries, then we
> > give up until the next enablement of that site.
> > 
> > However, if we fault and the fixup user fault with retries works, and
> > then we fault again trying to write, that will retry.
> > 
> > > I would think this is unexpected. I would expect that failing to fault in
> > > the page would stop any further attempts, but simply failing to pin the page
> > > after faulting it in should simply try again.
> > > 
> > 
> > The issue I repro is mmap() register the enable at that location, then
> > unmap(). All the faulting errors just tell you -EFAULT in this case,
> > even though it was maliciously removed. In this case you could get the
> > kernel to infinite loop trying to page it in.
> > 
> > > Thoughts ?
> > > 
> > 
> > We need to have some sanity toward giving up faulting in data that never
> > will make it. The code currently assigns that line to if
> > fixup_user_fault with retries fails, we give up. pin_user_pages_remote
> > will not stop it from being attempted again.
> 
> What are the legitimate cases that can make fixup_user_fault fail ? If there
> are none, and the only case that can make this fail is userspace unmapping
> memory, then we should very well kill the offending process.
> 

I believe only VM_FAULT_ERROR:
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS |	\
			VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON |	\
			VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)

I don't believe in any of the above cases we should retry indefinitely
for.

> An example here is futex fault_in_user_writeable(). When it fails, it
> appears that the futex_wake_op simply returns -EFAULT to the caller.
> 

fault_in_user_writeable() calls fixup_user_fault(), so it should be the
same scenario / failures as above.

> Thanks,
> 
> Mathieu
> 
> > 
> > > Thanks,
> > > 
> > > Mathieu
> > > 
> > 
> > Thanks,
> > -Beau
> 
> -- 
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com

Thanks,
-Beau
  
Beau Belgrave Oct. 31, 2022, 5:12 p.m. UTC | #9
On Sat, Oct 29, 2022 at 10:40:02AM -0400, Mathieu Desnoyers wrote:
> On 2022-10-28 18:42, Beau Belgrave wrote:
> > On Fri, Oct 28, 2022 at 06:19:10PM -0400, Mathieu Desnoyers wrote:
> > > On 2022-10-27 18:40, Beau Belgrave wrote:
> > > > When events are enabled within the various tracing facilities, such as
> > > > ftrace/perf, the event_mutex is held. As events are enabled pages are
> > > > accessed. We do not want page faults to occur under this lock. Instead
> > > > queue the fault to a workqueue to be handled in a process context safe
> > > > way without the lock.
> > > > 
> > > > The enable address is disabled while the async fault-in occurs. This
> > > > ensures that we don't attempt fault-in more than is necessary. Once the
> > > > page has been faulted in, the address write is attempted again. If the
> > > > page couldn't fault-in, then we wait until the next time the event is
> > > > enabled to prevent any potential infinite loops.
> > > 
> > > I'm also unclear about how the system call initiating the enabled state
> > > change is delayed (or not) when a page fault is queued.
> > > 
> > 
> > It's not, if needed we could call schedule_delayed_work(). However, I
> > don't think we need it. When pin_user_pages_remote is invoked, it's with
> > FOLL_NOFAULT. This will tell us if we need to fault pages in, we then
> > call fixup_user_fault with unlocked value passed. This will cause the
> > fixup to retry and get the page in.
> > 
> > It's called out in the comments for this exact purpose (lucked out
> > here):
> > mm/gup.c
> >   * This is meant to be called in the specific scenario where for locking reasons
> >   * we try to access user memory in atomic context (within a pagefault_disable()
> >   * section), this returns -EFAULT, and we want to resolve the user fault before
> >   * trying again.
> > 
> > The fault in happens in a workqueue, this is the same way KVM does it's
> > async page fault logic, so it's not a new pattern. As soon as the
> > fault-in is done, we have a page we should be able to use, so we
> > re-attempt the write immediately. If the write fails, another queue
> > happens and we could loop, but not like the unmap() case I replied with
> > earlier.
> > 
> > > I would expect that when a page fault is needed, after enqueuing work to the
> > > worker thread, the system call initiating the state change would somehow
> > > wait for a completion (after releasing the user events mutex). That
> > > completion would be signaled by the worker thread either if the page fault
> > > fails, or if the state change is done.
> > > 
> > 
> > I didn't see a generic fault-in + notify anywhere, do you know of one I
> > could use? Otherwise, it seems the pattern used is check fault, fault-in
> > via workqueue, re-attempt.
> 
> I don't think we're talking about the same thing. I'll try stating my
> concern in a different way.
> 
> user_event_enabler_write() calls user_event_enabler_queue_fault() when it
> cannot perform the enabled state update immediately (because a page fault is
> needed).
> 
> But then AFAIU it returns immediately to the caller. The caller could very
> well expect that the event has been enabled, as requested, immediately when
> the enabler write returns. The fact that enabling the event can be delayed
> for an arbitrary amount of time due to page faults means that we have no
> hard guarantee that the event is enabled as requested upon return to the
> caller.
> 

Yeah, sorry, I misread your statement.

If the user registers an event and user_event_enabler_write() is invoked
at that point the enabler is registered and will update the event as it
changes, even though the initial write fails (it will try again
repeatedly until a terminal state of faulting is reached).

I agree though, if the initial data is faulted out at that point, and it
has an error faulting in, the user doesn't know that (although it
appears the only time this would fail is if someone did something silly,
malicous, or the device is out of memory). They likely want to know if
it's the silly/forgetful case.

> I'd like to add a completion there, to be waited for after
> user_event_enabler_queue_fault(), but before returning from
> user_event_enabler_create(). Waiting for the completion should be done
> without any mutexes held, so after releasing event_mutex.
> 
> The completion would be placed on the stack of user_event_enabler_create(),
> and a reference to the completion would be placed in the queued fault
> request. The completion notification would be emitted by the worker thread
> either when enabling is done, or if a page fault fails.
> 
> See include/linux/completion.h
> 
> Thoughts ?
> 

If the case you are worried about is just the initial register, then I
can do this synchronously outside of the lock. I believe you had the
same thought later in the day.

The main scenario I am worried about is that we have say 20 processes
that share the same event. They have already been registered and they
are running. Then a few of them have page faults when perf/ftrace
enable the event and the register callback is invoked (which triggers
a user write to the enablement address/bit). If most of these
processes don't page fault, I don't want them to wait on the account of
1 or 2 bad apples. If they all page fault, I don't want them to hold up
the other parts the system that require event_mutex (since it is held by
the register callback caller). So these should be as fast as possible
while the event_mutex is being held.

> Thanks,
> 
> Mathieu
> 
> 
> > 
> > > Thoughts ?
> > > 
> > > Thanks,
> > > 
> > > Mathieu
> > > 
> > 
> > Thanks,
> > -Beau
> 
> -- 
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com

Thanks,
-Beau
  
Beau Belgrave Oct. 31, 2022, 5:18 p.m. UTC | #10
On Sun, Oct 30, 2022 at 07:45:33AM -0400, Mathieu Desnoyers wrote:
> On 2022-10-29 10:40, Mathieu Desnoyers wrote:
> > On 2022-10-28 18:42, Beau Belgrave wrote:
> > > On Fri, Oct 28, 2022 at 06:19:10PM -0400, Mathieu Desnoyers wrote:
> > > > On 2022-10-27 18:40, Beau Belgrave wrote:
> > > > > When events are enabled within the various tracing facilities, such as
> > > > > ftrace/perf, the event_mutex is held. As events are enabled pages are
> > > > > accessed. We do not want page faults to occur under this lock. Instead
> > > > > queue the fault to a workqueue to be handled in a process context safe
> > > > > way without the lock.
> > > > > 
> > > > > The enable address is disabled while the async fault-in occurs. This
> > > > > ensures that we don't attempt fault-in more than is necessary. Once the
> > > > > page has been faulted in, the address write is attempted again. If the
> > > > > page couldn't fault-in, then we wait until the next time the event is
> > > > > enabled to prevent any potential infinite loops.
> > > > 
> > > > I'm also unclear about how the system call initiating the enabled state
> > > > change is delayed (or not) when a page fault is queued.
> > > > 
> > > 
> > > It's not, if needed we could call schedule_delayed_work(). However, I
> > > don't think we need it. When pin_user_pages_remote is invoked, it's with
> > > FOLL_NOFAULT. This will tell us if we need to fault pages in, we then
> > > call fixup_user_fault with unlocked value passed. This will cause the
> > > fixup to retry and get the page in.
> > > 
> > > It's called out in the comments for this exact purpose (lucked out
> > > here):
> > > mm/gup.c
> > >   * This is meant to be called in the specific scenario where for
> > > locking reasons
> > >   * we try to access user memory in atomic context (within a
> > > pagefault_disable()
> > >   * section), this returns -EFAULT, and we want to resolve the user
> > > fault before
> > >   * trying again.
> > > 
> > > The fault in happens in a workqueue, this is the same way KVM does it's
> > > async page fault logic, so it's not a new pattern. As soon as the
> > > fault-in is done, we have a page we should be able to use, so we
> > > re-attempt the write immediately. If the write fails, another queue
> > > happens and we could loop, but not like the unmap() case I replied with
> > > earlier.
> > > 
> > > > I would expect that when a page fault is needed, after enqueuing
> > > > work to the
> > > > worker thread, the system call initiating the state change would somehow
> > > > wait for a completion (after releasing the user events mutex). That
> > > > completion would be signaled by the worker thread either if the
> > > > page fault
> > > > fails, or if the state change is done.
> > > > 
> > > 
> > > I didn't see a generic fault-in + notify anywhere, do you know of one I
> > > could use? Otherwise, it seems the pattern used is check fault, fault-in
> > > via workqueue, re-attempt.
> > 
> > I don't think we're talking about the same thing. I'll try stating my
> > concern in a different way.
> > 
> > user_event_enabler_write() calls user_event_enabler_queue_fault() when
> > it cannot perform the enabled state update immediately (because a page
> > fault is needed).
> > 
> > But then AFAIU it returns immediately to the caller. The caller could
> > very well expect that the event has been enabled, as requested,
> > immediately when the enabler write returns. The fact that enabling the
> > event can be delayed for an arbitrary amount of time due to page faults
> > means that we have no hard guarantee that the event is enabled as
> > requested upon return to the caller.
> > 
> > I'd like to add a completion there, to be waited for after
> > user_event_enabler_queue_fault(), but before returning from
> > user_event_enabler_create(). Waiting for the completion should be done
> > without any mutexes held, so after releasing event_mutex.
> > 
> > The completion would be placed on the stack of
> > user_event_enabler_create(), and a reference to the completion would be
> > placed in the queued fault request. The completion notification would be
> > emitted by the worker thread either when enabling is done, or if a page
> > fault fails.
> > 
> > See include/linux/completion.h
> > 
> > Thoughts ?
> 
> Actually, after further thinking, I wonder if we need a worker thread at all
> to perform the page faults.
> 
> Could we simply handle the page faults from user_event_enabler_create() by
> releasing the mutex and re-trying ? From what contexts is
> user_event_enabler_create() called ? (any locks taken ? system call context
> ? irqs and preemption enabled or disabled ?)
> 

The create case is in process context, via the reg IOCTL on the
user_events_data file. The only lock is the register lock, the
event_mutex is taken within the user_event_enabler_create() call to ensure
the enabler can safely be added to the shared user_event within the
group.

Shouldn't be a problem running it synchronously again or reporting back
a fault happened and letting the user call decide what to do.

> Thanks,
> 
> Mathieu
> 

[..]

> 
> -- 
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com

Thanks,
-Beau
  

Patch

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 633f24c2a1ac..f1eb8101e053 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -81,11 +81,22 @@  struct user_event_enabler {
 	struct list_head link;
 	struct mm_struct *mm;
 	struct file *file;
+	refcount_t refcnt;
 	unsigned long enable_addr;
 	unsigned int enable_bit: 5,
-		     __reserved: 27;
+		     __reserved: 26,
+		     disabled: 1;
 };
 
+/* Used for asynchronous faulting in of pages */
+struct user_event_enabler_fault {
+	struct work_struct work;
+	struct user_event_enabler *enabler;
+	struct user_event *event;
+};
+
+static struct kmem_cache *fault_cache;
+
 /*
  * Stores per-event properties, as users register events
  * within a file a user_event might be created if it does not
@@ -236,6 +247,19 @@  static void user_event_enabler_destroy(struct user_event_enabler *enabler)
 	kfree(enabler);
 }
 
+static __always_inline struct user_event_enabler
+*user_event_enabler_get(struct user_event_enabler *enabler)
+{
+	refcount_inc(&enabler->refcnt);
+	return enabler;
+}
+
+static void user_event_enabler_put(struct user_event_enabler *enabler)
+{
+	if (refcount_dec_and_test(&enabler->refcnt))
+		user_event_enabler_destroy(enabler);
+}
+
 static void user_event_enabler_remove(struct file *file,
 				      struct user_event *user)
 {
@@ -249,13 +273,93 @@  static void user_event_enabler_remove(struct file *file,
 		if (enabler->file != file)
 			continue;
 
+		enabler->disabled = 0;
 		list_del(&enabler->link);
-		user_event_enabler_destroy(enabler);
+		user_event_enabler_put(enabler);
 	}
 
 	mutex_unlock(&event_mutex);
 }
 
+static void user_event_enabler_write(struct user_event_enabler *enabler,
+				     struct user_event *user);
+
+static void user_event_enabler_fault_fixup(struct work_struct *work)
+{
+	struct user_event_enabler_fault *fault = container_of(
+		work, struct user_event_enabler_fault, work);
+	struct user_event_enabler *enabler = fault->enabler;
+	struct user_event *user = fault->event;
+	struct mm_struct *mm = enabler->mm;
+	unsigned long uaddr = enabler->enable_addr;
+	bool unlocked = false;
+	int ret;
+
+	might_sleep();
+
+	mmap_read_lock(mm);
+
+	ret = fixup_user_fault(mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+			       &unlocked);
+
+	mmap_read_unlock(mm);
+
+	if (ret)
+		pr_warn("user_events: Fixup fault failed with %d "
+			"for mm: 0x%pK offset: 0x%llx event: %s\n", ret, mm,
+			(unsigned long long)uaddr, EVENT_NAME(user));
+
+	/* Prevent state changes from racing */
+	mutex_lock(&event_mutex);
+
+	/*
+	 * If we managed to get the page, re-issue the write. We do not
+	 * want to get into a possible infinite loop, which is why we only
+	 * attempt again directly if the page came in. If we couldn't get
+	 * the page here, then we will try again the next time the event is
+	 * enabled/disabled.
+	 */
+	enabler->disabled = 0;
+
+	if (!ret)
+		user_event_enabler_write(enabler, user);
+
+	mutex_unlock(&event_mutex);
+
+	refcount_dec(&user->refcnt);
+	user_event_enabler_put(enabler);
+	kmem_cache_free(fault_cache, fault);
+}
+
+static bool user_event_enabler_queue_fault(struct user_event_enabler *enabler,
+					   struct user_event *user)
+{
+	struct user_event_enabler_fault *fault;
+
+	fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+
+	if (!fault)
+		return false;
+
+	INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
+	fault->enabler = user_event_enabler_get(enabler);
+	fault->event = user;
+
+	refcount_inc(&user->refcnt);
+	enabler->disabled = 1;
+
+	if (!schedule_work(&fault->work)) {
+		enabler->disabled = 0;
+		refcount_dec(&user->refcnt);
+		user_event_enabler_put(enabler);
+		kmem_cache_free(fault_cache, fault);
+
+		return false;
+	}
+
+	return true;
+}
+
 static void user_event_enabler_write(struct user_event_enabler *enabler,
 				     struct user_event *user)
 {
@@ -266,6 +370,11 @@  static void user_event_enabler_write(struct user_event_enabler *enabler,
 	void *kaddr;
 	int ret;
 
+	lockdep_assert_held(&event_mutex);
+
+	if (unlikely(enabler->disabled))
+		return;
+
 	mmap_read_lock(mm);
 
 	ret = pin_user_pages_remote(mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
@@ -273,8 +382,10 @@  static void user_event_enabler_write(struct user_event_enabler *enabler,
 
 	mmap_read_unlock(mm);
 
-	if (ret <= 0) {
-		pr_warn("user_events: Enable write failed\n");
+	if (unlikely(ret <= 0)) {
+		if (!user_event_enabler_queue_fault(enabler, user))
+			pr_warn("user_events: Unable to queue fault handler\n");
+
 		return;
 	}
 
@@ -321,6 +432,7 @@  static struct user_event_enabler
 	enabler->file = file;
 	enabler->enable_addr = (unsigned long)reg->enable_addr;
 	enabler->enable_bit = reg->enable_bit;
+	refcount_set(&enabler->refcnt, 1);
 
 	/* Prevents state changes from racing with new enablers */
 	mutex_lock(&event_mutex);
@@ -1902,6 +2014,11 @@  static int __init trace_events_user_init(void)
 {
 	int ret;
 
+	fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
+
+	if (!fault_cache)
+		return -ENOMEM;
+
 	init_group = user_event_group_create(&init_user_ns);
 
 	if (!init_group)