[v8,04/11] tracing/user_events: Fixup enable faults asyncly
Commit Message
When events are enabled within the various tracing facilities, such as
ftrace/perf, the event_mutex is held. As events are enabled pages are
accessed. We do not want page faults to occur under this lock. Instead
queue the fault to a workqueue to be handled in a process context safe
way without the lock.
The enable address is marked faulting while the async fault-in occurs.
This ensures that we don't attempt to fault-in more than is necessary.
Once the page has been faulted in, an address write is re-attempted.
If the page couldn't fault-in, then we wait until the next time the
event is enabled to prevent any potential infinite loops.
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
---
kernel/trace/trace_events_user.c | 120 +++++++++++++++++++++++++++++--
1 file changed, 114 insertions(+), 6 deletions(-)
Comments
On Tue, 21 Feb 2023 13:11:36 -0800
Beau Belgrave <beaub@linux.microsoft.com> wrote:
> @@ -263,7 +277,85 @@ static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr)
> }
>
> static int user_event_enabler_write(struct user_event_mm *mm,
> - struct user_event_enabler *enabler)
> + struct user_event_enabler *enabler,
> + bool fixup_fault);
> +
> +static void user_event_enabler_fault_fixup(struct work_struct *work)
> +{
> + struct user_event_enabler_fault *fault = container_of(
> + work, struct user_event_enabler_fault, work);
> + struct user_event_enabler *enabler = fault->enabler;
> + struct user_event_mm *mm = fault->mm;
> + unsigned long uaddr = enabler->addr;
> + int ret;
> +
> + ret = user_event_mm_fault_in(mm, uaddr);
> +
> + if (ret && ret != -ENOENT) {
> + struct user_event *user = enabler->event;
> +
> + pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
> + mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
> + }
> +
> + /* Prevent state changes from racing */
> + mutex_lock(&event_mutex);
> +
> + /*
> + * If we managed to get the page, re-issue the write. We do not
> + * want to get into a possible infinite loop, which is why we only
> + * attempt again directly if the page came in. If we couldn't get
> + * the page here, then we will try again the next time the event is
> + * enabled/disabled.
> + */
What case would we not get the page? A bad page mapping? User space doing
something silly?
Or something else, for which how can it go into an infinite loop? Can that
only happen if userspace is doing something mischievous?
-- Steve
> + clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
> +
> + if (!ret) {
> + mmap_read_lock(mm->mm);
> + user_event_enabler_write(mm, enabler, true);
> + mmap_read_unlock(mm->mm);
> + }
> +
> + mutex_unlock(&event_mutex);
> +
> + /* In all cases we no longer need the mm or fault */
> + user_event_mm_put(mm);
> + kmem_cache_free(fault_cache, fault);
> +}
> +
> +static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
> + struct user_event_enabler *enabler)
> +{
> + struct user_event_enabler_fault *fault;
> +
> + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
> +
> + if (!fault)
> + return false;
> +
> + INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
> + fault->mm = user_event_mm_get(mm);
> + fault->enabler = enabler;
> +
> + /* Don't try to queue in again while we have a pending fault */
> + set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
> +
> + if (!schedule_work(&fault->work)) {
> + /* Allow another attempt later */
> + clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
> +
> + user_event_mm_put(mm);
> + kmem_cache_free(fault_cache, fault);
> +
> + return false;
> + }
> +
> + return true;
> +}
> +
On Tue, Mar 28, 2023 at 05:20:49PM -0400, Steven Rostedt wrote:
> On Tue, 21 Feb 2023 13:11:36 -0800
> Beau Belgrave <beaub@linux.microsoft.com> wrote:
>
> > @@ -263,7 +277,85 @@ static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr)
> > }
> >
> > static int user_event_enabler_write(struct user_event_mm *mm,
> > - struct user_event_enabler *enabler)
> > + struct user_event_enabler *enabler,
> > + bool fixup_fault);
> > +
> > +static void user_event_enabler_fault_fixup(struct work_struct *work)
> > +{
> > + struct user_event_enabler_fault *fault = container_of(
> > + work, struct user_event_enabler_fault, work);
> > + struct user_event_enabler *enabler = fault->enabler;
> > + struct user_event_mm *mm = fault->mm;
> > + unsigned long uaddr = enabler->addr;
> > + int ret;
> > +
> > + ret = user_event_mm_fault_in(mm, uaddr);
> > +
> > + if (ret && ret != -ENOENT) {
> > + struct user_event *user = enabler->event;
> > +
> > + pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
> > + mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
> > + }
> > +
> > + /* Prevent state changes from racing */
> > + mutex_lock(&event_mutex);
> > +
> > + /*
> > + * If we managed to get the page, re-issue the write. We do not
> > + * want to get into a possible infinite loop, which is why we only
> > + * attempt again directly if the page came in. If we couldn't get
> > + * the page here, then we will try again the next time the event is
> > + * enabled/disabled.
> > + */
>
> What case would we not get the page? A bad page mapping? User space doing
> something silly?
>
A user space program unmapping the page is the most common I can think
of. A silly action would be unmapping the page while forgetting to call
the unregister IOCTL. We would then possibly see this if the event was
enabled in perf/ftrace before the process exited (and the mm getting
cleaned up).
> Or something else, for which how can it go into an infinite loop? Can that
> only happen if userspace is doing something mischievous?
>
I'm not sure if changing page permissions on the user side would prevent
write permitted mapping in the kernel, but I wanted to ensure if that
type of thing did occur, we wouldn't loop forever. The code lets the mm
decide if a page is ever coming in via fixup_user_fault() with
FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE set.
My understanding is that fixup_user_fault() will retry to get the page
up until it's decided it's either capable of coming in or not. It will
do this since we pass the unlocked bool as a parameter. I used
fixup_user_fault() since it was created for the futex code to handle
this scenario better.
From what I gather, the fault in should only fail for these reasons:
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \
VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \
VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)
If these are hit, I don't believe we want to retry as they aren't likely
to ever get corrected.
Thanks,
-Beau
> -- Steve
>
>
> > + clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
> > +
> > + if (!ret) {
> > + mmap_read_lock(mm->mm);
> > + user_event_enabler_write(mm, enabler, true);
> > + mmap_read_unlock(mm->mm);
> > + }
> > +
> > + mutex_unlock(&event_mutex);
> > +
> > + /* In all cases we no longer need the mm or fault */
> > + user_event_mm_put(mm);
> > + kmem_cache_free(fault_cache, fault);
> > +}
> > +
> > +static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
> > + struct user_event_enabler *enabler)
> > +{
> > + struct user_event_enabler_fault *fault;
> > +
> > + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
> > +
> > + if (!fault)
> > + return false;
> > +
> > + INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
> > + fault->mm = user_event_mm_get(mm);
> > + fault->enabler = enabler;
> > +
> > + /* Don't try to queue in again while we have a pending fault */
> > + set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
> > +
> > + if (!schedule_work(&fault->work)) {
> > + /* Allow another attempt later */
> > + clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
> > +
> > + user_event_mm_put(mm);
> > + kmem_cache_free(fault_cache, fault);
> > +
> > + return false;
> > + }
> > +
> > + return true;
> > +}
> > +
@@ -99,9 +99,23 @@ struct user_event_enabler {
/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
#define ENABLE_VAL_BIT_MASK 0x3F
+/* Bit 6 is for faulting status of enablement */
+#define ENABLE_VAL_FAULTING_BIT 6
+
/* Only duplicate the bit value */
#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
+#define ENABLE_BITOPS(e) ((unsigned long *)&(e)->values)
+
+/* Used for asynchronous faulting in of pages */
+struct user_event_enabler_fault {
+ struct work_struct work;
+ struct user_event_mm *mm;
+ struct user_event_enabler *enabler;
+};
+
+static struct kmem_cache *fault_cache;
+
/* Global list of memory descriptors using user_events */
static LIST_HEAD(user_event_mms);
static DEFINE_SPINLOCK(user_event_mms_lock);
@@ -263,7 +277,85 @@ static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr)
}
static int user_event_enabler_write(struct user_event_mm *mm,
- struct user_event_enabler *enabler)
+ struct user_event_enabler *enabler,
+ bool fixup_fault);
+
+static void user_event_enabler_fault_fixup(struct work_struct *work)
+{
+ struct user_event_enabler_fault *fault = container_of(
+ work, struct user_event_enabler_fault, work);
+ struct user_event_enabler *enabler = fault->enabler;
+ struct user_event_mm *mm = fault->mm;
+ unsigned long uaddr = enabler->addr;
+ int ret;
+
+ ret = user_event_mm_fault_in(mm, uaddr);
+
+ if (ret && ret != -ENOENT) {
+ struct user_event *user = enabler->event;
+
+ pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
+ mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
+ }
+
+ /* Prevent state changes from racing */
+ mutex_lock(&event_mutex);
+
+ /*
+ * If we managed to get the page, re-issue the write. We do not
+ * want to get into a possible infinite loop, which is why we only
+ * attempt again directly if the page came in. If we couldn't get
+ * the page here, then we will try again the next time the event is
+ * enabled/disabled.
+ */
+ clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!ret) {
+ mmap_read_lock(mm->mm);
+ user_event_enabler_write(mm, enabler, true);
+ mmap_read_unlock(mm->mm);
+ }
+
+ mutex_unlock(&event_mutex);
+
+ /* In all cases we no longer need the mm or fault */
+ user_event_mm_put(mm);
+ kmem_cache_free(fault_cache, fault);
+}
+
+static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
+ struct user_event_enabler *enabler)
+{
+ struct user_event_enabler_fault *fault;
+
+ fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+
+ if (!fault)
+ return false;
+
+ INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
+ fault->mm = user_event_mm_get(mm);
+ fault->enabler = enabler;
+
+ /* Don't try to queue in again while we have a pending fault */
+ set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!schedule_work(&fault->work)) {
+ /* Allow another attempt later */
+ clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ user_event_mm_put(mm);
+ kmem_cache_free(fault_cache, fault);
+
+ return false;
+ }
+
+ return true;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ bool fixup_fault)
{
unsigned long uaddr = enabler->addr;
unsigned long *ptr;
@@ -278,11 +370,19 @@ static int user_event_enabler_write(struct user_event_mm *mm,
if (refcount_read(&mm->tasks) == 0)
return -ENOENT;
+ if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))))
+ return -EBUSY;
+
ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
&page, NULL, NULL);
- if (ret <= 0) {
- pr_warn("user_events: Enable write failed\n");
+ if (unlikely(ret <= 0)) {
+ if (!fixup_fault)
+ return -EFAULT;
+
+ if (!user_event_enabler_queue_fault(mm, enabler))
+ pr_warn("user_events: Unable to queue fault handler\n");
+
return -EFAULT;
}
@@ -314,7 +414,7 @@ static void user_event_enabler_update(struct user_event *user)
list_for_each_entry_rcu(enabler, &mm->enablers, link)
if (enabler->event == user)
- user_event_enabler_write(mm, enabler);
+ user_event_enabler_write(mm, enabler, true);
rcu_read_unlock();
mmap_read_unlock(mm->mm);
@@ -562,7 +662,7 @@ static struct user_event_enabler
/* Attempt to reflect the current state within the process */
mmap_read_lock(user_mm->mm);
- *write_result = user_event_enabler_write(user_mm, enabler);
+ *write_result = user_event_enabler_write(user_mm, enabler, false);
mmap_read_unlock(user_mm->mm);
/*
@@ -2201,16 +2301,24 @@ static int __init trace_events_user_init(void)
{
int ret;
+ fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
+
+ if (!fault_cache)
+ return -ENOMEM;
+
init_group = user_event_group_create(&init_user_ns);
- if (!init_group)
+ if (!init_group) {
+ kmem_cache_destroy(fault_cache);
return -ENOMEM;
+ }
ret = create_user_tracefs();
if (ret) {
pr_warn("user_events could not register with tracefs\n");
user_event_group_destroy(init_group);
+ kmem_cache_destroy(fault_cache);
init_group = NULL;
return ret;
}