[RFC,1/2] tracing/user_events: Use remote writes for event enablement
Commit Message
As part of the discussions for user_events aligned with user space
tracers, it was determined that user programs should register a 32-bit
value to set or clear a bit when an event becomes enabled. Currently a
shared page is being used that requires mmap(). Remove the shared page
implementation and move to a user registered address implementation.
In this new model during the event registration from user programs 2 new
values are specified. The first is the address to update when the event
is either enabled or disabled. The second is the bit to set/clear to
reflect the event being enabled. This allows for a local 32-bit value in
user programs to support both kernel and user tracers. As an example,
setting bit 31 for kernel tracers when the event becomes enabled allows
for user tracers to use the other bits for ref counts or other flags.
The kernel side updates the bit atomically, user programs need to also
update these values atomically.
User provided addresses must be aligned on a 32-bit boundary, this
allows for single page checking and prevents odd behaviors such as a
32-bit value straddling 2 pages instead of a single page. Currently
page faults are only logged, future patches will handle these.
NOTE:
User programs that wish to have the enable bit shared across forks
either need to use a MAP_SHARED allocated address or register a new
address and file descriptor. If MAP_SHARED cannot be used or new
registrations cannot be done, then it's allowable to use MAP_PRIVATE
as long as the forked children never update the page themselves. Once
the page has been updated, the page from the parent will be copied over
to the child. This new copy-on-write page will not receive updates from
the kernel until another registration has been performed with this new
address.
Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
---
include/linux/user_events.h | 10 +-
kernel/trace/trace_events_user.c | 279 ++++++++++++++++---------------
2 files changed, 153 insertions(+), 136 deletions(-)
Comments
On 2022-10-27 18:40, Beau Belgrave wrote:
[...]
> diff --git a/include/linux/user_events.h b/include/linux/user_events.h
> index 592a3fbed98e..4c3bd16395a9 100644
> --- a/include/linux/user_events.h
> +++ b/include/linux/user_events.h
> @@ -33,12 +33,16 @@ struct user_reg {
> /* Input: Size of the user_reg structure being used */
> __u32 size;
>
> + /* Input: Flags/common settings */
> + __u32 enable_bit : 5, /* Bit in enable address to use (0-31) */
> + __reserved : 27;
I'm always worried about using C/C++ bitfields in uapi, because some
compilers (e.g. MS Windows Compiler) have different implementation of
the bitfields. See
gcc(1)
-Wnopacked-bitfield-compat
-mms-bitfields / -mno-ms-bitfields
Thanks,
Mathieu
> +
> + /* Input: Address to update when enabled */
> + __u64 enable_addr;
> +
> /* Input: Pointer to string with event name, description and flags */
> __u64 name_args;
>
> - /* Output: Bitwise index of the event within the status page */
> - __u32 status_bit;
> -
> /* Output: Index of the event to use when writing data */
> __u32 write_index;
> } __attribute__((__packed__));
> diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Hi,
I have some comments.
On Thu, 27 Oct 2022 15:40:10 -0700
Beau Belgrave <beaub@linux.microsoft.com> wrote:
[...]
> @@ -1570,11 +1610,12 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
> * Registers a user_event on behalf of a user process.
> */
> static long user_events_ioctl_reg(struct user_event_file_info *info,
> - unsigned long uarg)
> + struct file *file, unsigned long uarg)
> {
> struct user_reg __user *ureg = (struct user_reg __user *)uarg;
> struct user_reg reg;
> struct user_event *user;
> + struct user_event_enabler *enabler;
> char *name;
> long ret;
>
> @@ -1607,8 +1648,12 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
> if (ret < 0)
> return ret;
>
> + enabler = user_event_enabler_create(file, ®, user);
> +
> + if (!enabler)
Shouldn't we free the user_event if needed here?
(I found the similar memory leak pattern in the above failure case
of the user_events_ref_add().)
> + return -ENOMEM;
> +
> put_user((u32)ret, &ureg->write_index);
> - put_user(user->index, &ureg->status_bit);
>
> return 0;
> }
[...]
> @@ -1849,7 +1863,6 @@ static int user_status_open(struct inode *node, struct file *file)
>
> static const struct file_operations user_status_fops = {
> .open = user_status_open,
> - .mmap = user_status_mmap,
So, if this drops the mmap operation, can we drop the writable flag from
the status tracefs file?
static int create_user_tracefs(void)
{
[...]
/* mmap with MAP_SHARED requires writable fd */
emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
NULL, NULL, &user_status_fops);
Thank you,
On Sat, Oct 29, 2022 at 10:44:32AM -0400, Mathieu Desnoyers wrote:
> On 2022-10-27 18:40, Beau Belgrave wrote:
>
> [...]
> > diff --git a/include/linux/user_events.h b/include/linux/user_events.h
> > index 592a3fbed98e..4c3bd16395a9 100644
> > --- a/include/linux/user_events.h
> > +++ b/include/linux/user_events.h
> > @@ -33,12 +33,16 @@ struct user_reg {
> > /* Input: Size of the user_reg structure being used */
> > __u32 size;
> > + /* Input: Flags/common settings */
> > + __u32 enable_bit : 5, /* Bit in enable address to use (0-31) */
> > + __reserved : 27;
>
> I'm always worried about using C/C++ bitfields in uapi, because some
> compilers (e.g. MS Windows Compiler) have different implementation of the
> bitfields. See
>
> gcc(1)
>
> -Wnopacked-bitfield-compat
> -mms-bitfields / -mno-ms-bitfields
>
Sure, I'll change this __u8.
> Thanks,
>
> Mathieu
>
> > +
> > + /* Input: Address to update when enabled */
> > + __u64 enable_addr;
> > +
> > /* Input: Pointer to string with event name, description and flags */
> > __u64 name_args;
> > - /* Output: Bitwise index of the event within the status page */
> > - __u32 status_bit;
> > -
> > /* Output: Index of the event to use when writing data */
> > __u32 write_index;
> > } __attribute__((__packed__));
> > diff --git a/kernel/trace/trace_events_user.c
> > b/kernel/trace/trace_events_user.c--
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com
On Mon, Oct 31, 2022 at 11:47:03PM +0900, Masami Hiramatsu wrote:
> Hi,
>
> I have some comments.
>
> On Thu, 27 Oct 2022 15:40:10 -0700
> Beau Belgrave <beaub@linux.microsoft.com> wrote:
>
> [...]
> > @@ -1570,11 +1610,12 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
> > * Registers a user_event on behalf of a user process.
> > */
> > static long user_events_ioctl_reg(struct user_event_file_info *info,
> > - unsigned long uarg)
> > + struct file *file, unsigned long uarg)
> > {
> > struct user_reg __user *ureg = (struct user_reg __user *)uarg;
> > struct user_reg reg;
> > struct user_event *user;
> > + struct user_event_enabler *enabler;
> > char *name;
> > long ret;
> >
> > @@ -1607,8 +1648,12 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
> > if (ret < 0)
> > return ret;
> >
> > + enabler = user_event_enabler_create(file, ®, user);
> > +
> > + if (!enabler)
>
> Shouldn't we free the user_event if needed here?
> (I found the similar memory leak pattern in the above failure case
> of the user_events_ref_add().)
>
user_events are shared across the entire group. They cannot be cleaned
up until all references are gone. This is true both in this case and the
in the user_events_ref_add() case.
The pattern is to register events in the group's hashtable, then add
them to the local file ref array that is RCU protected. If the file ref
cannot be allocated, etc. the refcount on user is decremented. If we
cannot create an enabler, the refcount is still held until file release.
If the event has already been added to the local file ref array, it is
returned to prevent another reference.
> > + return -ENOMEM;
> > +
> > put_user((u32)ret, &ureg->write_index);
> > - put_user(user->index, &ureg->status_bit);
> >
> > return 0;
> > }
> [...]
> > @@ -1849,7 +1863,6 @@ static int user_status_open(struct inode *node, struct file *file)
> >
> > static const struct file_operations user_status_fops = {
> > .open = user_status_open,
> > - .mmap = user_status_mmap,
>
> So, if this drops the mmap operation, can we drop the writable flag from
> the status tracefs file?
>
Good catch, yes I'll remove this.
> static int create_user_tracefs(void)
> {
> [...]
> /* mmap with MAP_SHARED requires writable fd */
> emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
> NULL, NULL, &user_status_fops);
>
> Thank you,
>
> --
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
Thanks,
-Beau
On Mon, 31 Oct 2022 09:46:03 -0700
Beau Belgrave <beaub@linux.microsoft.com> wrote:
> On Mon, Oct 31, 2022 at 11:47:03PM +0900, Masami Hiramatsu wrote:
> > Hi,
> >
> > I have some comments.
> >
> > On Thu, 27 Oct 2022 15:40:10 -0700
> > Beau Belgrave <beaub@linux.microsoft.com> wrote:
> >
> > [...]
> > > @@ -1570,11 +1610,12 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
> > > * Registers a user_event on behalf of a user process.
> > > */
> > > static long user_events_ioctl_reg(struct user_event_file_info *info,
> > > - unsigned long uarg)
> > > + struct file *file, unsigned long uarg)
> > > {
> > > struct user_reg __user *ureg = (struct user_reg __user *)uarg;
> > > struct user_reg reg;
> > > struct user_event *user;
> > > + struct user_event_enabler *enabler;
> > > char *name;
> > > long ret;
> > >
> > > @@ -1607,8 +1648,12 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
> > > if (ret < 0)
> > > return ret;
> > >
> > > + enabler = user_event_enabler_create(file, ®, user);
> > > +
> > > + if (!enabler)
> >
> > Shouldn't we free the user_event if needed here?
> > (I found the similar memory leak pattern in the above failure case
> > of the user_events_ref_add().)
> >
>
> user_events are shared across the entire group. They cannot be cleaned
> up until all references are gone. This is true both in this case and the
> in the user_events_ref_add() case.
>
> The pattern is to register events in the group's hashtable, then add
> them to the local file ref array that is RCU protected. If the file ref
> cannot be allocated, etc. the refcount on user is decremented. If we
> cannot create an enabler, the refcount is still held until file release.
OK, when the ioctl returns, there should be 3 cases;
- Return success, a new(existing) user_event added.
- Return error, no new user_event added.
- Return error, a new user_event added but enabler is not initialized.
And in the last case, the new user_event will be released when user
closes the file. Could you comment it here?
>
> If the event has already been added to the local file ref array, it is
> returned to prevent another reference.
I'm not sure this point. Did you mean returning an error to prevent
registering the same event again?
>
> > > + return -ENOMEM;
> > > +
> > > put_user((u32)ret, &ureg->write_index);
> > > - put_user(user->index, &ureg->status_bit);
> > >
> > > return 0;
> > > }
> > [...]
> > > @@ -1849,7 +1863,6 @@ static int user_status_open(struct inode *node, struct file *file)
> > >
> > > static const struct file_operations user_status_fops = {
> > > .open = user_status_open,
> > > - .mmap = user_status_mmap,
> >
> > So, if this drops the mmap operation, can we drop the writable flag from
> > the status tracefs file?
> >
>
> Good catch, yes I'll remove this.
Thanks!
>
> > static int create_user_tracefs(void)
> > {
> > [...]
> > /* mmap with MAP_SHARED requires writable fd */
> > emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
> > NULL, NULL, &user_status_fops);
> >
> > Thank you,
> >
> > --
> > Masami Hiramatsu (Google) <mhiramat@kernel.org>
>
> Thanks,
> -Beau
On Tue, Nov 01, 2022 at 08:55:01AM +0900, Masami Hiramatsu wrote:
> On Mon, 31 Oct 2022 09:46:03 -0700
> Beau Belgrave <beaub@linux.microsoft.com> wrote:
>
> > On Mon, Oct 31, 2022 at 11:47:03PM +0900, Masami Hiramatsu wrote:
> > > Hi,
> > >
> > > I have some comments.
> > >
> > > On Thu, 27 Oct 2022 15:40:10 -0700
> > > Beau Belgrave <beaub@linux.microsoft.com> wrote:
> > >
> > > [...]
> > > > @@ -1570,11 +1610,12 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
> > > > * Registers a user_event on behalf of a user process.
> > > > */
> > > > static long user_events_ioctl_reg(struct user_event_file_info *info,
> > > > - unsigned long uarg)
> > > > + struct file *file, unsigned long uarg)
> > > > {
> > > > struct user_reg __user *ureg = (struct user_reg __user *)uarg;
> > > > struct user_reg reg;
> > > > struct user_event *user;
> > > > + struct user_event_enabler *enabler;
> > > > char *name;
> > > > long ret;
> > > >
> > > > @@ -1607,8 +1648,12 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
> > > > if (ret < 0)
> > > > return ret;
> > > >
> > > > + enabler = user_event_enabler_create(file, ®, user);
> > > > +
> > > > + if (!enabler)
> > >
> > > Shouldn't we free the user_event if needed here?
> > > (I found the similar memory leak pattern in the above failure case
> > > of the user_events_ref_add().)
> > >
> >
> > user_events are shared across the entire group. They cannot be cleaned
> > up until all references are gone. This is true both in this case and the
> > in the user_events_ref_add() case.
> >
> > The pattern is to register events in the group's hashtable, then add
> > them to the local file ref array that is RCU protected. If the file ref
> > cannot be allocated, etc. the refcount on user is decremented. If we
> > cannot create an enabler, the refcount is still held until file release.
>
> OK, when the ioctl returns, there should be 3 cases;
>
> - Return success, a new(existing) user_event added.
> - Return error, no new user_event added.
> - Return error, a new user_event added but enabler is not initialized.
>
> And in the last case, the new user_event will be released when user
> closes the file. Could you comment it here?
>
Sure thing, I'll add it.
> >
> > If the event has already been added to the local file ref array, it is
> > returned to prevent another reference.
>
> I'm not sure this point. Did you mean returning an error to prevent
> registering the same event again?
>
If a process uses the same fd and registers the same event multiple
times, then only 1 index is returned to the caller. If someone either
purposely or accidentally does this, the appropriate index will be
returned and we won't just keep allocating user_event objects.
However, in a threaded application, there may be situations where thread
A registers event E, and thread B registers event E as well. However,
they may pass different enable address locations. In this case you will
get 2 enablers for the single event. Right now the code does this all
the time, it does not do it only if the enable address differs.
I'm not sure how common this scenario is, but I think I should likely
check if the address is different or not before creating another enabler
in this case.
Thoughts?
>
> >
> > > > + return -ENOMEM;
> > > > +
> > > > put_user((u32)ret, &ureg->write_index);
> > > > - put_user(user->index, &ureg->status_bit);
> > > >
> > > > return 0;
> > > > }
> > > [...]
> > > > @@ -1849,7 +1863,6 @@ static int user_status_open(struct inode *node, struct file *file)
> > > >
> > > > static const struct file_operations user_status_fops = {
> > > > .open = user_status_open,
> > > > - .mmap = user_status_mmap,
> > >
> > > So, if this drops the mmap operation, can we drop the writable flag from
> > > the status tracefs file?
> > >
> >
> > Good catch, yes I'll remove this.
>
> Thanks!
>
> >
> > > static int create_user_tracefs(void)
> > > {
> > > [...]
> > > /* mmap with MAP_SHARED requires writable fd */
> > > emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
> > > NULL, NULL, &user_status_fops);
> > >
> > > Thank you,
> > >
> > > --
> > > Masami Hiramatsu (Google) <mhiramat@kernel.org>
> >
> > Thanks,
> > -Beau
>
>
> --
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
Thanks,
-Beau
@@ -33,12 +33,16 @@ struct user_reg {
/* Input: Size of the user_reg structure being used */
__u32 size;
+ /* Input: Flags/common settings */
+ __u32 enable_bit : 5, /* Bit in enable address to use (0-31) */
+ __reserved : 27;
+
+ /* Input: Address to update when enabled */
+ __u64 enable_addr;
+
/* Input: Pointer to string with event name, description and flags */
__u64 name_args;
- /* Output: Bitwise index of the event within the status page */
- __u32 status_bit;
-
/* Output: Index of the event to use when writing data */
__u32 write_index;
} __attribute__((__packed__));
@@ -19,6 +19,9 @@
#include <linux/tracefs.h>
#include <linux/types.h>
#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
+#include <linux/mmap_lock.h>
+#include <linux/highmem.h>
/* Reminder to move to uapi when everything works */
#ifdef CONFIG_COMPILE_TEST
#include <linux/user_events.h>
@@ -34,34 +37,11 @@
#define FIELD_DEPTH_NAME 1
#define FIELD_DEPTH_SIZE 2
-/*
- * Limits how many trace_event calls user processes can create:
- * Must be a power of two of PAGE_SIZE.
- */
-#define MAX_PAGE_ORDER 0
-#define MAX_PAGES (1 << MAX_PAGE_ORDER)
-#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
-#define MAX_EVENTS (MAX_BYTES * 8)
-
/* Limit how long of an event name plus args within the subsystem. */
#define MAX_EVENT_DESC 512
#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
#define MAX_FIELD_ARRAY_SIZE 1024
-/*
- * The MAP_STATUS_* macros are used for taking a index and determining the
- * appropriate byte and the bit in the byte to set/reset for an event.
- *
- * The lower 3 bits of the index decide which bit to set.
- * The remaining upper bits of the index decide which byte to use for the bit.
- *
- * This is used when an event has a probe attached/removed to reflect live
- * status of the event wanting tracing or not to user-programs via shared
- * memory maps.
- */
-#define MAP_STATUS_BYTE(index) ((index) >> 3)
-#define MAP_STATUS_MASK(index) BIT((index) & 7)
-
/*
* Internal bits (kernel side only) to keep track of connected probes:
* These are used when status is requested in text form about an event. These
@@ -75,25 +55,37 @@
#define EVENT_STATUS_OTHER BIT(7)
/*
- * Stores the pages, tables, and locks for a group of events.
- * Each logical grouping of events has its own group, with a
- * matching page for status checks within user programs. This
- * allows for isolation of events to user programs by various
- * means.
+ * Stores the system name, tables, and locks for a group of events. This
+ * allows isolation for events by various means.
*/
struct user_event_group {
- struct page *pages;
- char *register_page_data;
char *system_name;
struct hlist_node node;
struct mutex reg_mutex;
DECLARE_HASHTABLE(register_table, 8);
- DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
};
/* Group for init_user_ns mapping, top-most group */
static struct user_event_group *init_group;
+/*
+ * Describes where to change a bit when an event becomes
+ * enabled/disabled. These are chained together to enable
+ * many processes being notified when an event changes. These
+ * have a lifetime tied to the data files that are used to
+ * register them. When these go away the ref count to the mm_struct
+ * is decremented to ensure mm_struct lifetime last as long as
+ * required for the enable bit set/clear.
+ */
+struct user_event_enabler {
+ struct list_head link;
+ struct mm_struct *mm;
+ struct file *file;
+ unsigned long enable_addr;
+ unsigned int enable_bit: 5,
+ __reserved: 27;
+};
+
/*
* Stores per-event properties, as users register events
* within a file a user_event might be created if it does not
@@ -110,8 +102,8 @@ struct user_event {
struct hlist_node node;
struct list_head fields;
struct list_head validators;
+ struct list_head enablers;
refcount_t refcnt;
- int index;
int flags;
int min_size;
char status;
@@ -155,28 +147,8 @@ static u32 user_event_key(char *name)
return jhash(name, strlen(name), 0);
}
-static void set_page_reservations(char *pages, bool set)
-{
- int page;
-
- for (page = 0; page < MAX_PAGES; ++page) {
- void *addr = pages + (PAGE_SIZE * page);
-
- if (set)
- SetPageReserved(virt_to_page(addr));
- else
- ClearPageReserved(virt_to_page(addr));
- }
-}
-
static void user_event_group_destroy(struct user_event_group *group)
{
- if (group->register_page_data)
- set_page_reservations(group->register_page_data, false);
-
- if (group->pages)
- __free_pages(group->pages, MAX_PAGE_ORDER);
-
kfree(group->system_name);
kfree(group);
}
@@ -247,19 +219,6 @@ static struct user_event_group
if (!group->system_name)
goto error;
- group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
-
- if (!group->pages)
- goto error;
-
- group->register_page_data = page_address(group->pages);
-
- set_page_reservations(group->register_page_data, true);
-
- /* Zero all bits beside 0 (which is reserved for failures) */
- bitmap_zero(group->page_bitmap, MAX_EVENTS);
- set_bit(0, group->page_bitmap);
-
mutex_init(&group->reg_mutex);
hash_init(group->register_table);
@@ -271,20 +230,107 @@ static struct user_event_group
return NULL;
};
-static __always_inline
-void user_event_register_set(struct user_event *user)
+static void user_event_enabler_destroy(struct user_event_enabler *enabler)
{
- int i = user->index;
+ mmdrop(enabler->mm);
+ kfree(enabler);
+}
+
+static void user_event_enabler_remove(struct file *file,
+ struct user_event *user)
+{
+ struct user_event_enabler *enabler, *next;
+ struct list_head *head = &user->enablers;
+
+ /* Prevent racing with status changes and new events */
+ mutex_lock(&event_mutex);
+
+ list_for_each_entry_safe(enabler, next, head, link) {
+ if (enabler->file != file)
+ continue;
+
+ list_del(&enabler->link);
+ user_event_enabler_destroy(enabler);
+ }
+
+ mutex_unlock(&event_mutex);
+}
+
+static void user_event_enabler_write(struct user_event_enabler *enabler,
+ struct user_event *user)
+{
+ struct mm_struct *mm = enabler->mm;
+ unsigned long uaddr = enabler->enable_addr;
+ unsigned long *ptr;
+ struct page *page;
+ void *kaddr;
+ int ret;
+
+ mmap_read_lock(mm);
+
+ ret = pin_user_pages_remote(mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
+ &page, NULL, NULL);
+
+ mmap_read_unlock(mm);
+
+ if (ret <= 0) {
+ pr_warn("user_events: Enable write failed\n");
+ return;
+ }
+
+ kaddr = kmap_local_page(page);
+ ptr = kaddr + (uaddr & ~PAGE_MASK);
+
+ if (user->status)
+ set_bit(enabler->enable_bit, ptr);
+ else
+ clear_bit(enabler->enable_bit, ptr);
- user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+ kunmap_local(kaddr);
+ unpin_user_pages_dirty_lock(&page, 1, true);
}
-static __always_inline
-void user_event_register_clear(struct user_event *user)
+static void user_event_enabler_update(struct user_event *user)
{
- int i = user->index;
+ struct list_head *head = &user->enablers;
+ struct user_event_enabler *enabler;
- user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+ list_for_each_entry(enabler, head, link)
+ user_event_enabler_write(enabler, user);
+}
+
+static struct user_event_enabler
+*user_event_enabler_create(struct file *file, struct user_reg *reg,
+ struct user_event *user)
+{
+ struct user_event_enabler *enabler;
+
+ enabler = kzalloc(sizeof(*enabler), GFP_KERNEL);
+
+ if (!enabler)
+ return NULL;
+
+ /*
+ * This is grabbed for accounting purposes. This is to ensure if a
+ * process exits before the file is released a valid memory descriptor
+ * will exist for the enabler.
+ */
+ mmgrab(current->mm);
+
+ enabler->mm = current->mm;
+ enabler->file = file;
+ enabler->enable_addr = (unsigned long)reg->enable_addr;
+ enabler->enable_bit = reg->enable_bit;
+
+ /* Prevents state changes from racing with new enablers */
+ mutex_lock(&event_mutex);
+
+ list_add(&enabler->link, &user->enablers);
+ user_event_enabler_write(enabler, user);
+
+ mutex_unlock(&event_mutex);
+
+ return enabler;
}
static __always_inline __must_check
@@ -829,9 +875,6 @@ static int destroy_user_event(struct user_event *user)
return ret;
dyn_event_remove(&user->devent);
-
- user_event_register_clear(user);
- clear_bit(user->index, user->group->page_bitmap);
hash_del(&user->node);
user_event_destroy_validators(user);
@@ -977,9 +1020,9 @@ static void user_event_perf(struct user_event *user, struct iov_iter *i,
#endif
/*
- * Update the register page that is shared between user processes.
+ * Update the enabled bit among all user processes.
*/
-static void update_reg_page_for(struct user_event *user)
+static void update_enable_bit_for(struct user_event *user)
{
struct tracepoint *tp = &user->tracepoint;
char status = 0;
@@ -1010,12 +1053,9 @@ static void update_reg_page_for(struct user_event *user)
rcu_read_unlock_sched();
}
- if (status)
- user_event_register_set(user);
- else
- user_event_register_clear(user);
-
user->status = status;
+
+ user_event_enabler_update(user);
}
/*
@@ -1072,10 +1112,10 @@ static int user_event_reg(struct trace_event_call *call,
return ret;
inc:
refcount_inc(&user->refcnt);
- update_reg_page_for(user);
+ update_enable_bit_for(user);
return 0;
dec:
- update_reg_page_for(user);
+ update_enable_bit_for(user);
refcount_dec(&user->refcnt);
return 0;
}
@@ -1269,7 +1309,6 @@ static int user_event_parse(struct user_event_group *group, char *name,
struct user_event **newuser)
{
int ret;
- int index;
u32 key;
struct user_event *user;
@@ -1288,11 +1327,6 @@ static int user_event_parse(struct user_event_group *group, char *name,
return 0;
}
- index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
-
- if (index == MAX_EVENTS)
- return -EMFILE;
-
user = kzalloc(sizeof(*user), GFP_KERNEL);
if (!user)
@@ -1301,6 +1335,7 @@ static int user_event_parse(struct user_event_group *group, char *name,
INIT_LIST_HEAD(&user->class.fields);
INIT_LIST_HEAD(&user->fields);
INIT_LIST_HEAD(&user->validators);
+ INIT_LIST_HEAD(&user->enablers);
user->group = group;
user->tracepoint.name = name;
@@ -1338,14 +1373,11 @@ static int user_event_parse(struct user_event_group *group, char *name,
if (ret)
goto put_user_lock;
- user->index = index;
-
/* Ensure we track self ref and caller ref (2) */
refcount_set(&user->refcnt, 2);
dyn_event_init(&user->devent, &user_event_dops);
dyn_event_add(&user->devent, &user->call);
- set_bit(user->index, group->page_bitmap);
hash_add(group->register_table, &user->node, key);
mutex_unlock(&event_mutex);
@@ -1561,6 +1593,14 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
if (ret)
return ret;
+ /* Ensure natural alignment and sanity check on max bit */
+ if (kreg->enable_addr % sizeof(__u32) || kreg->enable_bit > 31)
+ return -EINVAL;
+
+ /* Ensure accessible */
+ if (!access_ok((const void __user *)kreg->enable_addr, sizeof(__u32)))
+ return -EFAULT;
+
kreg->size = size;
return 0;
@@ -1570,11 +1610,12 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
* Registers a user_event on behalf of a user process.
*/
static long user_events_ioctl_reg(struct user_event_file_info *info,
- unsigned long uarg)
+ struct file *file, unsigned long uarg)
{
struct user_reg __user *ureg = (struct user_reg __user *)uarg;
struct user_reg reg;
struct user_event *user;
+ struct user_event_enabler *enabler;
char *name;
long ret;
@@ -1607,8 +1648,12 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
if (ret < 0)
return ret;
+ enabler = user_event_enabler_create(file, ®, user);
+
+ if (!enabler)
+ return -ENOMEM;
+
put_user((u32)ret, &ureg->write_index);
- put_user(user->index, &ureg->status_bit);
return 0;
}
@@ -1651,7 +1696,7 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case DIAG_IOCSREG:
mutex_lock(&group->reg_mutex);
- ret = user_events_ioctl_reg(info, uarg);
+ ret = user_events_ioctl_reg(info, file, uarg);
mutex_unlock(&group->reg_mutex);
break;
@@ -1700,8 +1745,10 @@ static int user_events_release(struct inode *node, struct file *file)
for (i = 0; i < refs->count; ++i) {
user = refs->events[i];
- if (user)
+ if (user) {
+ user_event_enabler_remove(file, user);
refcount_dec(&user->refcnt);
+ }
}
out:
file->private_data = NULL;
@@ -1722,38 +1769,6 @@ static const struct file_operations user_data_fops = {
.release = user_events_release,
};
-static struct user_event_group *user_status_group(struct file *file)
-{
- struct seq_file *m = file->private_data;
-
- if (!m)
- return NULL;
-
- return m->private;
-}
-
-/*
- * Maps the shared page into the user process for checking if event is enabled.
- */
-static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
-{
- char *pages;
- struct user_event_group *group = user_status_group(file);
- unsigned long size = vma->vm_end - vma->vm_start;
-
- if (size != MAX_BYTES)
- return -EINVAL;
-
- if (!group)
- return -EINVAL;
-
- pages = group->register_page_data;
-
- return remap_pfn_range(vma, vma->vm_start,
- virt_to_phys(pages) >> PAGE_SHIFT,
- size, vm_get_page_prot(VM_READ));
-}
-
static void *user_seq_start(struct seq_file *m, loff_t *pos)
{
if (*pos)
@@ -1788,7 +1803,7 @@ static int user_seq_show(struct seq_file *m, void *p)
status = user->status;
flags = user->flags;
- seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+ seq_printf(m, "%s", EVENT_NAME(user));
if (flags != 0 || status != 0)
seq_puts(m, " #");
@@ -1813,7 +1828,6 @@ static int user_seq_show(struct seq_file *m, void *p)
seq_puts(m, "\n");
seq_printf(m, "Active: %d\n", active);
seq_printf(m, "Busy: %d\n", busy);
- seq_printf(m, "Max: %ld\n", MAX_EVENTS);
return 0;
}
@@ -1849,7 +1863,6 @@ static int user_status_open(struct inode *node, struct file *file)
static const struct file_operations user_status_fops = {
.open = user_status_open,
- .mmap = user_status_mmap,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,