[1/2] ring-buffer: Introducing ring-buffer mapping functions
Commit Message
In preparation for allowing the user-space to map a ring-buffer, add
a set of mapping functions:
ring_buffer_{map,unmap}()
ring_buffer_map_fault()
And controls on the ring-buffer:
ring_buffer_get_reader_page() /* swap reader and head */
ring_buffer_update_meta_page()
Mapping the ring-buffer also involves:
A unique ID for each page of the ring-buffer, as currently the pages
are only identified through their in-kernel VA.
A meta-page, where are stored statistics about the ring-buffer and
a page IDs list, ordered. A field gives what page is the reader
one and one to gives where the ring-buffer starts in the list of data
pages.
The linear mapping exposes the meta-page, and each page of the
ring-buffer, ordered following their unique ID, assigned during the
first mapping.
Once mapped, no page can get in or out of the ring-buffer: the buffer
size will remain unmodified and the splice enabling functions will in
reality simply memcpy the data instead of swapping pages.
Also, the meta-page being... a single page, this limits at the moment the
number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
system.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Comments
On Fri, 17 Mar 2023 14:33:09 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:
> Also, the meta-page being... a single page, this limits at the moment the
> number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> system.
I hate this limitation, so I fixed it ;-)
I added a meta_page_size field to the meta page, and user space can do:
meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
if (meta == MAP_FAILED)
pdie("mmap");
map = meta;
meta_len = map->meta_page_size;
if (meta_len > page_size) {
munmap(meta, page_size);
meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
if (meta == MAP_FAILED)
pdie("mmap");
map = meta;
}
This appears to work (but I'm still testing it).
-- Steve
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index 24bcec754a35..12f3f7ee33d9 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
__u32 reader_page;
__u32 nr_data_pages; /* doesn't take into account the reader_page */
__u32 data_page_head; /* index of data_pages[] */
+ __u32 meta_page_size; /* size of the meta page */
__u32 data_pages[];
};
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10a17e78cfe6..77c92e4a7adc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
u64 read_stamp;
int mapped;
+ int meta_order;
struct mutex mapping_lock;
unsigned long *page_ids; /* ID to addr */
struct ring_buffer_meta_page *meta_page;
@@ -5898,7 +5899,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
#define META_PAGE_MAX_PAGES \
- ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
+ ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -5908,22 +5909,34 @@ static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
+ struct page *meta_pages;
+ int pages;
+ int order = 0;
+
if (cpu_buffer->meta_page)
return 0;
- if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES)
- return -E2BIG;
-
- cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
- if (!cpu_buffer->meta_page)
+ if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES) {
+ /* Calcualte how many more pages we need to hold indexes */
+ pages = DIV_ROUND_UP(cpu_buffer->nr_pages - META_PAGE_MAX_PAGES,
+ PAGE_SIZE / sizeof(u32));
+ /* Add back the meta_page itself */
+ pages++;
+ order = fls(pages) - 1;
+ }
+ meta_pages = alloc_pages(GFP_USER, order);
+ if (!meta_pages)
return -ENOMEM;
+ cpu_buffer->meta_page = page_to_virt(meta_pages);
+ cpu_buffer->meta_order = order;
+
return 0;
}
static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
- free_page((unsigned long)cpu_buffer->meta_page);
+ free_pages((unsigned long)cpu_buffer->meta_page, cpu_buffer->meta_order);
cpu_buffer->meta_page = NULL;
}
@@ -5932,14 +5945,20 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
{
struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
struct buffer_page *first_page, *bpage;
+ int data_page_end;
int id = 0;
page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
cpu_buffer->reader_page->id = id++;
+ /* Calculate the last index of data_pages[] */
+ data_page_end = (1 << (cpu_buffer->meta_order + PAGE_SHIFT)) -
+ offsetof(struct ring_buffer_meta_page, data_pages);
+ data_page_end /= sizeof(u32);
+
first_page = bpage = rb_set_head_page(cpu_buffer);
do {
- if (id > META_PAGE_MAX_PAGES) {
+ if (id > data_page_end) {
WARN_ON(1);
break;
}
@@ -5960,6 +5979,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
meta->pages_touched = 0;
meta->reader_page = cpu_buffer->reader_page->id;
meta->nr_data_pages = cpu_buffer->nr_pages;
+ meta->meta_page_size = 1 << (cpu_buffer->meta_order + PAGE_SHIFT);
meta->data_page_head = 0;
}
@@ -6092,10 +6112,12 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
/*
* +--------------+
* | meta page | pgoff=0
+ * | ... |
+ * | | pgoff=(1<<cpu_buffer->meta_order - 1)
* +--------------+
- * | data page1 | pgoff=1 page_ids=0
+ * | data page1 | page_ids=0
* +--------------+
- * | data page2 | pgoff=2 page_ids=1
+ * | data page2 | page_ids=1
* ...
*/
struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
@@ -6103,10 +6125,11 @@ struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- if (!pgoff)
- return virt_to_page(cpu_buffer->meta_page);
+ if (pgoff < (1 << cpu_buffer->meta_order) + 1)
+ return virt_to_page((void *)cpu_buffer->meta_page + (pgoff << PAGE_SHIFT));
+
+ pgoff -= (1 << cpu_buffer->meta_order);
- pgoff--;
if (pgoff > cpu_buffer->nr_pages)
return NULL;
On Mon, Mar 20, 2023 at 09:45:16PM -0400, Steven Rostedt wrote:
> On Fri, 17 Mar 2023 14:33:09 +0000
> Vincent Donnefort <vdonnefort@google.com> wrote:
>
> > Also, the meta-page being... a single page, this limits at the moment the
> > number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> > system.
>
> I hate this limitation, so I fixed it ;-)
Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
>
> I added a meta_page_size field to the meta page, and user space can do:
>
> meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> if (meta == MAP_FAILED)
> pdie("mmap");
>
> map = meta;
> meta_len = map->meta_page_size;
>
> if (meta_len > page_size) {
> munmap(meta, page_size);
> meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
> if (meta == MAP_FAILED)
> pdie("mmap");
> map = meta;
> }
>
> This appears to work (but I'm still testing it).
>
> -- Steve
>
> diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> index 24bcec754a35..12f3f7ee33d9 100644
> --- a/include/uapi/linux/trace_mmap.h
> +++ b/include/uapi/linux/trace_mmap.h
> @@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
> __u32 reader_page;
> __u32 nr_data_pages; /* doesn't take into account the reader_page */
> __u32 data_page_head; /* index of data_pages[] */
> + __u32 meta_page_size; /* size of the meta page */
Do we want a specific field here? That could be deduced from nr_data_pages()
quite easily?
> __u32 data_pages[];
> };
>
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 10a17e78cfe6..77c92e4a7adc 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
> u64 read_stamp;
>
> int mapped;
> + int meta_order;
> struct mutex mapping_lock;
> unsigned long *page_ids; /* ID to addr */
> struct ring_buffer_meta_page *meta_page;
> @@ -5898,7 +5899,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
> EXPORT_SYMBOL_GPL(ring_buffer_read_page);
>
> #define META_PAGE_MAX_PAGES \
> - ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
> + ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
>
[...]
On Tue, 21 Mar 2023 15:17:15 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:
> On Mon, Mar 20, 2023 at 09:45:16PM -0400, Steven Rostedt wrote:
> > On Fri, 17 Mar 2023 14:33:09 +0000
> > Vincent Donnefort <vdonnefort@google.com> wrote:
> >
> > > Also, the meta-page being... a single page, this limits at the moment the
> > > number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> > > system.
> >
> > I hate this limitation, so I fixed it ;-)
>
> Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
Hold off, I found some bugs that I'm fixing ;-)
>
> >
> > I added a meta_page_size field to the meta page, and user space can do:
> >
> > meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> > if (meta == MAP_FAILED)
> > pdie("mmap");
> >
> > map = meta;
> > meta_len = map->meta_page_size;
> >
> > if (meta_len > page_size) {
> > munmap(meta, page_size);
> > meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
> > if (meta == MAP_FAILED)
> > pdie("mmap");
> > map = meta;
> > }
> >
> > This appears to work (but I'm still testing it).
> >
> > -- Steve
> >
> > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> > index 24bcec754a35..12f3f7ee33d9 100644
> > --- a/include/uapi/linux/trace_mmap.h
> > +++ b/include/uapi/linux/trace_mmap.h
> > @@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
> > __u32 reader_page;
> > __u32 nr_data_pages; /* doesn't take into account the reader_page */
> > __u32 data_page_head; /* index of data_pages[] */
> > + __u32 meta_page_size; /* size of the meta page */
>
> Do we want a specific field here? That could be deduced from nr_data_pages()
> quite easily?
I rather not have too much implementation detail knowledge in user space.
It only removes a single entry, and it makes user space easier. In fact,
I'm thinking we should not include "__u32 data_pages[]" but instead add a:
"__u32 data_start" where user space does:
__u32 *data_pages = (_u32 *)meta_page + meta_page->data_start;
That way we could extend the data provided by the meta_page in the future.
-- Steve
>
>
> > __u32 data_pages[];
> > };
> >
> > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > index 10a17e78cfe6..77c92e4a7adc 100644
> > --- a/kernel/trace/ring_buffer.c
> > +++ b/kernel/trace/ring_buffer.c
> > @@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
> > u64 read_stamp;
> >
> > int mapped;
> > + int meta_order;
> > struct mutex mapping_lock;
> > unsigned long *page_ids; /* ID to addr */
> > struct ring_buffer_meta_page *meta_page;
> > @@ -5898,7 +5899,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
> > EXPORT_SYMBOL_GPL(ring_buffer_read_page);
> >
> > #define META_PAGE_MAX_PAGES \
> > - ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
> > + ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
> >
>
> [...]
On Tue, Mar 21, 2023 at 11:40:47AM -0400, Steven Rostedt wrote:
> On Tue, 21 Mar 2023 15:17:15 +0000
> Vincent Donnefort <vdonnefort@google.com> wrote:
>
> > On Mon, Mar 20, 2023 at 09:45:16PM -0400, Steven Rostedt wrote:
> > > On Fri, 17 Mar 2023 14:33:09 +0000
> > > Vincent Donnefort <vdonnefort@google.com> wrote:
> > >
> > > > Also, the meta-page being... a single page, this limits at the moment the
> > > > number of pages in the ring-buffer that can be mapped: ~3MB on a 4K pages
> > > > system.
> > >
> > > I hate this limitation, so I fixed it ;-)
> >
> > Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
>
> Hold off, I found some bugs that I'm fixing ;-)
>
> >
> > >
> > > I added a meta_page_size field to the meta page, and user space can do:
> > >
> > > meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> > > if (meta == MAP_FAILED)
> > > pdie("mmap");
> > >
> > > map = meta;
> > > meta_len = map->meta_page_size;
> > >
> > > if (meta_len > page_size) {
> > > munmap(meta, page_size);
> > > meta = mmap(NULL, meta_len, PROT_READ, MAP_SHARED, fd, 0);
> > > if (meta == MAP_FAILED)
> > > pdie("mmap");
> > > map = meta;
> > > }
> > >
> > > This appears to work (but I'm still testing it).
> > >
> > > -- Steve
> > >
> > > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
> > > index 24bcec754a35..12f3f7ee33d9 100644
> > > --- a/include/uapi/linux/trace_mmap.h
> > > +++ b/include/uapi/linux/trace_mmap.h
> > > @@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
> > > __u32 reader_page;
> > > __u32 nr_data_pages; /* doesn't take into account the reader_page */
> > > __u32 data_page_head; /* index of data_pages[] */
> > > + __u32 meta_page_size; /* size of the meta page */
> >
> > Do we want a specific field here? That could be deduced from nr_data_pages()
> > quite easily?
>
> I rather not have too much implementation detail knowledge in user space.
> It only removes a single entry, and it makes user space easier. In fact,
Ack.
> I'm thinking we should not include "__u32 data_pages[]" but instead add a:
> "__u32 data_start" where user space does:
>
> __u32 *data_pages = (_u32 *)meta_page + meta_page->data_start;
>
> That way we could extend the data provided by the meta_page in the future.
That'd be nice. Couldn't we keep both to simplify the code for the kernel side?
>
> -- Steve
>
On Tue, 21 Mar 2023 11:40:47 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
>
> Hold off, I found some bugs that I'm fixing ;-)
OK, you can fold this in. I also fixed an issue with your patch where it
was missing setting a page->mapping and also clearing it.
I haven't updated to replace "__u32 *data_pages[]" with an "__u32 data_start"
But I think that should still be done.
-- Steve
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index 24bcec754a35..12f3f7ee33d9 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -18,6 +18,7 @@ struct ring_buffer_meta_page {
__u32 reader_page;
__u32 nr_data_pages; /* doesn't take into account the reader_page */
__u32 data_page_head; /* index of data_pages[] */
+ __u32 meta_page_size; /* size of the meta page */
__u32 data_pages[];
};
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10a17e78cfe6..d546fdd14fc3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -526,6 +526,7 @@ struct ring_buffer_per_cpu {
u64 read_stamp;
int mapped;
+ int meta_order;
struct mutex mapping_lock;
unsigned long *page_ids; /* ID to addr */
struct ring_buffer_meta_page *meta_page;
@@ -5898,32 +5899,63 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
#define META_PAGE_MAX_PAGES \
- ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
+ ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_pages))) >> 2)
+
+static void unmap_page(unsigned long addr)
+{
+ struct page *page = virt_to_page(addr);
+
+ page->mapping = NULL;
+}
static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
{
+ int i;
+
+ for (i = 0; i < cpu_buffer->nr_pages; i++)
+ unmap_page(cpu_buffer->page_ids[i]);
+
kfree(cpu_buffer->page_ids);
cpu_buffer->page_ids = NULL;
}
static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
+ struct page *meta_pages;
+ int pages;
+ int order = 0;
+
if (cpu_buffer->meta_page)
return 0;
- if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES)
- return -E2BIG;
-
- cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
- if (!cpu_buffer->meta_page)
+ if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES) {
+ /* Calcualte how many more pages we need to hold indexes */
+ pages = DIV_ROUND_UP(cpu_buffer->nr_pages - META_PAGE_MAX_PAGES,
+ PAGE_SIZE / sizeof(u32));
+ /* Add back the meta_page itself */
+ pages++;
+ order = fls(pages) - 1;
+ }
+ meta_pages = alloc_pages(GFP_USER, order);
+ if (!meta_pages)
return -ENOMEM;
+ cpu_buffer->meta_page = page_to_virt(meta_pages);
+ cpu_buffer->meta_order = order;
+
return 0;
}
static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
- free_page((unsigned long)cpu_buffer->meta_page);
+ unsigned long addr = (unsigned long)cpu_buffer->meta_page;
+ int i;
+
+ for (i = 0; i < (1 << cpu_buffer->meta_order); i++) {
+ unmap_page(addr);
+ addr += PAGE_SIZE;
+ }
+ free_pages((unsigned long)cpu_buffer->meta_page, cpu_buffer->meta_order);
cpu_buffer->meta_page = NULL;
}
@@ -5932,14 +5964,20 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
{
struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
struct buffer_page *first_page, *bpage;
+ int data_page_end;
int id = 0;
page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
cpu_buffer->reader_page->id = id++;
+ /* Calculate the last index of data_pages[] */
+ data_page_end = (1 << (cpu_buffer->meta_order + PAGE_SHIFT)) -
+ offsetof(struct ring_buffer_meta_page, data_pages);
+ data_page_end /= sizeof(u32);
+
first_page = bpage = rb_set_head_page(cpu_buffer);
do {
- if (id > META_PAGE_MAX_PAGES) {
+ if (id > data_page_end) {
WARN_ON(1);
break;
}
@@ -5960,6 +5998,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
meta->pages_touched = 0;
meta->reader_page = cpu_buffer->reader_page->id;
meta->nr_data_pages = cpu_buffer->nr_pages;
+ meta->meta_page_size = 1 << (cpu_buffer->meta_order + PAGE_SHIFT);
meta->data_page_head = 0;
}
@@ -6092,10 +6131,12 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
/*
* +--------------+
* | meta page | pgoff=0
+ * | ... |
+ * | | pgoff=(1<<cpu_buffer->meta_order - 1)
* +--------------+
- * | data page1 | pgoff=1 page_ids=0
+ * | data page1 | page_ids=0
* +--------------+
- * | data page2 | pgoff=2 page_ids=1
+ * | data page2 | page_ids=1
* ...
*/
struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
@@ -6103,10 +6144,11 @@ struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- if (!pgoff)
- return virt_to_page(cpu_buffer->meta_page);
+ if (pgoff < (1 << cpu_buffer->meta_order))
+ return virt_to_page((void *)cpu_buffer->meta_page + (pgoff << PAGE_SHIFT));
+
+ pgoff -= (1 << cpu_buffer->meta_order);
- pgoff--;
if (pgoff > cpu_buffer->nr_pages)
return NULL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ea48eabce7b7..2f43e4a842e7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8479,9 +8479,12 @@ static vm_fault_t tracing_buffers_mmap_fault(struct vm_fault *vmf)
if (!page)
return ret;
- get_page(page);
vmf->page = page;
+ get_page(vmf->page);
+ vmf->page->mapping = vmf->vma->vm_file->f_mapping;
+ vmf->page->index = vmf->pgoff;
+
return 0;
}
On Tue, Mar 21, 2023 at 12:44:25PM -0400, Steven Rostedt wrote:
> On Tue, 21 Mar 2023 11:40:47 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > >
> > > Thanks a lot for having a look. Do you mind if I fold this in my patch for a V2?
> >
> > Hold off, I found some bugs that I'm fixing ;-)
>
> OK, you can fold this in. I also fixed an issue with your patch where it
> was missing setting a page->mapping and also clearing it.
>
> I haven't updated to replace "__u32 *data_pages[]" with an "__u32 data_start"
> But I think that should still be done.
>
> -- Steve
>
[...]
Thanks! I'll prepare a v2 with all that!
On Tue, 21 Mar 2023 16:20:42 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:
> > > Do we want a specific field here? That could be deduced from nr_data_pages()
> > > quite easily?
> >
> > I rather not have too much implementation detail knowledge in user space.
> > It only removes a single entry, and it makes user space easier. In fact,
>
> Ack.
>
> > I'm thinking we should not include "__u32 data_pages[]" but instead add a:
> > "__u32 data_start" where user space does:
> >
> > __u32 *data_pages = (_u32 *)meta_page + meta_page->data_start;
> >
> > That way we could extend the data provided by the meta_page in the future.
>
> That'd be nice. Couldn't we keep both to simplify the code for the kernel side?
I would not expose the data_pages[] to user space, because then they'll use
it, and that *will* become an API.
But we could expose it to the kernel side with;
include/uapi/linux/trace_mmap.h:
struct ring_buffer_meta_page {
#if __BITS_PER_LONG == 64
__u64 entries;
__u64 overrun;
#else
__u32 entries;
__u32 overrun;
#endif
__u32 pages_touched;
__u32 reader_page;
__u32 nr_data_pages; /* doesn't take into account the reader_page */
__u32 data_page_head; /* index of data_pages[] */
__u32 meta_page_size; /* size of the meta page */
__u32 data_start; /* offset to where data_pages are */
};
kernel/trace/ring_buffer.c:
struct ring_buffer_meta {
struct ring_buffer_meta_page meta;
u32 data_pages[];
}
Then we can start each function with:
struct ring_buffer_meta_page *meta = &cpu_buffer->meta_page.meta;
u32 *data_pages = cpu_buffer->meta_page.data_pages;
-- Steve
@@ -6,6 +6,8 @@
#include <linux/seq_file.h>
#include <linux/poll.h>
+#include <uapi/linux/trace_mmap.h>
+
struct trace_buffer;
struct ring_buffer_iter;
@@ -211,4 +213,10 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
#define trace_rb_cpu_prepare NULL
#endif
+int ring_buffer_map(struct trace_buffer *buffer, int cpu);
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
+struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
+ unsigned long pgoff);
+int ring_buffer_get_reader_page(struct trace_buffer *buffer, int cpu);
+int ring_buffer_update_meta_page(struct trace_buffer *buffer, int cpu);
#endif /* _LINUX_RING_BUFFER_H */
new file mode 100644
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_TRACE_MMAP_H_
+#define _UAPI_TRACE_MMAP_H_
+
+#include <asm/bitsperlong.h>
+
+#include <linux/types.h>
+
+struct ring_buffer_meta_page {
+#if __BITS_PER_LONG == 64
+ __u64 entries;
+ __u64 overrun;
+#else
+ __u32 entries;
+ __u32 overrun;
+#endif
+ __u32 pages_touched;
+ __u32 reader_page;
+ __u32 nr_data_pages; /* doesn't take into account the reader_page */
+ __u32 data_page_head; /* index of data_pages[] */
+ __u32 data_pages[];
+};
+
+#endif /* _UAPI_TRACE_MMAP_H_ */
@@ -332,6 +332,7 @@ struct buffer_page {
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
struct buffer_data_page *page; /* Actual data page */
+ u32 id; /* ID for external mapping */
};
/*
@@ -529,6 +530,12 @@ struct ring_buffer_per_cpu {
rb_time_t before_stamp;
u64 event_stamp[MAX_NEST];
u64 read_stamp;
+
+ int mapped;
+ struct mutex mapping_lock;
+ unsigned long *page_ids; /* ID to addr */
+ struct ring_buffer_meta_page *meta_page;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -1452,12 +1459,37 @@ static inline void rb_inc_page(struct buffer_page **bpage)
*bpage = list_entry(p, struct buffer_page, list);
}
+static inline void
+rb_meta_page_head_move(struct ring_buffer_per_cpu *cpu_buffer, unsigned long num)
+{
+ unsigned long head_id;
+
+ if (!READ_ONCE(cpu_buffer->mapped))
+ return;
+
+ head_id = cpu_buffer->meta_page->data_page_head;
+ cpu_buffer->meta_page->data_page_head = (head_id + num) % cpu_buffer->nr_pages;
+}
+
+static inline void
+rb_meta_page_head_swap(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
+
+ if (!READ_ONCE(cpu_buffer->mapped))
+ return;
+
+ meta->reader_page = cpu_buffer->head_page->id;
+ meta->data_pages[meta->data_page_head] = cpu_buffer->reader_page->id;
+}
+
static struct buffer_page *
rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *head;
struct buffer_page *page;
struct list_head *list;
+ unsigned long cnt = 0;
int i;
if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
@@ -1479,9 +1511,12 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
do {
if (rb_is_head_page(page, page->list.prev)) {
cpu_buffer->head_page = page;
+ rb_meta_page_head_move(cpu_buffer, cnt);
+
return page;
}
rb_inc_page(&page);
+ cnt++;
} while (page != head);
}
@@ -1567,6 +1602,13 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
/* Again, either we update tail_page or an interrupt does */
(void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
}
+
+ if (READ_ONCE(cpu_buffer->mapped)) {
+ /* Ensure the meta_page is ready */
+ smp_rmb();
+ WRITE_ONCE(cpu_buffer->meta_page->pages_touched,
+ local_read(&cpu_buffer->pages_touched));
+ }
}
static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1735,6 +1777,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+ mutex_init(&cpu_buffer->mapping_lock);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -2173,7 +2216,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
-
if (cpu_id == RING_BUFFER_ALL_CPUS) {
/*
* Don't succeed if resizing is disabled, as a reader might be
@@ -2523,6 +2565,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
+ if (READ_ONCE(cpu_buffer->mapped)) {
+ /* Ensure the meta_page is ready */
+ smp_rmb();
+ WRITE_ONCE(cpu_buffer->meta_page->overrun,
+ local_read(&cpu_buffer->overrun));
+ }
+
/*
* The entries will be zeroed out when we move the
* tail page.
@@ -3179,6 +3228,14 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->entries);
+
+ if (READ_ONCE(cpu_buffer->mapped)) {
+ /* Ensure the meta_page is ready */
+ smp_rmb();
+ WRITE_ONCE(cpu_buffer->meta_page->entries,
+ local_read(&cpu_buffer->entries));
+ }
+
rb_end_commit(cpu_buffer);
}
@@ -3482,7 +3539,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
return;
/*
- * If this interrupted another event,
+ * If this interrupted another event,
*/
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
@@ -4643,7 +4700,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* Now make the new head point back to the reader page.
*/
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
+ rb_meta_page_head_swap(cpu_buffer);
rb_inc_page(&cpu_buffer->head_page);
+ rb_meta_page_head_move(cpu_buffer, 1);
local_inc(&cpu_buffer->pages_read);
@@ -5285,6 +5344,12 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
+ if (READ_ONCE(cpu_buffer->mapped)) {
+ WRITE_ONCE(cpu_buffer->meta_page->entries, 0);
+ WRITE_ONCE(cpu_buffer->meta_page->pages_touched, 0);
+ WRITE_ONCE(cpu_buffer->meta_page->overrun, 0);
+ }
+
rb_head_page_activate(cpu_buffer);
}
@@ -5489,6 +5554,11 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
+ if (READ_ONCE(cpu_buffer_a->mapped) || READ_ONCE(cpu_buffer_b->mapped)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
@@ -5722,7 +5792,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* Otherwise, we can simply swap the page with the one passed in.
*/
if (read || (len < (commit - read)) ||
- cpu_buffer->reader_page == cpu_buffer->commit_page) {
+ cpu_buffer->reader_page == cpu_buffer->commit_page ||
+ READ_ONCE(cpu_buffer->mapped)) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -5839,6 +5910,263 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+#define META_PAGE_MAX_PAGES \
+ ((PAGE_SIZE - (offsetof(struct ring_buffer_meta_page, data_page_head))) >> 2)
+
+static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ kfree(cpu_buffer->page_ids);
+ cpu_buffer->page_ids = NULL;
+}
+
+static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ if (cpu_buffer->meta_page)
+ return 0;
+
+ if (cpu_buffer->nr_pages > META_PAGE_MAX_PAGES)
+ return -E2BIG;
+
+ cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
+ if (!cpu_buffer->meta_page)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ free_page((unsigned long)cpu_buffer->meta_page);
+ cpu_buffer->meta_page = NULL;
+}
+
+static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long *page_ids)
+{
+ struct ring_buffer_meta_page *meta = cpu_buffer->meta_page;
+ struct buffer_page *first_page, *bpage;
+ int id = 0;
+
+ page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
+ cpu_buffer->reader_page->id = id++;
+
+ first_page = bpage = rb_set_head_page(cpu_buffer);
+ do {
+ if (id > META_PAGE_MAX_PAGES) {
+ WARN_ON(1);
+ break;
+ }
+
+ page_ids[id] = (unsigned long)bpage->page;
+ bpage->id = id;
+ meta->data_pages[id - 1] = id;
+
+ rb_inc_page(&bpage);
+ id++;
+ } while (bpage != first_page);
+
+ /* install page ID to kern VA translation */
+ cpu_buffer->page_ids = page_ids;
+
+ meta->entries = 0;
+ meta->overrun = 0;
+ meta->pages_touched = 0;
+ meta->reader_page = cpu_buffer->reader_page->id;
+ meta->nr_data_pages = cpu_buffer->nr_pages;
+ meta->data_page_head = 0;
+}
+
+static inline struct ring_buffer_per_cpu *
+rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-EINVAL);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->mapped) {
+ mutex_unlock(&cpu_buffer->mapping_lock);
+ return ERR_PTR(-ENODEV);
+ }
+
+ return cpu_buffer;
+}
+
+static inline void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ mutex_unlock(&cpu_buffer->mapping_lock);
+}
+
+int ring_buffer_map(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags, *page_ids;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->mapped) {
+ WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped + 1);
+ goto unlock;
+ }
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+ atomic_inc(&cpu_buffer->resize_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ err = rb_alloc_meta_page(cpu_buffer);
+ if (err) {
+ atomic_dec(&cpu_buffer->resize_disabled);
+ goto unlock;
+ }
+
+ /* page_ids include the reader page while nr_pages does not */
+ page_ids = kzalloc(sizeof(*page_ids) * (cpu_buffer->nr_pages + 1),
+ GFP_KERNEL);
+ if (!page_ids) {
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ /*
+ * Lock all readers to block any page swap until the page IDs are
+ * assigned.
+ */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ rb_setup_ids_meta_page(cpu_buffer, page_ids);
+ /*
+ * Ensure the writer will observe the meta-page before
+ * cpu_buffer->mapped.
+ */
+ smp_wmb();
+ WRITE_ONCE(cpu_buffer->mapped, 1);
+
+ /* Init meta_page values unless the writer did it already */
+ cmpxchg(&cpu_buffer->meta_page->entries, 0,
+ local_read(&cpu_buffer->entries));
+ cmpxchg(&cpu_buffer->meta_page->overrun, 0,
+ local_read(&cpu_buffer->overrun));
+ cmpxchg(&cpu_buffer->meta_page->pages_touched, 0,
+ local_read(&cpu_buffer->pages_touched));
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+unlock:
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->mapped) {
+ err = -ENODEV;
+ goto unlock;
+ }
+
+ WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped - 1);
+ if (!cpu_buffer->mapped) {
+ /* Wait the writer and readers to observe !mapped */
+ synchronize_rcu();
+
+ rb_free_page_ids(cpu_buffer);
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
+
+unlock:
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+/*
+ * +--------------+
+ * | meta page | pgoff=0
+ * +--------------+
+ * | data page1 | pgoff=1 page_ids=0
+ * +--------------+
+ * | data page2 | pgoff=2 page_ids=1
+ * ...
+ */
+struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
+ unsigned long pgoff)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+
+ if (!pgoff)
+ return virt_to_page(cpu_buffer->meta_page);
+
+ pgoff--;
+ if (pgoff > cpu_buffer->nr_pages)
+ return NULL;
+
+ return virt_to_page(cpu_buffer->page_ids[pgoff]);
+}
+
+int ring_buffer_get_reader_page(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *reader;
+ unsigned long flags;
+
+ cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+ if (IS_ERR(cpu_buffer))
+ return (int)PTR_ERR(cpu_buffer);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ reader = cpu_buffer->reader_page;
+ reader->read = rb_page_size(reader);
+ if (!rb_per_cpu_empty(cpu_buffer))
+ WARN_ON(!rb_get_reader_page(cpu_buffer));
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ rb_put_mapped_buffer(cpu_buffer);
+
+ return 0;
+}
+
+int ring_buffer_update_meta_page(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+
+ cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+ if (IS_ERR(cpu_buffer))
+ return PTR_ERR(cpu_buffer);
+
+ /* Update the head page if the writer moved it */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_set_head_page(cpu_buffer);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ rb_put_mapped_buffer(cpu_buffer);
+
+ return 0;
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in