diff mbox series

[v3,31/35] lib: add memory allocations report in show_mem()

Message ID	20240212213922.783301-32-surenb@google.com
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel+bounces-62437-ouuuleilei=gmail.com@vger.kernel.org designates 147.75.199.223 as permitted sender) client-ip=147.75.199.223; Date: Mon, 12 Feb 2024 13:39:17 -0800 In-Reply-To: <20240212213922.783301-1-surenb@google.com> Precedence: bulk Mime-Version: 1.0 References: <20240212213922.783301-1-surenb@google.com> Message-ID: <20240212213922.783301-32-surenb@google.com> Subject: [PATCH v3 31/35] lib: add memory allocations report in show_mem() From: Suren Baghdasaryan <surenb@google.com> To: akpm@linux-foundation.org Cc: kent.overstreet@linux.dev, mhocko@suse.com, vbabka@suse.cz, hannes@cmpxchg.org, roman.gushchin@linux.dev, mgorman@suse.de, dave@stgolabs.net, willy@infradead.org, liam.howlett@oracle.com, corbet@lwn.net, void@manifault.com, peterz@infradead.org, juri.lelli@redhat.com, catalin.marinas@arm.com, will@kernel.org, arnd@arndb.de, tglx@linutronix.de, mingo@redhat.com, dave.hansen@linux.intel.com, x86@kernel.org, peterx@redhat.com, david@redhat.com, axboe@kernel.dk, mcgrof@kernel.org, masahiroy@kernel.org, nathan@kernel.org, dennis@kernel.org, tj@kernel.org, muchun.song@linux.dev, rppt@kernel.org, paulmck@kernel.org, pasha.tatashin@soleen.com, yosryahmed@google.com, yuzhao@google.com, dhowells@redhat.com, hughd@google.com, andreyknvl@gmail.com, keescook@chromium.org, ndesaulniers@google.com, vvvvvv@google.com, gregkh@linuxfoundation.org, ebiggers@google.com, ytcoode@gmail.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, bristot@redhat.com, vschneid@redhat.com, cl@linux.com, penberg@kernel.org, iamjoonsoo.kim@lge.com, 42.hyeyoo@gmail.com, glider@google.com, elver@google.com, dvyukov@google.com, shakeelb@google.com, songmuchun@bytedance.com, jbaron@akamai.com, rientjes@google.com, minchan@google.com, kaleshsingh@google.com, surenb@google.com, kernel-team@android.com, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, iommu@lists.linux.dev, linux-arch@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-mm@kvack.org, linux-modules@vger.kernel.org, kasan-dev@googlegroups.com, cgroups@vger.kernel.org Content-Type: text/plain; charset="UTF-8" X-getmail-retrieved-from-mailbox: INBOX
Series	Memory allocation profiling \| [v3,00/35] Memory allocation profiling [v3,01/35] lib/string_helpers: Add flags param to string_get_size() [v3,02/35] scripts/kallysms: Always include __start and __stop symbols [v3,03/35] fs: Convert alloc_inode_sb() to a macro [v3,04/35] mm: enumerate all gfp flags [v3,05/35] mm: introduce slabobj_ext to support slab object extensions [v3,06/35] mm: introduce __GFP_NO_OBJ_EXT flag to selectively prevent slabobj_ext creation [v3,07/35] mm/slab: introduce SLAB_NO_OBJ_EXT to avoid obj_ext creation [v3,08/35] mm: prevent slabobj_ext allocations for slabobj_ext and kmem_cache objects [v3,09/35] slab: objext: introduce objext_flags as extension to page_memcg_data_flags [v3,10/35] lib: code tagging framework [v3,11/35] lib: code tagging module support [v3,12/35] lib: prevent module unloading if memory is not freed [v3,13/35] lib: add allocation tagging support for memory allocation profiling [v3,14/35] lib: introduce support for page allocation tagging [v3,15/35] mm: percpu: increase PERCPU_MODULE_RESERVE to accommodate allocation tags [v3,16/35] change alloc_pages name in dma_map_ops to avoid name conflicts [v3,17/35] mm: enable page allocation tagging [v3,18/35] mm: create new codetag references during page splitting [v3,19/35] mm/page_ext: enable early_page_ext when CONFIG_MEM_ALLOC_PROFILING_DEBUG=y [v3,20/35] lib: add codetag reference into slabobj_ext [v3,21/35] mm/slab: add allocation accounting into slab allocation and free paths [v3,22/35] mm/slab: enable slab allocation tagging for kmalloc and friends [v3,23/35] mm/slub: Mark slab_free_freelist_hook() __always_inline [v3,24/35] mempool: Hook up to memory allocation profiling [v3,25/35] xfs: Memory allocation profiling fixups [v3,26/35] mm: percpu: Introduce pcpuobj_ext [v3,27/35] mm: percpu: Add codetag reference into pcpuobj_ext [v3,28/35] mm: percpu: enable per-cpu allocation tagging [v3,29/35] mm: vmalloc: Enable memory allocation profiling [v3,30/35] rhashtable: Plumb through alloc tag [v3,31/35] lib: add memory allocations report in show_mem() [v3,32/35] codetag: debug: skip objext checking when it's for objext itself [v3,33/35] codetag: debug: mark codetags for reserved pages as empty [v3,34/35] codetag: debug: introduce OBJEXTS_ALLOC_FAIL to mark failed slab_ext allocations [v3,35/35] MAINTAINERS: Add entries for code tagging and memory allocation profiling

Commit Message

Suren Baghdasaryan Feb. 12, 2024, 9:39 p.m. UTC

  Include allocations in show_mem reports.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/alloc_tag.h |  2 ++
 lib/alloc_tag.c           | 38 ++++++++++++++++++++++++++++++++++++++
 mm/show_mem.c             | 15 +++++++++++++++
 3 files changed, 55 insertions(+)

Comments

Kees Cook Feb. 13, 2024, 12:10 a.m. UTC | #1

On Mon, Feb 12, 2024 at 01:39:17PM -0800, Suren Baghdasaryan wrote:
> Include allocations in show_mem reports.
> 
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  include/linux/alloc_tag.h |  2 ++
>  lib/alloc_tag.c           | 38 ++++++++++++++++++++++++++++++++++++++
>  mm/show_mem.c             | 15 +++++++++++++++
>  3 files changed, 55 insertions(+)
> 
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> index 3fe51e67e231..0a5973c4ad77 100644
> --- a/include/linux/alloc_tag.h
> +++ b/include/linux/alloc_tag.h
> @@ -30,6 +30,8 @@ struct alloc_tag {
>  
>  #ifdef CONFIG_MEM_ALLOC_PROFILING
>  
> +void alloc_tags_show_mem_report(struct seq_buf *s);
> +
>  static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
>  {
>  	return container_of(ct, struct alloc_tag, ct);
> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> index 2d5226d9262d..54312c213860 100644
> --- a/lib/alloc_tag.c
> +++ b/lib/alloc_tag.c
> @@ -96,6 +96,44 @@ static const struct seq_operations allocinfo_seq_op = {
>  	.show	= allocinfo_show,
>  };
>  
> +void alloc_tags_show_mem_report(struct seq_buf *s)
> +{
> +	struct codetag_iterator iter;
> +	struct codetag *ct;
> +	struct {
> +		struct codetag		*tag;
> +		size_t			bytes;
> +	} tags[10], n;
> +	unsigned int i, nr = 0;
> +
> +	codetag_lock_module_list(alloc_tag_cttype, true);
> +	iter = codetag_get_ct_iter(alloc_tag_cttype);
> +	while ((ct = codetag_next_ct(&iter))) {
> +		struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct));
> +
> +		n.tag	= ct;
> +		n.bytes = counter.bytes;
> +
> +		for (i = 0; i < nr; i++)
> +			if (n.bytes > tags[i].bytes)
> +				break;
> +
> +		if (i < ARRAY_SIZE(tags)) {
> +			nr -= nr == ARRAY_SIZE(tags);
> +			memmove(&tags[i + 1],
> +				&tags[i],
> +				sizeof(tags[0]) * (nr - i));
> +			nr++;
> +			tags[i] = n;
> +		}
> +	}
> +
> +	for (i = 0; i < nr; i++)
> +		alloc_tag_to_text(s, tags[i].tag);
> +
> +	codetag_lock_module_list(alloc_tag_cttype, false);
> +}
> +
>  static void __init procfs_init(void)
>  {
>  	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
> diff --git a/mm/show_mem.c b/mm/show_mem.c
> index 8dcfafbd283c..d514c15ca076 100644
> --- a/mm/show_mem.c
> +++ b/mm/show_mem.c
> @@ -12,6 +12,7 @@
>  #include <linux/hugetlb.h>
>  #include <linux/mm.h>
>  #include <linux/mmzone.h>
> +#include <linux/seq_buf.h>
>  #include <linux/swap.h>
>  #include <linux/vmstat.h>
>  
> @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>  #ifdef CONFIG_MEMORY_FAILURE
>  	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
>  #endif
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +	{
> +		struct seq_buf s;
> +		char *buf = kmalloc(4096, GFP_ATOMIC);

Why 4096? Maybe use PAGE_SIZE instead?

> +
> +		if (buf) {
> +			printk("Memory allocations:\n");

This needs a printk prefix, or better yet, just use pr_info() or similar.

> +			seq_buf_init(&s, buf, 4096);
> +			alloc_tags_show_mem_report(&s);
> +			printk("%s", buf);

Once a seq_buf "consumes" a char *, please don't use any directly any
more. This should be:

			pr_info("%s", seq_buf_str(s));

Otherwise %NUL termination isn't certain. Very likely, given the use
case here, but let's use good hygiene. :)

> +			kfree(buf);
> +		}
> +	}
> +#endif
>  }
> -- 
> 2.43.0.687.g38aa6559b0-goog
>

Steven Rostedt Feb. 13, 2024, 12:22 a.m. UTC | #2

On Mon, 12 Feb 2024 16:10:02 -0800
Kees Cook <keescook@chromium.org> wrote:

> >  #endif
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > +	{
> > +		struct seq_buf s;
> > +		char *buf = kmalloc(4096, GFP_ATOMIC);  
> 
> Why 4096? Maybe use PAGE_SIZE instead?

Will it make a difference for architectures that don't have 4096 PAGE_SIZE?
Like PowerPC which has PAGE_SIZE of anywhere between 4K to 256K!

-- Steve

Kent Overstreet Feb. 13, 2024, 4:33 a.m. UTC | #3

On Mon, Feb 12, 2024 at 07:22:42PM -0500, Steven Rostedt wrote:
> On Mon, 12 Feb 2024 16:10:02 -0800
> Kees Cook <keescook@chromium.org> wrote:
> 
> > >  #endif
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > +	{
> > > +		struct seq_buf s;
> > > +		char *buf = kmalloc(4096, GFP_ATOMIC);  
> > 
> > Why 4096? Maybe use PAGE_SIZE instead?
> 
> Will it make a difference for architectures that don't have 4096 PAGE_SIZE?
> Like PowerPC which has PAGE_SIZE of anywhere between 4K to 256K!

it's just a string buffer

Suren Baghdasaryan Feb. 13, 2024, 8:17 a.m. UTC | #4

On Mon, Feb 12, 2024 at 8:33 PM Kent Overstreet
<kent.overstreet@linux.dev> wrote:
>
> On Mon, Feb 12, 2024 at 07:22:42PM -0500, Steven Rostedt wrote:
> > On Mon, 12 Feb 2024 16:10:02 -0800
> > Kees Cook <keescook@chromium.org> wrote:
> >
> > > >  #endif
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > + {
> > > > +         struct seq_buf s;
> > > > +         char *buf = kmalloc(4096, GFP_ATOMIC);
> > >
> > > Why 4096? Maybe use PAGE_SIZE instead?
> >
> > Will it make a difference for architectures that don't have 4096 PAGE_SIZE?
> > Like PowerPC which has PAGE_SIZE of anywhere between 4K to 256K!
>
> it's just a string buffer

We should document that __show_mem() prints only the top 10 largest
allocations, therefore as long as this buffer is large enough to hold
10 records we should be good. Technically we could simply print one
record at a time and then the buffer can be smaller.

Michal Hocko Feb. 15, 2024, 9:22 a.m. UTC | #5

On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
[...]
> @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>  #ifdef CONFIG_MEMORY_FAILURE
>  	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
>  #endif
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +	{
> +		struct seq_buf s;
> +		char *buf = kmalloc(4096, GFP_ATOMIC);
> +
> +		if (buf) {
> +			printk("Memory allocations:\n");
> +			seq_buf_init(&s, buf, 4096);
> +			alloc_tags_show_mem_report(&s);
> +			printk("%s", buf);
> +			kfree(buf);
> +		}
> +	}
> +#endif

I am pretty sure I have already objected to this. Memory allocations in
the oom path are simply no go unless there is absolutely no other way
around that. In this case the buffer could be preallocated.

Suren Baghdasaryan Feb. 15, 2024, 2:58 p.m. UTC | #6

On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> [...]
> > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> >  #ifdef CONFIG_MEMORY_FAILURE
> >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> >  #endif
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > +     {
> > +             struct seq_buf s;
> > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > +
> > +             if (buf) {
> > +                     printk("Memory allocations:\n");
> > +                     seq_buf_init(&s, buf, 4096);
> > +                     alloc_tags_show_mem_report(&s);
> > +                     printk("%s", buf);
> > +                     kfree(buf);
> > +             }
> > +     }
> > +#endif
>
> I am pretty sure I have already objected to this. Memory allocations in
> the oom path are simply no go unless there is absolutely no other way
> around that. In this case the buffer could be preallocated.

Good point. We will change this to a smaller buffer allocated on the
stack and will print records one-by-one. Thanks!

>
> --
> Michal Hocko
> SUSE Labs

Michal Hocko Feb. 15, 2024, 4:44 p.m. UTC | #7

On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > [...]
> > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > >  #ifdef CONFIG_MEMORY_FAILURE
> > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > >  #endif
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > +     {
> > > +             struct seq_buf s;
> > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > +
> > > +             if (buf) {
> > > +                     printk("Memory allocations:\n");
> > > +                     seq_buf_init(&s, buf, 4096);
> > > +                     alloc_tags_show_mem_report(&s);
> > > +                     printk("%s", buf);
> > > +                     kfree(buf);
> > > +             }
> > > +     }
> > > +#endif
> >
> > I am pretty sure I have already objected to this. Memory allocations in
> > the oom path are simply no go unless there is absolutely no other way
> > around that. In this case the buffer could be preallocated.
> 
> Good point. We will change this to a smaller buffer allocated on the
> stack and will print records one-by-one. Thanks!

__show_mem could be called with a very deep call chains. A single
pre-allocated buffer should just do ok.

Suren Baghdasaryan Feb. 15, 2024, 4:47 p.m. UTC | #8

On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > > [...]
> > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > >  #endif
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > +     {
> > > > +             struct seq_buf s;
> > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > > +
> > > > +             if (buf) {
> > > > +                     printk("Memory allocations:\n");
> > > > +                     seq_buf_init(&s, buf, 4096);
> > > > +                     alloc_tags_show_mem_report(&s);
> > > > +                     printk("%s", buf);
> > > > +                     kfree(buf);
> > > > +             }
> > > > +     }
> > > > +#endif
> > >
> > > I am pretty sure I have already objected to this. Memory allocations in
> > > the oom path are simply no go unless there is absolutely no other way
> > > around that. In this case the buffer could be preallocated.
> >
> > Good point. We will change this to a smaller buffer allocated on the
> > stack and will print records one-by-one. Thanks!
>
> __show_mem could be called with a very deep call chains. A single
> pre-allocated buffer should just do ok.

Ack. Will do.

>
> --
> Michal Hocko
> SUSE Labs

Kent Overstreet Feb. 15, 2024, 6:38 p.m. UTC | #9

On Thu, Feb 15, 2024 at 10:33:53AM -0800, Suren Baghdasaryan wrote:
> On Thu, Feb 15, 2024 at 10:29 AM Kent Overstreet
> <kent.overstreet@linux.dev> wrote:
> >
> > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> > > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> > > >
> > > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > > >
> > > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > > > > > [...]
> > > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > > > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > > > > >  #endif
> > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > > > > +     {
> > > > > > > +             struct seq_buf s;
> > > > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > > > > > +
> > > > > > > +             if (buf) {
> > > > > > > +                     printk("Memory allocations:\n");
> > > > > > > +                     seq_buf_init(&s, buf, 4096);
> > > > > > > +                     alloc_tags_show_mem_report(&s);
> > > > > > > +                     printk("%s", buf);
> > > > > > > +                     kfree(buf);
> > > > > > > +             }
> > > > > > > +     }
> > > > > > > +#endif
> > > > > >
> > > > > > I am pretty sure I have already objected to this. Memory allocations in
> > > > > > the oom path are simply no go unless there is absolutely no other way
> > > > > > around that. In this case the buffer could be preallocated.
> > > > >
> > > > > Good point. We will change this to a smaller buffer allocated on the
> > > > > stack and will print records one-by-one. Thanks!
> > > >
> > > > __show_mem could be called with a very deep call chains. A single
> > > > pre-allocated buffer should just do ok.
> > >
> > > Ack. Will do.
> >
> > No, we're not going to permanently burn 4k here.
> 
> We don't need 4K here. Just enough to store one line and then print
> these 10 highest allocations one line at a time. This way we can also
> change that 10 to any higher number we like without any side effects.

There's no reason to make the change at all. If Michal thinks there's
something "dangerous" about allocating a buffer here, he needs to able
to explain what it is.

Michal Hocko Feb. 15, 2024, 6:41 p.m. UTC | #10

On Thu 15-02-24 13:29:40, Kent Overstreet wrote:
> On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > >
> > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > > > > [...]
> > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > > > >  #endif
> > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > > > +     {
> > > > > > +             struct seq_buf s;
> > > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > > > > +
> > > > > > +             if (buf) {
> > > > > > +                     printk("Memory allocations:\n");
> > > > > > +                     seq_buf_init(&s, buf, 4096);
> > > > > > +                     alloc_tags_show_mem_report(&s);
> > > > > > +                     printk("%s", buf);
> > > > > > +                     kfree(buf);
> > > > > > +             }
> > > > > > +     }
> > > > > > +#endif
> > > > >
> > > > > I am pretty sure I have already objected to this. Memory allocations in
> > > > > the oom path are simply no go unless there is absolutely no other way
> > > > > around that. In this case the buffer could be preallocated.
> > > >
> > > > Good point. We will change this to a smaller buffer allocated on the
> > > > stack and will print records one-by-one. Thanks!
> > >
> > > __show_mem could be called with a very deep call chains. A single
> > > pre-allocated buffer should just do ok.
> > 
> > Ack. Will do.
> 
> No, we're not going to permanently burn 4k here.
> 
> It's completely fine if the allocation fails, there's nothing "unsafe"
> about doing a GFP_ATOMIC allocation here.

Nobody is talking about safety. This is just a wrong thing to do when
you are likely under OOM situation. This is a situation when you
GFP_ATOMIC allocation is _likely_ to fail. Yes, yes you will get some
additional memory reservers head room, but you shouldn't rely on that
because that will make the output unreliable. Not something you want in
situation when you really want to know that information.

More over you do not need to preallocate a full page.

Michal Hocko Feb. 15, 2024, 9:54 p.m. UTC | #11

On Thu 15-02-24 15:33:30, Kent Overstreet wrote:
> On Thu, Feb 15, 2024 at 09:22:07PM +0100, Vlastimil Babka wrote:
> > On 2/15/24 19:29, Kent Overstreet wrote:
> > > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> > >> On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> > >> >
> > >> > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > >> > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > >> > > >
> > >> > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > >> > > > [...]
> > >> > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > >> > > > >  #ifdef CONFIG_MEMORY_FAILURE
> > >> > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > >> > > > >  #endif
> > >> > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > >> > > > > +     {
> > >> > > > > +             struct seq_buf s;
> > >> > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > >> > > > > +
> > >> > > > > +             if (buf) {
> > >> > > > > +                     printk("Memory allocations:\n");
> > >> > > > > +                     seq_buf_init(&s, buf, 4096);
> > >> > > > > +                     alloc_tags_show_mem_report(&s);
> > >> > > > > +                     printk("%s", buf);
> > >> > > > > +                     kfree(buf);
> > >> > > > > +             }
> > >> > > > > +     }
> > >> > > > > +#endif
> > >> > > >
> > >> > > > I am pretty sure I have already objected to this. Memory allocations in
> > >> > > > the oom path are simply no go unless there is absolutely no other way
> > >> > > > around that. In this case the buffer could be preallocated.
> > >> > >
> > >> > > Good point. We will change this to a smaller buffer allocated on the
> > >> > > stack and will print records one-by-one. Thanks!
> > >> >
> > >> > __show_mem could be called with a very deep call chains. A single
> > >> > pre-allocated buffer should just do ok.
> > >> 
> > >> Ack. Will do.
> > > 
> > > No, we're not going to permanently burn 4k here.
> > > 
> > > It's completely fine if the allocation fails, there's nothing "unsafe"
> > > about doing a GFP_ATOMIC allocation here.
> > 
> > Well, I think without __GFP_NOWARN it will cause a warning and thus
> > recursion into __show_mem(), potentially infinite? Which is of course
> > trivial to fix, but I'd myself rather sacrifice a bit of memory to get this
> > potentially very useful output, if I enabled the profiling. The necessary
> > memory overhead of page_ext and slabobj_ext makes the printing buffer
> > overhead negligible in comparison?
> 
> __GFP_NOWARN is a good point, we should have that.
> 
> But - and correct me if I'm wrong here - doesn't an OOM kick in well
> before GFP_ATOMIC 4k allocations are failing?

Not really, GFP_ATOMIC users can compete with reclaimers and consume
those reserves.

> I'd expect the system to
> be well and truly hosed at that point.

It is OOMed...
 
> If we want this report to be 100% reliable, then yes the preallocated
> buffer makes sense - but I don't think 100% makes sense here; I think we
> can accept ~99% and give back that 4k.

Think about that from the memory reserves consumers. The atomic reserve
is a scarse resource and now you want to use it for debugging purposes
for which you could have preallocated.

Kent Overstreet Feb. 15, 2024, 10:54 p.m. UTC | #12

On Thu, Feb 15, 2024 at 10:54:53PM +0100, Michal Hocko wrote:
> On Thu 15-02-24 15:33:30, Kent Overstreet wrote:
> > If we want this report to be 100% reliable, then yes the preallocated
> > buffer makes sense - but I don't think 100% makes sense here; I think we
> > can accept ~99% and give back that 4k.
> 
> Think about that from the memory reserves consumers. The atomic reserve
> is a scarse resource and now you want to use it for debugging purposes
> for which you could have preallocated.

_Memory_ is a finite resource that we shouldn't be using unnecessarily. 

We don't need this for the entire time we're under memory pressure; just
the short duration it takes to generate the report, then it's back
available for other users.

You would have us dedicate 4k, from system bootup, that can never be
used by other users.

Again: this makes no sense. The whole point of having watermarks and
shared reserves is so that every codepath doesn't have to have its own
dedicated, private reserve, so that we can make better use of a shared
finite resource.

Dave Hansen Feb. 15, 2024, 11:19 p.m. UTC | #13

On 2/15/24 15:07, Steven Rostedt wrote:
> Just adding the patches increases the size by 5k. But the rest shows an
> increase of 259k, and you are worried about 4k (and possibly less?)???

Doesn't the new page_ext thingy add a pointer per 'struct page', or
~0.2% of RAM, or ~32MB on a 16GB laptop?  I, too, am confused why 4k is
even remotely an issue.

Michal Hocko Feb. 20, 2024, 4:23 p.m. UTC | #14

On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote:
[...]
> For now I think with Vlastimil's __GFP_NOWARN suggestion the code
> becomes safe and the only risk is to lose this report. If we get cases
> with reports missing this data, we can easily change to reserved
> memory.

This is not just about missing part of the oom report. This is annoying
but not earth shattering. Eating into very small reserves (that might be
the only usable memory while the system is struggling in OOM situation)
could cause functional problems that would be non trivial to test for.
All that for debugging purposes is just lame. If you want to reuse the code
for a different purpose then abstract it and allocate the buffer when you
can afford that and use preallocated on when in OOM situation.

We have always went extra mile to avoid potentially disruptive
operations from the oom handling code and I do not see any good reason
to diverge from that principle.

Michal Hocko Feb. 20, 2024, 5:24 p.m. UTC | #15

On Tue 20-02-24 12:18:49, Kent Overstreet wrote:
> On Tue, Feb 20, 2024 at 05:23:29PM +0100, Michal Hocko wrote:
> > On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote:
> > [...]
> > > For now I think with Vlastimil's __GFP_NOWARN suggestion the code
> > > becomes safe and the only risk is to lose this report. If we get cases
> > > with reports missing this data, we can easily change to reserved
> > > memory.
> > 
> > This is not just about missing part of the oom report. This is annoying
> > but not earth shattering. Eating into very small reserves (that might be
> > the only usable memory while the system is struggling in OOM situation)
> > could cause functional problems that would be non trivial to test for.
> > All that for debugging purposes is just lame. If you want to reuse the code
> > for a different purpose then abstract it and allocate the buffer when you
> > can afford that and use preallocated on when in OOM situation.
> > 
> > We have always went extra mile to avoid potentially disruptive
> > operations from the oom handling code and I do not see any good reason
> > to diverge from that principle.
> 
> Michal, I gave you the logic between dedicated reserves and system
> reserves. Please stop repeating these vague what-ifs.

Your argument makes little sense and it seems that it is impossible to
explain that to you. I gave up on discussing this further with you.

Consider NAK to any additional allocation from oom path unless you can
give very _solid_ arguments this is absolutely necessary. "It's gona be
fine and work most of the time" is not a solid argument.

diff mbox series

Patch

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 3fe51e67e231..0a5973c4ad77 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -30,6 +30,8 @@  struct alloc_tag {
 
 #ifdef CONFIG_MEM_ALLOC_PROFILING
 
+void alloc_tags_show_mem_report(struct seq_buf *s);
+
 static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
 {
 	return container_of(ct, struct alloc_tag, ct);
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 2d5226d9262d..54312c213860 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -96,6 +96,44 @@  static const struct seq_operations allocinfo_seq_op = {
 	.show	= allocinfo_show,
 };
 
+void alloc_tags_show_mem_report(struct seq_buf *s)
+{
+	struct codetag_iterator iter;
+	struct codetag *ct;
+	struct {
+		struct codetag		*tag;
+		size_t			bytes;
+	} tags[10], n;
+	unsigned int i, nr = 0;
+
+	codetag_lock_module_list(alloc_tag_cttype, true);
+	iter = codetag_get_ct_iter(alloc_tag_cttype);
+	while ((ct = codetag_next_ct(&iter))) {
+		struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct));
+
+		n.tag	= ct;
+		n.bytes = counter.bytes;
+
+		for (i = 0; i < nr; i++)
+			if (n.bytes > tags[i].bytes)
+				break;
+
+		if (i < ARRAY_SIZE(tags)) {
+			nr -= nr == ARRAY_SIZE(tags);
+			memmove(&tags[i + 1],
+				&tags[i],
+				sizeof(tags[0]) * (nr - i));
+			nr++;
+			tags[i] = n;
+		}
+	}
+
+	for (i = 0; i < nr; i++)
+		alloc_tag_to_text(s, tags[i].tag);
+
+	codetag_lock_module_list(alloc_tag_cttype, false);
+}
+
 static void __init procfs_init(void)
 {
 	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 8dcfafbd283c..d514c15ca076 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -12,6 +12,7 @@ 
 #include <linux/hugetlb.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/seq_buf.h>
 #include <linux/swap.h>
 #include <linux/vmstat.h>
 
@@ -423,4 +424,18 @@  void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 #ifdef CONFIG_MEMORY_FAILURE
 	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
 #endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+	{
+		struct seq_buf s;
+		char *buf = kmalloc(4096, GFP_ATOMIC);
+
+		if (buf) {
+			printk("Memory allocations:\n");
+			seq_buf_init(&s, buf, 4096);
+			alloc_tags_show_mem_report(&s);
+			printk("%s", buf);
+			kfree(buf);
+		}
+	}
+#endif
 }