mm/slub: Reduce memory consumption in extreme scenarios

Message ID 20230314123403.100158-1-chenjun102@huawei.com
State New
Headers
Series mm/slub: Reduce memory consumption in extreme scenarios |

Commit Message

Chen Jun March 14, 2023, 12:34 p.m. UTC
  When kmalloc_node() is called without __GFP_THISNODE and the target node
lacks sufficient memory, SLUB allocates a folio from a different node
other than the requested node, instead of taking a partial slab from it.

However, since the allocated folio does not belong to the requested
node, it is deactivated and added to the partial slab list of the node
it belongs to.

This behavior can result in excessive memory usage when the requested
node has insufficient memory, as SLUB will repeatedly allocate folios
from other nodes without reusing the previously allocated ones.

To prevent memory wastage,
when (node != NUMA_NO_NODE) && (gfpflags & __GFP_THISNODE) is:
1) try to get a partial slab from target node with __GFP_THISNODE.
2) if 1) failed, try to allocate a new slab from target node with
   __GFP_THISNODE.
3) if 2) failed, retry 1) and 2) without __GFP_THISNODE constraint.

when node != NUMA_NO_NODE || (gfpflags & __GFP_THISNODE), the behavior
remains unchanged.

On qemu with 4 numa nodes and each numa has 1G memory. Write a test ko
to call kmalloc_node(196, GFP_KERNEL, 3) for (4 * 1024 + 4) * 1024 times.

cat /proc/slabinfo shows:
kmalloc-256       4200530 13519712    256   32    2 : tunables..

after this patch,
cat /proc/slabinfo shows:
kmalloc-256       4200558 4200768    256   32    2 : tunables..

Signed-off-by: Chen Jun <chenjun102@huawei.com>
---
 mm/slub.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)
  

Comments

Vlastimil Babka March 14, 2023, 2:41 p.m. UTC | #1
On 3/14/23 13:34, Chen Jun wrote:
> When kmalloc_node() is called without __GFP_THISNODE and the target node
> lacks sufficient memory, SLUB allocates a folio from a different node
> other than the requested node, instead of taking a partial slab from it.
> 
> However, since the allocated folio does not belong to the requested
> node, it is deactivated and added to the partial slab list of the node
> it belongs to.
> 
> This behavior can result in excessive memory usage when the requested
> node has insufficient memory, as SLUB will repeatedly allocate folios
> from other nodes without reusing the previously allocated ones.
> 
> To prevent memory wastage,
> when (node != NUMA_NO_NODE) && (gfpflags & __GFP_THISNODE) is:
> 1) try to get a partial slab from target node with __GFP_THISNODE.
> 2) if 1) failed, try to allocate a new slab from target node with
>    __GFP_THISNODE.
> 3) if 2) failed, retry 1) and 2) without __GFP_THISNODE constraint.
> 
> when node != NUMA_NO_NODE || (gfpflags & __GFP_THISNODE), the behavior
> remains unchanged.
> 
> On qemu with 4 numa nodes and each numa has 1G memory. Write a test ko
> to call kmalloc_node(196, GFP_KERNEL, 3) for (4 * 1024 + 4) * 1024 times.
> 
> cat /proc/slabinfo shows:
> kmalloc-256       4200530 13519712    256   32    2 : tunables..
> 
> after this patch,
> cat /proc/slabinfo shows:
> kmalloc-256       4200558 4200768    256   32    2 : tunables..
> 
> Signed-off-by: Chen Jun <chenjun102@huawei.com>
> ---
>  mm/slub.c | 22 +++++++++++++++++++---
>  1 file changed, 19 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 39327e98fce3..32e436957e03 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2384,7 +2384,7 @@ static void *get_partial(struct kmem_cache *s, int node, struct partial_context
>  		searchnode = numa_mem_id();
>  
>  	object = get_partial_node(s, get_node(s, searchnode), pc);
> -	if (object || node != NUMA_NO_NODE)
> +	if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
>  		return object;
>  
>  	return get_any_partial(s, pc);
> @@ -3069,6 +3069,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  	struct slab *slab;
>  	unsigned long flags;
>  	struct partial_context pc;
> +	bool try_thisnode = true;
>  
>  	stat(s, ALLOC_SLOWPATH);
>  
> @@ -3181,8 +3182,18 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  	}
>  
>  new_objects:
> -
>  	pc.flags = gfpflags;
> +
> +	/*
> +	 * when (node != NUMA_NO_NODE) && (gfpflags & __GFP_THISNODE)
> +	 * 1) try to get a partial slab from target node with __GFP_THISNODE.
> +	 * 2) if 1) failed, try to allocate a new slab from target node with
> +	 *    __GFP_THISNODE.
> +	 * 3) if 2) failed, retry 1) and 2) without __GFP_THISNODE constraint.
> +	 */
> +	if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode)
> +			pc.flags |= __GFP_THISNODE;

Hmm I'm thinking we should also perhaps remove direct reclaim possibilities
from the attempt 2). In your qemu test it should make no difference, as it
fills everything with kernel memory that is not reclaimable. But in practice
the target node might be filled with user memory, and I think it's better to
quickly allocate on a different node than spend time in direct reclaim. So
the following should work I think?

pc.flags = GFP_NOWAIT | __GFP_NOWARN |__GFP_THISNODE

> +
>  	pc.slab = &slab;
>  	pc.orig_size = orig_size;
>  	freelist = get_partial(s, node, &pc);
> @@ -3190,10 +3201,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  		goto check_new_slab;
>  
>  	slub_put_cpu_ptr(s->cpu_slab);
> -	slab = new_slab(s, gfpflags, node);
> +	slab = new_slab(s, pc.flags, node);
>  	c = slub_get_cpu_ptr(s->cpu_slab);
>  
>  	if (unlikely(!slab)) {
> +		if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode) {
> +			try_thisnode = false;
> +			goto new_objects;
> +		}
> +
>  		slab_out_of_memory(s, gfpflags, node);
>  		return NULL;
>  	}
  
Vlastimil Babka March 17, 2023, 12:06 p.m. UTC | #2
On 3/17/23 12:32, chenjun (AM) wrote:
> 在 2023/3/14 22:41, Vlastimil Babka 写道:
>>>   	pc.flags = gfpflags;
>>> +
>>> +	/*
>>> +	 * when (node != NUMA_NO_NODE) && (gfpflags & __GFP_THISNODE)
>>> +	 * 1) try to get a partial slab from target node with __GFP_THISNODE.
>>> +	 * 2) if 1) failed, try to allocate a new slab from target node with
>>> +	 *    __GFP_THISNODE.
>>> +	 * 3) if 2) failed, retry 1) and 2) without __GFP_THISNODE constraint.
>>> +	 */
>>> +	if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode)
>>> +			pc.flags |= __GFP_THISNODE;
>> 
>> Hmm I'm thinking we should also perhaps remove direct reclaim possibilities
>> from the attempt 2). In your qemu test it should make no difference, as it
>> fills everything with kernel memory that is not reclaimable. But in practice
>> the target node might be filled with user memory, and I think it's better to
>> quickly allocate on a different node than spend time in direct reclaim. So
>> the following should work I think?
>> 
>> pc.flags = GFP_NOWAIT | __GFP_NOWARN |__GFP_THISNODE
>> 
> 
> Hmm, Should it be that:
> 
> pc.flags |= GFP_NOWAIT | __GFP_NOWARN |__GFP_THISNODE

No, we need to ignore the other reclaim-related flags that the caller
passed, or it wouldn't work as intended.
The danger is that we ignore some flag that would be necessary to pass, but
I don't think there's any?
  
Vlastimil Babka March 20, 2023, 8:05 a.m. UTC | #3
On 3/19/23 08:22, chenjun (AM) wrote:
> 在 2023/3/17 20:06, Vlastimil Babka 写道:
>> On 3/17/23 12:32, chenjun (AM) wrote:
>>> 在 2023/3/14 22:41, Vlastimil Babka 写道:
>>>>>    	pc.flags = gfpflags;
>>>>> +
>>>>> +	/*
>>>>> +	 * when (node != NUMA_NO_NODE) && (gfpflags & __GFP_THISNODE)
>>>>> +	 * 1) try to get a partial slab from target node with __GFP_THISNODE.
>>>>> +	 * 2) if 1) failed, try to allocate a new slab from target node with
>>>>> +	 *    __GFP_THISNODE.
>>>>> +	 * 3) if 2) failed, retry 1) and 2) without __GFP_THISNODE constraint.
>>>>> +	 */
>>>>> +	if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode)
>>>>> +			pc.flags |= __GFP_THISNODE;
>>>>
>>>> Hmm I'm thinking we should also perhaps remove direct reclaim possibilities
>>>> from the attempt 2). In your qemu test it should make no difference, as it
>>>> fills everything with kernel memory that is not reclaimable. But in practice
>>>> the target node might be filled with user memory, and I think it's better to
>>>> quickly allocate on a different node than spend time in direct reclaim. So
>>>> the following should work I think?
>>>>
>>>> pc.flags = GFP_NOWAIT | __GFP_NOWARN |__GFP_THISNODE
>>>>
>>>
>>> Hmm, Should it be that:
>>>
>>> pc.flags |= GFP_NOWAIT | __GFP_NOWARN |__GFP_THISNODE
>> 
>> No, we need to ignore the other reclaim-related flags that the caller
>> passed, or it wouldn't work as intended.
>> The danger is that we ignore some flag that would be necessary to pass, but
>> I don't think there's any?
>> 
>> 
> 
> If we ignore __GFP_ZERO passed by kzalloc, kzalloc will not work.
> Could we just unmask __GFP_RECLAIMABLE | __GFP_RECLAIM?
> 
> pc.flags &= ~(__GFP_RECLAIMABLE | __GFP_RECLAIM)
> pc.flags |= __GFP_THISNODE

__GFP_RECLAIMABLE would be wrong, but also ignored as new_slab() does:
	flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK)

which would filter out __GFP_ZERO as well. That's not a problem as kzalloc()
will zero out the individual allocated objects, so it doesn't matter if we
don't zero out the whole slab page.

But I wonder, if we're not past due time for a helper e.g.
gfp_opportunistic(flags) that would turn any allocation flags to a
GFP_NOWAIT while keeping the rest of relevant flags intact, and thus there
would be one canonical way to do it - I'm sure there's a number of places
with their own variants now?
With such helper we'd just add __GFP_THISNODE to the result here as that's
specific to this particular opportunistic allocation.
  
Mike Rapoport March 20, 2023, 9:12 a.m. UTC | #4
On Mon, Mar 20, 2023 at 09:05:57AM +0100, Vlastimil Babka wrote:
> On 3/19/23 08:22, chenjun (AM) wrote:
> > 在 2023/3/17 20:06, Vlastimil Babka 写道:
> >> On 3/17/23 12:32, chenjun (AM) wrote:
> >>> 在 2023/3/14 22:41, Vlastimil Babka 写道:
> >>>>>    	pc.flags = gfpflags;
> >>>>> +
> >>>>> +	/*
> >>>>> +	 * when (node != NUMA_NO_NODE) && (gfpflags & __GFP_THISNODE)
> >>>>> +	 * 1) try to get a partial slab from target node with __GFP_THISNODE.
> >>>>> +	 * 2) if 1) failed, try to allocate a new slab from target node with
> >>>>> +	 *    __GFP_THISNODE.
> >>>>> +	 * 3) if 2) failed, retry 1) and 2) without __GFP_THISNODE constraint.
> >>>>> +	 */
> >>>>> +	if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode)
> >>>>> +			pc.flags |= __GFP_THISNODE;
> >>>>
> >>>> Hmm I'm thinking we should also perhaps remove direct reclaim possibilities
> >>>> from the attempt 2). In your qemu test it should make no difference, as it
> >>>> fills everything with kernel memory that is not reclaimable. But in practice
> >>>> the target node might be filled with user memory, and I think it's better to
> >>>> quickly allocate on a different node than spend time in direct reclaim. So
> >>>> the following should work I think?
> >>>>
> >>>> pc.flags = GFP_NOWAIT | __GFP_NOWARN |__GFP_THISNODE
> >>>>
> >>>
> >>> Hmm, Should it be that:
> >>>
> >>> pc.flags |= GFP_NOWAIT | __GFP_NOWARN |__GFP_THISNODE
> >> 
> >> No, we need to ignore the other reclaim-related flags that the caller
> >> passed, or it wouldn't work as intended.
> >> The danger is that we ignore some flag that would be necessary to pass, but
> >> I don't think there's any?
> >> 
> >> 
> > 
> > If we ignore __GFP_ZERO passed by kzalloc, kzalloc will not work.
> > Could we just unmask __GFP_RECLAIMABLE | __GFP_RECLAIM?
> > 
> > pc.flags &= ~(__GFP_RECLAIMABLE | __GFP_RECLAIM)
> > pc.flags |= __GFP_THISNODE
> 
> __GFP_RECLAIMABLE would be wrong, but also ignored as new_slab() does:
> 	flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK)
> 
> which would filter out __GFP_ZERO as well. That's not a problem as kzalloc()
> will zero out the individual allocated objects, so it doesn't matter if we
> don't zero out the whole slab page.
> 
> But I wonder, if we're not past due time for a helper e.g.
> gfp_opportunistic(flags) that would turn any allocation flags to a
> GFP_NOWAIT while keeping the rest of relevant flags intact, and thus there
> would be one canonical way to do it - I'm sure there's a number of places
> with their own variants now?
> With such helper we'd just add __GFP_THISNODE to the result here as that's
> specific to this particular opportunistic allocation.

I like the idea, but maybe gfp_no_reclaim() would be clearer?
  
Vlastimil Babka March 21, 2023, 9:41 a.m. UTC | #5
On 3/20/23 10:12, Mike Rapoport wrote:
> On Mon, Mar 20, 2023 at 09:05:57AM +0100, Vlastimil Babka wrote:
>> On 3/19/23 08:22, chenjun (AM) wrote:
>> > 在 2023/3/17 20:06, Vlastimil Babka 写道:
>> > 
>> > If we ignore __GFP_ZERO passed by kzalloc, kzalloc will not work.
>> > Could we just unmask __GFP_RECLAIMABLE | __GFP_RECLAIM?
>> > 
>> > pc.flags &= ~(__GFP_RECLAIMABLE | __GFP_RECLAIM)
>> > pc.flags |= __GFP_THISNODE
>> 
>> __GFP_RECLAIMABLE would be wrong, but also ignored as new_slab() does:
>> 	flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK)
>> 
>> which would filter out __GFP_ZERO as well. That's not a problem as kzalloc()
>> will zero out the individual allocated objects, so it doesn't matter if we
>> don't zero out the whole slab page.
>> 
>> But I wonder, if we're not past due time for a helper e.g.
>> gfp_opportunistic(flags) that would turn any allocation flags to a
>> GFP_NOWAIT while keeping the rest of relevant flags intact, and thus there
>> would be one canonical way to do it - I'm sure there's a number of places
>> with their own variants now?
>> With such helper we'd just add __GFP_THISNODE to the result here as that's
>> specific to this particular opportunistic allocation.
> 
> I like the idea, but maybe gfp_no_reclaim() would be clearer?

Well, that name would say how it's implemented, but not exactly as we also
want to add __GFP_NOWARN. "gfp_opportunistic()" or a better name with
similar meaning was meant to convey the intention of what this allocation is
trying to do, and I think that's better from the API users POV?
  
Vlastimil Babka March 29, 2023, 8:41 a.m. UTC | #6
On 3/21/23 10:30, chenjun (AM) wrote:
> 在 2023/3/20 17:12, Mike Rapoport 写道:
>>>>
>>>> If we ignore __GFP_ZERO passed by kzalloc, kzalloc will not work.
>>>> Could we just unmask __GFP_RECLAIMABLE | __GFP_RECLAIM?
>>>>
>>>> pc.flags &= ~(__GFP_RECLAIMABLE | __GFP_RECLAIM)
>>>> pc.flags |= __GFP_THISNODE
>>>
>>> __GFP_RECLAIMABLE would be wrong, but also ignored as new_slab() does:
>>> 	flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK)
>>>
>>> which would filter out __GFP_ZERO as well. That's not a problem as kzalloc()
>>> will zero out the individual allocated objects, so it doesn't matter if we
>>> don't zero out the whole slab page.
>>>
>>> But I wonder, if we're not past due time for a helper e.g.
>>> gfp_opportunistic(flags) that would turn any allocation flags to a
>>> GFP_NOWAIT while keeping the rest of relevant flags intact, and thus there
>>> would be one canonical way to do it - I'm sure there's a number of places
>>> with their own variants now?
>>> With such helper we'd just add __GFP_THISNODE to the result here as that's
>>> specific to this particular opportunistic allocation.
>> 
>> I like the idea, but maybe gfp_no_reclaim() would be clearer?
>> 
> 
> #define gfp_no_reclaim(gfpflag) (gfpflag & ~__GFP_DIRECT_RECLAIM)

I hoped for more feedback on the idea, but it's probably best proposed
outside of this slub-specific thread, so we could go for an open-coded
solution in slub for now.

Also just masking out __GFP_DIRECT_RECLAIM wouldn't be sufficient in any
case for the general solution/

> And here,
> 
> pc.flags = gfp_no_reclaim(gfpflags) | __GFP_THISNODE.

I'd still suggest as earlier:

pc.flags = GFP_NOWAIT | __GFP_NOWARN |__GFP_THISNODE;

> Do I get it right?
  

Patch

diff --git a/mm/slub.c b/mm/slub.c
index 39327e98fce3..32e436957e03 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2384,7 +2384,7 @@  static void *get_partial(struct kmem_cache *s, int node, struct partial_context
 		searchnode = numa_mem_id();
 
 	object = get_partial_node(s, get_node(s, searchnode), pc);
-	if (object || node != NUMA_NO_NODE)
+	if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
 		return object;
 
 	return get_any_partial(s, pc);
@@ -3069,6 +3069,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	struct slab *slab;
 	unsigned long flags;
 	struct partial_context pc;
+	bool try_thisnode = true;
 
 	stat(s, ALLOC_SLOWPATH);
 
@@ -3181,8 +3182,18 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	}
 
 new_objects:
-
 	pc.flags = gfpflags;
+
+	/*
+	 * when (node != NUMA_NO_NODE) && (gfpflags & __GFP_THISNODE)
+	 * 1) try to get a partial slab from target node with __GFP_THISNODE.
+	 * 2) if 1) failed, try to allocate a new slab from target node with
+	 *    __GFP_THISNODE.
+	 * 3) if 2) failed, retry 1) and 2) without __GFP_THISNODE constraint.
+	 */
+	if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode)
+			pc.flags |= __GFP_THISNODE;
+
 	pc.slab = &slab;
 	pc.orig_size = orig_size;
 	freelist = get_partial(s, node, &pc);
@@ -3190,10 +3201,15 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 		goto check_new_slab;
 
 	slub_put_cpu_ptr(s->cpu_slab);
-	slab = new_slab(s, gfpflags, node);
+	slab = new_slab(s, pc.flags, node);
 	c = slub_get_cpu_ptr(s->cpu_slab);
 
 	if (unlikely(!slab)) {
+		if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode) {
+			try_thisnode = false;
+			goto new_objects;
+		}
+
 		slab_out_of_memory(s, gfpflags, node);
 		return NULL;
 	}