slub: avoid scanning all partial slabs in get_slabinfo()

Message ID 20240215211457.32172-1-jianfeng.w.wang@oracle.com
State New
Headers
Series slub: avoid scanning all partial slabs in get_slabinfo() |

Commit Message

Jianfeng Wang Feb. 15, 2024, 9:14 p.m. UTC
  When reading "/proc/slabinfo", the kernel needs to report the number of
free objects for each kmem_cache. The current implementation relies on
count_partial() that counts the number of free objects by scanning each
kmem_cache_node's partial slab list and summing free objects from all
partial slabs in the list. This process must hold per kmem_cache_node
spinlock and disable IRQ. Consequently, it can block slab allocation
requests on other CPU cores and cause timeouts for network devices etc.,
if the partial slab list is long. In production, even NMI watchdog can
be triggered because some slab caches have a long partial list: e.g.,
for "buffer_head", the number of partial slabs was observed to be ~1M
in one kmem_cache_node. This problem was also observed by several
others [1-2] in the past.

The fix is to maintain a counter of free objects for each kmem_cache.
Then, in get_slabinfo(), use the counter rather than count_partial()
when reporting the number of free objects for a slab cache. per-cpu
counter is used to minimize atomic or lock operation.

Benchmark: run hackbench on a dual-socket 72-CPU bare metal machine
with 256 GB memory and Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.3 GHz.
The command is "hackbench 18 thread 20000". Each group gets 10 runs.

Results:
- Mainline:
21.0381 +- 0.0325 seconds time elapsed  ( +-  0.15% )
- Mainline w/ this patch:
21.1878 +- 0.0239 seconds time elapsed  ( +-  0.11% )

[1] https://lore.kernel.org/linux-mm/
alpine.DEB.2.21.2003031602460.1537@www.lameter.com/T/
[2] https://lore.kernel.org/lkml/
alpine.DEB.2.22.394.2008071258020.55871@www.lameter.com/T/

Signed-off-by: Jianfeng Wang <jianfeng.w.wang@oracle.com>
---
 mm/slab.h |  4 ++++
 mm/slub.c | 31 +++++++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 2 deletions(-)
  

Comments

David Rientjes Feb. 18, 2024, 7:25 p.m. UTC | #1
On Thu, 15 Feb 2024, Jianfeng Wang wrote:

> When reading "/proc/slabinfo", the kernel needs to report the number of
> free objects for each kmem_cache. The current implementation relies on
> count_partial() that counts the number of free objects by scanning each
> kmem_cache_node's partial slab list and summing free objects from all
> partial slabs in the list. This process must hold per kmem_cache_node
> spinlock and disable IRQ. Consequently, it can block slab allocation
> requests on other CPU cores and cause timeouts for network devices etc.,
> if the partial slab list is long. In production, even NMI watchdog can
> be triggered because some slab caches have a long partial list: e.g.,
> for "buffer_head", the number of partial slabs was observed to be ~1M
> in one kmem_cache_node. This problem was also observed by several
> others [1-2] in the past.
> 
> The fix is to maintain a counter of free objects for each kmem_cache.
> Then, in get_slabinfo(), use the counter rather than count_partial()
> when reporting the number of free objects for a slab cache. per-cpu
> counter is used to minimize atomic or lock operation.
> 
> Benchmark: run hackbench on a dual-socket 72-CPU bare metal machine
> with 256 GB memory and Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.3 GHz.
> The command is "hackbench 18 thread 20000". Each group gets 10 runs.
> 

This seems particularly intrusive for the common path to optimize for 
reading of /proc/slabinfo, and that's shown in the benchmark result.

Could you discuss the /proc/slabinfo usage model a bit?  It's not clear if 
this is being continuously read, or whether even a single read in 
isolation is problematic.

That said, optimizing for reading /proc/slabinfo at the cost of runtime 
performance degradation doesn't sound like the right trade-off.

> Results:
> - Mainline:
> 21.0381 +- 0.0325 seconds time elapsed  ( +-  0.15% )
> - Mainline w/ this patch:
> 21.1878 +- 0.0239 seconds time elapsed  ( +-  0.11% )
> 
> [1] https://lore.kernel.org/linux-mm/
> alpine.DEB.2.21.2003031602460.1537@www.lameter.com/T/
> [2] https://lore.kernel.org/lkml/
> alpine.DEB.2.22.394.2008071258020.55871@www.lameter.com/T/
> 
> Signed-off-by: Jianfeng Wang <jianfeng.w.wang@oracle.com>
> ---
>  mm/slab.h |  4 ++++
>  mm/slub.c | 31 +++++++++++++++++++++++++++++--
>  2 files changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/slab.h b/mm/slab.h
> index 54deeb0428c6..a0e7672ba648 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -11,6 +11,7 @@
>  #include <linux/memcontrol.h>
>  #include <linux/kfence.h>
>  #include <linux/kasan.h>
> +#include <linux/percpu_counter.h>
>  
>  /*
>   * Internal slab definitions
> @@ -277,6 +278,9 @@ struct kmem_cache {
>  	unsigned int red_left_pad;	/* Left redzone padding size */
>  	const char *name;		/* Name (only for display!) */
>  	struct list_head list;		/* List of slab caches */
> +#ifdef CONFIG_SLUB_DEBUG
> +	struct percpu_counter free_objects;
> +#endif
>  #ifdef CONFIG_SYSFS
>  	struct kobject kobj;		/* For sysfs */
>  #endif
> diff --git a/mm/slub.c b/mm/slub.c
> index 2ef88bbf56a3..44f8ded96574 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -736,6 +736,12 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
>  static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
>  static DEFINE_SPINLOCK(object_map_lock);
>  
> +static inline void
> +__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta)
> +{
> +	percpu_counter_add_batch(&s->free_objects, delta, INT_MAX);
> +}
> +
>  static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
>  		       struct slab *slab)
>  {
> @@ -1829,6 +1835,9 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
>  	return flags | slub_debug_local;
>  }
>  #else /* !CONFIG_SLUB_DEBUG */
> +static inline void
> +__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta) {}
> +
>  static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
>  static inline
>  void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
> @@ -2369,6 +2378,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	slab->inuse = 0;
>  	slab->frozen = 0;
>  
> +	__update_kmem_cache_free_objs(s, slab->objects);
>  	account_slab(slab, oo_order(oo), s, flags);
>  
>  	slab->slab_cache = s;
> @@ -2445,6 +2455,7 @@ static void free_slab(struct kmem_cache *s, struct slab *slab)
>  		call_rcu(&slab->rcu_head, rcu_free_slab);
>  	else
>  		__free_slab(s, slab);
> +	__update_kmem_cache_free_objs(s, -slab->objects);
>  }
>  
>  static void discard_slab(struct kmem_cache *s, struct slab *slab)
> @@ -3859,6 +3870,8 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
>  	 */
>  	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);
>  
> +	if (object)
> +		__update_kmem_cache_free_objs(s, -1);
>  	return object;
>  }
>  
> @@ -4235,6 +4248,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
>  	unsigned long tid;
>  	void **freelist;
>  
> +	__update_kmem_cache_free_objs(s, cnt);
>  redo:
>  	/*
>  	 * Determine the currently cpus per cpu slab.
> @@ -4286,6 +4300,7 @@ static void do_slab_free(struct kmem_cache *s,
>  				struct slab *slab, void *head, void *tail,
>  				int cnt, unsigned long addr)
>  {
> +	__update_kmem_cache_free_objs(s, cnt);
>  	__slab_free(s, slab, head, tail, cnt, addr);
>  }
>  #endif /* CONFIG_SLUB_TINY */
> @@ -4658,6 +4673,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>  		memcg_slab_alloc_error_hook(s, size, objcg);
>  	}
>  
> +	__update_kmem_cache_free_objs(s, -i);
>  	return i;
>  }
>  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
> @@ -4899,6 +4915,9 @@ void __kmem_cache_release(struct kmem_cache *s)
>  	cache_random_seq_destroy(s);
>  #ifndef CONFIG_SLUB_TINY
>  	free_percpu(s->cpu_slab);
> +#endif
> +#ifdef CONFIG_SLUB_DEBUG
> +	percpu_counter_destroy(&s->free_objects);
>  #endif
>  	free_kmem_cache_nodes(s);
>  }
> @@ -5109,6 +5128,14 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
>  	s->random = get_random_long();
>  #endif
>  
> +#ifdef CONFIG_SLUB_DEBUG
> +	int ret;
> +
> +	ret = percpu_counter_init(&s->free_objects, 0, GFP_KERNEL);
> +	if (ret)
> +		return ret;
> +#endif
> +
>  	if (!calculate_sizes(s))
>  		goto error;
>  	if (disable_higher_order_debug) {
> @@ -7100,15 +7127,15 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
>  {
>  	unsigned long nr_slabs = 0;
>  	unsigned long nr_objs = 0;
> -	unsigned long nr_free = 0;
> +	unsigned long nr_free;
>  	int node;
>  	struct kmem_cache_node *n;
>  
>  	for_each_kmem_cache_node(s, node, n) {
>  		nr_slabs += node_nr_slabs(n);
>  		nr_objs += node_nr_objs(n);
> -		nr_free += count_partial(n, count_free);
>  	}
> +	nr_free = percpu_counter_sum_positive(&s->free_objects);
>  
>  	sinfo->active_objs = nr_objs - nr_free;
>  	sinfo->num_objs = nr_objs;
> -- 
> 2.42.1
> 
>
  
Vlastimil Babka Feb. 19, 2024, 8:30 a.m. UTC | #2
On 2/18/24 20:25, David Rientjes wrote:
> On Thu, 15 Feb 2024, Jianfeng Wang wrote:
> 
>> When reading "/proc/slabinfo", the kernel needs to report the number of
>> free objects for each kmem_cache. The current implementation relies on
>> count_partial() that counts the number of free objects by scanning each
>> kmem_cache_node's partial slab list and summing free objects from all
>> partial slabs in the list. This process must hold per kmem_cache_node
>> spinlock and disable IRQ. Consequently, it can block slab allocation
>> requests on other CPU cores and cause timeouts for network devices etc.,
>> if the partial slab list is long. In production, even NMI watchdog can
>> be triggered because some slab caches have a long partial list: e.g.,
>> for "buffer_head", the number of partial slabs was observed to be ~1M
>> in one kmem_cache_node. This problem was also observed by several
>> others [1-2] in the past.
>> 
>> The fix is to maintain a counter of free objects for each kmem_cache.
>> Then, in get_slabinfo(), use the counter rather than count_partial()
>> when reporting the number of free objects for a slab cache. per-cpu
>> counter is used to minimize atomic or lock operation.
>> 
>> Benchmark: run hackbench on a dual-socket 72-CPU bare metal machine
>> with 256 GB memory and Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.3 GHz.
>> The command is "hackbench 18 thread 20000". Each group gets 10 runs.
>> 
> 
> This seems particularly intrusive for the common path to optimize for 
> reading of /proc/slabinfo, and that's shown in the benchmark result.
> 
> Could you discuss the /proc/slabinfo usage model a bit?  It's not clear if 
> this is being continuously read, or whether even a single read in 
> isolation is problematic.
> 
> That said, optimizing for reading /proc/slabinfo at the cost of runtime 
> performance degradation doesn't sound like the right trade-off.

It should be possible to make this overhead smaller by restricting the
counter only to partial list slabs, as [2] did. This would keep it out of
the fast paths, where it's really not acceptable.
Note [2] used atomic_long_t and the percpu counters used here should be
lower overhead. So basically try to get the best of both attemps.

>> Results:
>> - Mainline:
>> 21.0381 +- 0.0325 seconds time elapsed  ( +-  0.15% )
>> - Mainline w/ this patch:
>> 21.1878 +- 0.0239 seconds time elapsed  ( +-  0.11% )
>> 
>> [1] https://lore.kernel.org/linux-mm/
>> alpine.DEB.2.21.2003031602460.1537@www.lameter.com/T/
>> [2] https://lore.kernel.org/lkml/
>> alpine.DEB.2.22.394.2008071258020.55871@www.lameter.com/T/
>> 
>> Signed-off-by: Jianfeng Wang <jianfeng.w.wang@oracle.com>
>> ---
>>  mm/slab.h |  4 ++++
>>  mm/slub.c | 31 +++++++++++++++++++++++++++++--
>>  2 files changed, 33 insertions(+), 2 deletions(-)
>> 
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 54deeb0428c6..a0e7672ba648 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -11,6 +11,7 @@
>>  #include <linux/memcontrol.h>
>>  #include <linux/kfence.h>
>>  #include <linux/kasan.h>
>> +#include <linux/percpu_counter.h>
>>  
>>  /*
>>   * Internal slab definitions
>> @@ -277,6 +278,9 @@ struct kmem_cache {
>>  	unsigned int red_left_pad;	/* Left redzone padding size */
>>  	const char *name;		/* Name (only for display!) */
>>  	struct list_head list;		/* List of slab caches */
>> +#ifdef CONFIG_SLUB_DEBUG
>> +	struct percpu_counter free_objects;
>> +#endif
>>  #ifdef CONFIG_SYSFS
>>  	struct kobject kobj;		/* For sysfs */
>>  #endif
>> diff --git a/mm/slub.c b/mm/slub.c
>> index 2ef88bbf56a3..44f8ded96574 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -736,6 +736,12 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
>>  static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
>>  static DEFINE_SPINLOCK(object_map_lock);
>>  
>> +static inline void
>> +__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta)
>> +{
>> +	percpu_counter_add_batch(&s->free_objects, delta, INT_MAX);
>> +}
>> +
>>  static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
>>  		       struct slab *slab)
>>  {
>> @@ -1829,6 +1835,9 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
>>  	return flags | slub_debug_local;
>>  }
>>  #else /* !CONFIG_SLUB_DEBUG */
>> +static inline void
>> +__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta) {}
>> +
>>  static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
>>  static inline
>>  void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
>> @@ -2369,6 +2378,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>>  	slab->inuse = 0;
>>  	slab->frozen = 0;
>>  
>> +	__update_kmem_cache_free_objs(s, slab->objects);
>>  	account_slab(slab, oo_order(oo), s, flags);
>>  
>>  	slab->slab_cache = s;
>> @@ -2445,6 +2455,7 @@ static void free_slab(struct kmem_cache *s, struct slab *slab)
>>  		call_rcu(&slab->rcu_head, rcu_free_slab);
>>  	else
>>  		__free_slab(s, slab);
>> +	__update_kmem_cache_free_objs(s, -slab->objects);
>>  }
>>  
>>  static void discard_slab(struct kmem_cache *s, struct slab *slab)
>> @@ -3859,6 +3870,8 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
>>  	 */
>>  	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);
>>  
>> +	if (object)
>> +		__update_kmem_cache_free_objs(s, -1);
>>  	return object;
>>  }
>>  
>> @@ -4235,6 +4248,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
>>  	unsigned long tid;
>>  	void **freelist;
>>  
>> +	__update_kmem_cache_free_objs(s, cnt);
>>  redo:
>>  	/*
>>  	 * Determine the currently cpus per cpu slab.
>> @@ -4286,6 +4300,7 @@ static void do_slab_free(struct kmem_cache *s,
>>  				struct slab *slab, void *head, void *tail,
>>  				int cnt, unsigned long addr)
>>  {
>> +	__update_kmem_cache_free_objs(s, cnt);
>>  	__slab_free(s, slab, head, tail, cnt, addr);
>>  }
>>  #endif /* CONFIG_SLUB_TINY */
>> @@ -4658,6 +4673,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>>  		memcg_slab_alloc_error_hook(s, size, objcg);
>>  	}
>>  
>> +	__update_kmem_cache_free_objs(s, -i);
>>  	return i;
>>  }
>>  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
>> @@ -4899,6 +4915,9 @@ void __kmem_cache_release(struct kmem_cache *s)
>>  	cache_random_seq_destroy(s);
>>  #ifndef CONFIG_SLUB_TINY
>>  	free_percpu(s->cpu_slab);
>> +#endif
>> +#ifdef CONFIG_SLUB_DEBUG
>> +	percpu_counter_destroy(&s->free_objects);
>>  #endif
>>  	free_kmem_cache_nodes(s);
>>  }
>> @@ -5109,6 +5128,14 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
>>  	s->random = get_random_long();
>>  #endif
>>  
>> +#ifdef CONFIG_SLUB_DEBUG
>> +	int ret;
>> +
>> +	ret = percpu_counter_init(&s->free_objects, 0, GFP_KERNEL);
>> +	if (ret)
>> +		return ret;
>> +#endif
>> +
>>  	if (!calculate_sizes(s))
>>  		goto error;
>>  	if (disable_higher_order_debug) {
>> @@ -7100,15 +7127,15 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
>>  {
>>  	unsigned long nr_slabs = 0;
>>  	unsigned long nr_objs = 0;
>> -	unsigned long nr_free = 0;
>> +	unsigned long nr_free;
>>  	int node;
>>  	struct kmem_cache_node *n;
>>  
>>  	for_each_kmem_cache_node(s, node, n) {
>>  		nr_slabs += node_nr_slabs(n);
>>  		nr_objs += node_nr_objs(n);
>> -		nr_free += count_partial(n, count_free);
>>  	}
>> +	nr_free = percpu_counter_sum_positive(&s->free_objects);
>>  
>>  	sinfo->active_objs = nr_objs - nr_free;
>>  	sinfo->num_objs = nr_objs;
>> -- 
>> 2.42.1
>> 
>>
  
Chengming Zhou Feb. 19, 2024, 9:29 a.m. UTC | #3
On 2024/2/19 16:30, Vlastimil Babka wrote:
> On 2/18/24 20:25, David Rientjes wrote:
>> On Thu, 15 Feb 2024, Jianfeng Wang wrote:
>>
>>> When reading "/proc/slabinfo", the kernel needs to report the number of
>>> free objects for each kmem_cache. The current implementation relies on
>>> count_partial() that counts the number of free objects by scanning each
>>> kmem_cache_node's partial slab list and summing free objects from all
>>> partial slabs in the list. This process must hold per kmem_cache_node
>>> spinlock and disable IRQ. Consequently, it can block slab allocation
>>> requests on other CPU cores and cause timeouts for network devices etc.,
>>> if the partial slab list is long. In production, even NMI watchdog can
>>> be triggered because some slab caches have a long partial list: e.g.,
>>> for "buffer_head", the number of partial slabs was observed to be ~1M
>>> in one kmem_cache_node. This problem was also observed by several

Not sure if this situation is normal? It maybe very fragmented, right?

SLUB completely depend on the timing order to place partial slabs in node,
which maybe suboptimal in some cases. Maybe we could introduce anti-fragment
mechanism like fullness grouping in zsmalloc to have multiple lists based
on fullness grouping? Just some random thoughts... :)

>>> others [1-2] in the past.
>>>
>>> The fix is to maintain a counter of free objects for each kmem_cache.
>>> Then, in get_slabinfo(), use the counter rather than count_partial()
>>> when reporting the number of free objects for a slab cache. per-cpu
>>> counter is used to minimize atomic or lock operation.
>>>
>>> Benchmark: run hackbench on a dual-socket 72-CPU bare metal machine
>>> with 256 GB memory and Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.3 GHz.
>>> The command is "hackbench 18 thread 20000". Each group gets 10 runs.
>>>
>>
>> This seems particularly intrusive for the common path to optimize for 
>> reading of /proc/slabinfo, and that's shown in the benchmark result.
>>
>> Could you discuss the /proc/slabinfo usage model a bit?  It's not clear if 
>> this is being continuously read, or whether even a single read in 
>> isolation is problematic.
>>
>> That said, optimizing for reading /proc/slabinfo at the cost of runtime 
>> performance degradation doesn't sound like the right trade-off.
> 
> It should be possible to make this overhead smaller by restricting the
> counter only to partial list slabs, as [2] did. This would keep it out of
> the fast paths, where it's really not acceptable.
> Note [2] used atomic_long_t and the percpu counters used here should be
> lower overhead. So basically try to get the best of both attemps.

Right, the current count_partial() also only iterate over slabs on the
node partial list, doesn't include slabs on the cpu partial list. So this
new percpu counter should also only include slabs on node partial list.
Then the overhead should be lower.
  
Vlastimil Babka Feb. 19, 2024, 10:17 a.m. UTC | #4
On 2/19/24 10:29, Chengming Zhou wrote:
> On 2024/2/19 16:30, Vlastimil Babka wrote:
>> On 2/18/24 20:25, David Rientjes wrote:
>>> On Thu, 15 Feb 2024, Jianfeng Wang wrote:
>>>
>>>> When reading "/proc/slabinfo", the kernel needs to report the number of
>>>> free objects for each kmem_cache. The current implementation relies on
>>>> count_partial() that counts the number of free objects by scanning each
>>>> kmem_cache_node's partial slab list and summing free objects from all
>>>> partial slabs in the list. This process must hold per kmem_cache_node
>>>> spinlock and disable IRQ. Consequently, it can block slab allocation
>>>> requests on other CPU cores and cause timeouts for network devices etc.,
>>>> if the partial slab list is long. In production, even NMI watchdog can
>>>> be triggered because some slab caches have a long partial list: e.g.,
>>>> for "buffer_head", the number of partial slabs was observed to be ~1M
>>>> in one kmem_cache_node. This problem was also observed by several
> 
> Not sure if this situation is normal? It maybe very fragmented, right?
> 
> SLUB completely depend on the timing order to place partial slabs in node,
> which maybe suboptimal in some cases. Maybe we could introduce anti-fragment
> mechanism like fullness grouping in zsmalloc to have multiple lists based
> on fullness grouping? Just some random thoughts... :)

Most likely that's wouldn't be feasible. When freeing to a slab on partial
list that's just a cmpxchg128 (unless the slab become empty) and additional
list manipulation to maintain the grouping would kill the performance.
  
Jianfeng Wang Feb. 20, 2024, 6:41 p.m. UTC | #5
On 2/18/24 11:25 AM, David Rientjes wrote:
> On Thu, 15 Feb 2024, Jianfeng Wang wrote:
> 
>> When reading "/proc/slabinfo", the kernel needs to report the number of
>> free objects for each kmem_cache. The current implementation relies on
>> count_partial() that counts the number of free objects by scanning each
>> kmem_cache_node's partial slab list and summing free objects from all
>> partial slabs in the list. This process must hold per kmem_cache_node
>> spinlock and disable IRQ. Consequently, it can block slab allocation
>> requests on other CPU cores and cause timeouts for network devices etc.,
>> if the partial slab list is long. In production, even NMI watchdog can
>> be triggered because some slab caches have a long partial list: e.g.,
>> for "buffer_head", the number of partial slabs was observed to be ~1M
>> in one kmem_cache_node. This problem was also observed by several
>> others [1-2] in the past.
>>
>> The fix is to maintain a counter of free objects for each kmem_cache.
>> Then, in get_slabinfo(), use the counter rather than count_partial()
>> when reporting the number of free objects for a slab cache. per-cpu
>> counter is used to minimize atomic or lock operation.
>>
>> Benchmark: run hackbench on a dual-socket 72-CPU bare metal machine
>> with 256 GB memory and Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.3 GHz.
>> The command is "hackbench 18 thread 20000". Each group gets 10 runs.
>>
> 
> This seems particularly intrusive for the common path to optimize for 
> reading of /proc/slabinfo, and that's shown in the benchmark result.
> 
> Could you discuss the /proc/slabinfo usage model a bit?  It's not clear if 
> this is being continuously read, or whether even a single read in 
> isolation is problematic.
> There are several use cases of /proc/slabinfo.

Some monitoring tools run in the background to collect system-level
traces to enable performance analysis and debugging. Collecting
slabinfo is one task in the tools.

For example, I've seen an OS crash case: based on kdump, one core runs
"cat" process which runs count_partial() on a kmem_cache_node for
"buffer_head" slab object; the kmem_cache_node has 1,103,484 partial
slabs; many application threads run on other cores and are in
__slab_alloc(), waiting for the kmem_cache_node's spinlock. This leads
to the node's crash.

In the meanwhile, some engineers/customers may collect slabinfo
when conducting performance analysis, most likely when nodes are
under memory pressure.

Both cases can trigger the soft lockup in production. There are several
similar reports. Thus, we'd like to resolve this type of issues completely
by getting rid of count_partial() in get_slabinfo(). I believe a proper
fix is wanted by several others who also reported the problem.

> That said, optimizing for reading /proc/slabinfo at the cost of runtime 
> performance degradation doesn't sound like the right trade-off.
>>> Results:
>> - Mainline:
>> 21.0381 +- 0.0325 seconds time elapsed  ( +-  0.15% )
>> - Mainline w/ this patch:
>> 21.1878 +- 0.0239 seconds time elapsed  ( +-  0.11% )
>>
>> [1] https://urldefense.com/v3/__https://lore.kernel.org/linux-mm/__;!!ACWV5N9M2RV99hQ!JxjieJXhYB_Uehnw5O9tn8CCMdkk4siQXleaZ9rDoRFFIcimaceibjyW026H3vPp2DdaXb_1siulMi1TVeVKkbM$ 
>> alpine.DEB.2.21.2003031602460.1537@www.lameter.com/T/
>> [2] https://urldefense.com/v3/__https://lore.kernel.org/lkml/__;!!ACWV5N9M2RV99hQ!JxjieJXhYB_Uehnw5O9tn8CCMdkk4siQXleaZ9rDoRFFIcimaceibjyW026H3vPp2DdaXb_1siulMi1TgMCGM94$ 
>> alpine.DEB.2.22.394.2008071258020.55871@www.lameter.com/T/
>>
>> Signed-off-by: Jianfeng Wang <jianfeng.w.wang@oracle.com>
>> ---
>>  mm/slab.h |  4 ++++
>>  mm/slub.c | 31 +++++++++++++++++++++++++++++--
>>  2 files changed, 33 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 54deeb0428c6..a0e7672ba648 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -11,6 +11,7 @@
>>  #include <linux/memcontrol.h>
>>  #include <linux/kfence.h>
>>  #include <linux/kasan.h>
>> +#include <linux/percpu_counter.h>
>>  
>>  /*
>>   * Internal slab definitions
>> @@ -277,6 +278,9 @@ struct kmem_cache {
>>  	unsigned int red_left_pad;	/* Left redzone padding size */
>>  	const char *name;		/* Name (only for display!) */
>>  	struct list_head list;		/* List of slab caches */
>> +#ifdef CONFIG_SLUB_DEBUG
>> +	struct percpu_counter free_objects;
>> +#endif
>>  #ifdef CONFIG_SYSFS
>>  	struct kobject kobj;		/* For sysfs */
>>  #endif
>> diff --git a/mm/slub.c b/mm/slub.c
>> index 2ef88bbf56a3..44f8ded96574 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -736,6 +736,12 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
>>  static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
>>  static DEFINE_SPINLOCK(object_map_lock);
>>  
>> +static inline void
>> +__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta)
>> +{
>> +	percpu_counter_add_batch(&s->free_objects, delta, INT_MAX);
>> +}
>> +
>>  static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
>>  		       struct slab *slab)
>>  {
>> @@ -1829,6 +1835,9 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
>>  	return flags | slub_debug_local;
>>  }
>>  #else /* !CONFIG_SLUB_DEBUG */
>> +static inline void
>> +__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta) {}
>> +
>>  static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
>>  static inline
>>  void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
>> @@ -2369,6 +2378,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>>  	slab->inuse = 0;
>>  	slab->frozen = 0;
>>  
>> +	__update_kmem_cache_free_objs(s, slab->objects);
>>  	account_slab(slab, oo_order(oo), s, flags);
>>  
>>  	slab->slab_cache = s;
>> @@ -2445,6 +2455,7 @@ static void free_slab(struct kmem_cache *s, struct slab *slab)
>>  		call_rcu(&slab->rcu_head, rcu_free_slab);
>>  	else
>>  		__free_slab(s, slab);
>> +	__update_kmem_cache_free_objs(s, -slab->objects);
>>  }
>>  
>>  static void discard_slab(struct kmem_cache *s, struct slab *slab)
>> @@ -3859,6 +3870,8 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
>>  	 */
>>  	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);
>>  
>> +	if (object)
>> +		__update_kmem_cache_free_objs(s, -1);
>>  	return object;
>>  }
>>  
>> @@ -4235,6 +4248,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
>>  	unsigned long tid;
>>  	void **freelist;
>>  
>> +	__update_kmem_cache_free_objs(s, cnt);
>>  redo:
>>  	/*
>>  	 * Determine the currently cpus per cpu slab.
>> @@ -4286,6 +4300,7 @@ static void do_slab_free(struct kmem_cache *s,
>>  				struct slab *slab, void *head, void *tail,
>>  				int cnt, unsigned long addr)
>>  {
>> +	__update_kmem_cache_free_objs(s, cnt);
>>  	__slab_free(s, slab, head, tail, cnt, addr);
>>  }
>>  #endif /* CONFIG_SLUB_TINY */
>> @@ -4658,6 +4673,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
>>  		memcg_slab_alloc_error_hook(s, size, objcg);
>>  	}
>>  
>> +	__update_kmem_cache_free_objs(s, -i);
>>  	return i;
>>  }
>>  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
>> @@ -4899,6 +4915,9 @@ void __kmem_cache_release(struct kmem_cache *s)
>>  	cache_random_seq_destroy(s);
>>  #ifndef CONFIG_SLUB_TINY
>>  	free_percpu(s->cpu_slab);
>> +#endif
>> +#ifdef CONFIG_SLUB_DEBUG
>> +	percpu_counter_destroy(&s->free_objects);
>>  #endif
>>  	free_kmem_cache_nodes(s);
>>  }
>> @@ -5109,6 +5128,14 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
>>  	s->random = get_random_long();
>>  #endif
>>  
>> +#ifdef CONFIG_SLUB_DEBUG
>> +	int ret;
>> +
>> +	ret = percpu_counter_init(&s->free_objects, 0, GFP_KERNEL);
>> +	if (ret)
>> +		return ret;
>> +#endif
>> +
>>  	if (!calculate_sizes(s))
>>  		goto error;
>>  	if (disable_higher_order_debug) {
>> @@ -7100,15 +7127,15 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
>>  {
>>  	unsigned long nr_slabs = 0;
>>  	unsigned long nr_objs = 0;
>> -	unsigned long nr_free = 0;
>> +	unsigned long nr_free;
>>  	int node;
>>  	struct kmem_cache_node *n;
>>  
>>  	for_each_kmem_cache_node(s, node, n) {
>>  		nr_slabs += node_nr_slabs(n);
>>  		nr_objs += node_nr_objs(n);
>> -		nr_free += count_partial(n, count_free);
>>  	}
>> +	nr_free = percpu_counter_sum_positive(&s->free_objects);
>>  
>>  	sinfo->active_objs = nr_objs - nr_free;
>>  	sinfo->num_objs = nr_objs;
>> -- 
>> 2.42.1
>>
>>
  
Chengming Zhou Feb. 22, 2024, 1:20 p.m. UTC | #6
On 2024/2/19 16:30, Vlastimil Babka wrote:
> On 2/18/24 20:25, David Rientjes wrote:
>> On Thu, 15 Feb 2024, Jianfeng Wang wrote:
>>
>>> When reading "/proc/slabinfo", the kernel needs to report the number of
>>> free objects for each kmem_cache. The current implementation relies on
>>> count_partial() that counts the number of free objects by scanning each
>>> kmem_cache_node's partial slab list and summing free objects from all
>>> partial slabs in the list. This process must hold per kmem_cache_node
>>> spinlock and disable IRQ. Consequently, it can block slab allocation
>>> requests on other CPU cores and cause timeouts for network devices etc.,
>>> if the partial slab list is long. In production, even NMI watchdog can
>>> be triggered because some slab caches have a long partial list: e.g.,
>>> for "buffer_head", the number of partial slabs was observed to be ~1M
>>> in one kmem_cache_node. This problem was also observed by several
>>> others [1-2] in the past.
>>>
>>> The fix is to maintain a counter of free objects for each kmem_cache.
>>> Then, in get_slabinfo(), use the counter rather than count_partial()
>>> when reporting the number of free objects for a slab cache. per-cpu
>>> counter is used to minimize atomic or lock operation.
>>>
>>> Benchmark: run hackbench on a dual-socket 72-CPU bare metal machine
>>> with 256 GB memory and Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.3 GHz.
>>> The command is "hackbench 18 thread 20000". Each group gets 10 runs.
>>>
>>
>> This seems particularly intrusive for the common path to optimize for 
>> reading of /proc/slabinfo, and that's shown in the benchmark result.
>>
>> Could you discuss the /proc/slabinfo usage model a bit?  It's not clear if 
>> this is being continuously read, or whether even a single read in 
>> isolation is problematic.
>>
>> That said, optimizing for reading /proc/slabinfo at the cost of runtime 
>> performance degradation doesn't sound like the right trade-off.
> 
> It should be possible to make this overhead smaller by restricting the
> counter only to partial list slabs, as [2] did. This would keep it out of
> the fast paths, where it's really not acceptable.
> Note [2] used atomic_long_t and the percpu counters used here should be
> lower overhead. So basically try to get the best of both attemps.

I just tried to implement this, the performance numbers are below:
(Run testing 5 times: perf bench sched messaging -g 5 -t -l 100000)

		slab-for-next	slab-pernode-counters
Total time:	7.495		7.5
		7.51		7.532
		7.54		7.514
		7.508		7.472
		7.42		7.527

The problem is that the counters include inuse objects of all slabs,
so can't distinguish how many free objects on the node partial list,
or how many free objects on the cpu partial lists.

But the current code will only iterate the slabs on node partial list,
so this implementation has inconsistency with the current behavior.

IMHO, we can't easily record inuse/free objects of only slabs on the
node partial list, since in __slab_free() we can't know the freed object
belong to slabs on cpu partial list or slabs on node partial list.

Anyway, I put the code below for discussion...

---
 mm/slub.c | 68 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 15 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 20641ad82508..284b751b3b64 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -42,6 +42,7 @@
 #include <kunit/test.h>
 #include <kunit/test-bug.h>
 #include <linux/sort.h>
+#include <linux/percpu_counter.h>

 #include <linux/debugfs.h>
 #include <trace/events/kmem.h>
@@ -431,6 +432,7 @@ struct kmem_cache_node {
 	atomic_long_t total_objects;
 	struct list_head full;
 #endif
+	struct percpu_counter inuse_objects;
 };

 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
@@ -1873,6 +1875,20 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
 #endif
 #endif /* CONFIG_SLUB_DEBUG */

+static inline void add_inuse_objects(struct kmem_cache *s, int node, int objects)
+{
+	struct kmem_cache_node *n = get_node(s, node);
+
+	percpu_counter_add_local(&n->inuse_objects, objects);
+}
+
+static inline void sub_inuse_objects(struct kmem_cache *s, int node, int objects)
+{
+	struct kmem_cache_node *n = get_node(s, node);
+
+	percpu_counter_sub_local(&n->inuse_objects, objects);
+}
+
 static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
 {
 	return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -2526,6 +2542,8 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
 		add_full(s, n, slab);
 	}

+	add_inuse_objects(s, slab_nid(slab), 1);
+
 	return object;
 }

@@ -2563,6 +2581,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s,
 		add_partial(n, slab, DEACTIVATE_TO_HEAD);

 	inc_slabs_node(s, nid, slab->objects);
+	add_inuse_objects(s, nid, 1);
 	spin_unlock_irqrestore(&n->list_lock, flags);

 	return object;
@@ -2862,6 +2881,8 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
 		new.freelist, new.counters,
 		"unfreezing slab"));

+	sub_inuse_objects(s, slab_nid(slab), free_delta);
+
 	/*
 	 * Stage three: Manipulate the slab list based on the updated state.
 	 */
@@ -3313,26 +3334,27 @@ __update_cpu_freelist_fast(struct kmem_cache *s,
 static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
 {
 	struct slab new;
-	unsigned long counters;
-	void *freelist;
+	struct slab old;

 	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));

 	do {
-		freelist = slab->freelist;
-		counters = slab->counters;
+		old.freelist = slab->freelist;
+		old.counters = slab->counters;

-		new.counters = counters;
+		new.counters = old.counters;

 		new.inuse = slab->objects;
-		new.frozen = freelist != NULL;
+		new.frozen = old.freelist != NULL;

 	} while (!__slab_update_freelist(s, slab,
-		freelist, counters,
+		old.freelist, old.counters,
 		NULL, new.counters,
 		"get_freelist"));

-	return freelist;
+	add_inuse_objects(s, slab_nid(slab), old.objects - old.inuse);
+
+	return old.freelist;
 }

 /*
@@ -3341,25 +3363,26 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
 static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
 {
 	struct slab new;
-	unsigned long counters;
-	void *freelist;
+	struct slab old;

 	do {
-		freelist = slab->freelist;
-		counters = slab->counters;
+		old.freelist = slab->freelist;
+		old.counters = slab->counters;

-		new.counters = counters;
+		new.counters = old.counters;
 		VM_BUG_ON(new.frozen);

 		new.inuse = slab->objects;
 		new.frozen = 1;

 	} while (!slab_update_freelist(s, slab,
-		freelist, counters,
+		old.freelist, old.counters,
 		NULL, new.counters,
 		"freeze_slab"));

-	return freelist;
+	add_inuse_objects(s, slab_nid(slab), old.objects - old.inuse);
+
+	return old.freelist;
 }

 /*
@@ -3567,6 +3590,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	slab->frozen = 1;

 	inc_slabs_node(s, slab_nid(slab), slab->objects);
+	add_inuse_objects(s, slab_nid(slab), slab->objects);

 	if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
 		/*
@@ -4108,6 +4132,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,

 	stat(s, FREE_SLOWPATH);

+	sub_inuse_objects(s, slab_nid(slab), cnt);
+
 	if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
 		free_to_partial_list(s, slab, head, tail, cnt, addr);
 		return;
@@ -4866,6 +4892,7 @@ static void early_kmem_cache_node_alloc(int node)
 	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
 	slab->freelist = get_freepointer(kmem_cache_node, n);
 	slab->inuse = 1;
+	BUG_ON(percpu_counter_init(&n->inuse_objects, 1, GFP_KERNEL));
 	kmem_cache_node->node[node] = n;
 	init_kmem_cache_node(n);
 	inc_slabs_node(kmem_cache_node, node, slab->objects);
@@ -4884,6 +4911,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)

 	for_each_kmem_cache_node(s, node, n) {
 		s->node[node] = NULL;
+		percpu_counter_destroy(&n->inuse_objects);
 		kmem_cache_free(kmem_cache_node, n);
 	}
 }
@@ -4916,6 +4944,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
 			return 0;
 		}

+		if (percpu_counter_init(&n->inuse_objects, 0, GFP_KERNEL)) {
+			free_kmem_cache_nodes(s);
+			return 0;
+		}
+
 		init_kmem_cache_node(n);
 		s->node[node] = n;
 	}
@@ -5541,6 +5574,11 @@ static int slab_mem_going_online_callback(void *arg)
 			ret = -ENOMEM;
 			goto out;
 		}
+		ret = percpu_counter_init(&n->inuse_objects, 0, GFP_KERNEL);
+		if (ret) {
+			kmem_cache_free(kmem_cache_node, n);
+			goto out;
+		}
 		init_kmem_cache_node(n);
 		s->node[nid] = n;
 	}
--
2.40.1
  
Chengming Zhou Feb. 23, 2024, 3:36 a.m. UTC | #7
On 2024/2/23 11:02, Christoph Lameter (Ampere) wrote:
> On Thu, 22 Feb 2024, Chengming Zhou wrote:
> 
>> Anyway, I put the code below for discussion...
> 
> Can we guestimate the free objects based on the number of partial slabs. That number is available.
> 

Yeah, the number of partial slabs is easy to know, but I can't think of a way to
estimate the free objects, since __slab_free() is just double cmpxchg in most cases.

> How accurate need the accounting be? We also have fuzzy accounting in the VM counters.

Maybe not need to be very accurate, some delay/fuzzy should be acceptable.

Another direction I think is that we don't distinguish slabs on cpu partial list or
slabs on node partial list anymore (different with current behavior).

Now we have three scopes:
1. SL_ALL: include all slabs
2. SL_PARTIAL: only include partial slabs on node
3. SL_CPU: only include partail slabs on cpu and the using cpu slab

If we change SL_PARTIAL to mean all partial slabs, it maybe simpler.

Thanks.
  
Chengming Zhou Feb. 23, 2024, 5 a.m. UTC | #8
On 2024/2/23 11:50, Christoph Lameter (Ampere) wrote:
> On Fri, 23 Feb 2024, Chengming Zhou wrote:
> 
>>> Can we guestimate the free objects based on the number of partial slabs. That number is available.
>>
>> Yeah, the number of partial slabs is easy to know, but I can't think of a way to
>> estimate the free objects, since __slab_free() is just double cmpxchg in most cases.
> 
> Well a starting point may be half the objects possible in a slab page?

Yeah, also a choice.

> 
> 
>>> How accurate need the accounting be? We also have fuzzy accounting in the VM counters.
>>
>> Maybe not need to be very accurate, some delay/fuzzy should be acceptable.
>>
>> Another direction I think is that we don't distinguish slabs on cpu partial list or
>> slabs on node partial list anymore (different with current behavior).
>>
>> Now we have three scopes:
>> 1. SL_ALL: include all slabs
>> 2. SL_PARTIAL: only include partial slabs on node
>> 3. SL_CPU: only include partail slabs on cpu and the using cpu slab
>>
>> If we change SL_PARTIAL to mean all partial slabs, it maybe simpler.
> 
> Thats not going to work since you would have to scan multiple lists instead of a single list.

We have to use percpu counters if we go this way.

> 
> Another approach may be to come up with some way to scan the partial lists without taking locks. That actually would improve the performance of the allocator. It may work with a single linked lists and RCU.
> 

I think this is a better direction! We can use RCU list if slab can be freed by RCU.
  
Jianfeng Wang Feb. 23, 2024, 7:36 a.m. UTC | #9
On 2/22/24 7:02 PM, Christoph Lameter (Ampere) wrote:
> On Thu, 22 Feb 2024, Chengming Zhou wrote:
> 
>> Anyway, I put the code below for discussion...
> 
> Can we guestimate the free objects based on the number of partial slabs. That number is available.
> 

Yes.
I've thought about calculating the average number of free objects in a
partial slab (through sampling) and then estimating the total number of
free objects as (avg * n->nr_partial).

See the following.

---
 mm/slub.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 63d281dfacdb..13385761049c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2963,6 +2963,8 @@ static inline bool free_debug_processing(struct kmem_cache *s,
 #endif /* CONFIG_SLUB_DEBUG */
 
 #if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
+#define MAX_PARTIAL_TO_SCAN 10000
+
 static unsigned long count_partial(struct kmem_cache_node *n,
 					int (*get_count)(struct slab *))
 {
@@ -2971,8 +2973,22 @@ static unsigned long count_partial(struct kmem_cache_node *n,
 	struct slab *slab;
 
 	spin_lock_irqsave(&n->list_lock, flags);
-	list_for_each_entry(slab, &n->partial, slab_list)
-		x += get_count(slab);
+	if (n->nr_partial > MAX_PARTIAL_TO_SCAN) {
+		/* Estimate total count of objects via sampling */
+		unsigned long sample_rate = n->nr_partial / MAX_PARTIAL_TO_SCAN;
+		unsigned long scanned = 0;
+		unsigned long counted = 0;
+		list_for_each_entry(slab, &n->partial, slab_list) {
+			if (++scanned % sample_rate == 0) {
+				x += get_count(slab);
+				counted++;
+			}
+		}
+		x = mult_frac(x, n->nr_partial, counted);
+	} else {
+		list_for_each_entry(slab, &n->partial, slab_list)
+			x += get_count(slab);
+	}
 	spin_unlock_irqrestore(&n->list_lock, flags);
 	return x;
 }
  
Chengming Zhou Feb. 23, 2024, 9:37 a.m. UTC | #10
On 2024/2/23 17:24, Vlastimil Babka wrote:
> On 2/23/24 06:00, Chengming Zhou wrote:
>> On 2024/2/23 11:50, Christoph Lameter (Ampere) wrote:
>>> On Fri, 23 Feb 2024, Chengming Zhou wrote:
>>>
>>>>> Can we guestimate the free objects based on the number of partial slabs. That number is available.
>>>>
>>>> Yeah, the number of partial slabs is easy to know, but I can't think of a way to
>>>> estimate the free objects, since __slab_free() is just double cmpxchg in most cases.
>>>
>>> Well a starting point may be half the objects possible in a slab page?
>>
>> Yeah, also a choice.
>>
>>>
>>>
>>>>> How accurate need the accounting be? We also have fuzzy accounting in the VM counters.
>>>>
>>>> Maybe not need to be very accurate, some delay/fuzzy should be acceptable.
>>>>
>>>> Another direction I think is that we don't distinguish slabs on cpu partial list or
>>>> slabs on node partial list anymore (different with current behavior).
>>>>
>>>> Now we have three scopes:
>>>> 1. SL_ALL: include all slabs
>>>> 2. SL_PARTIAL: only include partial slabs on node
>>>> 3. SL_CPU: only include partail slabs on cpu and the using cpu slab
>>>>
>>>> If we change SL_PARTIAL to mean all partial slabs, it maybe simpler.
>>>
>>> Thats not going to work since you would have to scan multiple lists instead of a single list.
>>
>> We have to use percpu counters if we go this way.
>>
>>>
>>> Another approach may be to come up with some way to scan the partial lists without taking locks. That actually would improve the performance of the allocator. It may work with a single linked lists and RCU.
> 
> We often remove a slab from the middle of a partial list due to object
> freeing, and this means it has to be double linked, no?

Right, double linked list.

> 
>>>
>>
>> I think this is a better direction! We can use RCU list if slab can be freed by RCU.
> 
> Often we remove slab from the partial list for other purposes than freeing -
> i.e. to become a cpu (partial) slab, and that can't be handled by a rcu
> callback nor can we wait a grace period in such situations.

IMHO, only free_slab() need to use call_rcu() to delay free the slab,
other paths like taking partial slabs from node partial list don't need
to wait for RCU grace period.

All we want is safely lockless iterate over the node partial list, right?

Thanks.
  
Chengming Zhou Feb. 23, 2024, 9:46 a.m. UTC | #11
On 2024/2/23 17:37, Chengming Zhou wrote:
> On 2024/2/23 17:24, Vlastimil Babka wrote:
>> On 2/23/24 06:00, Chengming Zhou wrote:
>>> On 2024/2/23 11:50, Christoph Lameter (Ampere) wrote:
>>>> On Fri, 23 Feb 2024, Chengming Zhou wrote:
>>>>
>>>>>> Can we guestimate the free objects based on the number of partial slabs. That number is available.
>>>>>
>>>>> Yeah, the number of partial slabs is easy to know, but I can't think of a way to
>>>>> estimate the free objects, since __slab_free() is just double cmpxchg in most cases.
>>>>
>>>> Well a starting point may be half the objects possible in a slab page?
>>>
>>> Yeah, also a choice.
>>>
>>>>
>>>>
>>>>>> How accurate need the accounting be? We also have fuzzy accounting in the VM counters.
>>>>>
>>>>> Maybe not need to be very accurate, some delay/fuzzy should be acceptable.
>>>>>
>>>>> Another direction I think is that we don't distinguish slabs on cpu partial list or
>>>>> slabs on node partial list anymore (different with current behavior).
>>>>>
>>>>> Now we have three scopes:
>>>>> 1. SL_ALL: include all slabs
>>>>> 2. SL_PARTIAL: only include partial slabs on node
>>>>> 3. SL_CPU: only include partail slabs on cpu and the using cpu slab
>>>>>
>>>>> If we change SL_PARTIAL to mean all partial slabs, it maybe simpler.
>>>>
>>>> Thats not going to work since you would have to scan multiple lists instead of a single list.
>>>
>>> We have to use percpu counters if we go this way.
>>>
>>>>
>>>> Another approach may be to come up with some way to scan the partial lists without taking locks. That actually would improve the performance of the allocator. It may work with a single linked lists and RCU.
>>
>> We often remove a slab from the middle of a partial list due to object
>> freeing, and this means it has to be double linked, no?
> 
> Right, double linked list.
> 
>>
>>>>
>>>
>>> I think this is a better direction! We can use RCU list if slab can be freed by RCU.
>>
>> Often we remove slab from the partial list for other purposes than freeing -
>> i.e. to become a cpu (partial) slab, and that can't be handled by a rcu
>> callback nor can we wait a grace period in such situations.
> 
> IMHO, only free_slab() need to use call_rcu() to delay free the slab,
> other paths like taking partial slabs from node partial list don't need
> to wait for RCU grace period.
> 
> All we want is safely lockless iterate over the node partial list, right?
Ah, I'm wrong, these paths also need to wait for RCU grace period...
  
Chengming Zhou Feb. 27, 2024, 9:30 a.m. UTC | #12
On 2024/2/27 01:38, Christoph Lameter (Ampere) wrote:
> On Fri, 23 Feb 2024, Vlastimil Babka wrote:
> 
>> On 2/23/24 10:37, Chengming Zhou wrote:
>>> On 2024/2/23 17:24, Vlastimil Babka wrote:
>>>>
>>>>>>
>>>>>
>>>>> I think this is a better direction! We can use RCU list if slab can be freed by RCU.
>>>>
>>>> Often we remove slab from the partial list for other purposes than freeing -
>>>> i.e. to become a cpu (partial) slab, and that can't be handled by a rcu
>>>> callback nor can we wait a grace period in such situations.
>>>
>>> IMHO, only free_slab() need to use call_rcu() to delay free the slab,
>>> other paths like taking partial slabs from node partial list don't need
>>> to wait for RCU grace period.
>>>
>>> All we want is safely lockless iterate over the node partial list, right?
>>
>> Yes, and for that there's the "list_head slab_list", which is in union with
>> "struct slab *next" and "int slabs" for the cpu partial list. So if we
>> remove a slab from the partial list and rewrite the list_head for the
>> partial list purposes, it will break the lockless iterators, right? We would
>> have to wait a grace period between unlinking the slab from partial list (so
>> no new iterators can reach it), and reusing the list_head (so we are sure
>> the existing iterators stopped looking at our slab).
> 
> We could mark the state change (list ownership) in the slab metadata and then abort the scan if the state mismatches the list.

It seems feasible, maybe something like below?

But this way needs all kmem_caches have SLAB_TYPESAFE_BY_RCU, right?
Not sure if this is acceptable? Which may cause random delay of memory free.

```
retry:
	rcu_read_lock();

	h = rcu_dereference(list_next_rcu(&n->partial));

	while (h != &n->partial) {
		slab = list_entry(h, struct slab, slab_list);

		/* Recheck slab with node list lock. */
		spin_lock_irqsave(&n->list_lock, flags);
		if (!slab_test_node_partial(slab)) {
			spin_unlock_irqrestore(&n->list_lock, flags);
			rcu_read_unlock();
			goto retry;
		}

		/* Count this slab's inuse. */

		/* Get the next pointer with node list lock. */
		h = rcu_dereference(list_next_rcu(h));
		spin_unlock_irqrestore(&n->list_lock, flags);
	}

	rcu_read_unlock();
```

> 
>> Maybe there's more advanced rcu tricks but this is my basic understanding
>> how this works.
> 
> This could get tricky but we already do similar things with RCU slabs objects/metadata where we allow the resuse of the object before the RCU period expires and there is an understanding that the user of those objects need to verify the type of object matching expectations when looking for objects.
>
  
Chengming Zhou Feb. 28, 2024, 9:51 a.m. UTC | #13
On 2024/2/28 06:55, Christoph Lameter (Ampere) wrote:
> On Tue, 27 Feb 2024, Chengming Zhou wrote:
> 
>>> We could mark the state change (list ownership) in the slab metadata and then abort the scan if the state mismatches the list.
>>
>> It seems feasible, maybe something like below?
>>
>> But this way needs all kmem_caches have SLAB_TYPESAFE_BY_RCU, right?
> 
> No.
> 
> If a slab is freed to the page allocator and the fields are reused in a different way then we would have to wait till the end of the RCU period. This could be done with a deferred free. Otherwise we have the type checking to ensure that nothing untoward happens in the RCU period.
> 
> The usually shuffle of the pages between freelists/cpulists/cpuslab and fully used slabs would not require that.

IIUC, your method doesn't need the slab struct (page) to be delay freed by RCU.

So that page struct maybe reused to anything by buddy, even maybe freed, right?

Not sure the RCU read lock protection is enough here, do we need to hold other
lock, like memory hotplug lock?

> 
>> Not sure if this is acceptable? Which may cause random delay of memory free.
>>
>> ```
>> retry:
>>     rcu_read_lock();
>>
>>     h = rcu_dereference(list_next_rcu(&n->partial));
>>
>>     while (h != &n->partial) {
> 
> Hmm... a linked list that forms a circle? Linked lists usually terminate in a NULL pointer.

I think the node partial list should be a double-linked list? Since we need to
add slab to its head or tail.

> 
> So this would be
> 
> 
> redo:
> 
>     <zap counters>
>     rcu_read_lock();
>     h = <first>;
> 
>     while (h && h->type == <our type>) {
>           <count h somethings>
> 
>           /* Maybe check h->type again */
>           if (h->type != <our_type>)
>             break;

Another problem of this lockless recheck is that we may get a very false value:
say a slab removed from the node list, then be added to our list in another position,
so passed our recheck conditions here. Which may cause our counting is very mistaken?

Thanks!

> 
>           h = <next>;
>     }
> 
>     rcu_read_unlock();
> 
> 
>     if (!h) /* Type of list changed under us */
>         goto redo;
> 
> 
> The check for type == <our_type> is racy. Maybe we can ignore that or we could do something additional.
> 
> Using RCU does not make sense if you add locking in the inner loop. Then it gets too complicated and causes delay. This must be a simple fast lockless loop in order to do what we need.
> 
> Presumably the type and list pointers are in the same cacheline and thus could made to be updated in a coherent way if properly sequenced with fences etc.
  

Patch

diff --git a/mm/slab.h b/mm/slab.h
index 54deeb0428c6..a0e7672ba648 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -11,6 +11,7 @@ 
 #include <linux/memcontrol.h>
 #include <linux/kfence.h>
 #include <linux/kasan.h>
+#include <linux/percpu_counter.h>
 
 /*
  * Internal slab definitions
@@ -277,6 +278,9 @@  struct kmem_cache {
 	unsigned int red_left_pad;	/* Left redzone padding size */
 	const char *name;		/* Name (only for display!) */
 	struct list_head list;		/* List of slab caches */
+#ifdef CONFIG_SLUB_DEBUG
+	struct percpu_counter free_objects;
+#endif
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;		/* For sysfs */
 #endif
diff --git a/mm/slub.c b/mm/slub.c
index 2ef88bbf56a3..44f8ded96574 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -736,6 +736,12 @@  static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
 static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
 static DEFINE_SPINLOCK(object_map_lock);
 
+static inline void
+__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta)
+{
+	percpu_counter_add_batch(&s->free_objects, delta, INT_MAX);
+}
+
 static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
 		       struct slab *slab)
 {
@@ -1829,6 +1835,9 @@  slab_flags_t kmem_cache_flags(unsigned int object_size,
 	return flags | slub_debug_local;
 }
 #else /* !CONFIG_SLUB_DEBUG */
+static inline void
+__update_kmem_cache_free_objs(struct kmem_cache *s, s64 delta) {}
+
 static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
 static inline
 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
@@ -2369,6 +2378,7 @@  static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	slab->inuse = 0;
 	slab->frozen = 0;
 
+	__update_kmem_cache_free_objs(s, slab->objects);
 	account_slab(slab, oo_order(oo), s, flags);
 
 	slab->slab_cache = s;
@@ -2445,6 +2455,7 @@  static void free_slab(struct kmem_cache *s, struct slab *slab)
 		call_rcu(&slab->rcu_head, rcu_free_slab);
 	else
 		__free_slab(s, slab);
+	__update_kmem_cache_free_objs(s, -slab->objects);
 }
 
 static void discard_slab(struct kmem_cache *s, struct slab *slab)
@@ -3859,6 +3870,8 @@  static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
 	 */
 	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);
 
+	if (object)
+		__update_kmem_cache_free_objs(s, -1);
 	return object;
 }
 
@@ -4235,6 +4248,7 @@  static __always_inline void do_slab_free(struct kmem_cache *s,
 	unsigned long tid;
 	void **freelist;
 
+	__update_kmem_cache_free_objs(s, cnt);
 redo:
 	/*
 	 * Determine the currently cpus per cpu slab.
@@ -4286,6 +4300,7 @@  static void do_slab_free(struct kmem_cache *s,
 				struct slab *slab, void *head, void *tail,
 				int cnt, unsigned long addr)
 {
+	__update_kmem_cache_free_objs(s, cnt);
 	__slab_free(s, slab, head, tail, cnt, addr);
 }
 #endif /* CONFIG_SLUB_TINY */
@@ -4658,6 +4673,7 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 		memcg_slab_alloc_error_hook(s, size, objcg);
 	}
 
+	__update_kmem_cache_free_objs(s, -i);
 	return i;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -4899,6 +4915,9 @@  void __kmem_cache_release(struct kmem_cache *s)
 	cache_random_seq_destroy(s);
 #ifndef CONFIG_SLUB_TINY
 	free_percpu(s->cpu_slab);
+#endif
+#ifdef CONFIG_SLUB_DEBUG
+	percpu_counter_destroy(&s->free_objects);
 #endif
 	free_kmem_cache_nodes(s);
 }
@@ -5109,6 +5128,14 @@  static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
 	s->random = get_random_long();
 #endif
 
+#ifdef CONFIG_SLUB_DEBUG
+	int ret;
+
+	ret = percpu_counter_init(&s->free_objects, 0, GFP_KERNEL);
+	if (ret)
+		return ret;
+#endif
+
 	if (!calculate_sizes(s))
 		goto error;
 	if (disable_higher_order_debug) {
@@ -7100,15 +7127,15 @@  void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
 {
 	unsigned long nr_slabs = 0;
 	unsigned long nr_objs = 0;
-	unsigned long nr_free = 0;
+	unsigned long nr_free;
 	int node;
 	struct kmem_cache_node *n;
 
 	for_each_kmem_cache_node(s, node, n) {
 		nr_slabs += node_nr_slabs(n);
 		nr_objs += node_nr_objs(n);
-		nr_free += count_partial(n, count_free);
 	}
+	nr_free = percpu_counter_sum_positive(&s->free_objects);
 
 	sinfo->active_objs = nr_objs - nr_free;
 	sinfo->num_objs = nr_objs;