[v3] mm/slab: Annotate kmem_cache_node->list_lock as raw

Message ID nycvar.YFH.7.76.2210212117090.29912@cbobk.fhfr.pm
State New
Headers
Series [v3] mm/slab: Annotate kmem_cache_node->list_lock as raw |

Commit Message

Jiri Kosina Oct. 21, 2022, 7:18 p.m. UTC
  From: Jiri Kosina <jkosina@suse.cz>

The list_lock can be taken in hardirq context when do_drain() is being 
called via IPI on all cores, and therefore lockdep complains about it, 
because it can't be preempted on PREEMPT_RT.

That's not a real issue, as SLAB can't be built on PREEMPT_RT anyway, but 
we still want to get rid of the warning on non-PREEMPT_RT builds.

Annotate it therefore as a raw lock in order to get rid of he lockdep 
warning below.

	 =============================
	 [ BUG: Invalid wait context ]
	 6.1.0-rc1-00134-ge35184f32151 #4 Not tainted
	 -----------------------------
	 swapper/3/0 is trying to lock:
	 ffff8bc88086dc18 (&parent->list_lock){..-.}-{3:3}, at: do_drain+0x57/0xb0
	 other info that might help us debug this:
	 context-{2:2}
	 no locks held by swapper/3/0.
	 stack backtrace:
	 CPU: 3 PID: 0 Comm: swapper/3 Not tainted 6.1.0-rc1-00134-ge35184f32151 #4
	 Hardware name: LENOVO 20K5S22R00/20K5S22R00, BIOS R0IET38W (1.16 ) 05/31/2017
	 Call Trace:
	  <IRQ>
	  dump_stack_lvl+0x6b/0x9d
	  __lock_acquire+0x1519/0x1730
	  ? build_sched_domains+0x4bd/0x1590
	  ? __lock_acquire+0xad2/0x1730
	  lock_acquire+0x294/0x340
	  ? do_drain+0x57/0xb0
	  ? sched_clock_tick+0x41/0x60
	  _raw_spin_lock+0x2c/0x40
	  ? do_drain+0x57/0xb0
	  do_drain+0x57/0xb0
	  __flush_smp_call_function_queue+0x138/0x220
	  __sysvec_call_function+0x4f/0x210
	  sysvec_call_function+0x4b/0x90
	  </IRQ>
	  <TASK>
	  asm_sysvec_call_function+0x16/0x20
	 RIP: 0010:mwait_idle+0x5e/0x80
	 Code: 31 d2 65 48 8b 04 25 80 ed 01 00 48 89 d1 0f 01 c8 48 8b 00 a8 08 75 14 66 90 0f 00 2d 0b 78 46 00 31 c0 48 89 c1 fb 0f 01 c9 <eb> 06 fb 0f 1f 44 00 00 65 48 8b 04 25 80 ed 01 00 f0 80 60 02 df
	 RSP: 0000:ffffa90940217ee0 EFLAGS: 00000246
	 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
	 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff9bb9f93a
	 RBP: 0000000000000003 R08: 0000000000000001 R09: 0000000000000001
	 R10: ffffa90940217ea8 R11: 0000000000000000 R12: ffffffffffffffff
	 R13: 0000000000000000 R14: ffff8bc88127c500 R15: 0000000000000000
	  ? default_idle_call+0x1a/0xa0
	  default_idle_call+0x4b/0xa0
	  do_idle+0x1f1/0x2c0
	  ? _raw_spin_unlock_irqrestore+0x56/0x70
	  cpu_startup_entry+0x19/0x20
	  start_secondary+0x122/0x150
	  secondary_startup_64_no_verify+0xce/0xdb
	  </TASK>

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---

v1->v2: fix !SLAB build failures due to list_lock mismatch
v2->v3: really fix it by sending refreshed version of the patch (facepalm)

 mm/slab.c | 90 +++++++++++++++++++++++++++----------------------------
 mm/slab.h |  4 +++
 2 files changed, 49 insertions(+), 45 deletions(-)
  

Comments

Hyeonggon Yoo Oct. 22, 2022, 5:04 a.m. UTC | #1
On Fri, Oct 21, 2022 at 09:18:12PM +0200, Jiri Kosina wrote:
> From: Jiri Kosina <jkosina@suse.cz>
> 
> The list_lock can be taken in hardirq context when do_drain() is being 
> called via IPI on all cores, and therefore lockdep complains about it, 
> because it can't be preempted on PREEMPT_RT.
>
> That's not a real issue, as SLAB can't be built on PREEMPT_RT anyway, but 
> we still want to get rid of the warning on non-PREEMPT_RT builds.
> 
> Annotate it therefore as a raw lock in order to get rid of he lockdep 
> warning below.
> 
> 	 =============================
> 	 [ BUG: Invalid wait context ]
> 	 6.1.0-rc1-00134-ge35184f32151 #4 Not tainted
> 	 -----------------------------
> 	 swapper/3/0 is trying to lock:
> 	 ffff8bc88086dc18 (&parent->list_lock){..-.}-{3:3}, at: do_drain+0x57/0xb0
> 	 other info that might help us debug this:
> 	 context-{2:2}
> 	 no locks held by swapper/3/0.
> 	 stack backtrace:
> 	 CPU: 3 PID: 0 Comm: swapper/3 Not tainted 6.1.0-rc1-00134-ge35184f32151 #4
> 	 Hardware name: LENOVO 20K5S22R00/20K5S22R00, BIOS R0IET38W (1.16 ) 05/31/2017
> 	 Call Trace:
> 	  <IRQ>
> 	  dump_stack_lvl+0x6b/0x9d
> 	  __lock_acquire+0x1519/0x1730
> 	  ? build_sched_domains+0x4bd/0x1590
> 	  ? __lock_acquire+0xad2/0x1730
> 	  lock_acquire+0x294/0x340
> 	  ? do_drain+0x57/0xb0
> 	  ? sched_clock_tick+0x41/0x60
> 	  _raw_spin_lock+0x2c/0x40
> 	  ? do_drain+0x57/0xb0
> 	  do_drain+0x57/0xb0
> 	  __flush_smp_call_function_queue+0x138/0x220
> 	  __sysvec_call_function+0x4f/0x210
> 	  sysvec_call_function+0x4b/0x90
> 	  </IRQ>
> 	  <TASK>
> 	  asm_sysvec_call_function+0x16/0x20
> 	 RIP: 0010:mwait_idle+0x5e/0x80
> 	 Code: 31 d2 65 48 8b 04 25 80 ed 01 00 48 89 d1 0f 01 c8 48 8b 00 a8 08 75 14 66 90 0f 00 2d 0b 78 46 00 31 c0 48 89 c1 fb 0f 01 c9 <eb> 06 fb 0f 1f 44 00 00 65 48 8b 04 25 80 ed 01 00 f0 80 60 02 df
> 	 RSP: 0000:ffffa90940217ee0 EFLAGS: 00000246
> 	 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> 	 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff9bb9f93a
> 	 RBP: 0000000000000003 R08: 0000000000000001 R09: 0000000000000001
> 	 R10: ffffa90940217ea8 R11: 0000000000000000 R12: ffffffffffffffff
> 	 R13: 0000000000000000 R14: ffff8bc88127c500 R15: 0000000000000000
> 	  ? default_idle_call+0x1a/0xa0
> 	  default_idle_call+0x4b/0xa0
> 	  do_idle+0x1f1/0x2c0
> 	  ? _raw_spin_unlock_irqrestore+0x56/0x70
> 	  cpu_startup_entry+0x19/0x20
> 	  start_secondary+0x122/0x150
> 	  secondary_startup_64_no_verify+0xce/0xdb
> 	  </TASK>
>

Looks good to me.
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>

> Signed-off-by: Jiri Kosina <jkosina@suse.cz>
> ---
> 
> v1->v2: fix !SLAB build failures due to list_lock mismatch
> v2->v3: really fix it by sending refreshed version of the patch (facepalm)
> 
>  mm/slab.c | 90 +++++++++++++++++++++++++++----------------------------
>  mm/slab.h |  4 +++
>  2 files changed, 49 insertions(+), 45 deletions(-)
> 
> diff --git a/mm/slab.c b/mm/slab.c
> index 59c8e28f7b6a..d8a287900193 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -234,7 +234,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
>  	parent->shared = NULL;
>  	parent->alien = NULL;
>  	parent->colour_next = 0;
> -	spin_lock_init(&parent->list_lock);
> +	raw_spin_lock_init(&parent->list_lock);
>  	parent->free_objects = 0;
>  	parent->free_touched = 0;
>  }
> @@ -559,9 +559,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
>  	slab_node = slab_nid(slab);
>  	n = get_node(cachep, slab_node);
>  
> -	spin_lock(&n->list_lock);
> +	raw_spin_lock(&n->list_lock);
>  	free_block(cachep, &objp, 1, slab_node, &list);
> -	spin_unlock(&n->list_lock);
> +	raw_spin_unlock(&n->list_lock);
>  
>  	slabs_destroy(cachep, &list);
>  }
> @@ -684,7 +684,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
>  	struct kmem_cache_node *n = get_node(cachep, node);
>  
>  	if (ac->avail) {
> -		spin_lock(&n->list_lock);
> +		raw_spin_lock(&n->list_lock);
>  		/*
>  		 * Stuff objects into the remote nodes shared array first.
>  		 * That way we could avoid the overhead of putting the objects
> @@ -695,7 +695,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
>  
>  		free_block(cachep, ac->entry, ac->avail, node, list);
>  		ac->avail = 0;
> -		spin_unlock(&n->list_lock);
> +		raw_spin_unlock(&n->list_lock);
>  	}
>  }
>  
> @@ -768,9 +768,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
>  		slabs_destroy(cachep, &list);
>  	} else {
>  		n = get_node(cachep, slab_node);
> -		spin_lock(&n->list_lock);
> +		raw_spin_lock(&n->list_lock);
>  		free_block(cachep, &objp, 1, slab_node, &list);
> -		spin_unlock(&n->list_lock);
> +		raw_spin_unlock(&n->list_lock);
>  		slabs_destroy(cachep, &list);
>  	}
>  	return 1;
> @@ -811,10 +811,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
>  	 */
>  	n = get_node(cachep, node);
>  	if (n) {
> -		spin_lock_irq(&n->list_lock);
> +		raw_spin_lock_irq(&n->list_lock);
>  		n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
>  				cachep->num;
> -		spin_unlock_irq(&n->list_lock);
> +		raw_spin_unlock_irq(&n->list_lock);
>  
>  		return 0;
>  	}
> @@ -893,7 +893,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
>  		goto fail;
>  
>  	n = get_node(cachep, node);
> -	spin_lock_irq(&n->list_lock);
> +	raw_spin_lock_irq(&n->list_lock);
>  	if (n->shared && force_change) {
>  		free_block(cachep, n->shared->entry,
>  				n->shared->avail, node, &list);
> @@ -911,7 +911,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
>  		new_alien = NULL;
>  	}
>  
> -	spin_unlock_irq(&n->list_lock);
> +	raw_spin_unlock_irq(&n->list_lock);
>  	slabs_destroy(cachep, &list);
>  
>  	/*
> @@ -950,7 +950,7 @@ static void cpuup_canceled(long cpu)
>  		if (!n)
>  			continue;
>  
> -		spin_lock_irq(&n->list_lock);
> +		raw_spin_lock_irq(&n->list_lock);
>  
>  		/* Free limit for this kmem_cache_node */
>  		n->free_limit -= cachep->batchcount;
> @@ -961,7 +961,7 @@ static void cpuup_canceled(long cpu)
>  		nc->avail = 0;
>  
>  		if (!cpumask_empty(mask)) {
> -			spin_unlock_irq(&n->list_lock);
> +			raw_spin_unlock_irq(&n->list_lock);
>  			goto free_slab;
>  		}
>  
> @@ -975,7 +975,7 @@ static void cpuup_canceled(long cpu)
>  		alien = n->alien;
>  		n->alien = NULL;
>  
> -		spin_unlock_irq(&n->list_lock);
> +		raw_spin_unlock_irq(&n->list_lock);
>  
>  		kfree(shared);
>  		if (alien) {
> @@ -1159,7 +1159,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *
>  	/*
>  	 * Do not assume that spinlocks can be initialized via memcpy:
>  	 */
> -	spin_lock_init(&ptr->list_lock);
> +	raw_spin_lock_init(&ptr->list_lock);
>  
>  	MAKE_ALL_LISTS(cachep, ptr, nodeid);
>  	cachep->node[nodeid] = ptr;
> @@ -1330,11 +1330,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
>  	for_each_kmem_cache_node(cachep, node, n) {
>  		unsigned long total_slabs, free_slabs, free_objs;
>  
> -		spin_lock_irqsave(&n->list_lock, flags);
> +		raw_spin_lock_irqsave(&n->list_lock, flags);
>  		total_slabs = n->total_slabs;
>  		free_slabs = n->free_slabs;
>  		free_objs = n->free_objects;
> -		spin_unlock_irqrestore(&n->list_lock, flags);
> +		raw_spin_unlock_irqrestore(&n->list_lock, flags);
>  
>  		pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
>  			node, total_slabs - free_slabs, total_slabs,
> @@ -2096,7 +2096,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
>  {
>  #ifdef CONFIG_SMP
>  	check_irq_off();
> -	assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
> +	assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
>  #endif
>  }
>  
> @@ -2104,7 +2104,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
>  {
>  #ifdef CONFIG_SMP
>  	check_irq_off();
> -	assert_spin_locked(&get_node(cachep, node)->list_lock);
> +	assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
>  #endif
>  }
>  
> @@ -2144,9 +2144,9 @@ static void do_drain(void *arg)
>  	check_irq_off();
>  	ac = cpu_cache_get(cachep);
>  	n = get_node(cachep, node);
> -	spin_lock(&n->list_lock);
> +	raw_spin_lock(&n->list_lock);
>  	free_block(cachep, ac->entry, ac->avail, node, &list);
> -	spin_unlock(&n->list_lock);
> +	raw_spin_unlock(&n->list_lock);
>  	ac->avail = 0;
>  	slabs_destroy(cachep, &list);
>  }
> @@ -2164,9 +2164,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
>  			drain_alien_cache(cachep, n->alien);
>  
>  	for_each_kmem_cache_node(cachep, node, n) {
> -		spin_lock_irq(&n->list_lock);
> +		raw_spin_lock_irq(&n->list_lock);
>  		drain_array_locked(cachep, n->shared, node, true, &list);
> -		spin_unlock_irq(&n->list_lock);
> +		raw_spin_unlock_irq(&n->list_lock);
>  
>  		slabs_destroy(cachep, &list);
>  	}
> @@ -2188,10 +2188,10 @@ static int drain_freelist(struct kmem_cache *cache,
>  	nr_freed = 0;
>  	while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
>  
> -		spin_lock_irq(&n->list_lock);
> +		raw_spin_lock_irq(&n->list_lock);
>  		p = n->slabs_free.prev;
>  		if (p == &n->slabs_free) {
> -			spin_unlock_irq(&n->list_lock);
> +			raw_spin_unlock_irq(&n->list_lock);
>  			goto out;
>  		}
>  
> @@ -2204,7 +2204,7 @@ static int drain_freelist(struct kmem_cache *cache,
>  		 * to the cache.
>  		 */
>  		n->free_objects -= cache->num;
> -		spin_unlock_irq(&n->list_lock);
> +		raw_spin_unlock_irq(&n->list_lock);
>  		slab_destroy(cache, slab);
>  		nr_freed++;
>  	}
> @@ -2629,7 +2629,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
>  	INIT_LIST_HEAD(&slab->slab_list);
>  	n = get_node(cachep, slab_nid(slab));
>  
> -	spin_lock(&n->list_lock);
> +	raw_spin_lock(&n->list_lock);
>  	n->total_slabs++;
>  	if (!slab->active) {
>  		list_add_tail(&slab->slab_list, &n->slabs_free);
> @@ -2639,7 +2639,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
>  
>  	STATS_INC_GROWN(cachep);
>  	n->free_objects += cachep->num - slab->active;
> -	spin_unlock(&n->list_lock);
> +	raw_spin_unlock(&n->list_lock);
>  
>  	fixup_objfreelist_debug(cachep, &list);
>  }
> @@ -2805,7 +2805,7 @@ static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
>  {
>  	struct slab *slab;
>  
> -	assert_spin_locked(&n->list_lock);
> +	assert_raw_spin_locked(&n->list_lock);
>  	slab = list_first_entry_or_null(&n->slabs_partial, struct slab,
>  					slab_list);
>  	if (!slab) {
> @@ -2832,10 +2832,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
>  	if (!gfp_pfmemalloc_allowed(flags))
>  		return NULL;
>  
> -	spin_lock(&n->list_lock);
> +	raw_spin_lock(&n->list_lock);
>  	slab = get_first_slab(n, true);
>  	if (!slab) {
> -		spin_unlock(&n->list_lock);
> +		raw_spin_unlock(&n->list_lock);
>  		return NULL;
>  	}
>  
> @@ -2844,7 +2844,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
>  
>  	fixup_slab_list(cachep, n, slab, &list);
>  
> -	spin_unlock(&n->list_lock);
> +	raw_spin_unlock(&n->list_lock);
>  	fixup_objfreelist_debug(cachep, &list);
>  
>  	return obj;
> @@ -2903,7 +2903,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
>  	if (!n->free_objects && (!shared || !shared->avail))
>  		goto direct_grow;
>  
> -	spin_lock(&n->list_lock);
> +	raw_spin_lock(&n->list_lock);
>  	shared = READ_ONCE(n->shared);
>  
>  	/* See if we can refill from the shared array */
> @@ -2927,7 +2927,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
>  must_grow:
>  	n->free_objects -= ac->avail;
>  alloc_done:
> -	spin_unlock(&n->list_lock);
> +	raw_spin_unlock(&n->list_lock);
>  	fixup_objfreelist_debug(cachep, &list);
>  
>  direct_grow:
> @@ -3147,7 +3147,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
>  	BUG_ON(!n);
>  
>  	check_irq_off();
> -	spin_lock(&n->list_lock);
> +	raw_spin_lock(&n->list_lock);
>  	slab = get_first_slab(n, false);
>  	if (!slab)
>  		goto must_grow;
> @@ -3165,12 +3165,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
>  
>  	fixup_slab_list(cachep, n, slab, &list);
>  
> -	spin_unlock(&n->list_lock);
> +	raw_spin_unlock(&n->list_lock);
>  	fixup_objfreelist_debug(cachep, &list);
>  	return obj;
>  
>  must_grow:
> -	spin_unlock(&n->list_lock);
> +	raw_spin_unlock(&n->list_lock);
>  	slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
>  	if (slab) {
>  		/* This slab isn't counted yet so don't update free_objects */
> @@ -3325,7 +3325,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
>  
>  	check_irq_off();
>  	n = get_node(cachep, node);
> -	spin_lock(&n->list_lock);
> +	raw_spin_lock(&n->list_lock);
>  	if (n->shared) {
>  		struct array_cache *shared_array = n->shared;
>  		int max = shared_array->limit - shared_array->avail;
> @@ -3354,7 +3354,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
>  		STATS_SET_FREEABLE(cachep, i);
>  	}
>  #endif
> -	spin_unlock(&n->list_lock);
> +	raw_spin_unlock(&n->list_lock);
>  	ac->avail -= batchcount;
>  	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
>  	slabs_destroy(cachep, &list);
> @@ -3721,9 +3721,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
>  
>  		node = cpu_to_mem(cpu);
>  		n = get_node(cachep, node);
> -		spin_lock_irq(&n->list_lock);
> +		raw_spin_lock_irq(&n->list_lock);
>  		free_block(cachep, ac->entry, ac->avail, node, &list);
> -		spin_unlock_irq(&n->list_lock);
> +		raw_spin_unlock_irq(&n->list_lock);
>  		slabs_destroy(cachep, &list);
>  	}
>  	free_percpu(prev);
> @@ -3815,9 +3815,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
>  		return;
>  	}
>  
> -	spin_lock_irq(&n->list_lock);
> +	raw_spin_lock_irq(&n->list_lock);
>  	drain_array_locked(cachep, ac, node, false, &list);
> -	spin_unlock_irq(&n->list_lock);
> +	raw_spin_unlock_irq(&n->list_lock);
>  
>  	slabs_destroy(cachep, &list);
>  }
> @@ -3901,7 +3901,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
>  
>  	for_each_kmem_cache_node(cachep, node, n) {
>  		check_irq_on();
> -		spin_lock_irq(&n->list_lock);
> +		raw_spin_lock_irq(&n->list_lock);
>  
>  		total_slabs += n->total_slabs;
>  		free_slabs += n->free_slabs;
> @@ -3910,7 +3910,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
>  		if (n->shared)
>  			shared_avail += n->shared->avail;
>  
> -		spin_unlock_irq(&n->list_lock);
> +		raw_spin_unlock_irq(&n->list_lock);
>  	}
>  	num_objs = total_slabs * cachep->num;
>  	active_slabs = total_slabs - free_slabs;
> diff --git a/mm/slab.h b/mm/slab.h
> index 0202a8c2f0d2..7a705e4228c8 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -750,7 +750,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
>   * The slab lists for all objects.
>   */
>  struct kmem_cache_node {
> +#ifdef CONFIG_SLAB
> +	raw_spinlock_t list_lock;
> +#else
>  	spinlock_t list_lock;
> +#endif
>  
>  #ifdef CONFIG_SLAB
>  	struct list_head slabs_partial;	/* partial list first, better asm code */
> -- 
> 2.35.3
>
  
Vlastimil Babka Oct. 24, 2022, 1:20 p.m. UTC | #2
On 10/21/22 21:18, Jiri Kosina wrote:
> From: Jiri Kosina <jkosina@suse.cz>
> 
> The list_lock can be taken in hardirq context when do_drain() is being 
> called via IPI on all cores, and therefore lockdep complains about it, 
> because it can't be preempted on PREEMPT_RT.
> 
> That's not a real issue, as SLAB can't be built on PREEMPT_RT anyway, but 
> we still want to get rid of the warning on non-PREEMPT_RT builds.
> 
> Annotate it therefore as a raw lock in order to get rid of he lockdep 
> warning below.
> 
> 	 =============================
> 	 [ BUG: Invalid wait context ]
> 	 6.1.0-rc1-00134-ge35184f32151 #4 Not tainted
> 	 -----------------------------
> 	 swapper/3/0 is trying to lock:
> 	 ffff8bc88086dc18 (&parent->list_lock){..-.}-{3:3}, at: do_drain+0x57/0xb0
> 	 other info that might help us debug this:
> 	 context-{2:2}
> 	 no locks held by swapper/3/0.
> 	 stack backtrace:
> 	 CPU: 3 PID: 0 Comm: swapper/3 Not tainted 6.1.0-rc1-00134-ge35184f32151 #4
> 	 Hardware name: LENOVO 20K5S22R00/20K5S22R00, BIOS R0IET38W (1.16 ) 05/31/2017
> 	 Call Trace:
> 	  <IRQ>
> 	  dump_stack_lvl+0x6b/0x9d
> 	  __lock_acquire+0x1519/0x1730
> 	  ? build_sched_domains+0x4bd/0x1590
> 	  ? __lock_acquire+0xad2/0x1730
> 	  lock_acquire+0x294/0x340
> 	  ? do_drain+0x57/0xb0
> 	  ? sched_clock_tick+0x41/0x60
> 	  _raw_spin_lock+0x2c/0x40
> 	  ? do_drain+0x57/0xb0
> 	  do_drain+0x57/0xb0
> 	  __flush_smp_call_function_queue+0x138/0x220
> 	  __sysvec_call_function+0x4f/0x210
> 	  sysvec_call_function+0x4b/0x90
> 	  </IRQ>
> 	  <TASK>
> 	  asm_sysvec_call_function+0x16/0x20
> 	 RIP: 0010:mwait_idle+0x5e/0x80
> 	 Code: 31 d2 65 48 8b 04 25 80 ed 01 00 48 89 d1 0f 01 c8 48 8b 00 a8 08 75 14 66 90 0f 00 2d 0b 78 46 00 31 c0 48 89 c1 fb 0f 01 c9 <eb> 06 fb 0f 1f 44 00 00 65 48 8b 04 25 80 ed 01 00 f0 80 60 02 df
> 	 RSP: 0000:ffffa90940217ee0 EFLAGS: 00000246
> 	 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> 	 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff9bb9f93a
> 	 RBP: 0000000000000003 R08: 0000000000000001 R09: 0000000000000001
> 	 R10: ffffa90940217ea8 R11: 0000000000000000 R12: ffffffffffffffff
> 	 R13: 0000000000000000 R14: ffff8bc88127c500 R15: 0000000000000000
> 	  ? default_idle_call+0x1a/0xa0
> 	  default_idle_call+0x4b/0xa0
> 	  do_idle+0x1f1/0x2c0
> 	  ? _raw_spin_unlock_irqrestore+0x56/0x70
> 	  cpu_startup_entry+0x19/0x20
> 	  start_secondary+0x122/0x150
> 	  secondary_startup_64_no_verify+0xce/0xdb
> 	  </TASK>
> 
> Signed-off-by: Jiri Kosina <jkosina@suse.cz>

Thanks, added to slab/for-6.2/locking

...
> diff --git a/mm/slab.h b/mm/slab.h
> index 0202a8c2f0d2..7a705e4228c8 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -750,7 +750,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
>   * The slab lists for all objects.
>   */
>  struct kmem_cache_node {
> +#ifdef CONFIG_SLAB
> +	raw_spinlock_t list_lock;
> +#else
>  	spinlock_t list_lock;
> +#endif
>  
>  #ifdef CONFIG_SLAB
>  	struct list_head slabs_partial;	/* partial list first, better asm code */

Simplified a bit. Since we already have CONFIG_SLAB/CONFIG_SLUB #ifdef
sections, moved the list_lock there.
  

Patch

diff --git a/mm/slab.c b/mm/slab.c
index 59c8e28f7b6a..d8a287900193 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -234,7 +234,7 @@  static void kmem_cache_node_init(struct kmem_cache_node *parent)
 	parent->shared = NULL;
 	parent->alien = NULL;
 	parent->colour_next = 0;
-	spin_lock_init(&parent->list_lock);
+	raw_spin_lock_init(&parent->list_lock);
 	parent->free_objects = 0;
 	parent->free_touched = 0;
 }
@@ -559,9 +559,9 @@  static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
 	slab_node = slab_nid(slab);
 	n = get_node(cachep, slab_node);
 
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	free_block(cachep, &objp, 1, slab_node, &list);
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 
 	slabs_destroy(cachep, &list);
 }
@@ -684,7 +684,7 @@  static void __drain_alien_cache(struct kmem_cache *cachep,
 	struct kmem_cache_node *n = get_node(cachep, node);
 
 	if (ac->avail) {
-		spin_lock(&n->list_lock);
+		raw_spin_lock(&n->list_lock);
 		/*
 		 * Stuff objects into the remote nodes shared array first.
 		 * That way we could avoid the overhead of putting the objects
@@ -695,7 +695,7 @@  static void __drain_alien_cache(struct kmem_cache *cachep,
 
 		free_block(cachep, ac->entry, ac->avail, node, list);
 		ac->avail = 0;
-		spin_unlock(&n->list_lock);
+		raw_spin_unlock(&n->list_lock);
 	}
 }
 
@@ -768,9 +768,9 @@  static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
 		slabs_destroy(cachep, &list);
 	} else {
 		n = get_node(cachep, slab_node);
-		spin_lock(&n->list_lock);
+		raw_spin_lock(&n->list_lock);
 		free_block(cachep, &objp, 1, slab_node, &list);
-		spin_unlock(&n->list_lock);
+		raw_spin_unlock(&n->list_lock);
 		slabs_destroy(cachep, &list);
 	}
 	return 1;
@@ -811,10 +811,10 @@  static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
 	 */
 	n = get_node(cachep, node);
 	if (n) {
-		spin_lock_irq(&n->list_lock);
+		raw_spin_lock_irq(&n->list_lock);
 		n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
 				cachep->num;
-		spin_unlock_irq(&n->list_lock);
+		raw_spin_unlock_irq(&n->list_lock);
 
 		return 0;
 	}
@@ -893,7 +893,7 @@  static int setup_kmem_cache_node(struct kmem_cache *cachep,
 		goto fail;
 
 	n = get_node(cachep, node);
-	spin_lock_irq(&n->list_lock);
+	raw_spin_lock_irq(&n->list_lock);
 	if (n->shared && force_change) {
 		free_block(cachep, n->shared->entry,
 				n->shared->avail, node, &list);
@@ -911,7 +911,7 @@  static int setup_kmem_cache_node(struct kmem_cache *cachep,
 		new_alien = NULL;
 	}
 
-	spin_unlock_irq(&n->list_lock);
+	raw_spin_unlock_irq(&n->list_lock);
 	slabs_destroy(cachep, &list);
 
 	/*
@@ -950,7 +950,7 @@  static void cpuup_canceled(long cpu)
 		if (!n)
 			continue;
 
-		spin_lock_irq(&n->list_lock);
+		raw_spin_lock_irq(&n->list_lock);
 
 		/* Free limit for this kmem_cache_node */
 		n->free_limit -= cachep->batchcount;
@@ -961,7 +961,7 @@  static void cpuup_canceled(long cpu)
 		nc->avail = 0;
 
 		if (!cpumask_empty(mask)) {
-			spin_unlock_irq(&n->list_lock);
+			raw_spin_unlock_irq(&n->list_lock);
 			goto free_slab;
 		}
 
@@ -975,7 +975,7 @@  static void cpuup_canceled(long cpu)
 		alien = n->alien;
 		n->alien = NULL;
 
-		spin_unlock_irq(&n->list_lock);
+		raw_spin_unlock_irq(&n->list_lock);
 
 		kfree(shared);
 		if (alien) {
@@ -1159,7 +1159,7 @@  static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
 	 */
-	spin_lock_init(&ptr->list_lock);
+	raw_spin_lock_init(&ptr->list_lock);
 
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->node[nodeid] = ptr;
@@ -1330,11 +1330,11 @@  slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
 	for_each_kmem_cache_node(cachep, node, n) {
 		unsigned long total_slabs, free_slabs, free_objs;
 
-		spin_lock_irqsave(&n->list_lock, flags);
+		raw_spin_lock_irqsave(&n->list_lock, flags);
 		total_slabs = n->total_slabs;
 		free_slabs = n->free_slabs;
 		free_objs = n->free_objects;
-		spin_unlock_irqrestore(&n->list_lock, flags);
+		raw_spin_unlock_irqrestore(&n->list_lock, flags);
 
 		pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
 			node, total_slabs - free_slabs, total_slabs,
@@ -2096,7 +2096,7 @@  static void check_spinlock_acquired(struct kmem_cache *cachep)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
-	assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
+	assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
 #endif
 }
 
@@ -2104,7 +2104,7 @@  static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
-	assert_spin_locked(&get_node(cachep, node)->list_lock);
+	assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
 #endif
 }
 
@@ -2144,9 +2144,9 @@  static void do_drain(void *arg)
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
 	n = get_node(cachep, node);
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	free_block(cachep, ac->entry, ac->avail, node, &list);
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 	ac->avail = 0;
 	slabs_destroy(cachep, &list);
 }
@@ -2164,9 +2164,9 @@  static void drain_cpu_caches(struct kmem_cache *cachep)
 			drain_alien_cache(cachep, n->alien);
 
 	for_each_kmem_cache_node(cachep, node, n) {
-		spin_lock_irq(&n->list_lock);
+		raw_spin_lock_irq(&n->list_lock);
 		drain_array_locked(cachep, n->shared, node, true, &list);
-		spin_unlock_irq(&n->list_lock);
+		raw_spin_unlock_irq(&n->list_lock);
 
 		slabs_destroy(cachep, &list);
 	}
@@ -2188,10 +2188,10 @@  static int drain_freelist(struct kmem_cache *cache,
 	nr_freed = 0;
 	while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
 
-		spin_lock_irq(&n->list_lock);
+		raw_spin_lock_irq(&n->list_lock);
 		p = n->slabs_free.prev;
 		if (p == &n->slabs_free) {
-			spin_unlock_irq(&n->list_lock);
+			raw_spin_unlock_irq(&n->list_lock);
 			goto out;
 		}
 
@@ -2204,7 +2204,7 @@  static int drain_freelist(struct kmem_cache *cache,
 		 * to the cache.
 		 */
 		n->free_objects -= cache->num;
-		spin_unlock_irq(&n->list_lock);
+		raw_spin_unlock_irq(&n->list_lock);
 		slab_destroy(cache, slab);
 		nr_freed++;
 	}
@@ -2629,7 +2629,7 @@  static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
 	INIT_LIST_HEAD(&slab->slab_list);
 	n = get_node(cachep, slab_nid(slab));
 
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	n->total_slabs++;
 	if (!slab->active) {
 		list_add_tail(&slab->slab_list, &n->slabs_free);
@@ -2639,7 +2639,7 @@  static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
 
 	STATS_INC_GROWN(cachep);
 	n->free_objects += cachep->num - slab->active;
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 
 	fixup_objfreelist_debug(cachep, &list);
 }
@@ -2805,7 +2805,7 @@  static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
 {
 	struct slab *slab;
 
-	assert_spin_locked(&n->list_lock);
+	assert_raw_spin_locked(&n->list_lock);
 	slab = list_first_entry_or_null(&n->slabs_partial, struct slab,
 					slab_list);
 	if (!slab) {
@@ -2832,10 +2832,10 @@  static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
 	if (!gfp_pfmemalloc_allowed(flags))
 		return NULL;
 
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	slab = get_first_slab(n, true);
 	if (!slab) {
-		spin_unlock(&n->list_lock);
+		raw_spin_unlock(&n->list_lock);
 		return NULL;
 	}
 
@@ -2844,7 +2844,7 @@  static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
 
 	fixup_slab_list(cachep, n, slab, &list);
 
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 	fixup_objfreelist_debug(cachep, &list);
 
 	return obj;
@@ -2903,7 +2903,7 @@  static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 	if (!n->free_objects && (!shared || !shared->avail))
 		goto direct_grow;
 
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	shared = READ_ONCE(n->shared);
 
 	/* See if we can refill from the shared array */
@@ -2927,7 +2927,7 @@  static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 must_grow:
 	n->free_objects -= ac->avail;
 alloc_done:
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 	fixup_objfreelist_debug(cachep, &list);
 
 direct_grow:
@@ -3147,7 +3147,7 @@  static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 	BUG_ON(!n);
 
 	check_irq_off();
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	slab = get_first_slab(n, false);
 	if (!slab)
 		goto must_grow;
@@ -3165,12 +3165,12 @@  static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 
 	fixup_slab_list(cachep, n, slab, &list);
 
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 	fixup_objfreelist_debug(cachep, &list);
 	return obj;
 
 must_grow:
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 	slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
 	if (slab) {
 		/* This slab isn't counted yet so don't update free_objects */
@@ -3325,7 +3325,7 @@  static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 
 	check_irq_off();
 	n = get_node(cachep, node);
-	spin_lock(&n->list_lock);
+	raw_spin_lock(&n->list_lock);
 	if (n->shared) {
 		struct array_cache *shared_array = n->shared;
 		int max = shared_array->limit - shared_array->avail;
@@ -3354,7 +3354,7 @@  static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 		STATS_SET_FREEABLE(cachep, i);
 	}
 #endif
-	spin_unlock(&n->list_lock);
+	raw_spin_unlock(&n->list_lock);
 	ac->avail -= batchcount;
 	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
 	slabs_destroy(cachep, &list);
@@ -3721,9 +3721,9 @@  static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 
 		node = cpu_to_mem(cpu);
 		n = get_node(cachep, node);
-		spin_lock_irq(&n->list_lock);
+		raw_spin_lock_irq(&n->list_lock);
 		free_block(cachep, ac->entry, ac->avail, node, &list);
-		spin_unlock_irq(&n->list_lock);
+		raw_spin_unlock_irq(&n->list_lock);
 		slabs_destroy(cachep, &list);
 	}
 	free_percpu(prev);
@@ -3815,9 +3815,9 @@  static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
 		return;
 	}
 
-	spin_lock_irq(&n->list_lock);
+	raw_spin_lock_irq(&n->list_lock);
 	drain_array_locked(cachep, ac, node, false, &list);
-	spin_unlock_irq(&n->list_lock);
+	raw_spin_unlock_irq(&n->list_lock);
 
 	slabs_destroy(cachep, &list);
 }
@@ -3901,7 +3901,7 @@  void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 
 	for_each_kmem_cache_node(cachep, node, n) {
 		check_irq_on();
-		spin_lock_irq(&n->list_lock);
+		raw_spin_lock_irq(&n->list_lock);
 
 		total_slabs += n->total_slabs;
 		free_slabs += n->free_slabs;
@@ -3910,7 +3910,7 @@  void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 		if (n->shared)
 			shared_avail += n->shared->avail;
 
-		spin_unlock_irq(&n->list_lock);
+		raw_spin_unlock_irq(&n->list_lock);
 	}
 	num_objs = total_slabs * cachep->num;
 	active_slabs = total_slabs - free_slabs;
diff --git a/mm/slab.h b/mm/slab.h
index 0202a8c2f0d2..7a705e4228c8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -750,7 +750,11 @@  static inline void slab_post_alloc_hook(struct kmem_cache *s,
  * The slab lists for all objects.
  */
 struct kmem_cache_node {
+#ifdef CONFIG_SLAB
+	raw_spinlock_t list_lock;
+#else
 	spinlock_t list_lock;
+#endif
 
 #ifdef CONFIG_SLAB
 	struct list_head slabs_partial;	/* partial list first, better asm code */