[RFC,1/5] slub: Introduce on_partial()

Message ID 20231017154439.3036608-2-chengming.zhou@linux.dev
State New
Headers
Series slub: Delay freezing of CPU partial slabs |

Commit Message

Chengming Zhou Oct. 17, 2023, 3:44 p.m. UTC
  From: Chengming Zhou <zhouchengming@bytedance.com>

We change slab->__unused to slab->flags to use it as SLUB_FLAGS, which
now only include SF_NODE_PARTIAL flag. It indicates whether or not the
slab is on node partial list.

The following patches will change to don't freeze slab when moving it
from node partial list to cpu partial list. So we can't rely on frozen
bit to see if we should manipulate the slab->slab_list.

Instead we will rely on this SF_NODE_PARTIAL flag, which is protected
by node list_lock.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 mm/slab.h |  2 +-
 mm/slub.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
  

Comments

Matthew Wilcox Oct. 17, 2023, 3:54 p.m. UTC | #1
On Tue, Oct 17, 2023 at 03:44:35PM +0000, chengming.zhou@linux.dev wrote:
> We change slab->__unused to slab->flags to use it as SLUB_FLAGS, which
> now only include SF_NODE_PARTIAL flag. It indicates whether or not the
> slab is on node partial list.

This is an unnecessarily complex solution.  As with the pfmemalloc bit,
we can reuse the folio flags for a few flags.  I would recommend the
PG_workingset bit for this purpose.
  
Chengming Zhou Oct. 18, 2023, 7:37 a.m. UTC | #2
On 2023/10/17 23:54, Matthew Wilcox wrote:
> On Tue, Oct 17, 2023 at 03:44:35PM +0000, chengming.zhou@linux.dev wrote:
>> We change slab->__unused to slab->flags to use it as SLUB_FLAGS, which
>> now only include SF_NODE_PARTIAL flag. It indicates whether or not the
>> slab is on node partial list.
> 
> This is an unnecessarily complex solution.  As with the pfmemalloc bit,
> we can reuse the folio flags for a few flags.  I would recommend the
> PG_workingset bit for this purpose.
> 

Yeah, this is better indeed. Thanks for your suggestion!
  
kernel test robot Oct. 27, 2023, 5:26 a.m. UTC | #3
Hello,

kernel test robot noticed "WARNING:at_mm/slub.c:#___add_partial" on:

commit: 0805463ab860a2dde667bd4423a30efbf650b34b ("[RFC PATCH 1/5] slub: Introduce on_partial()")
url: https://github.com/intel-lab-lkp/linux/commits/chengming-zhou-linux-dev/slub-Introduce-on_partial/20231017-234739
base: git://git.kernel.org/cgit/linux/kernel/git/vbabka/slab.git for-next
patch link: https://lore.kernel.org/all/20231017154439.3036608-2-chengming.zhou@linux.dev/
patch subject: [RFC PATCH 1/5] slub: Introduce on_partial()

in testcase: boot

compiler: gcc-12
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G

(please refer to attached dmesg/kmsg for entire log/backtrace)


+--------------------------------------------+------------+------------+
|                                            | e050a704f3 | 0805463ab8 |
+--------------------------------------------+------------+------------+
| WARNING:at_mm/slub.c:#___add_partial       | 0          | 16         |
| RIP:___add_partial                         | 0          | 16         |
+--------------------------------------------+------------+------------+


If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202310271308.9076b4c0-oliver.sang@intel.com


[    2.344426][    T0] ------------[ cut here ]------------
[ 2.345095][ T0] WARNING: CPU: 0 PID: 0 at mm/slub.c:2132 ___add_partial (mm/slub.c:2132) 
[    2.346072][    T0] Modules linked in:
[    2.346555][    T0] CPU: 0 PID: 0 Comm: swapper Not tainted 6.6.0-rc5-00008-g0805463ab860 #1 e88a4d31ac7553ddd9cc4ecfa6b6cbc9ab8c98ab
[    2.348039][    T0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 2.349271][ T0] RIP: 0010:___add_partial (mm/slub.c:2132) 
[ 2.349920][ T0] Code: 05 52 3f fb 05 53 48 89 f3 85 c0 75 0a 83 4b 30 01 5b e9 28 3c 06 03 48 83 c7 18 be ff ff ff ff e8 6a ec 02 03 85 c0 75 e4 90 <0f> 0b 90 83 4b 30 01 5b e9 08 3c 06 03 0f 1f 84 00 00 00 00 00 f6
All code
========
   0:	05 52 3f fb 05       	add    $0x5fb3f52,%eax
   5:	53                   	push   %rbx
   6:	48 89 f3             	mov    %rsi,%rbx
   9:	85 c0                	test   %eax,%eax
   b:	75 0a                	jne    0x17
   d:	83 4b 30 01          	orl    $0x1,0x30(%rbx)
  11:	5b                   	pop    %rbx
  12:	e9 28 3c 06 03       	jmp    0x3063c3f
  17:	48 83 c7 18          	add    $0x18,%rdi
  1b:	be ff ff ff ff       	mov    $0xffffffff,%esi
  20:	e8 6a ec 02 03       	call   0x302ec8f
  25:	85 c0                	test   %eax,%eax
  27:	75 e4                	jne    0xd
  29:	90                   	nop
  2a:*	0f 0b                	ud2		<-- trapping instruction
  2c:	90                   	nop
  2d:	83 4b 30 01          	orl    $0x1,0x30(%rbx)
  31:	5b                   	pop    %rbx
  32:	e9 08 3c 06 03       	jmp    0x3063c3f
  37:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
  3e:	00 
  3f:	f6                   	.byte 0xf6

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2
   2:	90                   	nop
   3:	83 4b 30 01          	orl    $0x1,0x30(%rbx)
   7:	5b                   	pop    %rbx
   8:	e9 08 3c 06 03       	jmp    0x3063c15
   d:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
  14:	00 
  15:	f6                   	.byte 0xf6
[    2.352308][    T0] RSP: 0000:ffffffff86407dd8 EFLAGS: 00010046
[    2.353078][    T0] RAX: 0000000000000000 RBX: ffffea0004001000 RCX: 0000000000000001
[    2.354058][    T0] RDX: 0000000000000000 RSI: ffffffff84e8e940 RDI: ffffffff855b1ca0
[    2.355041][    T0] RBP: ffff888100040000 R08: 0000000000000002 R09: 0000000000000000
[    2.355978][    T0] R10: ffffffff86f35083 R11: ffffffff819fd2f1 R12: 0000000000000000
[    2.356822][    T0] R13: ffff888100040048 R14: 0000000000000015 R15: ffffffff886073e0
[    2.357702][    T0] FS:  0000000000000000(0000) GS:ffff8883aec00000(0000) knlGS:0000000000000000
[    2.358674][    T0] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    2.359469][    T0] CR2: ffff88843ffff000 CR3: 00000000064dc000 CR4: 00000000000000b0
[    2.360402][    T0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    2.361328][    T0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    2.362305][    T0] Call Trace:
[    2.362722][    T0]  <TASK>
[ 2.363087][ T0] ? show_regs (arch/x86/kernel/dumpstack.c:479) 
[ 2.365499][ T0] ? __warn (kernel/panic.c:673) 
[ 2.366034][ T0] ? ___add_partial (mm/slub.c:2132) 
[ 2.366627][ T0] ? report_bug (lib/bug.c:180 lib/bug.c:219) 
[ 2.367200][ T0] ? handle_bug (arch/x86/kernel/traps.c:237) 
[ 2.367743][ T0] ? exc_invalid_op (arch/x86/kernel/traps.c:258 (discriminator 1)) 
[ 2.368309][ T0] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:568) 
[ 2.368930][ T0] ? kasan_set_track (mm/kasan/common.c:52) 
[ 2.369529][ T0] ? ___add_partial (mm/slub.c:2132) 
[ 2.370121][ T0] ? ___add_partial (mm/slub.c:2132 (discriminator 1)) 
[ 2.370706][ T0] early_kmem_cache_node_alloc (include/linux/list.h:169 mm/slub.c:2156 mm/slub.c:4308) 
[ 2.371471][ T0] kmem_cache_open (mm/slub.c:4340 mm/slub.c:4578) 
[ 2.372060][ T0] __kmem_cache_create (mm/slub.c:5140) 
[ 2.372688][ T0] create_boot_cache (mm/slab_common.c:654) 
[ 2.373317][ T0] kmem_cache_init (mm/slub.c:5075) 
[ 2.373936][ T0] mm_core_init (mm/mm_init.c:2786) 
[ 2.374519][ T0] start_kernel (init/main.c:929) 
[ 2.375103][ T0] x86_64_start_reservations (arch/x86/kernel/head64.c:544) 
[ 2.375763][ T0] x86_64_start_kernel (arch/x86/kernel/head64.c:486 (discriminator 17)) 
[ 2.376353][ T0] secondary_startup_64_no_verify (arch/x86/kernel/head_64.S:433) 
[    2.377096][    T0]  </TASK>
[    2.377447][    T0] irq event stamp: 0
[ 2.377916][ T0] hardirqs last enabled at (0): 0x0 
[ 2.378794][ T0] hardirqs last disabled at (0): 0x0 
[ 2.379684][ T0] softirqs last enabled at (0): 0x0 
[ 2.380551][ T0] softirqs last disabled at (0): 0x0 
[    2.381441][    T0] ---[ end trace 0000000000000000 ]---
[    2.384117][    T0] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1



The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20231027/202310271308.9076b4c0-oliver.sang@intel.com
  
Chengming Zhou Oct. 27, 2023, 9:43 a.m. UTC | #4
On 2023/10/27 13:26, kernel test robot wrote:
> 
> 
> Hello,
> 
> kernel test robot noticed "WARNING:at_mm/slub.c:#___add_partial" on:
> 
> commit: 0805463ab860a2dde667bd4423a30efbf650b34b ("[RFC PATCH 1/5] slub: Introduce on_partial()")
> url: https://github.com/intel-lab-lkp/linux/commits/chengming-zhou-linux-dev/slub-Introduce-on_partial/20231017-234739
> base: git://git.kernel.org/cgit/linux/kernel/git/vbabka/slab.git for-next
> patch link: https://lore.kernel.org/all/20231017154439.3036608-2-chengming.zhou@linux.dev/
> patch subject: [RFC PATCH 1/5] slub: Introduce on_partial()
> 
> in testcase: boot
> 
> compiler: gcc-12
> test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
> 
> (please refer to attached dmesg/kmsg for entire log/backtrace)
> 
> 
> +--------------------------------------------+------------+------------+
> |                                            | e050a704f3 | 0805463ab8 |
> +--------------------------------------------+------------+------------+
> | WARNING:at_mm/slub.c:#___add_partial       | 0          | 16         |
> | RIP:___add_partial                         | 0          | 16         |
> +--------------------------------------------+------------+------------+
> 
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <oliver.sang@intel.com>
> | Closes: https://lore.kernel.org/oe-lkp/202310271308.9076b4c0-oliver.sang@intel.com
> 
> 
> [    2.344426][    T0] ------------[ cut here ]------------
> [ 2.345095][ T0] WARNING: CPU: 0 PID: 0 at mm/slub.c:2132 ___add_partial (mm/slub.c:2132) 

The latest version "RFC v3" should have not this problem, since it changes to
use page flag "workingset" bit, instead of the mapcount, which has to be
initialized from -1 to 0 in allocate_slab().

Here, the problem is that the boot cache is not from allocate_slab().

RFC v3: https://lore.kernel.org/all/20231024093345.3676493-1-chengming.zhou@linux.dev/

Thanks!

> [    2.346072][    T0] Modules linked in:
> [    2.346555][    T0] CPU: 0 PID: 0 Comm: swapper Not tainted 6.6.0-rc5-00008-g0805463ab860 #1 e88a4d31ac7553ddd9cc4ecfa6b6cbc9ab8c98ab
> [    2.348039][    T0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> [ 2.349271][ T0] RIP: 0010:___add_partial (mm/slub.c:2132) 
> [ 2.349920][ T0] Code: 05 52 3f fb 05 53 48 89 f3 85 c0 75 0a 83 4b 30 01 5b e9 28 3c 06 03 48 83 c7 18 be ff ff ff ff e8 6a ec 02 03 85 c0 75 e4 90 <0f> 0b 90 83 4b 30 01 5b e9 08 3c 06 03 0f 1f 84 00 00 00 00 00 f6
> All code
> ========
>    0:	05 52 3f fb 05       	add    $0x5fb3f52,%eax
>    5:	53                   	push   %rbx
>    6:	48 89 f3             	mov    %rsi,%rbx
>    9:	85 c0                	test   %eax,%eax
>    b:	75 0a                	jne    0x17
>    d:	83 4b 30 01          	orl    $0x1,0x30(%rbx)
>   11:	5b                   	pop    %rbx
>   12:	e9 28 3c 06 03       	jmp    0x3063c3f
>   17:	48 83 c7 18          	add    $0x18,%rdi
>   1b:	be ff ff ff ff       	mov    $0xffffffff,%esi
>   20:	e8 6a ec 02 03       	call   0x302ec8f
>   25:	85 c0                	test   %eax,%eax
>   27:	75 e4                	jne    0xd
>   29:	90                   	nop
>   2a:*	0f 0b                	ud2		<-- trapping instruction
>   2c:	90                   	nop
>   2d:	83 4b 30 01          	orl    $0x1,0x30(%rbx)
>   31:	5b                   	pop    %rbx
>   32:	e9 08 3c 06 03       	jmp    0x3063c3f
>   37:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
>   3e:	00 
>   3f:	f6                   	.byte 0xf6
> 
> Code starting with the faulting instruction
> ===========================================
>    0:	0f 0b                	ud2
>    2:	90                   	nop
>    3:	83 4b 30 01          	orl    $0x1,0x30(%rbx)
>    7:	5b                   	pop    %rbx
>    8:	e9 08 3c 06 03       	jmp    0x3063c15
>    d:	0f 1f 84 00 00 00 00 	nopl   0x0(%rax,%rax,1)
>   14:	00 
>   15:	f6                   	.byte 0xf6
> [    2.352308][    T0] RSP: 0000:ffffffff86407dd8 EFLAGS: 00010046
> [    2.353078][    T0] RAX: 0000000000000000 RBX: ffffea0004001000 RCX: 0000000000000001
> [    2.354058][    T0] RDX: 0000000000000000 RSI: ffffffff84e8e940 RDI: ffffffff855b1ca0
> [    2.355041][    T0] RBP: ffff888100040000 R08: 0000000000000002 R09: 0000000000000000
> [    2.355978][    T0] R10: ffffffff86f35083 R11: ffffffff819fd2f1 R12: 0000000000000000
> [    2.356822][    T0] R13: ffff888100040048 R14: 0000000000000015 R15: ffffffff886073e0
> [    2.357702][    T0] FS:  0000000000000000(0000) GS:ffff8883aec00000(0000) knlGS:0000000000000000
> [    2.358674][    T0] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    2.359469][    T0] CR2: ffff88843ffff000 CR3: 00000000064dc000 CR4: 00000000000000b0
> [    2.360402][    T0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [    2.361328][    T0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [    2.362305][    T0] Call Trace:
> [    2.362722][    T0]  <TASK>
> [ 2.363087][ T0] ? show_regs (arch/x86/kernel/dumpstack.c:479) 
> [ 2.365499][ T0] ? __warn (kernel/panic.c:673) 
> [ 2.366034][ T0] ? ___add_partial (mm/slub.c:2132) 
> [ 2.366627][ T0] ? report_bug (lib/bug.c:180 lib/bug.c:219) 
> [ 2.367200][ T0] ? handle_bug (arch/x86/kernel/traps.c:237) 
> [ 2.367743][ T0] ? exc_invalid_op (arch/x86/kernel/traps.c:258 (discriminator 1)) 
> [ 2.368309][ T0] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:568) 
> [ 2.368930][ T0] ? kasan_set_track (mm/kasan/common.c:52) 
> [ 2.369529][ T0] ? ___add_partial (mm/slub.c:2132) 
> [ 2.370121][ T0] ? ___add_partial (mm/slub.c:2132 (discriminator 1)) 
> [ 2.370706][ T0] early_kmem_cache_node_alloc (include/linux/list.h:169 mm/slub.c:2156 mm/slub.c:4308) 
> [ 2.371471][ T0] kmem_cache_open (mm/slub.c:4340 mm/slub.c:4578) 
> [ 2.372060][ T0] __kmem_cache_create (mm/slub.c:5140) 
> [ 2.372688][ T0] create_boot_cache (mm/slab_common.c:654) 
> [ 2.373317][ T0] kmem_cache_init (mm/slub.c:5075) 
> [ 2.373936][ T0] mm_core_init (mm/mm_init.c:2786) 
> [ 2.374519][ T0] start_kernel (init/main.c:929) 
> [ 2.375103][ T0] x86_64_start_reservations (arch/x86/kernel/head64.c:544) 
> [ 2.375763][ T0] x86_64_start_kernel (arch/x86/kernel/head64.c:486 (discriminator 17)) 
> [ 2.376353][ T0] secondary_startup_64_no_verify (arch/x86/kernel/head_64.S:433) 
> [    2.377096][    T0]  </TASK>
> [    2.377447][    T0] irq event stamp: 0
> [ 2.377916][ T0] hardirqs last enabled at (0): 0x0 
> [ 2.378794][ T0] hardirqs last disabled at (0): 0x0 
> [ 2.379684][ T0] softirqs last enabled at (0): 0x0 
> [ 2.380551][ T0] softirqs last disabled at (0): 0x0 
> [    2.381441][    T0] ---[ end trace 0000000000000000 ]---
> [    2.384117][    T0] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1
> 
> 
> 
> The kernel config and materials to reproduce are available at:
> https://download.01.org/0day-ci/archive/20231027/202310271308.9076b4c0-oliver.sang@intel.com
> 
> 
>
  

Patch

diff --git a/mm/slab.h b/mm/slab.h
index 8cd3294fedf5..11e9c9a0f648 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -89,7 +89,7 @@  struct slab {
 		};
 		struct rcu_head rcu_head;
 	};
-	unsigned int __unused;
+	unsigned int flags;
 
 #else
 #error "Unexpected slab allocator configured"
diff --git a/mm/slub.c b/mm/slub.c
index 63d281dfacdb..e5356ad14951 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1993,6 +1993,12 @@  static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
 }
 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
 
+enum SLUB_FLAGS {
+	SF_INIT_VALUE = 0,
+	SF_EXIT_VALUE = -1,
+	SF_NODE_PARTIAL = 1 << 0,
+};
+
 static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
 	struct slab *slab;
@@ -2031,6 +2037,7 @@  static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	slab->objects = oo_objects(oo);
 	slab->inuse = 0;
 	slab->frozen = 0;
+	slab->flags = SF_INIT_VALUE;
 
 	account_slab(slab, oo_order(oo), s, flags);
 
@@ -2077,6 +2084,7 @@  static void __free_slab(struct kmem_cache *s, struct slab *slab)
 	int order = folio_order(folio);
 	int pages = 1 << order;
 
+	slab->flags = SF_EXIT_VALUE;
 	__slab_clear_pfmemalloc(slab);
 	folio->mapping = NULL;
 	/* Make the mapping reset visible before clearing the flag */
@@ -2119,9 +2127,28 @@  static void discard_slab(struct kmem_cache *s, struct slab *slab)
 /*
  * Management of partially allocated slabs.
  */
+static void ___add_partial(struct kmem_cache_node *n, struct slab *slab)
+{
+	lockdep_assert_held(&n->list_lock);
+	slab->flags |= SF_NODE_PARTIAL;
+}
+
+static void ___remove_partial(struct kmem_cache_node *n, struct slab *slab)
+{
+	lockdep_assert_held(&n->list_lock);
+	slab->flags &= ~SF_NODE_PARTIAL;
+}
+
+static inline bool on_partial(struct kmem_cache_node *n, struct slab *slab)
+{
+	lockdep_assert_held(&n->list_lock);
+	return slab->flags & SF_NODE_PARTIAL;
+}
+
 static inline void
 __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
 {
+	___add_partial(n, slab);
 	n->nr_partial++;
 	if (tail == DEACTIVATE_TO_TAIL)
 		list_add_tail(&slab->slab_list, &n->partial);
@@ -2142,6 +2169,7 @@  static inline void remove_partial(struct kmem_cache_node *n,
 	lockdep_assert_held(&n->list_lock);
 	list_del(&slab->slab_list);
 	n->nr_partial--;
+	___remove_partial(n, slab);
 }
 
 /*