[v3,08/11] slub: Replace cmpxchg_double()
Commit Message
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
---
include/linux/slub_def.h | 12 ++-
mm/slab.h | 49 ++++++++++++++--
mm/slub.c | 143 ++++++++++++++++++++++++++++-------------------
3 files changed, 138 insertions(+), 66 deletions(-)
Comments
On Mon, May 15, 2023 at 09:57:07AM +0200, Peter Zijlstra wrote:
> @@ -3008,6 +3029,22 @@ static inline bool pfmemalloc_match(stru
> }
>
> #ifndef CONFIG_SLUB_TINY
> +static inline bool
> +__update_cpu_freelist_fast(struct kmem_cache *s,
> + void *freelist_old, void *freelist_new,
> + unsigned long tid)
> +{
> +#ifdef system_has_freelist_aba
> + freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
> + freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
> +
> + return this_cpu_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
> + old.full, new.full) == old.full;
> +#else
> + return false;
> +#endif
> +}
> +
> /*
> * Check the slab->freelist and either transfer the freelist to the
> * per cpu freelist or deactivate the slab.
> @@ -3359,11 +3396,7 @@ static __always_inline void *__slab_allo
> * against code executing on this cpu *not* from access by
> * other cpus.
> */
> - if (unlikely(!this_cpu_cmpxchg_double(
> - s->cpu_slab->freelist, s->cpu_slab->tid,
> - object, tid,
> - next_object, next_tid(tid)))) {
> -
> + if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
> note_cmpxchg_failure("slab_alloc", s, tid);
> goto redo;
> }
> @@ -3736,11 +3769,7 @@ static __always_inline void do_slab_free
>
> set_freepointer(s, tail_obj, freelist);
>
> - if (unlikely(!this_cpu_cmpxchg_double(
> - s->cpu_slab->freelist, s->cpu_slab->tid,
> - freelist, tid,
> - head, next_tid(tid)))) {
> -
> + if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
> note_cmpxchg_failure("slab_free", s, tid);
> goto redo;
> }
This isn't right; the this_cpu_cmpxchg_double() was unconditional and
relied on the local_irq_save() fallback when no native cmpxchg128 is
present.
The below delta makes things boot again when system_has_cmpxchg128 is
not defined.
I'm going to zap these patches from tip/locking/core for a few days and
fold the below back into the series and let it run through the robots
again.
---
mm/slab.h | 20 +++++++++++---------
mm/slub.c | 6 +-----
2 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/mm/slab.h b/mm/slab.h
index 5880c70de3d6..b191bf68e6e0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -6,36 +6,36 @@
*/
void __init kmem_cache_init(void);
-#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
#ifdef CONFIG_64BIT
# ifdef system_has_cmpxchg128
# define system_has_freelist_aba() system_has_cmpxchg128()
# define try_cmpxchg_freelist try_cmpxchg128
-# define this_cpu_cmpxchg_freelist this_cpu_cmpxchg128
-typedef u128 freelist_full_t;
# endif
+#define this_cpu_cmpxchg_freelist this_cpu_cmpxchg128
+typedef u128 freelist_full_t;
#else /* CONFIG_64BIT */
# ifdef system_has_cmpxchg64
# define system_has_freelist_aba() system_has_cmpxchg64()
# define try_cmpxchg_freelist try_cmpxchg64
-# define this_cpu_cmpxchg_freelist this_cpu_cmpxchg64
-typedef u64 freelist_full_t;
# endif
+#define this_cpu_cmpxchg_freelist this_cpu_cmpxchg64
+typedef u64 freelist_full_t;
#endif /* CONFIG_64BIT */
-#endif /* CONFIG_HAVE_ALIGNED_STRUCT_PAGE */
+
+#if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#undef system_has_freelist_aba
+#endif
/*
* Freelist pointer and counter to cmpxchg together, avoids the typical ABA
* problems with cmpxchg of just a pointer.
*/
typedef union {
-#ifdef system_has_freelist_aba
struct {
void *freelist;
unsigned long counter;
};
freelist_full_t full;
-#endif
} freelist_aba_t;
/* Reuses the bits in struct page */
@@ -82,7 +82,9 @@ struct slab {
};
};
};
+#ifdef system_has_freelist_aba
freelist_aba_t freelist_counter;
+#endif
};
};
struct rcu_head rcu_head;
@@ -110,7 +112,7 @@ SLAB_MATCH(memcg_data, memcg_data);
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB)
-static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *)));
+static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)));
#endif
/**
diff --git a/mm/slub.c b/mm/slub.c
index 161b091746b7..af92c770606d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3034,15 +3034,11 @@ __update_cpu_freelist_fast(struct kmem_cache *s,
void *freelist_old, void *freelist_new,
unsigned long tid)
{
-#ifdef system_has_freelist_aba
freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
return this_cpu_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
old.full, new.full) == old.full;
-#else
- return false;
-#endif
}
/*
On 5/24/23 11:32, Peter Zijlstra wrote:
> On Mon, May 15, 2023 at 09:57:07AM +0200, Peter Zijlstra wrote:
>
>> @@ -3008,6 +3029,22 @@ static inline bool pfmemalloc_match(stru
>> }
>>
>> #ifndef CONFIG_SLUB_TINY
>> +static inline bool
>> +__update_cpu_freelist_fast(struct kmem_cache *s,
>> + void *freelist_old, void *freelist_new,
>> + unsigned long tid)
>> +{
>> +#ifdef system_has_freelist_aba
>> + freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
>> + freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
>> +
>> + return this_cpu_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
>> + old.full, new.full) == old.full;
>> +#else
>> + return false;
>> +#endif
>> +}
>> +
>> /*
>> * Check the slab->freelist and either transfer the freelist to the
>> * per cpu freelist or deactivate the slab.
>> @@ -3359,11 +3396,7 @@ static __always_inline void *__slab_allo
>> * against code executing on this cpu *not* from access by
>> * other cpus.
>> */
>> - if (unlikely(!this_cpu_cmpxchg_double(
>> - s->cpu_slab->freelist, s->cpu_slab->tid,
>> - object, tid,
>> - next_object, next_tid(tid)))) {
>> -
>> + if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
>> note_cmpxchg_failure("slab_alloc", s, tid);
>> goto redo;
>> }
>> @@ -3736,11 +3769,7 @@ static __always_inline void do_slab_free
>>
>> set_freepointer(s, tail_obj, freelist);
>>
>> - if (unlikely(!this_cpu_cmpxchg_double(
>> - s->cpu_slab->freelist, s->cpu_slab->tid,
>> - freelist, tid,
>> - head, next_tid(tid)))) {
>> -
>> + if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
>> note_cmpxchg_failure("slab_free", s, tid);
>> goto redo;
>> }
>
> This isn't right; the this_cpu_cmpxchg_double() was unconditional and
> relied on the local_irq_save() fallback when no native cmpxchg128 is
> present.
>
> The below delta makes things boot again when system_has_cmpxchg128 is
> not defined.
Right, that should do.
> I'm going to zap these patches from tip/locking/core for a few days and
> fold the below back into the series and let it run through the robots
> again.
I noticed some comments in mm/slub.c still mention "cmpxchg_double", dunno
how much you want to clean it right now or can be postponed. Also some sysfs
stats files for CONFIG_SLUB_STATS (not widely used) which we probably might
try renaming without breaking anyone, but it's not guaranteed.
On Wed, May 24, 2023 at 11:32:47AM +0200, Peter Zijlstra wrote:
> On Mon, May 15, 2023 at 09:57:07AM +0200, Peter Zijlstra wrote:
>
> > @@ -3008,6 +3029,22 @@ static inline bool pfmemalloc_match(stru
> > }
> >
> > #ifndef CONFIG_SLUB_TINY
> > +static inline bool
> > +__update_cpu_freelist_fast(struct kmem_cache *s,
> > + void *freelist_old, void *freelist_new,
> > + unsigned long tid)
> > +{
> > +#ifdef system_has_freelist_aba
> > + freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
> > + freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
> > +
> > + return this_cpu_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
> > + old.full, new.full) == old.full;
> > +#else
> > + return false;
> > +#endif
> > +}
> > +
> > /*
> > * Check the slab->freelist and either transfer the freelist to the
> > * per cpu freelist or deactivate the slab.
> > @@ -3359,11 +3396,7 @@ static __always_inline void *__slab_allo
> > * against code executing on this cpu *not* from access by
> > * other cpus.
> > */
> > - if (unlikely(!this_cpu_cmpxchg_double(
> > - s->cpu_slab->freelist, s->cpu_slab->tid,
> > - object, tid,
> > - next_object, next_tid(tid)))) {
> > -
> > + if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
> > note_cmpxchg_failure("slab_alloc", s, tid);
> > goto redo;
> > }
> > @@ -3736,11 +3769,7 @@ static __always_inline void do_slab_free
> >
> > set_freepointer(s, tail_obj, freelist);
> >
> > - if (unlikely(!this_cpu_cmpxchg_double(
> > - s->cpu_slab->freelist, s->cpu_slab->tid,
> > - freelist, tid,
> > - head, next_tid(tid)))) {
> > -
> > + if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
> > note_cmpxchg_failure("slab_free", s, tid);
> > goto redo;
> > }
>
> This isn't right; the this_cpu_cmpxchg_double() was unconditional and
> relied on the local_irq_save() fallback when no native cmpxchg128 is
> present.
This then also means I need to look at this_cpu_cmpxchg128 and
this_cpu_cmoxchg64 behaviour when we dont have the CPUID feature.
Because current verions seem to assume the instruction is present.
On Thu, May 25, 2023, at 12:29, Peter Zijlstra wrote:
> On Wed, May 24, 2023 at 11:32:47AM +0200, Peter Zijlstra wrote:
>> On Mon, May 15, 2023 at 09:57:07AM +0200, Peter Zijlstra wrote:
>
> This then also means I need to look at this_cpu_cmpxchg128 and
> this_cpu_cmoxchg64 behaviour when we dont have the CPUID feature.
>
> Because current verions seem to assume the instruction is present.
As far as I could tell when reviewing your series, this_cpu_cmpxchg64()
is always available on all architectures. Depending on compile-time
feature detection this would be either a native instruction that
is guaranteed to work, or the irq-disabled version. On x86, this
is handled at runtime with alternative_io().
this_cpu_cmpxchg128() clearly needed the system_has_cmpxchg128()
check, same as system_has_cmpxchg_double() today.
Arnd
On Thu, May 25, 2023 at 12:52:06PM +0200, Arnd Bergmann wrote:
> On Thu, May 25, 2023, at 12:29, Peter Zijlstra wrote:
> > On Wed, May 24, 2023 at 11:32:47AM +0200, Peter Zijlstra wrote:
> >> On Mon, May 15, 2023 at 09:57:07AM +0200, Peter Zijlstra wrote:
> >
> > This then also means I need to look at this_cpu_cmpxchg128 and
> > this_cpu_cmoxchg64 behaviour when we dont have the CPUID feature.
> >
> > Because current verions seem to assume the instruction is present.
>
> As far as I could tell when reviewing your series, this_cpu_cmpxchg64()
> is always available on all architectures. Depending on compile-time
> feature detection this would be either a native instruction that
> is guaranteed to work, or the irq-disabled version. On x86, this
> is handled at runtime with alternative_io().
>
> this_cpu_cmpxchg128() clearly needed the system_has_cmpxchg128()
> check, same as system_has_cmpxchg_double() today.
So, having just dug through all that, on x86:
this_cpu_cmpxchg64() is:
X86_CMPXCHG64=n -> fallback, irrespective of CX8
X86_CMPXCHG64=y -> cmpxchg8b
X86_64 -> cmpxchg
I've changed it to be similar between 32bit and 64bit such that both:
cmpxchg#b when CX#, otherwise this_cpu_cmpxchg#b_emu
On Wed, May 24, 2023 at 11:32:47AM +0200, Peter Zijlstra wrote:
> On Mon, May 15, 2023 at 09:57:07AM +0200, Peter Zijlstra wrote:
>
> > @@ -3008,6 +3029,22 @@ static inline bool pfmemalloc_match(stru
> > }
> >
> > #ifndef CONFIG_SLUB_TINY
> > +static inline bool
> > +__update_cpu_freelist_fast(struct kmem_cache *s,
> > + void *freelist_old, void *freelist_new,
> > + unsigned long tid)
> > +{
> > +#ifdef system_has_freelist_aba
> > + freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
> > + freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
> > +
> > + return this_cpu_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
> > + old.full, new.full) == old.full;
> > +#else
> > + return false;
> > +#endif
> > +}
> > +
> > /*
> > * Check the slab->freelist and either transfer the freelist to the
> > * per cpu freelist or deactivate the slab.
> > @@ -3359,11 +3396,7 @@ static __always_inline void *__slab_allo
> > * against code executing on this cpu *not* from access by
> > * other cpus.
> > */
> > - if (unlikely(!this_cpu_cmpxchg_double(
> > - s->cpu_slab->freelist, s->cpu_slab->tid,
> > - object, tid,
> > - next_object, next_tid(tid)))) {
> > -
> > + if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
> > note_cmpxchg_failure("slab_alloc", s, tid);
> > goto redo;
> > }
> > @@ -3736,11 +3769,7 @@ static __always_inline void do_slab_free
> >
> > set_freepointer(s, tail_obj, freelist);
> >
> > - if (unlikely(!this_cpu_cmpxchg_double(
> > - s->cpu_slab->freelist, s->cpu_slab->tid,
> > - freelist, tid,
> > - head, next_tid(tid)))) {
> > -
> > + if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
> > note_cmpxchg_failure("slab_free", s, tid);
> > goto redo;
> > }
>
> This isn't right; the this_cpu_cmpxchg_double() was unconditional and
> relied on the local_irq_save() fallback when no native cmpxchg128 is
> present.
This means this_cpu_cmpxchg128 is expected to be present on all 64bit
archs, except Mark just found out that HPPA doens't support __int128
until gcc-11.
(I've been building using gcc-12.2)
And because the cmpxchg128 fallback relies on '==' we can't trivally
fudge that with a struct type either :/ Now, afaict it all magically
works if I use:
#ifdef __SIZEOF_INT128__
typedef __s128 s128
typedef __u128 u128
#else
#if defined(CONFIG_PARISC) && defined(CONFIG_64BIT)
typedef long double u128;
#endif
#endif
but that is *super* gross.
The alternative is raising the minimum GCC for PARISC to gcc-11..
Yet another alternative is using a struct type and an equality function,
just for this.
Anybody?
On Tue, May 30, 2023 at 04:22:32PM +0200, Peter Zijlstra wrote:
> Yet another alternative is using a struct type and an equality function,
> just for this.
The best I could come up with in the regard is the below. It builds on
HPPA64 and x86_64, but I've not ran it yet.
(also, the introduction of this_cpu_try_cmpxchg() should probably be
split out into its own patch)
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -99,6 +99,15 @@ do { \
__ret; \
})
+#define raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval) \
+({ \
+ typeof(pcp) __ret, __old = *(ovalp); \
+ __ret = raw_cpu_cmpxchg(pcp, __old, nval); \
+ if (!likely(__ret == __old)) \
+ *(ovalp) = __ret; \
+ likely(__ret == __old); \
+})
+
#define __this_cpu_generic_read_nopreempt(pcp) \
({ \
typeof(pcp) ___ret; \
@@ -167,6 +176,15 @@ do { \
__ret; \
})
+#define this_cpu_generic_try_cmpxchg(pcp, ovalp, nval) \
+({ \
+ typeof(pcp) __ret, __old = *(ovalp); \
+ __ret = this_cpu_cmpxchg(pcp, __old, nval); \
+ if (!likely(__ret == __old)) \
+ *(ovalp) = __ret; \
+ likely(__ret == __old); \
+})
+
#ifndef raw_cpu_read_1
#define raw_cpu_read_1(pcp) raw_cpu_generic_read(pcp)
#endif
@@ -258,6 +276,36 @@ do { \
#define raw_cpu_xchg_8(pcp, nval) raw_cpu_generic_xchg(pcp, nval)
#endif
+#ifndef __SIZEOF_INT128__
+#define raw_cpu_generic_try_cmpxchg_memcmp(pcp, ovalp, nval) \
+({ \
+ typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \
+ typeof(pcp) __ret, __old = *(ovalp); \
+ bool __s; \
+ __ret = *__p; \
+ if (!__builtin_memcmp(&__ret, &__old, sizeof(pcp))) { \
+ *__p = nval; \
+ __s = true; \
+ } else { \
+ *(ovalp) = __ret; \
+ __s = false; \
+ } \
+ __s; \
+})
+
+#define raw_cpu_generic_cmpxchg_memcmp(pcp, oval, nval) \
+({ \
+ typeof(pcp) __old = (oval); \
+ raw_cpu_generic_try_cmpxchg_memcpy(pcp, &__old, nval); \
+ __old; \
+})
+
+#define raw_cpu_cmpxchg128(pcp, oval, nval) \
+ raw_cpu_generic_cmpxchg_memcmp(pcp, oval, nval)
+#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) \
+ raw_cpu_generic_try_cmpxchg_memcmp(pcp, ovalp, nval)
+#endif
+
#ifndef raw_cpu_cmpxchg_1
#define raw_cpu_cmpxchg_1(pcp, oval, nval) \
raw_cpu_generic_cmpxchg(pcp, oval, nval)
@@ -283,6 +331,31 @@ do { \
raw_cpu_generic_cmpxchg(pcp, oval, nval)
#endif
+#ifndef raw_cpu_try_cmpxchg_1
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) \
+ raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef raw_cpu_try_cmpxchg_2
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) \
+ raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef raw_cpu_try_cmpxchg_4
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) \
+ raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef raw_cpu_try_cmpxchg_8
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) \
+ raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef raw_cpu_try_cmpxchg64
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) \
+ raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef raw_cpu_try_cmpxchg128
+#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) \
+ raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+
#ifndef this_cpu_read_1
#define this_cpu_read_1(pcp) this_cpu_generic_read(pcp)
#endif
@@ -374,6 +447,33 @@ do { \
#define this_cpu_xchg_8(pcp, nval) this_cpu_generic_xchg(pcp, nval)
#endif
+#ifndef __SIZEOF_INT128__
+#define this_cpu_generic_try_cmpxchg_memcmp(pcp, ovalp, nval) \
+({ \
+ bool __ret; \
+ unsigned long __flags; \
+ raw_local_irq_save(__flags); \
+ __ret = raw_cpu_generic_try_cmpxchg_memcmp(pcp, ovalp, nval); \
+ raw_local_irq_restore(__flags); \
+ __ret; \
+})
+
+#define this_cpu_generic_cmpxchg_memcmp(pcp, oval, nval) \
+({ \
+ typeof(pcp) __ret; \
+ unsigned long __flags; \
+ raw_local_irq_save(__flags); \
+ __ret = raw_cpu_generic_cmpxchg_memcmp(pcp, oval, nval); \
+ raw_local_irq_restore(__flags); \
+ __ret; \
+})
+
+#define this_cpu_cmpxchg128(pcp, oval, nval) \
+ this_cpu_generic_cmpxchg_memcmp(pcp, oval, nval)
+#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) \
+ this_cpu_generic_try_cmpxchg_memcmp(pcp, ovalp, nval)
+#endif
+
#ifndef this_cpu_cmpxchg_1
#define this_cpu_cmpxchg_1(pcp, oval, nval) \
this_cpu_generic_cmpxchg(pcp, oval, nval)
@@ -399,4 +499,29 @@ do { \
this_cpu_generic_cmpxchg(pcp, oval, nval)
#endif
+#ifndef this_cpu_try_cmpxchg_1
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) \
+ this_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef this_cpu_try_cmpxchg_2
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) \
+ this_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef this_cpu_try_cmpxchg_4
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) \
+ this_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef this_cpu_try_cmpxchg_8
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) \
+ this_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef this_cpu_try_cmpxchg64
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) \
+ this_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+#ifndef this_cpu_try_cmpxchg128
+#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) \
+ this_cpu_generic_try_cmpxchg(pcp, ovalp, nval)
+#endif
+
#endif /* _ASM_GENERIC_PERCPU_H_ */
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -13,6 +13,13 @@
#ifdef __SIZEOF_INT128__
typedef __s128 s128;
typedef __u128 u128;
+#else
+#ifdef CONFIG_64BIT
+/* hack for this_cpu_cmpxchg128 */
+typedef struct {
+ u64 a, b;
+} u128 __attribute__((aligned(16)));
+#endif
#endif
typedef u32 __kernel_dev_t;
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -11,14 +11,14 @@ void __init kmem_cache_init(void);
# define system_has_freelist_aba() system_has_cmpxchg128()
# define try_cmpxchg_freelist try_cmpxchg128
# endif
-#define this_cpu_cmpxchg_freelist this_cpu_cmpxchg128
+#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128
typedef u128 freelist_full_t;
#else /* CONFIG_64BIT */
# ifdef system_has_cmpxchg64
# define system_has_freelist_aba() system_has_cmpxchg64()
# define try_cmpxchg_freelist try_cmpxchg64
# endif
-#define this_cpu_cmpxchg_freelist this_cpu_cmpxchg64
+#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64
typedef u64 freelist_full_t;
#endif /* CONFIG_64BIT */
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3037,8 +3037,8 @@ __update_cpu_freelist_fast(struct kmem_c
freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
- return this_cpu_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
- old.full, new.full) == old.full;
+ return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
+ &old.full, new.full);
}
/*
@@ -39,7 +39,8 @@ enum stat_item {
CPU_PARTIAL_FREE, /* Refill cpu partial on free */
CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
- NR_SLUB_STAT_ITEMS };
+ NR_SLUB_STAT_ITEMS
+};
#ifndef CONFIG_SLUB_TINY
/*
@@ -47,8 +48,13 @@ enum stat_item {
* with this_cpu_cmpxchg_double() alignment requirements.
*/
struct kmem_cache_cpu {
- void **freelist; /* Pointer to next available object */
- unsigned long tid; /* Globally unique transaction id */
+ union {
+ struct {
+ void **freelist; /* Pointer to next available object */
+ unsigned long tid; /* Globally unique transaction id */
+ };
+ freelist_aba_t freelist_tid;
+ };
struct slab *slab; /* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct slab *partial; /* Partially allocated frozen slabs */
@@ -6,6 +6,38 @@
*/
void __init kmem_cache_init(void);
+#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
+#ifdef CONFIG_64BIT
+# ifdef system_has_cmpxchg128
+# define system_has_freelist_aba() system_has_cmpxchg128()
+# define try_cmpxchg_freelist try_cmpxchg128
+# define this_cpu_cmpxchg_freelist this_cpu_cmpxchg128
+typedef u128 freelist_full_t;
+# endif
+#else /* CONFIG_64BIT */
+# ifdef system_has_cmpxchg64
+# define system_has_freelist_aba() system_has_cmpxchg64()
+# define try_cmpxchg_freelist try_cmpxchg64
+# define this_cpu_cmpxchg_freelist this_cpu_cmpxchg64
+typedef u64 freelist_full_t;
+# endif
+#endif /* CONFIG_64BIT */
+#endif /* CONFIG_HAVE_ALIGNED_STRUCT_PAGE */
+
+/*
+ * Freelist pointer and counter to cmpxchg together, avoids the typical ABA
+ * problems with cmpxchg of just a pointer.
+ */
+typedef union {
+#ifdef system_has_freelist_aba
+ struct {
+ void *freelist;
+ unsigned long counter;
+ };
+ freelist_full_t full;
+#endif
+} freelist_aba_t;
+
/* Reuses the bits in struct page */
struct slab {
unsigned long __page_flags;
@@ -38,14 +70,19 @@ struct slab {
#endif
};
/* Double-word boundary */
- void *freelist; /* first free object */
union {
- unsigned long counters;
struct {
- unsigned inuse:16;
- unsigned objects:15;
- unsigned frozen:1;
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
};
+ freelist_aba_t freelist_counter;
};
};
struct rcu_head rcu_head;
@@ -72,7 +109,7 @@ SLAB_MATCH(memcg_data, memcg_data);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB)
+#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB)
static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *)));
#endif
@@ -292,7 +292,12 @@ static inline bool kmem_cache_has_cpu_pa
/* Poison object */
#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
/* Use cmpxchg_double */
+
+#ifdef system_has_freelist_aba
#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
+#else
+#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U)
+#endif
/*
* Tracking user of a slab.
@@ -512,6 +517,40 @@ static __always_inline void slab_unlock(
__bit_spin_unlock(PG_locked, &page->flags);
}
+static inline bool
+__update_freelist_fast(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+#ifdef system_has_freelist_aba
+ freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
+
+ return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
+#else
+ return false;
+#endif
+}
+
+static inline bool
+__update_freelist_slow(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+ bool ret = false;
+
+ slab_lock(slab);
+ if (slab->freelist == freelist_old &&
+ slab->counters == counters_old) {
+ slab->freelist = freelist_new;
+ slab->counters = counters_new;
+ ret = true;
+ }
+ slab_unlock(slab);
+
+ return ret;
+}
+
/*
* Interrupts must be disabled (for the fallback code to work right), typically
* by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
@@ -519,33 +558,25 @@ static __always_inline void slab_unlock(
* allocation/ free operation in hardirq context. Therefore nothing can
* interrupt the operation.
*/
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
+ bool ret;
+
if (USE_LOCKLESS_FAST_PATH())
lockdep_assert_irqs_disabled();
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&slab->freelist, &slab->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
- slab_lock(slab);
- if (slab->freelist == freelist_old &&
- slab->counters == counters_old) {
- slab->freelist = freelist_new;
- slab->counters = counters_new;
- slab_unlock(slab);
- return true;
- }
- slab_unlock(slab);
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
}
+ if (likely(ret))
+ return true;
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -557,36 +588,26 @@ static inline bool __cmpxchg_double_slab
return false;
}
-static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ bool ret;
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&slab->freelist, &slab->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
unsigned long flags;
local_irq_save(flags);
- slab_lock(slab);
- if (slab->freelist == freelist_old &&
- slab->counters == counters_old) {
- slab->freelist = freelist_new;
- slab->counters = counters_new;
- slab_unlock(slab);
- local_irq_restore(flags);
- return true;
- }
- slab_unlock(slab);
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
local_irq_restore(flags);
}
+ if (likely(ret))
+ return true;
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -2228,7 +2249,7 @@ static inline void *acquire_slab(struct
VM_BUG_ON(new.frozen);
new.frozen = 1;
- if (!__cmpxchg_double_slab(s, slab,
+ if (!__slab_update_freelist(s, slab,
freelist, counters,
new.freelist, new.counters,
"acquire_slab"))
@@ -2554,7 +2575,7 @@ static void deactivate_slab(struct kmem_
}
- if (!cmpxchg_double_slab(s, slab,
+ if (!slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab")) {
@@ -2611,7 +2632,7 @@ static void __unfreeze_partials(struct k
new.frozen = 0;
- } while (!__cmpxchg_double_slab(s, slab,
+ } while (!__slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"));
@@ -3008,6 +3029,22 @@ static inline bool pfmemalloc_match(stru
}
#ifndef CONFIG_SLUB_TINY
+static inline bool
+__update_cpu_freelist_fast(struct kmem_cache *s,
+ void *freelist_old, void *freelist_new,
+ unsigned long tid)
+{
+#ifdef system_has_freelist_aba
+ freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
+
+ return this_cpu_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
+ old.full, new.full) == old.full;
+#else
+ return false;
+#endif
+}
+
/*
* Check the slab->freelist and either transfer the freelist to the
* per cpu freelist or deactivate the slab.
@@ -3034,7 +3071,7 @@ static inline void *get_freelist(struct
new.inuse = slab->objects;
new.frozen = freelist != NULL;
- } while (!__cmpxchg_double_slab(s, slab,
+ } while (!__slab_update_freelist(s, slab,
freelist, counters,
NULL, new.counters,
"get_freelist"));
@@ -3359,11 +3396,7 @@ static __always_inline void *__slab_allo
* against code executing on this cpu *not* from access by
* other cpus.
*/
- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- object, tid,
- next_object, next_tid(tid)))) {
-
+ if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
@@ -3631,7 +3664,7 @@ static void __slab_free(struct kmem_cach
}
}
- } while (!cmpxchg_double_slab(s, slab,
+ } while (!slab_update_freelist(s, slab,
prior, counters,
head, new.counters,
"__slab_free"));
@@ -3736,11 +3769,7 @@ static __always_inline void do_slab_free
set_freepointer(s, tail_obj, freelist);
- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- freelist, tid,
- head, next_tid(tid)))) {
-
+ if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
@@ -4505,11 +4534,11 @@ static int kmem_cache_open(struct kmem_c
}
}
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
+#ifdef system_has_freelist_aba
+ if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
+ }
#endif
/*