[RFC,10/14] x86: Create virtual memory region for SLUB
Commit Message
From: Jann Horn <jannh@google.com>
SLAB_VIRTUAL reserves 512 GiB of virtual memory and uses them for both
struct slab and the actual slab memory. The pointers returned by
kmem_cache_alloc will point to this range of memory.
Signed-off-by: Jann Horn <jannh@google.com>
Co-developed-by: Matteo Rizzo <matteorizzo@google.com>
Signed-off-by: Matteo Rizzo <matteorizzo@google.com>
---
Documentation/arch/x86/x86_64/mm.rst | 4 ++--
arch/x86/include/asm/pgtable_64_types.h | 16 ++++++++++++++++
arch/x86/mm/init_64.c | 19 +++++++++++++++----
arch/x86/mm/kaslr.c | 9 +++++++++
arch/x86/mm/mm_internal.h | 4 ++++
mm/slub.c | 4 ++++
security/Kconfig.hardening | 2 ++
7 files changed, 52 insertions(+), 6 deletions(-)
Comments
On Fri, Sep 15, 2023 at 10:59:29AM +0000, Matteo Rizzo wrote:
> From: Jann Horn <jannh@google.com>
>
> SLAB_VIRTUAL reserves 512 GiB of virtual memory and uses them for both
> struct slab and the actual slab memory. The pointers returned by
> kmem_cache_alloc will point to this range of memory.
I think the 512 GiB limit may be worth mentioning in the Kconfig help
text.
And in the "640K is enough for everything" devil's advocacy, why is 512
GiB enough here? Is there any greater risk of a pathological allocation
pattern breaking a system any more (or less) than is currently possible?
>
> Signed-off-by: Jann Horn <jannh@google.com>
But, yes, I'm still a fan, and I think it interacts well here with the
rest of the KASLR initialization:
Reviewed-by: Kees Cook <keescook@chromium.org>
Have you tried to make this work on arm64? I imagine it should be
roughly as easy?
On 9/15/23 14:13, Kees Cook wrote:
> On Fri, Sep 15, 2023 at 10:59:29AM +0000, Matteo Rizzo wrote:
>> From: Jann Horn <jannh@google.com>
>>
>> SLAB_VIRTUAL reserves 512 GiB of virtual memory and uses them for both
>> struct slab and the actual slab memory. The pointers returned by
>> kmem_cache_alloc will point to this range of memory.
>
> I think the 512 GiB limit may be worth mentioning in the Kconfig help
> text.
Yes, please.
> And in the "640K is enough for everything" devil's advocacy, why is 512
> GiB enough here? Is there any greater risk of a pathological allocation
> pattern breaking a system any more (or less) than is currently possible?
I have the feeling folks just grabbed the first big-ish chunk they saw
free in the memory map and stole that one. Not a horrible approach,
mind you, but I have the feeling it didn't go through the most rigorous
sizing procedure. :)
My laptop memory is ~6% consumed by slab, 90% of which is reclaimable.
If a 64TB system had the same ratio, it would bump into this 512GB
limit. But it _should_ just reclaim thing earlier rather than falling over.
That said, we still have gobs of actual vmalloc() space. It's ~30TiB in
size and I'm not aware of anyone consuming anywhere near that much. If
the 512GB fills up somehow, there are other places to steal the space.
One minor concern is that the virtual area is the same size on 4 and
5-level paging systems. It might be a good idea to pick one of the
holes that actually gets bigger on 5-level systems.
On Fri, 15 Sept 2023 at 23:50, Dave Hansen <dave.hansen@intel.com> wrote:
>
> I have the feeling folks just grabbed the first big-ish chunk they saw
> free in the memory map and stole that one. Not a horrible approach,
> mind you, but I have the feeling it didn't go through the most rigorous
> sizing procedure. :)
>
> My laptop memory is ~6% consumed by slab, 90% of which is reclaimable.
> If a 64TB system had the same ratio, it would bump into this 512GB
> limit. But it _should_ just reclaim thing earlier rather than falling over.
>
> That said, we still have gobs of actual vmalloc() space. It's ~30TiB in
> size and I'm not aware of anyone consuming anywhere near that much. If
> the 512GB fills up somehow, there are other places to steal the space.
>
> One minor concern is that the virtual area is the same size on 4 and
> 5-level paging systems. It might be a good idea to pick one of the
> holes that actually gets bigger on 5-level systems.
One of the other ideas that we had was to use the KASAN shadow memory instead of
a dedicated area. As far as I know the KASAN region is not used by anything else
when KASAN is disabled, and I don't think it makes sense to have both KASAN and
SLAB_VIRTUAL enabled at the same time (see the patch which introduces the
Kconfig option for why). The KASAN region is 16 TiB on 4-level systems and 8 PiB
on 5-level, in both cases 1/16th the size of the address space.
Could that work?
--
Matteo
@@ -57,7 +57,7 @@ Complete virtual memory map with 4-level page tables
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
- fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
+ fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | SLUB virtual memory
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
@@ -116,7 +116,7 @@ Complete virtual memory map with 5-level page tables
fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
| | | | vaddr_end for KASLR
fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
- fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
+ fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | SLUB virtual memory
ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
@@ -6,6 +6,7 @@
#ifndef __ASSEMBLY__
#include <linux/types.h>
+#include <linux/align.h>
#include <asm/kaslr.h>
/*
@@ -199,6 +200,21 @@ extern unsigned int ptrs_per_p4d;
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+#ifdef CONFIG_SLAB_VIRTUAL
+#define SLAB_PGD_ENTRY _AC(-3, UL)
+#define SLAB_BASE_ADDR (SLAB_PGD_ENTRY << P4D_SHIFT)
+#define SLAB_END_ADDR (SLAB_BASE_ADDR + P4D_SIZE)
+
+/*
+ * We need to define this here because we need it to compute SLAB_META_SIZE
+ * and including slab.h causes a dependency cycle.
+ */
+#define STRUCT_SLAB_SIZE (32 * sizeof(void *))
+#define SLAB_VPAGES ((SLAB_END_ADDR - SLAB_BASE_ADDR) / PAGE_SIZE)
+#define SLAB_META_SIZE ALIGN(SLAB_VPAGES * STRUCT_SLAB_SIZE, PAGE_SIZE)
+#define SLAB_DATA_BASE_ADDR (SLAB_BASE_ADDR + SLAB_META_SIZE)
+#endif /* CONFIG_SLAB_VIRTUAL */
+
#define CPU_ENTRY_AREA_PGD _AC(-4, UL)
#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
@@ -1279,16 +1279,19 @@ static void __init register_page_bootmem_info(void)
}
/*
- * Pre-allocates page-table pages for the vmalloc area in the kernel page-table.
+ * Pre-allocates page-table pages for the vmalloc and SLUB areas in the kernel
+ * page-table.
* Only the level which needs to be synchronized between all page-tables is
* allocated because the synchronization can be expensive.
*/
-static void __init preallocate_vmalloc_pages(void)
+static void __init preallocate_top_level_entries_range(unsigned long start,
+ unsigned long end)
{
unsigned long addr;
const char *lvl;
- for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+
+ for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
pgd_t *pgd = pgd_offset_k(addr);
p4d_t *p4d;
pud_t *pud;
@@ -1328,6 +1331,14 @@ static void __init preallocate_vmalloc_pages(void)
panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl);
}
+static void __init preallocate_top_level_entries(void)
+{
+ preallocate_top_level_entries_range(VMALLOC_START, VMEMORY_END);
+#ifdef CONFIG_SLAB_VIRTUAL
+ preallocate_top_level_entries_range(SLAB_BASE_ADDR, SLAB_END_ADDR - 1);
+#endif
+}
+
void __init mem_init(void)
{
pci_iommu_alloc();
@@ -1351,7 +1362,7 @@ void __init mem_init(void)
if (get_gate_vma(&init_mm))
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
- preallocate_vmalloc_pages();
+ preallocate_top_level_entries();
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -136,6 +136,15 @@ void __init kernel_randomize_memory(void)
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
+
+#ifdef CONFIG_SLAB_VIRTUAL
+ /*
+ * slub_addr_base is initialized separately from the
+ * kaslr_memory_regions because it comes after CPU_ENTRY_AREA_BASE.
+ */
+ prandom_bytes_state(&rand_state, &rand, sizeof(rand));
+ slub_addr_base += (rand & ((1UL << 36) - PAGE_SIZE));
+#endif
}
void __meminit init_trampoline_kaslr(void)
@@ -25,4 +25,8 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache);
extern unsigned long tlb_single_page_flush_ceiling;
+#ifdef CONFIG_SLAB_VIRTUAL
+extern unsigned long slub_addr_base;
+#endif
+
#endif /* __X86_MM_INTERNAL_H */
@@ -166,6 +166,10 @@
* the fast path and disables lockless freelists.
*/
+#ifdef CONFIG_SLAB_VIRTUAL
+unsigned long slub_addr_base = SLAB_DATA_BASE_ADDR;
+#endif /* CONFIG_SLAB_VIRTUAL */
+
/*
* We could simply use migrate_disable()/enable() but as long as it's a
* function call even on !PREEMPT_RT, use inline preempt_disable() there.
@@ -357,6 +357,8 @@ config GCC_PLUGIN_RANDSTRUCT
config SLAB_VIRTUAL
bool "Allocate slab objects from virtual memory"
+ # For virtual memory region allocation
+ depends on X86_64
depends on SLUB && !SLUB_TINY
# If KFENCE support is desired, it could be implemented on top of our
# virtual memory allocation facilities