On 07/03/2023 14:04, Ard Biesheuvel wrote:
> Even though we support loading kernels anywhere in 48-bit addressable
> physical memory, we create the ID maps based on the number of levels
> that we happened to configure for the kernel VA and user VA spaces.
>
> The reason for this is that the PGD/PUD/PMD based classification of
> translation levels, along with the associated folding when the number of
> levels is less than 5, does not permit creating a page table hierarchy
> of a set number of levels. This means that, for instance, on 39-bit VA
> kernels we need to configure an additional level above PGD level on the
> fly, and 36-bit VA kernels still only support 47-bit virtual addressing
> with this trick applied.
>
> Now that we have a separate helper to populate page table hierarchies
> that does not define the levels in terms of PUDS/PMDS/etc at all, let's
> reuse it to create the permanent ID map with a fixed VA size of 48 bits.
>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
> arch/arm64/include/asm/kernel-pgtable.h | 2 ++
> arch/arm64/kernel/head.S | 5 +++
> arch/arm64/kvm/mmu.c | 15 +++------
> arch/arm64/mm/mmu.c | 32 +++++++++++---------
> arch/arm64/mm/proc.S | 9 ++----
> 5 files changed, 31 insertions(+), 32 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
> index 50b5c145358a5d8e..2a2c80ffe59e5307 100644
> --- a/arch/arm64/include/asm/kernel-pgtable.h
> +++ b/arch/arm64/include/asm/kernel-pgtable.h
> @@ -35,6 +35,8 @@
> #define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS)
> #endif
>
> +#define IDMAP_LEVELS ARM64_HW_PGTABLE_LEVELS(48)
> +#define IDMAP_ROOT_LEVEL (4 - IDMAP_LEVELS)
>
> /*
> * If KASLR is enabled, then an offset K is added to the kernel address
> diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
> index e45fd99e8ab4272a..fc6a4076d826b728 100644
> --- a/arch/arm64/kernel/head.S
> +++ b/arch/arm64/kernel/head.S
> @@ -727,6 +727,11 @@ SYM_FUNC_START_LOCAL(__no_granule_support)
> SYM_FUNC_END(__no_granule_support)
>
> SYM_FUNC_START_LOCAL(__primary_switch)
> + mrs x1, tcr_el1
> + mov x2, #64 - VA_BITS
> + tcr_set_t0sz x1, x2
> + msr tcr_el1, x1
> +
> adrp x1, reserved_pg_dir
> adrp x2, init_idmap_pg_dir
> bl __enable_mmu
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 7113587222ffe8e1..d64be7b5f6692e8b 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1687,16 +1687,9 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
> BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
>
> /*
> - * The ID map may be configured to use an extended virtual address
> - * range. This is only the case if system RAM is out of range for the
> - * currently configured page size and VA_BITS_MIN, in which case we will
> - * also need the extended virtual range for the HYP ID map, or we won't
> - * be able to enable the EL2 MMU.
> - *
> - * However, in some cases the ID map may be configured for fewer than
> - * the number of VA bits used by the regular kernel stage 1. This
> - * happens when VA_BITS=52 and the kernel image is placed in PA space
> - * below 48 bits.
> + * The ID map is always configured for 48 bits of translation, which
> + * may be fewer than the number of VA bits used by the regular kernel
> + * stage 1, when VA_BITS=52.
> *
> * At EL2, there is only one TTBR register, and we can't switch between
> * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
> @@ -1707,7 +1700,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
> * 1 VA bits to assure that the hypervisor can both ID map its code page
> * and map any kernel memory.
> */
> - idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
> + idmap_bits = 48;
> kernel_bits = vabits_actual;
> *hyp_va_bits = max(idmap_bits, kernel_bits);
This effectively means that the hypervisor always uses at least 48 VA bits.
Previously, I think it would have been 39 for (e.g.) Android builds? Does this
have any performance implications for pKVM?
>
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 81e1420d2cc13246..a59433ae4f5f8d02 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -762,22 +762,21 @@ static void __init map_kernel(pgd_t *pgdp)
> kasan_copy_shadow(pgdp);
> }
>
> +void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
> + int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
> +
> +static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
> + kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
I see this new storage introduced, but I don't see you removing the storage for
the old method (I have a vague memory of it being defined in the linker script)?
> +
> static void __init create_idmap(void)
> {
> u64 start = __pa_symbol(__idmap_text_start);
> - u64 size = __pa_symbol(__idmap_text_end) - start;
> - pgd_t *pgd = idmap_pg_dir;
> - u64 pgd_phys;
> -
> - /* check if we need an additional level of translation */
> - if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) {
> - pgd_phys = early_pgtable_alloc(PAGE_SHIFT);
> - set_pgd(&idmap_pg_dir[start >> VA_BITS],
> - __pgd(pgd_phys | P4D_TYPE_TABLE));
> - pgd = __va(pgd_phys);
> - }
> - __create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX,
> - early_pgtable_alloc, 0);
> + u64 end = __pa_symbol(__idmap_text_end);
> + u64 ptep = __pa_symbol(idmap_ptes);
> +
> + __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
> + IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
> + __phys_to_virt(ptep) - ptep);
>
> if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) {
> extern u32 __idmap_kpti_flag;
> @@ -787,8 +786,10 @@ static void __init create_idmap(void)
> * The KPTI G-to-nG conversion code needs a read-write mapping
> * of its synchronization flag in the ID map.
> */
> - __create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL,
> - early_pgtable_alloc, 0);
> + ptep = __pa_symbol(kpti_ptes);
> + __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
> + IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
> + __phys_to_virt(ptep) - ptep);
> }
> }
>
> @@ -813,6 +814,7 @@ void __init paging_init(void)
> memblock_allow_resize();
>
> create_idmap();
> + idmap_t0sz = TCR_T0SZ(48);
> }
>
> #ifdef CONFIG_MEMORY_HOTPLUG
> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> index 82e88f4521737c0e..c7129b21bfd5191f 100644
> --- a/arch/arm64/mm/proc.S
> +++ b/arch/arm64/mm/proc.S
> @@ -422,9 +422,9 @@ SYM_FUNC_START(__cpu_setup)
> mair .req x17
> tcr .req x16
> mov_q mair, MAIR_EL1_SET
> - mov_q tcr, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
> - TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
> - TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
> + mov_q tcr, TCR_T0SZ(48) | TCR_T1SZ(VA_BITS) | TCR_CACHE_FLAGS | \
> + TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
> + TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
You're hardcoding 48 in 3 places in this patch. I wonder if an IDMAP_VA_BITS
macro might help here?
>
> tcr_clear_errata_bits tcr, x9, x5
>
> @@ -432,10 +432,7 @@ SYM_FUNC_START(__cpu_setup)
> sub x9, xzr, x0
> add x9, x9, #64
> tcr_set_t1sz tcr, x9
> -#else
> - idmap_get_t0sz x9
> #endif
> - tcr_set_t0sz tcr, x9
>
> /*
> * Set the IPS bits in TCR_EL1.
@@ -35,6 +35,8 @@
#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS)
#endif
+#define IDMAP_LEVELS ARM64_HW_PGTABLE_LEVELS(48)
+#define IDMAP_ROOT_LEVEL (4 - IDMAP_LEVELS)
/*
* If KASLR is enabled, then an offset K is added to the kernel address
@@ -727,6 +727,11 @@ SYM_FUNC_START_LOCAL(__no_granule_support)
SYM_FUNC_END(__no_granule_support)
SYM_FUNC_START_LOCAL(__primary_switch)
+ mrs x1, tcr_el1
+ mov x2, #64 - VA_BITS
+ tcr_set_t0sz x1, x2
+ msr tcr_el1, x1
+
adrp x1, reserved_pg_dir
adrp x2, init_idmap_pg_dir
bl __enable_mmu
@@ -1687,16 +1687,9 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
/*
- * The ID map may be configured to use an extended virtual address
- * range. This is only the case if system RAM is out of range for the
- * currently configured page size and VA_BITS_MIN, in which case we will
- * also need the extended virtual range for the HYP ID map, or we won't
- * be able to enable the EL2 MMU.
- *
- * However, in some cases the ID map may be configured for fewer than
- * the number of VA bits used by the regular kernel stage 1. This
- * happens when VA_BITS=52 and the kernel image is placed in PA space
- * below 48 bits.
+ * The ID map is always configured for 48 bits of translation, which
+ * may be fewer than the number of VA bits used by the regular kernel
+ * stage 1, when VA_BITS=52.
*
* At EL2, there is only one TTBR register, and we can't switch between
* translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
@@ -1707,7 +1700,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
* 1 VA bits to assure that the hypervisor can both ID map its code page
* and map any kernel memory.
*/
- idmap_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ idmap_bits = 48;
kernel_bits = vabits_actual;
*hyp_va_bits = max(idmap_bits, kernel_bits);
@@ -762,22 +762,21 @@ static void __init map_kernel(pgd_t *pgdp)
kasan_copy_shadow(pgdp);
}
+void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
+ int level, pte_t *tbl, bool may_use_cont, u64 va_offset);
+
+static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
+ kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;
+
static void __init create_idmap(void)
{
u64 start = __pa_symbol(__idmap_text_start);
- u64 size = __pa_symbol(__idmap_text_end) - start;
- pgd_t *pgd = idmap_pg_dir;
- u64 pgd_phys;
-
- /* check if we need an additional level of translation */
- if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) {
- pgd_phys = early_pgtable_alloc(PAGE_SHIFT);
- set_pgd(&idmap_pg_dir[start >> VA_BITS],
- __pgd(pgd_phys | P4D_TYPE_TABLE));
- pgd = __va(pgd_phys);
- }
- __create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX,
- early_pgtable_alloc, 0);
+ u64 end = __pa_symbol(__idmap_text_end);
+ u64 ptep = __pa_symbol(idmap_ptes);
+
+ __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
+ IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
+ __phys_to_virt(ptep) - ptep);
if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) {
extern u32 __idmap_kpti_flag;
@@ -787,8 +786,10 @@ static void __init create_idmap(void)
* The KPTI G-to-nG conversion code needs a read-write mapping
* of its synchronization flag in the ID map.
*/
- __create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL,
- early_pgtable_alloc, 0);
+ ptep = __pa_symbol(kpti_ptes);
+ __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
+ IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
+ __phys_to_virt(ptep) - ptep);
}
}
@@ -813,6 +814,7 @@ void __init paging_init(void)
memblock_allow_resize();
create_idmap();
+ idmap_t0sz = TCR_T0SZ(48);
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -422,9 +422,9 @@ SYM_FUNC_START(__cpu_setup)
mair .req x17
tcr .req x16
mov_q mair, MAIR_EL1_SET
- mov_q tcr, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
- TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
- TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
+ mov_q tcr, TCR_T0SZ(48) | TCR_T1SZ(VA_BITS) | TCR_CACHE_FLAGS | \
+ TCR_SMP_FLAGS | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
+ TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
tcr_clear_errata_bits tcr, x9, x5
@@ -432,10 +432,7 @@ SYM_FUNC_START(__cpu_setup)
sub x9, xzr, x0
add x9, x9, #64
tcr_set_t1sz tcr, x9
-#else
- idmap_get_t0sz x9
#endif
- tcr_set_t0sz tcr, x9
/*
* Set the IPS bits in TCR_EL1.