[v4,02/11] x86/startup_64: Replace pointer fixups with RIP-relative references

Message ID 20240213124143.1484862-15-ardb+git@google.com
State New
Headers
Series x86: Confine early 1:1 mapped startup code |

Commit Message

Ard Biesheuvel Feb. 13, 2024, 12:41 p.m. UTC
  From: Ard Biesheuvel <ardb@kernel.org>

The code in __startup64() runs from a 1:1 mapping of memory, and uses
explicit pointer arithmetic to convert symbol references generated by
the compiler into references that work correctly via this 1:1 mapping.

This relies on the compiler generating absolute symbol references, which
will be resolved by the linker using the kernel virtual mapping.
However, the compiler may just as well emit RIP-relative references,
even when not operating in PIC mode, and so this explicit pointer
arithmetic is fragile and should be avoided. The fixup routines also
take a 'physical base' argument which needs to be passed around as
well.

Convert these pointer fixups to RIP-relative references, which are
guaranteed to produce the correct values without any explicit
arithmetic, removing the need to pass around the physical load address.
It also makes the code substantially easier to understand.

Replace bare 510/511 constants with the appropriate symbolic constants
while at it.  Note that pgd_index(__START_KERNEL_map) always produces
the value 511, regardless of the number of paging levels used, so a
symbolic constant is used here as well.

The remaining fixup_int()/fixup_long() calls related to 5-level paging
will be removed in a subsequent patch.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/kernel/head64.c | 57 ++++++++------------
 1 file changed, 21 insertions(+), 36 deletions(-)
  

Comments

Borislav Petkov Feb. 17, 2024, 12:51 p.m. UTC | #1
On Tue, Feb 13, 2024 at 01:41:46PM +0100, Ard Biesheuvel wrote:
> @@ -201,25 +201,19 @@ unsigned long __head __startup_64(unsigned long physaddr,
>  	load_delta += sme_get_me_mask();
>  
>  	/* Fixup the physical addresses in the page table */
> -
> -	pgd = fixup_pointer(early_top_pgt, physaddr);
> -	p = pgd + pgd_index(__START_KERNEL_map);
> -	if (la57)
> -		*p = (unsigned long)level4_kernel_pgt;
> -	else
> -		*p = (unsigned long)level3_kernel_pgt;
> -	*p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
> -
>  	if (la57) {
> -		p4d = fixup_pointer(level4_kernel_pgt, physaddr);
> -		p4d[511] += load_delta;
> +		p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
> +		p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
>  	}
>  
> -	pud = fixup_pointer(level3_kernel_pgt, physaddr);
> -	pud[510] += load_delta;
> -	pud[511] += load_delta;
> +	pud = &RIP_REL_REF(level3_kernel_pgt)->pud;
> +	pud[PTRS_PER_PUD - 2] += load_delta;
> +	pud[PTRS_PER_PUD - 1] += load_delta;
> +
> +	pgd = &RIP_REL_REF(early_top_pgt)->pgd;

Let's do the pgd assignment above, where it was so that we have that
natural order of p4d -> pgd -> pud ->pmd etc manipulations.

> +	pgd[PTRS_PER_PGD - 1] = (pgdval_t)(la57 ? p4d : pud) | _PAGE_TABLE_NOENC;

I see what you mean with pgd_index(__START_KERNEL_map) always being 511
but this:

	pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)(la57 ? p4d : pud) | _PAGE_TABLE_NOENC;

says exactly what gets mapped there in the pagetable while

	PTRS_PER_PGD - 1

makes me wonder what's that last pud supposed to map.

Other than that, my gut feeling right now is, this would need extensive
testing so that we make sure there's no fallout from it.

Thx.
  
Ard Biesheuvel Feb. 17, 2024, 1:58 p.m. UTC | #2
On Sat, 17 Feb 2024 at 13:51, Borislav Petkov <bp@alien8.de> wrote:
>
> On Tue, Feb 13, 2024 at 01:41:46PM +0100, Ard Biesheuvel wrote:
> > @@ -201,25 +201,19 @@ unsigned long __head __startup_64(unsigned long physaddr,
> >       load_delta += sme_get_me_mask();
> >
> >       /* Fixup the physical addresses in the page table */
> > -
> > -     pgd = fixup_pointer(early_top_pgt, physaddr);
> > -     p = pgd + pgd_index(__START_KERNEL_map);
> > -     if (la57)
> > -             *p = (unsigned long)level4_kernel_pgt;
> > -     else
> > -             *p = (unsigned long)level3_kernel_pgt;
> > -     *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
> > -
> >       if (la57) {
> > -             p4d = fixup_pointer(level4_kernel_pgt, physaddr);
> > -             p4d[511] += load_delta;
> > +             p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
> > +             p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
> >       }
> >
> > -     pud = fixup_pointer(level3_kernel_pgt, physaddr);
> > -     pud[510] += load_delta;
> > -     pud[511] += load_delta;
> > +     pud = &RIP_REL_REF(level3_kernel_pgt)->pud;
> > +     pud[PTRS_PER_PUD - 2] += load_delta;
> > +     pud[PTRS_PER_PUD - 1] += load_delta;
> > +
> > +     pgd = &RIP_REL_REF(early_top_pgt)->pgd;
>
> Let's do the pgd assignment above, where it was so that we have that
> natural order of p4d -> pgd -> pud ->pmd etc manipulations.
>

pud and p4d need to be assigned first, unless we want to keep taking
the addresses of level4_kernel_pgt and level3_kernel_pgt twice as
before.

> > +     pgd[PTRS_PER_PGD - 1] = (pgdval_t)(la57 ? p4d : pud) | _PAGE_TABLE_NOENC;
>
> I see what you mean with pgd_index(__START_KERNEL_map) always being 511
> but this:
>
>         pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)(la57 ? p4d : pud) | _PAGE_TABLE_NOENC;
>
> says exactly what gets mapped there in the pagetable while
>
>         PTRS_PER_PGD - 1
>
> makes me wonder what's that last pud supposed to map.
>

Fair enough. But the same applies to p4d[] and pud[].

> Other than that, my gut feeling right now is, this would need extensive
> testing so that we make sure there's no fallout from it.
>

More testing is always good, but I am not particularly nervous about
these changes.

I could split this up into 3+ patches so we could bisect any resulting
issues more effectively.
  
Ard Biesheuvel Feb. 17, 2024, 4:10 p.m. UTC | #3
On Sat, 17 Feb 2024 at 14:58, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Sat, 17 Feb 2024 at 13:51, Borislav Petkov <bp@alien8.de> wrote:
> >
> > On Tue, Feb 13, 2024 at 01:41:46PM +0100, Ard Biesheuvel wrote:
> > > @@ -201,25 +201,19 @@ unsigned long __head __startup_64(unsigned long physaddr,
> > >       load_delta += sme_get_me_mask();
> > >
> > >       /* Fixup the physical addresses in the page table */
> > > -
> > > -     pgd = fixup_pointer(early_top_pgt, physaddr);
> > > -     p = pgd + pgd_index(__START_KERNEL_map);
> > > -     if (la57)
> > > -             *p = (unsigned long)level4_kernel_pgt;
> > > -     else
> > > -             *p = (unsigned long)level3_kernel_pgt;
> > > -     *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
> > > -
> > >       if (la57) {
> > > -             p4d = fixup_pointer(level4_kernel_pgt, physaddr);
> > > -             p4d[511] += load_delta;
> > > +             p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
> > > +             p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
> > >       }
> > >
> > > -     pud = fixup_pointer(level3_kernel_pgt, physaddr);
> > > -     pud[510] += load_delta;
> > > -     pud[511] += load_delta;
> > > +     pud = &RIP_REL_REF(level3_kernel_pgt)->pud;
> > > +     pud[PTRS_PER_PUD - 2] += load_delta;
> > > +     pud[PTRS_PER_PUD - 1] += load_delta;
> > > +
> > > +     pgd = &RIP_REL_REF(early_top_pgt)->pgd;
> >
> > Let's do the pgd assignment above, where it was so that we have that
> > natural order of p4d -> pgd -> pud ->pmd etc manipulations.
> >
>
> pud and p4d need to be assigned first, unless we want to keep taking
> the addresses of level4_kernel_pgt and level3_kernel_pgt twice as
> before.
>
> > > +     pgd[PTRS_PER_PGD - 1] = (pgdval_t)(la57 ? p4d : pud) | _PAGE_TABLE_NOENC;
> >
> > I see what you mean with pgd_index(__START_KERNEL_map) always being 511
> > but this:
> >
> >         pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)(la57 ? p4d : pud) | _PAGE_TABLE_NOENC;
> >
> > says exactly what gets mapped there in the pagetable while
> >
> >         PTRS_PER_PGD - 1
> >
> > makes me wonder what's that last pud supposed to map.
> >
>
> Fair enough. But the same applies to p4d[] and pud[].
>
> > Other than that, my gut feeling right now is, this would need extensive
> > testing so that we make sure there's no fallout from it.
> >
>
> More testing is always good, but I am not particularly nervous about
> these changes.
>
> I could split this up into 3+ patches so we could bisect any resulting
> issues more effectively.

Maybe this is better?

--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -165,14 +165,14 @@
  * doesn't have to generate PC-relative relocations when accessing globals from
  * that function. Clang actually does not generate them, which leads to
  * boot-time crashes. To work around this problem, every global pointer must
- * be adjusted using fixup_pointer().
+ * be accessed using RIP_REL_REF().
  */
 unsigned long __head __startup_64(unsigned long physaddr,
                                  struct boot_params *bp)
 {
        pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
-       unsigned long load_delta, *p;
        unsigned long pgtable_flags;
+       unsigned long load_delta;
        pgdval_t *pgd;
        p4dval_t *p4d;
        pudval_t *pud;
@@ -202,17 +202,14 @@ unsigned long __head __startup_64(unsigned long physaddr,

        /* Fixup the physical addresses in the page table */

-       pgd = fixup_pointer(early_top_pgt, physaddr);
-       p = pgd + pgd_index(__START_KERNEL_map);
-       if (la57)
-               *p = (unsigned long)level4_kernel_pgt;
-       else
-               *p = (unsigned long)level3_kernel_pgt;
-       *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+       pgd = &RIP_REL_REF(early_top_pgt)->pgd;
+       pgd[pgd_index(__START_KERNEL_map)] += load_delta;

        if (la57) {
-               p4d = fixup_pointer(level4_kernel_pgt, physaddr);
-               p4d[511] += load_delta;
+               p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
+               p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
+
+               pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d |
_PAGE_TABLE_NOENC;
        }

        RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3cac98c61066..fb2a98c29094 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -653,7 +653,8 @@ SYM_CODE_END(vc_no_ghcb)
        .balign 4

 SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
-       .fill   512,8,0
+       .fill   511,8,0
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
        .fill   PTI_USER_PGD_FILL,8,0
 SYM_DATA_END(early_top_pgt)
  
Borislav Petkov Feb. 19, 2024, 9:55 a.m. UTC | #4
On Sat, Feb 17, 2024 at 05:10:27PM +0100, Ard Biesheuvel wrote:
> Maybe this is better?

Yap, looks better.

Btw, when you paste those diffs ontop, can pls make sure you paste them
in applicable form so that I can apply them and look at them in detail?

gmail mangles them:

> +
> +               pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d |
> _PAGE_TABLE_NOENC;

For example, linebreak here.

Thx.
  
Borislav Petkov Feb. 19, 2024, 10:01 a.m. UTC | #5
On Sat, Feb 17, 2024 at 02:58:29PM +0100, Ard Biesheuvel wrote:
> More testing is always good, but I am not particularly nervous about
> these changes.

Perhaps but there's a big difference between testing everything as much
as one can and *then* queueing it - vs testing a bit, not being really
nervous about the changes and then someone reporting a snafu when the
patches are already in Linus' tree.

Means dropping everything and getting on that. And then imagine a couple
more breakages happening in parallel and needing urgent attention.

Not something you wanna deal with. Speaking from my experience, at
least.

> I could split this up into 3+ patches so we could bisect any resulting
> issues more effectively.

Yeah, splitting changes into separate bits - ala, one logical change per
patch - is always a good idea.

In this particular case, I don't mind splitting them even more so that
it is perfectly clear what happens and looking at those changes doesn't
make people have to go look at the source to figure out what the change
actually looks like applied, in order to fully grok it.

Thx.
  
Ard Biesheuvel Feb. 19, 2024, 10:45 a.m. UTC | #6
On Mon, 19 Feb 2024 at 10:56, Borislav Petkov <bp@alien8.de> wrote:
>
> On Sat, Feb 17, 2024 at 05:10:27PM +0100, Ard Biesheuvel wrote:
> > Maybe this is better?
>
> Yap, looks better.
>
> Btw, when you paste those diffs ontop, can pls make sure you paste them
> in applicable form so that I can apply them and look at them in detail?
>
> gmail mangles them:
>
> > +
> > +               pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d |
> > _PAGE_TABLE_NOENC;
>
> For example, linebreak here.
>

Yeah, sorry about that.
  
Ard Biesheuvel Feb. 19, 2024, 10:47 a.m. UTC | #7
On Mon, 19 Feb 2024 at 11:01, Borislav Petkov <bp@alien8.de> wrote:
>
> On Sat, Feb 17, 2024 at 02:58:29PM +0100, Ard Biesheuvel wrote:
> > More testing is always good, but I am not particularly nervous about
> > these changes.
>
> Perhaps but there's a big difference between testing everything as much
> as one can and *then* queueing it - vs testing a bit, not being really
> nervous about the changes and then someone reporting a snafu when the
> patches are already in Linus' tree.
>
> Means dropping everything and getting on that. And then imagine a couple
> more breakages happening in parallel and needing urgent attention.
>
> Not something you wanna deal with. Speaking from my experience, at
> least.
>

Not disagreeing with that.

> > I could split this up into 3+ patches so we could bisect any resulting
> > issues more effectively.
>
> Yeah, splitting changes into separate bits - ala, one logical change per
> patch - is always a good idea.
>
> In this particular case, I don't mind splitting them even more so that
> it is perfectly clear what happens and looking at those changes doesn't
> make people have to go look at the source to figure out what the change
> actually looks like applied, in order to fully grok it.
>

I split this into 5 patches for v5. The final patch in this v4 is
broken for CONFIG_X86_5LEVEL=n so I was going to have to respin
anyway. (I'll pick up the latest version of patch #1 you pasted)
  

Patch

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9d7f12829f2d..4b08e321d168 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -77,6 +77,7 @@  static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = {
 	[GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
 };
 
+#ifdef CONFIG_X86_5LEVEL
 static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
 {
 	return ptr - (void *)_text + (void *)physaddr;
@@ -87,7 +88,6 @@  static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
 	return fixup_pointer(ptr, physaddr);
 }
 
-#ifdef CONFIG_X86_5LEVEL
 static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
 {
 	return fixup_pointer(ptr, physaddr);
@@ -164,22 +164,21 @@  static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
 /* Code in __startup_64() can be relocated during execution, but the compiler
  * doesn't have to generate PC-relative relocations when accessing globals from
  * that function. Clang actually does not generate them, which leads to
- * boot-time crashes. To work around this problem, every global pointer must
- * be adjusted using fixup_pointer().
+ * boot-time crashes. To work around this problem, every global variable must
+ * be accessed using RIP_REL_REF().
  */
 unsigned long __head __startup_64(unsigned long physaddr,
 				  struct boot_params *bp)
 {
-	unsigned long load_delta, *p;
+	pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
 	unsigned long pgtable_flags;
+	unsigned long load_delta;
 	pgdval_t *pgd;
 	p4dval_t *p4d;
 	pudval_t *pud;
 	pmdval_t *pmd, pmd_entry;
-	pteval_t *mask_ptr;
 	bool la57;
 	int i;
-	unsigned int *next_pgt_ptr;
 
 	la57 = check_la57_support(physaddr);
 
@@ -192,6 +191,7 @@  unsigned long __head __startup_64(unsigned long physaddr,
 	 * and the address I am actually running at.
 	 */
 	load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+	RIP_REL_REF(phys_base) = load_delta;
 
 	/* Is the address not 2M aligned? */
 	if (load_delta & ~PMD_MASK)
@@ -201,25 +201,19 @@  unsigned long __head __startup_64(unsigned long physaddr,
 	load_delta += sme_get_me_mask();
 
 	/* Fixup the physical addresses in the page table */
-
-	pgd = fixup_pointer(early_top_pgt, physaddr);
-	p = pgd + pgd_index(__START_KERNEL_map);
-	if (la57)
-		*p = (unsigned long)level4_kernel_pgt;
-	else
-		*p = (unsigned long)level3_kernel_pgt;
-	*p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
-
 	if (la57) {
-		p4d = fixup_pointer(level4_kernel_pgt, physaddr);
-		p4d[511] += load_delta;
+		p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
+		p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
 	}
 
-	pud = fixup_pointer(level3_kernel_pgt, physaddr);
-	pud[510] += load_delta;
-	pud[511] += load_delta;
+	pud = &RIP_REL_REF(level3_kernel_pgt)->pud;
+	pud[PTRS_PER_PUD - 2] += load_delta;
+	pud[PTRS_PER_PUD - 1] += load_delta;
+
+	pgd = &RIP_REL_REF(early_top_pgt)->pgd;
+	pgd[PTRS_PER_PGD - 1] = (pgdval_t)(la57 ? p4d : pud) | _PAGE_TABLE_NOENC;
 
-	pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
+	pmd = &RIP_REL_REF(level2_fixmap_pgt)->pmd;
 	for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
 		pmd[i] += load_delta;
 
@@ -230,16 +224,14 @@  unsigned long __head __startup_64(unsigned long physaddr,
 	 * it avoids problems around wraparound.
 	 */
 
-	next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
-	pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
-	pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+	pud = &early_pgts[0]->pmd;
+	pmd = &early_pgts[1]->pmd;
+	p4d = &early_pgts[2]->pmd;
+	RIP_REL_REF(next_early_pgt) = 3;
 
 	pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
 
 	if (la57) {
-		p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++],
-				    physaddr);
-
 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
 		pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
 		pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
@@ -259,8 +251,7 @@  unsigned long __head __startup_64(unsigned long physaddr,
 
 	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
 	/* Filter out unsupported __PAGE_KERNEL_* bits: */
-	mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
-	pmd_entry &= *mask_ptr;
+	pmd_entry &= RIP_REL_REF(__supported_pte_mask);
 	pmd_entry += sme_get_me_mask();
 	pmd_entry +=  physaddr;
 
@@ -286,7 +277,7 @@  unsigned long __head __startup_64(unsigned long physaddr,
 	 * error, causing the BIOS to halt the system.
 	 */
 
-	pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+	pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
 
 	/* invalidate pages before the kernel image */
 	for (i = 0; i < pmd_index((unsigned long)_text); i++)
@@ -301,12 +292,6 @@  unsigned long __head __startup_64(unsigned long physaddr,
 	for (; i < PTRS_PER_PMD; i++)
 		pmd[i] &= ~_PAGE_PRESENT;
 
-	/*
-	 * Fixup phys_base - remove the memory encryption mask to obtain
-	 * the true physical address.
-	 */
-	*fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
-
 	return sme_postprocess_startup(bp, pmd);
 }