[PATCHv4,10/14] x86/tdx: Convert shared memory back to private on kexec

Message ID 20231205004510.27164-11-kirill.shutemov@linux.intel.com
State New
Headers
Series x86/tdx: Add kexec support |

Commit Message

Kirill A. Shutemov Dec. 5, 2023, 12:45 a.m. UTC
  TDX guests allocate shared buffers to perform I/O. It is done by
allocating pages normally from the buddy allocator and converting them
to shared with set_memory_decrypted().

The second kernel has no idea what memory is converted this way. It only
sees E820_TYPE_RAM.

Accessing shared memory via private mapping is fatal. It leads to
unrecoverable TD exit.

On kexec walk direct mapping and convert all shared memory back to
private. It makes all RAM private again and second kernel may use it
normally.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/coco/tdx/kexec.c       |   0
 arch/x86/coco/tdx/tdx.c         | 120 +++++++++++++++++++++++++++++++-
 arch/x86/include/asm/x86_init.h |   1 +
 arch/x86/kernel/crash.c         |   4 ++
 arch/x86/kernel/reboot.c        |  10 +++
 5 files changed, 133 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/coco/tdx/kexec.c
  

Comments

Edgecombe, Rick P Dec. 6, 2023, 1:28 a.m. UTC | #1
On Tue, 2023-12-05 at 03:45 +0300, Kirill A. Shutemov wrote: 
> +static void tdx_kexec_unshare_mem(bool crash)
> +{
> +       unsigned long addr, end;
> +       long found = 0, shared;
> +
> +       /* Stop new private<->shared conversions */
> +       conversion_allowed = false;

I wonder if this might need a compiler barrier here to be totally safe.
I'm not sure.

> +
> +       /*
> +        * Crash kernel reaches here with interrupts disabled: can't
> wait for
> +        * conversions to finish.
> +        *
> +        * If race happened, just report and proceed.
> +        */
> +       if (!crash) {
> +               unsigned long timeout;
> +
> +               /*
> +                * Wait for in-flight conversions to complete.
> +                *
> +                * Do not wait more than 30 seconds.
> +                */
> +               timeout = 30 * USEC_PER_SEC;
> +               while (atomic_read(&conversions_in_progress) &&
> timeout--)
> +                       udelay(1);
> +       }
> +
> +       if (atomic_read(&conversions_in_progress))
> +               pr_warn("Failed to finish shared<->private
> conversions\n");

I can't think of any non-ridiculous way to handle this case. Maybe we
need VMM help.

> 
> diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
> index 830425e6d38e..c81afffaa954 100644
> --- a/arch/x86/kernel/reboot.c
> +++ b/arch/x86/kernel/reboot.c
> @@ -12,6 +12,7 @@
>  #include <linux/delay.h>
>  #include <linux/objtool.h>
>  #include <linux/pgtable.h>
> +#include <linux/kexec.h>
>  #include <acpi/reboot.h>
>  #include <asm/io.h>
>  #include <asm/apic.h>
> @@ -31,6 +32,7 @@
>  #include <asm/realmode.h>
>  #include <asm/x86_init.h>
>  #include <asm/efi.h>
> +#include <asm/tdx.h>
>  
>  /*
>   * Power off function, if any
> @@ -716,6 +718,14 @@ static void
> native_machine_emergency_restart(void)
>  
>  void native_machine_shutdown(void)
>  {
> +       /*
> +        * Call enc_kexec_unshare_mem() while all CPUs are still
> active and
> +        * interrupts are enabled. This will allow all in-flight
> memory
> +        * conversions to finish cleanly before unsharing all memory.
> +        */
> +       if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
> kexec_in_progress)
> +               x86_platform.guest.enc_kexec_unshare_mem(false);

These questions are coming from an incomplete understanding of the
kexec/reboot operation. Please disregard if it is not helpful.

By doing this while other tasks can still run, it handles the
conversion races in the !crash case. But then it sets shared pages to
NP. What happens if another active task tries to write to one?

I guess we rely on the kernel_restart_prepare()->device_shutdown() to
clean up, which runs before native_machine_shutdown(). So there might
be conversions in progress when tdx_kexec_unshare_mem() is called, from
the allocator work queues. But the actual memory won't be accessed
during that operation.

But the console must be active? Or otherwise who can see these
warnings. It doesn't use a shared page? Or the KVM clock, which looks
to clean up at cpu tear down, which now happens after
tdx_kexec_unshare_mem()? So I wonder if there might be cases.

If so, maybe you could halt the conversions in
native_machine_shutdown(), then do the actual reset to private after
tasks can't schedule. I'd still wonder about if anything might try to
access a shared page triggered by the console output.


> +
>         /* Stop the cpus and apics */
>  #ifdef CONFIG_X86_IO_APIC
>         /*
  
Kirill A. Shutemov Dec. 6, 2023, 3:07 p.m. UTC | #2
On Wed, Dec 06, 2023 at 01:28:08AM +0000, Edgecombe, Rick P wrote:
> On Tue, 2023-12-05 at 03:45 +0300, Kirill A. Shutemov wrote: 
> > +static void tdx_kexec_unshare_mem(bool crash)
> > +{
> > +       unsigned long addr, end;
> > +       long found = 0, shared;
> > +
> > +       /* Stop new private<->shared conversions */
> > +       conversion_allowed = false;
> 
> I wonder if this might need a compiler barrier here to be totally safe.
> I'm not sure.

Yeah, it should be cleaner with a barrier.

> > +
> > +       /*
> > +        * Crash kernel reaches here with interrupts disabled: can't
> > wait for
> > +        * conversions to finish.
> > +        *
> > +        * If race happened, just report and proceed.
> > +        */
> > +       if (!crash) {
> > +               unsigned long timeout;
> > +
> > +               /*
> > +                * Wait for in-flight conversions to complete.
> > +                *
> > +                * Do not wait more than 30 seconds.
> > +                */
> > +               timeout = 30 * USEC_PER_SEC;
> > +               while (atomic_read(&conversions_in_progress) &&
> > timeout--)
> > +                       udelay(1);
> > +       }
> > +
> > +       if (atomic_read(&conversions_in_progress))
> > +               pr_warn("Failed to finish shared<->private
> > conversions\n");
> 
> I can't think of any non-ridiculous way to handle this case. Maybe we
> need VMM help.

Do you see a specific way how VMM can help here?

> > diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
> > index 830425e6d38e..c81afffaa954 100644
> > --- a/arch/x86/kernel/reboot.c
> > +++ b/arch/x86/kernel/reboot.c
> > @@ -12,6 +12,7 @@
> >  #include <linux/delay.h>
> >  #include <linux/objtool.h>
> >  #include <linux/pgtable.h>
> > +#include <linux/kexec.h>
> >  #include <acpi/reboot.h>
> >  #include <asm/io.h>
> >  #include <asm/apic.h>
> > @@ -31,6 +32,7 @@
> >  #include <asm/realmode.h>
> >  #include <asm/x86_init.h>
> >  #include <asm/efi.h>
> > +#include <asm/tdx.h>
> >  
> >  /*
> >   * Power off function, if any
> > @@ -716,6 +718,14 @@ static void
> > native_machine_emergency_restart(void)
> >  
> >  void native_machine_shutdown(void)
> >  {
> > +       /*
> > +        * Call enc_kexec_unshare_mem() while all CPUs are still
> > active and
> > +        * interrupts are enabled. This will allow all in-flight
> > memory
> > +        * conversions to finish cleanly before unsharing all memory.
> > +        */
> > +       if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
> > kexec_in_progress)
> > +               x86_platform.guest.enc_kexec_unshare_mem(false);
> 
> These questions are coming from an incomplete understanding of the
> kexec/reboot operation. Please disregard if it is not helpful.
> 
> By doing this while other tasks can still run, it handles the
> conversion races in the !crash case. But then it sets shared pages to
> NP. What happens if another active task tries to write to one?
> 
> I guess we rely on the kernel_restart_prepare()->device_shutdown() to
> clean up, which runs before native_machine_shutdown(). So there might
> be conversions in progress when tdx_kexec_unshare_mem() is called, from
> the allocator work queues. But the actual memory won't be accessed
> during that operation.

Right, devices has to be shutdown by then.

> But the console must be active? Or otherwise who can see these
> warnings. It doesn't use a shared page? Or the KVM clock, which looks
> to clean up at cpu tear down, which now happens after
> tdx_kexec_unshare_mem()? So I wonder if there might be cases.

Virtio console is not functional by then, but serial is. Serial uses port
I/O and doesn't need shared memory.

> If so, maybe you could halt the conversions in
> native_machine_shutdown(), then do the actual reset to private after
> tasks can't schedule.

It would also mean that we cannot use set_memory_np() there as it requires
sleepable context. I would rather keep conversion in
native_machine_shutdown() path.

> I'd still wonder about if anything might try to
> access a shared page triggered by the console output.

set_memory_np() would make it obvious if it ever happens.
  
Edgecombe, Rick P Dec. 6, 2023, 6:32 p.m. UTC | #3
On Wed, 2023-12-06 at 18:07 +0300, kirill.shutemov@linux.intel.com
wrote:
>  I can't think of any non-ridiculous way to handle this case. Maybe
> we
> > need VMM help.
> 
> Do you see a specific way how VMM can help here?

I didn't have a specific idea. I was just thinking that the problem is
that guest doesn't know the exact private/shared state of the GFNs
because of the potentially interrupted conversion processes. But the
VMM does have this information. 

What about something like: The VMM could expose something like MapGPA
that searches for a shared GPA and return it. So you ask it to convert
the next shared GPA it can find to private and it searches (in the
host) the xarray stuff to find a GPA that is shared. Then in the guest,
it has a shared GPA and check the direct map PTE to reset, and accept.

The guest could call the new MapGPA-like hypercall in a loop until all
GPAs are reset.

> > I'd still wonder about if anything might try to
> > access a shared page triggered by the console output.
> 
> set_memory_np() would make it obvious if it ever happens.

I think this is a worthwhile improvement over the existing complete
lack of support, but it's not race free. With the barrier comments, and
given the lack of good alternatives:

Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
  

Patch

diff --git a/arch/x86/coco/tdx/kexec.c b/arch/x86/coco/tdx/kexec.c
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index fcc159497554..46355ea9f4cf 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -6,14 +6,17 @@ 
 
 #include <linux/cpufeature.h>
 #include <linux/debugfs.h>
+#include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/io.h>
+#include <linux/kexec.h>
 #include <asm/coco.h>
 #include <asm/tdx.h>
 #include <asm/vmx.h>
 #include <asm/insn.h>
 #include <asm/insn-eval.h>
 #include <asm/pgtable.h>
+#include <asm/set_memory.h>
 
 /* MMIO direction */
 #define EPT_READ	0
@@ -40,6 +43,9 @@ 
 
 static atomic_long_t nr_shared;
 
+static atomic_t conversions_in_progress;
+static bool conversion_allowed = true;
+
 static inline bool pte_decrypted(pte_t pte)
 {
 	return cc_mkdec(pte_val(pte)) == pte_val(pte);
@@ -725,6 +731,14 @@  static bool tdx_tlb_flush_required(bool private)
 
 static bool tdx_cache_flush_required(void)
 {
+	/*
+	 * Avoid issuing CLFLUSH on set_memory_decrypted() if conversions
+	 * stopped. Otherwise it can race with unshare_all_memory() and trigger
+	 * implicit conversion to shared.
+	 */
+	if (!conversion_allowed)
+		return false;
+
 	/*
 	 * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
 	 * TDX doesn't have such capability.
@@ -808,12 +822,25 @@  static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
 static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
 					 bool enc)
 {
+	atomic_inc(&conversions_in_progress);
+
+	/*
+	 * Check after bumping conversions_in_progress to serialize
+	 * against tdx_shutdown().
+	 */
+	if (!conversion_allowed) {
+		atomic_dec(&conversions_in_progress);
+		return -EBUSY;
+	}
+
 	/*
 	 * Only handle shared->private conversion here.
 	 * See the comment in tdx_early_init().
 	 */
-	if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+	if (enc && !tdx_enc_status_changed(vaddr, numpages, enc)) {
+		atomic_dec(&conversions_in_progress);
 		return -EIO;
+	}
 
 	return 0;
 }
@@ -825,17 +852,104 @@  static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
 	 * Only handle private->shared conversion here.
 	 * See the comment in tdx_early_init().
 	 */
-	if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+	if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc)) {
+		atomic_dec(&conversions_in_progress);
 		return -EIO;
+	}
 
 	if (enc)
 		atomic_long_sub(numpages, &nr_shared);
 	else
 		atomic_long_add(numpages, &nr_shared);
 
+	atomic_dec(&conversions_in_progress);
+
 	return 0;
 }
 
+static void tdx_kexec_unshare_mem(bool crash)
+{
+	unsigned long addr, end;
+	long found = 0, shared;
+
+	/* Stop new private<->shared conversions */
+	conversion_allowed = false;
+
+	/*
+	 * Crash kernel reaches here with interrupts disabled: can't wait for
+	 * conversions to finish.
+	 *
+	 * If race happened, just report and proceed.
+	 */
+	if (!crash) {
+		unsigned long timeout;
+
+		/*
+		 * Wait for in-flight conversions to complete.
+		 *
+		 * Do not wait more than 30 seconds.
+		 */
+		timeout = 30 * USEC_PER_SEC;
+		while (atomic_read(&conversions_in_progress) && timeout--)
+			udelay(1);
+	}
+
+	if (atomic_read(&conversions_in_progress))
+		pr_warn("Failed to finish shared<->private conversions\n");
+
+	/*
+	 * Walk direct mapping and convert all shared memory back to private,
+	 */
+
+	addr = PAGE_OFFSET;
+	end  = PAGE_OFFSET + get_max_mapped();
+
+	while (addr < end) {
+		unsigned long size;
+		unsigned int level;
+		pte_t *pte;
+
+		pte = lookup_address(addr, &level);
+		size = page_level_size(level);
+
+		if (pte && pte_decrypted(*pte)) {
+			int pages = size / PAGE_SIZE;
+
+			/*
+			 * Touching memory with shared bit set triggers implicit
+			 * conversion to shared.
+			 *
+			 * Make sure nobody touches the shared range from
+			 * now on.
+			 *
+			 * Bypass unmapping for crash scenario. Unmapping
+			 * requires sleepable context, but in crash case kernel
+			 * hits the code path with interrupts disabled.
+			 * It shouldn't be a problem as all secondary CPUs are
+			 * down and kernel runs with interrupts disabled, so
+			 * there is no room for race.
+			 */
+			if (!crash)
+				set_memory_np(addr, pages);
+
+			if (!tdx_enc_status_changed(addr, pages, true)) {
+				pr_err("Failed to unshare range %#lx-%#lx\n",
+				       addr, addr + size);
+			}
+
+			found += pages;
+		}
+
+		addr += size;
+	}
+
+	shared = atomic_long_read(&nr_shared);
+	if (shared != found) {
+		pr_err("shared page accounting is off\n");
+		pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
+	}
+}
+
 void __init tdx_early_init(void)
 {
 	struct tdx_module_args args = {
@@ -895,6 +1009,8 @@  void __init tdx_early_init(void)
 	x86_platform.guest.enc_cache_flush_required  = tdx_cache_flush_required;
 	x86_platform.guest.enc_tlb_flush_required    = tdx_tlb_flush_required;
 
+	x86_platform.guest.enc_kexec_unshare_mem     = tdx_kexec_unshare_mem;
+
 	/*
 	 * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
 	 * bringup low level code. That raises #VE which cannot be handled
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index c9503fe2d13a..917358821a31 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -154,6 +154,7 @@  struct x86_guest {
 	int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
 	bool (*enc_tlb_flush_required)(bool enc);
 	bool (*enc_cache_flush_required)(void);
+	void (*enc_kexec_unshare_mem)(bool crash);
 };
 
 /**
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c92d88680dbf..1618224775f5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -40,6 +40,7 @@ 
 #include <asm/intel_pt.h>
 #include <asm/crash.h>
 #include <asm/cmdline.h>
+#include <asm/tdx.h>
 
 /* Used while preparing memory map entries for second kernel */
 struct crash_memmap_data {
@@ -107,6 +108,9 @@  void native_machine_crash_shutdown(struct pt_regs *regs)
 
 	crash_smp_send_stop();
 
+	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+		x86_platform.guest.enc_kexec_unshare_mem(true);
+
 	cpu_emergency_disable_virtualization();
 
 	/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 830425e6d38e..c81afffaa954 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@ 
 #include <linux/delay.h>
 #include <linux/objtool.h>
 #include <linux/pgtable.h>
+#include <linux/kexec.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/apic.h>
@@ -31,6 +32,7 @@ 
 #include <asm/realmode.h>
 #include <asm/x86_init.h>
 #include <asm/efi.h>
+#include <asm/tdx.h>
 
 /*
  * Power off function, if any
@@ -716,6 +718,14 @@  static void native_machine_emergency_restart(void)
 
 void native_machine_shutdown(void)
 {
+	/*
+	 * Call enc_kexec_unshare_mem() while all CPUs are still active and
+	 * interrupts are enabled. This will allow all in-flight memory
+	 * conversions to finish cleanly before unsharing all memory.
+	 */
+	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && kexec_in_progress)
+		x86_platform.guest.enc_kexec_unshare_mem(false);
+
 	/* Stop the cpus and apics */
 #ifdef CONFIG_X86_IO_APIC
 	/*