new file mode 100644
@@ -6,14 +6,17 @@
#include <linux/cpufeature.h>
#include <linux/debugfs.h>
+#include <linux/delay.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/kexec.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/pgtable.h>
+#include <asm/set_memory.h>
/* MMIO direction */
#define EPT_READ 0
@@ -40,6 +43,9 @@
static atomic_long_t nr_shared;
+static atomic_t conversions_in_progress;
+static bool conversion_allowed = true;
+
static inline bool pte_decrypted(pte_t pte)
{
return cc_mkdec(pte_val(pte)) == pte_val(pte);
@@ -704,6 +710,14 @@ static bool tdx_tlb_flush_required(bool private)
static bool tdx_cache_flush_required(void)
{
+ /*
+ * Avoid issuing CLFLUSH on set_memory_decrypted() if conversions
+ * stopped. Otherwise it can race with unshare_all_memory() and trigger
+ * implicit conversion to shared.
+ */
+ if (!conversion_allowed)
+ return false;
+
/*
* AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
* TDX doesn't have such capability.
@@ -787,12 +801,25 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
bool enc)
{
+ atomic_inc(&conversions_in_progress);
+
+ /*
+ * Check after bumping conversions_in_progress to serialize
+ * against tdx_shutdown().
+ */
+ if (!conversion_allowed) {
+ atomic_dec(&conversions_in_progress);
+ return -EBUSY;
+ }
+
/*
* Only handle shared->private conversion here.
* See the comment in tdx_early_init().
*/
- if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ if (enc && !tdx_enc_status_changed(vaddr, numpages, enc)) {
+ atomic_dec(&conversions_in_progress);
return -EIO;
+ }
return 0;
}
@@ -804,17 +831,104 @@ static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
* Only handle private->shared conversion here.
* See the comment in tdx_early_init().
*/
- if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc)) {
+ atomic_dec(&conversions_in_progress);
return -EIO;
+ }
if (enc)
atomic_long_sub(numpages, &nr_shared);
else
atomic_long_add(numpages, &nr_shared);
+ atomic_dec(&conversions_in_progress);
+
return 0;
}
+void tdx_kexec_unshare_mem(bool crash)
+{
+ unsigned long addr, end;
+ long found = 0, shared;
+
+ /* Stop new private<->shared conversions */
+ conversion_allowed = false;
+
+ /*
+ * Crash kernel reaches here with interrupts disabled: can't wait for
+ * conversions to finish.
+ *
+ * If race happened, just report and proceed.
+ */
+ if (!crash) {
+ unsigned long timeout;
+
+ /*
+ * Wait for in-flight conversions to complete.
+ *
+ * Do not wait more than 30 seconds.
+ */
+ timeout = 30 * USEC_PER_SEC;
+ while (atomic_read(&conversions_in_progress) && timeout--)
+ udelay(1);
+ }
+
+ if (atomic_read(&conversions_in_progress))
+ pr_warn("Failed to finish shared<->private conversions\n");
+
+ /*
+ * Walk direct mapping and convert all shared memory back to private,
+ */
+
+ addr = PAGE_OFFSET;
+ end = PAGE_OFFSET + get_max_mapped();
+
+ while (addr < end) {
+ unsigned long size;
+ unsigned int level;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &level);
+ size = page_level_size(level);
+
+ if (pte && pte_decrypted(*pte)) {
+ int pages = size / PAGE_SIZE;
+
+ /*
+ * Touching memory with shared bit set triggers implicit
+ * conversion to shared.
+ *
+ * Make sure nobody touches the shared range from
+ * now on.
+ *
+ * Bypass unmapping for crash scenario. Unmapping
+ * requires sleepable context, but in crash case kernel
+ * hits the code path with interrupts disabled.
+ * It shouldn't be a problem as all secondary CPUs are
+ * down and kernel runs with interrupts disabled, so
+ * there is no room for race.
+ */
+ if (!crash)
+ set_memory_np(addr, pages);
+
+ if (!tdx_enc_status_changed(addr, pages, true)) {
+ pr_err("Failed to unshare range %#lx-%#lx\n",
+ addr, addr + size);
+ }
+
+ found += pages;
+ }
+
+ addr += size;
+ }
+
+ shared = atomic_long_read(&nr_shared);
+ if (shared != found) {
+ pr_err("shared page accounting is off\n");
+ pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
+ }
+}
+
void __init tdx_early_init(void)
{
struct tdx_module_args args = {
@@ -874,6 +988,8 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
+ x86_platform.guest.enc_kexec_unshare_mem = tdx_kexec_unshare_mem;
+
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
@@ -154,6 +154,7 @@ struct x86_guest {
int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
+ void (*enc_kexec_unshare_mem)(bool crash);
};
/**
@@ -40,6 +40,7 @@
#include <asm/intel_pt.h>
#include <asm/crash.h>
#include <asm/cmdline.h>
+#include <asm/tdx.h>
/* Used while preparing memory map entries for second kernel */
struct crash_memmap_data {
@@ -107,6 +108,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
crash_smp_send_stop();
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ x86_platform.guest.enc_kexec_unshare_mem(true);
+
cpu_emergency_disable_virtualization();
/*
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
+#include <linux/kexec.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -31,6 +32,7 @@
#include <asm/realmode.h>
#include <asm/x86_init.h>
#include <asm/efi.h>
+#include <asm/tdx.h>
/*
* Power off function, if any
@@ -716,6 +718,9 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && kexec_in_progress)
+ x86_platform.guest.enc_kexec_unshare_mem(false);
+
/* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC
/*