[v5,5/6] Drivers: hv: vmbus: Support TDX guests

Message ID 20230422021735.27698-6-decui@microsoft.com
State New
Headers
Series Support TDX guests on Hyper-V |

Commit Message

Dexuan Cui April 22, 2023, 2:17 a.m. UTC
  Add Hyper-V specific code so that a TDX guest can run on Hyper-V:
  No need to use hv_vp_assist_page.
  Don't use the unsafe Hyper-V TSC page.
  Don't try to use HV_REGISTER_CRASH_CTL.
  Don't trust Hyper-V's TLB-flushing hypercalls.
  Don't use lazy EOI.
  Share SynIC Event/Message pages and VMBus Monitor pages with the host.
  Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().

Signed-off-by: Dexuan Cui <decui@microsoft.com>
---
 arch/x86/hyperv/hv_apic.c      |  6 ++--
 arch/x86/hyperv/hv_init.c      | 19 +++++++++---
 arch/x86/kernel/cpu/mshyperv.c | 21 ++++++++++++-
 drivers/hv/hv.c                | 54 ++++++++++++++++++++++++++++++++--
 4 files changed, 90 insertions(+), 10 deletions(-)

Changes in v2:
  Used a new function hv_set_memory_enc_dec_needed() in
    __set_memory_enc_pgtable().
  Added the missing set_memory_encrypted() in hv_synic_free().

Changes in v3:
  Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().
  (Do not use PAGE_KERNEL_NOENC, which doesn't exist for ARM64).

  Used cc_mkdec() in hv_synic_enable_regs().

  ms_hyperv_init_platform():
    Explicitly do not use HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED.
    Explicitly do not use HV_X64_APIC_ACCESS_RECOMMENDED.

  Enabled __send_ipi_mask() and __send_ipi_one() for TDX guests.

Changes in v4:
  A minor rebase to Michael's v7 DDA patchset. I'm very happy that
    I can drop my v3 change to arch/x86/mm/pat/set_memory.c due to
    Michael's work.

Changes in v5:
  Added memset() to clear synic_message_page and synic_event_page()
after set_memory_decrypted().
  Rebased the patch since "post_msg_page" has been removed in
hyperv-next.
  Improved the error handling in hv_synic_alloc()/free() [Michael
Kelley]
  

Comments

Michael Kelley (LINUX) May 1, 2023, 5:32 p.m. UTC | #1
From: Dexuan Cui <decui@microsoft.com> Sent: Friday, April 21, 2023 7:18 PM
> 
> Add Hyper-V specific code so that a TDX guest can run on Hyper-V:
>   No need to use hv_vp_assist_page.
>   Don't use the unsafe Hyper-V TSC page.
>   Don't try to use HV_REGISTER_CRASH_CTL.
>   Don't trust Hyper-V's TLB-flushing hypercalls.
>   Don't use lazy EOI.
>   Share SynIC Event/Message pages and VMBus Monitor pages with the host.

This patch no longer does anything with the VMBus monitor pages.

>   Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().

The above line in the commit message is stale and can be dropped.

> 
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  arch/x86/hyperv/hv_apic.c      |  6 ++--
>  arch/x86/hyperv/hv_init.c      | 19 +++++++++---
>  arch/x86/kernel/cpu/mshyperv.c | 21 ++++++++++++-
>  drivers/hv/hv.c                | 54 ++++++++++++++++++++++++++++++++--
>  4 files changed, 90 insertions(+), 10 deletions(-)
> 
> Changes in v2:
>   Used a new function hv_set_memory_enc_dec_needed() in
>     __set_memory_enc_pgtable().
>   Added the missing set_memory_encrypted() in hv_synic_free().
> 
> Changes in v3:
>   Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().
>   (Do not use PAGE_KERNEL_NOENC, which doesn't exist for ARM64).
> 
>   Used cc_mkdec() in hv_synic_enable_regs().
> 
>   ms_hyperv_init_platform():
>     Explicitly do not use HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED.
>     Explicitly do not use HV_X64_APIC_ACCESS_RECOMMENDED.
> 
>   Enabled __send_ipi_mask() and __send_ipi_one() for TDX guests.
> 
> Changes in v4:
>   A minor rebase to Michael's v7 DDA patchset. I'm very happy that
>     I can drop my v3 change to arch/x86/mm/pat/set_memory.c due to
>     Michael's work.
> 
> Changes in v5:
>   Added memset() to clear synic_message_page and synic_event_page()
> after set_memory_decrypted().
>   Rebased the patch since "post_msg_page" has been removed in
> hyperv-next.
>   Improved the error handling in hv_synic_alloc()/free() [Michael
> Kelley]
> 
> diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
> index fb8b2c088681..16919c7b3196 100644
> --- a/arch/x86/hyperv/hv_apic.c
> +++ b/arch/x86/hyperv/hv_apic.c
> @@ -173,7 +173,8 @@ static bool __send_ipi_mask(const struct cpumask *mask, int
> vector,
>  	    (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask)))
>  		return true;
> 
> -	if (!hv_hypercall_pg)
> +	/* A TDX guest doesn't use hv_hypercall_pg. */
> +	if (!hv_isolation_type_tdx() && !hv_hypercall_pg)
>  		return false;
> 
>  	if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
> @@ -227,7 +228,8 @@ static bool __send_ipi_one(int cpu, int vector)
> 
>  	trace_hyperv_send_ipi_one(cpu, vector);
> 
> -	if (!hv_hypercall_pg || (vp == VP_INVAL))
> +	/* A TDX guest doesn't use hv_hypercall_pg. */
> +	if ((!hv_isolation_type_tdx() && !hv_hypercall_pg) || (vp == VP_INVAL))
>  		return false;
> 
>  	if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index f175e0de821c..f28357ecad7d 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -79,7 +79,7 @@ static int hyperv_init_ghcb(void)
>  static int hv_cpu_init(unsigned int cpu)
>  {
>  	union hv_vp_assist_msr_contents msr = { 0 };
> -	struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
> +	struct hv_vp_assist_page **hvp;
>  	int ret;
> 
>  	ret = hv_common_cpu_init(cpu);
> @@ -89,6 +89,7 @@ static int hv_cpu_init(unsigned int cpu)
>  	if (!hv_vp_assist_page)
>  		return 0;
> 
> +	hvp = &hv_vp_assist_page[cpu];
>  	if (hv_root_partition) {
>  		/*
>  		 * For root partition we get the hypervisor provided VP assist
> @@ -398,11 +399,21 @@ void __init hyperv_init(void)
>  	if (hv_common_init())
>  		return;
> 
> -	hv_vp_assist_page = kcalloc(num_possible_cpus(),
> -				    sizeof(*hv_vp_assist_page), GFP_KERNEL);
> +	/*
> +	 * The VP assist page is useless to a TDX guest: the only use we
> +	 * would have for it is lazy EOI, which can not be used with TDX.
> +	 */
> +	if (hv_isolation_type_tdx())
> +		hv_vp_assist_page = NULL;
> +	else
> +		hv_vp_assist_page = kcalloc(num_possible_cpus(),
> +					    sizeof(*hv_vp_assist_page),
> +					    GFP_KERNEL);
>  	if (!hv_vp_assist_page) {
>  		ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
> -		goto common_free;
> +
> +		if (!hv_isolation_type_tdx())
> +			goto common_free;
>  	}
> 
>  	if (hv_isolation_type_snp()) {
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index a87fb934cd4b..e9106c9d92f8 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -405,8 +405,27 @@ static void __init ms_hyperv_init_platform(void)
> 
>  		if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
>  			static_branch_enable(&isolation_type_snp);
> -		else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX)
> +		else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
>  			static_branch_enable(&isolation_type_tdx);
> +
> +			/*
> +			 * The GPAs of SynIC Event/Message pages and VMBus
> +			 * Moniter pages need to be added by this offset.
> +			 */
> +			ms_hyperv.shared_gpa_boundary = cc_mkdec(0);
> +
> +			/* Don't use the unsafe Hyper-V TSC page */
> +			ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
> +
> +			/* HV_REGISTER_CRASH_CTL is unsupported */
> +			ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
> +
> +			/* Don't trust Hyper-V's TLB-flushing hypercalls */
> +			ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
> +
> +			/* A TDX VM must use x2APIC and doesn't use lazy EOI */
> +			ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
> +		}
>  	}
> 
>  	if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> index 4e1407d59ba0..fa7dce26ec67 100644
> --- a/drivers/hv/hv.c
> +++ b/drivers/hv/hv.c
> @@ -18,6 +18,7 @@
>  #include <linux/clockchips.h>
>  #include <linux/delay.h>
>  #include <linux/interrupt.h>
> +#include <linux/set_memory.h>
>  #include <clocksource/hyperv_timer.h>
>  #include <asm/mshyperv.h>
>  #include "hyperv_vmbus.h"
> @@ -116,6 +117,7 @@ int hv_synic_alloc(void)
>  {
>  	int cpu;
>  	struct hv_per_cpu_context *hv_cpu;
> +	int ret = -ENOMEM;
> 
>  	/*
>  	 * First, zero all per-cpu memory areas so hv_synic_free() can
> @@ -159,6 +161,28 @@ int hv_synic_alloc(void)
>  				goto err;
>  			}
>  		}
> +
> +		/* It's better to leak the page if the decryption fails. */
> +		if (hv_isolation_type_tdx()) {
> +			ret = set_memory_decrypted(
> +				(unsigned long)hv_cpu->synic_message_page, 1);
> +			if (ret) {
> +				pr_err("Failed to decrypt SYNIC msg page\n");
> +				hv_cpu->synic_message_page = NULL;
> +				goto err;
> +			}
> +
> +			ret = set_memory_decrypted(
> +				(unsigned long)hv_cpu->synic_event_page, 1);
> +			if (ret) {
> +				pr_err("Failed to decrypt SYNIC event page\n");
> +				hv_cpu->synic_event_page = NULL;
> +				goto err;
> +			}

The error handling still doesn't work quite correctly.   In the TDX case, upon
exiting this function, the synic_message_page and the synic_event_page must
each either be mapped decrypted or be NULL.  This requirement is so
that hv_synic_free() will do the right thing in changing the mapping back to
encrypted.  hv_synic_free() can't handle a non-NULL page being encrypted.

In the above code, if we fail to decrypt the synic_message_page, then setting
it to NULL will leak the page (which we'll live with) and ensures that hv_synic_free()
will handle it correctly.  But at that point we'll exit with synic_event_page
non-NULL and in the encrypted state, which hv_synic_free() can't handle.

Michael

> +
> +			memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
> +			memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
> +		}
>  	}
> 
>  	return 0;
> @@ -167,18 +191,40 @@ int hv_synic_alloc(void)
>  	 * Any memory allocations that succeeded will be freed when
>  	 * the caller cleans up by calling hv_synic_free()
>  	 */
> -	return -ENOMEM;
> +	return ret;
>  }
> 
> 
>  void hv_synic_free(void)
>  {
>  	int cpu;
> +	int ret;
> 
>  	for_each_present_cpu(cpu) {
>  		struct hv_per_cpu_context *hv_cpu
>  			= per_cpu_ptr(hv_context.cpu_context, cpu);
> 
> +		/* It's better to leak the page if the encryption fails. */
> +		if (hv_isolation_type_tdx()) {
> +			if (hv_cpu->synic_message_page) {
> +				ret = set_memory_encrypted((unsigned long)
> +					hv_cpu->synic_message_page, 1);
> +				if (ret) {
> +					pr_err("Failed to encrypt SYNIC msg page\n");
> +					hv_cpu->synic_message_page = NULL;
> +				}
> +			}
> +
> +			if (hv_cpu->synic_event_page) {
> +				ret = set_memory_encrypted((unsigned long)
> +					hv_cpu->synic_event_page, 1);
> +				if (ret) {
> +					pr_err("Failed to encrypt SYNIC event page\n");
> +					hv_cpu->synic_event_page = NULL;
> +				}
> +			}
> +		}
> +
>  		free_page((unsigned long)hv_cpu->synic_event_page);
>  		free_page((unsigned long)hv_cpu->synic_message_page);
>  	}
> @@ -215,7 +261,8 @@ void hv_synic_enable_regs(unsigned int cpu)
>  		if (!hv_cpu->synic_message_page)
>  			pr_err("Fail to map synic message page.\n");
>  	} else {
> -		simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
> +		simp.base_simp_gpa =
> +			cc_mkdec(virt_to_phys(hv_cpu->synic_message_page))
>  			>> HV_HYP_PAGE_SHIFT;
>  	}
> 
> @@ -234,7 +281,8 @@ void hv_synic_enable_regs(unsigned int cpu)
>  		if (!hv_cpu->synic_event_page)
>  			pr_err("Fail to map synic event page.\n");
>  	} else {
> -		siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
> +		siefp.base_siefp_gpa =
> +			cc_mkdec(virt_to_phys(hv_cpu->synic_event_page))
>  			>> HV_HYP_PAGE_SHIFT;
>  	}
> 
> --
> 2.25.1
  
Dexuan Cui May 2, 2023, 1:34 a.m. UTC | #2
> From: Michael Kelley (LINUX) <mikelley@microsoft.com>
> Sent: Monday, May 1, 2023 10:33 AM
> ...
> From: Dexuan Cui
> >
> > Add Hyper-V specific code so that a TDX guest can run on Hyper-V:
> >   No need to use hv_vp_assist_page.
> >   Don't use the unsafe Hyper-V TSC page.
> >   Don't try to use HV_REGISTER_CRASH_CTL.
> >   Don't trust Hyper-V's TLB-flushing hypercalls.
> >   Don't use lazy EOI.
> >   Share SynIC Event/Message pages and VMBus Monitor pages with the
> >  host.
> 
> This patch no longer does anything with the VMBus monitor pages.
Sorry, I forgot to update the commit log. Will drop this from the log.

> >   Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().
> 
> The above line in the commit message is stale and can be dropped.
Will drop this from the commit log.

> > @@ -116,6 +117,7 @@ int hv_synic_alloc(void)
> >  {
> >  	int cpu;
> >  	struct hv_per_cpu_context *hv_cpu;
> > +	int ret = -ENOMEM;
> >
> >  	/*
> >  	 * First, zero all per-cpu memory areas so hv_synic_free() can
> > @@ -159,6 +161,28 @@ int hv_synic_alloc(void)
> >  				goto err;
> >  			}
> >  		}
> > +
> > +		/* It's better to leak the page if the decryption fails. */
> > +		if (hv_isolation_type_tdx()) {
> > +			ret = set_memory_decrypted(
> > +				(unsigned long)hv_cpu->synic_message_page, 1);
> > +			if (ret) {
> > +				pr_err("Failed to decrypt SYNIC msg page\n");
> > +				hv_cpu->synic_message_page = NULL;
> > +				goto err;
> > +			}
> > +
> > +			ret = set_memory_decrypted(
> > +				(unsigned long)hv_cpu->synic_event_page, 1);
> > +			if (ret) {
> > +				pr_err("Failed to decrypt SYNIC event page\n");
> > +				hv_cpu->synic_event_page = NULL;
> > +				goto err;
> > +			}
> 
> The error handling still doesn't work quite correctly.   In the TDX case, upon
> exiting this function, the synic_message_page and the synic_event_page
> must
> each either be mapped decrypted or be NULL.  This requirement is so
> that hv_synic_free() will do the right thing in changing the mapping back to
> encrypted.  hv_synic_free() can't handle a non-NULL page being encrypted.
> 
> In the above code, if we fail to decrypt the synic_message_page, then setting
> it to NULL will leak the page (which we'll live with) and ensures that
> hv_synic_free()
> will handle it correctly.  But at that point we'll exit with synic_event_page
> non-NULL and in the encrypted state, which hv_synic_free() can't handle.
> 
> Michael

Thanks for spotting the issue! 
I think the below extra changes should do the job:

@@ -121,91 +121,102 @@ int hv_synic_alloc(void)

        /*
         * First, zero all per-cpu memory areas so hv_synic_free() can
         * detect what memory has been allocated and cleanup properly
         * after any failures.
         */
        for_each_present_cpu(cpu) {
                hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
                memset(hv_cpu, 0, sizeof(*hv_cpu));
        }

        hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
                                         GFP_KERNEL);
        if (hv_context.hv_numa_map == NULL) {
                pr_err("Unable to allocate NUMA map\n");
                goto err;
        }

        for_each_present_cpu(cpu) {
                hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);

                tasklet_init(&hv_cpu->msg_dpc,
                             vmbus_on_msg_dpc, (unsigned long) hv_cpu);

                /*
                 * Synic message and event pages are allocated by paravisor.
                 * Skip these pages allocation here.
                 */
                if (!hv_isolation_type_snp() && !hv_root_partition) {
                        hv_cpu->synic_message_page =
                                (void *)get_zeroed_page(GFP_ATOMIC);
                        if (hv_cpu->synic_message_page == NULL) {
                                pr_err("Unable to allocate SYNIC message page\n");
                                goto err;
                        }

                        hv_cpu->synic_event_page =
                                (void *)get_zeroed_page(GFP_ATOMIC);
                        if (hv_cpu->synic_event_page == NULL) {
                                pr_err("Unable to allocate SYNIC event page\n");
+
+                               free_page((unsigned long)hv_cpu->synic_message_page);
+                               hv_cpu->synic_message_page = NULL;
+
                                goto err;
                        }
                }

                /* It's better to leak the page if the decryption fails. */
                if (hv_isolation_type_tdx()) {
                        ret = set_memory_decrypted(
                                (unsigned long)hv_cpu->synic_message_page, 1);
                        if (ret) {
                                pr_err("Failed to decrypt SYNIC msg page\n");
                                hv_cpu->synic_message_page = NULL;
+
+                               /*
+                                * Free the event page so that a TDX VM won't
+                                * try to encrypt the page in hv_synic_free().
+                                */
+                               free_page((unsigned long)hv_cpu->synic_event_page);
+                               hv_cpu->synic_event_page = NULL;
                                goto err;
                        }

                        ret = set_memory_decrypted(
                                (unsigned long)hv_cpu->synic_event_page, 1);
                        if (ret) {
                                pr_err("Failed to decrypt SYNIC event page\n");
                                hv_cpu->synic_event_page = NULL;
                                goto err;
                        }

                        memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
                        memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
                }
        }

        return 0;
 err:
        /*
         * Any memory allocations that succeeded will be freed when
         * the caller cleans up by calling hv_synic_free()
         */
        return ret;
 }

I'm going to use the below (i.e. v5 + the above extra changes) in v6.
Please let me know if there is still any bug.

@@ -116,6 +117,7 @@ int hv_synic_alloc(void)
 {
        int cpu;
        struct hv_per_cpu_context *hv_cpu;
+       int ret = -ENOMEM;

        /*
         * First, zero all per-cpu memory areas so hv_synic_free() can
@@ -156,9 +158,42 @@ int hv_synic_alloc(void)
                                (void *)get_zeroed_page(GFP_ATOMIC);
                        if (hv_cpu->synic_event_page == NULL) {
                                pr_err("Unable to allocate SYNIC event page\n");
+
+                               free_page((unsigned long)hv_cpu->synic_message_page);
+                               hv_cpu->synic_message_page = NULL;
+
                                goto err;
                        }
                }
+
+               /* It's better to leak the page if the decryption fails. */
+               if (hv_isolation_type_tdx()) {
+                       ret = set_memory_decrypted(
+                               (unsigned long)hv_cpu->synic_message_page, 1);
+                       if (ret) {
+                               pr_err("Failed to decrypt SYNIC msg page\n");
+                               hv_cpu->synic_message_page = NULL;
+
+                               /*
+                                * Free the event page so that a TDX VM won't
+                                * try to encrypt the page in hv_synic_free().
+                                */
+                               free_page((unsigned long)hv_cpu->synic_event_page);
+                               hv_cpu->synic_event_page = NULL;
+                               goto err;
+                       }
+
+                       ret = set_memory_decrypted(
+                               (unsigned long)hv_cpu->synic_event_page, 1);
+                       if (ret) {
+                               pr_err("Failed to decrypt SYNIC event page\n");
+                               hv_cpu->synic_event_page = NULL;
+                               goto err;
+                       }
+
+                       memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
+                       memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
+               }
        }

        return 0;
@@ -167,18 +202,40 @@ int hv_synic_alloc(void)
         * Any memory allocations that succeeded will be freed when
         * the caller cleans up by calling hv_synic_free()
         */
-       return -ENOMEM;
+       return ret;
 }


 void hv_synic_free(void)
 {
        int cpu;
+       int ret;

        for_each_present_cpu(cpu) {
                struct hv_per_cpu_context *hv_cpu
                        = per_cpu_ptr(hv_context.cpu_context, cpu);

+               /* It's better to leak the page if the encryption fails. */
+               if (hv_isolation_type_tdx()) {
+                       if (hv_cpu->synic_message_page) {
+                               ret = set_memory_encrypted((unsigned long)
+                                       hv_cpu->synic_message_page, 1);
+                               if (ret) {
+                                       pr_err("Failed to encrypt SYNIC msg page\n");
+                                       hv_cpu->synic_message_page = NULL;
+                               }
+                       }
+
+                       if (hv_cpu->synic_event_page) {
+                               ret = set_memory_encrypted((unsigned long)
+                                       hv_cpu->synic_event_page, 1);
+                               if (ret) {
+                                       pr_err("Failed to encrypt SYNIC event page\n");
+                                       hv_cpu->synic_event_page = NULL;
+                               }
+                       }
+               }
+
                free_page((unsigned long)hv_cpu->synic_event_page);
                free_page((unsigned long)hv_cpu->synic_message_page);
        }


I'll post a separate patch (currently if hv_synic_alloc() --> get_zeroed_page() fails,
hv_context.hv_numa_map is leaked):


--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1515,27 +1515,27 @@ static int vmbus_bus_init(void)
        }

        ret = hv_synic_alloc();
        if (ret)
                goto err_alloc;

        /*
         * Initialize the per-cpu interrupt state and stimer state.
         * Then connect to the host.
         */
        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
                                hv_synic_init, hv_synic_cleanup);
        if (ret < 0)
-               goto err_cpuhp;
+               goto err_alloc;
        hyperv_cpuhp_online = ret;

        ret = vmbus_connect();
        if (ret)
                goto err_connect;

        if (hv_is_isolation_supported())
                sysctl_record_panic_msg = 0;

        /*
         * Only register if the crash MSRs are available
         */
        if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
@@ -1567,29 +1567,28 @@ static int vmbus_bus_init(void)
        /*
         * Always register the vmbus unload panic notifier because we
         * need to shut the VMbus channel connection on panic.
         */
        atomic_notifier_chain_register(&panic_notifier_list,
                               &hyperv_panic_vmbus_unload_block);

        vmbus_request_offers();

        return 0;

 err_connect:
        cpuhp_remove_state(hyperv_cpuhp_online);
-err_cpuhp:
-       hv_synic_free();
 err_alloc:
+       hv_synic_free();
        if (vmbus_irq == -1) {
                hv_remove_vmbus_handler();
        } else {
                free_percpu_irq(vmbus_irq, vmbus_evt);
                free_percpu(vmbus_evt);
        }
 err_setup:
        bus_unregister(&hv_bus);
        unregister_sysctl_table(hv_ctl_table_hdr);
        hv_ctl_table_hdr = NULL;
        return ret;
 }
  
Michael Kelley (LINUX) May 2, 2023, 3:26 p.m. UTC | #3
From: Dexuan Cui <decui@microsoft.com> Sent: Monday, May 1, 2023 6:34 PM
> 
> > From: Michael Kelley (LINUX) <mikelley@microsoft.com>
> > Sent: Monday, May 1, 2023 10:33 AM
> > ...
> > From: Dexuan Cui
> > >
> > > Add Hyper-V specific code so that a TDX guest can run on Hyper-V:
> > >   No need to use hv_vp_assist_page.
> > >   Don't use the unsafe Hyper-V TSC page.
> > >   Don't try to use HV_REGISTER_CRASH_CTL.
> > >   Don't trust Hyper-V's TLB-flushing hypercalls.
> > >   Don't use lazy EOI.
> > >   Share SynIC Event/Message pages and VMBus Monitor pages with the
> > >  host.
> >
> > This patch no longer does anything with the VMBus monitor pages.
> Sorry, I forgot to update the commit log. Will drop this from the log.
> 
> > >   Use pgprot_decrypted(PAGE_KERNEL)in hv_ringbuffer_init().
> >
> > The above line in the commit message is stale and can be dropped.
> Will drop this from the commit log.
> 
> > > @@ -116,6 +117,7 @@ int hv_synic_alloc(void)
> > >  {
> > >  	int cpu;
> > >  	struct hv_per_cpu_context *hv_cpu;
> > > +	int ret = -ENOMEM;
> > >
> > >  	/*
> > >  	 * First, zero all per-cpu memory areas so hv_synic_free() can
> > > @@ -159,6 +161,28 @@ int hv_synic_alloc(void)
> > >  				goto err;
> > >  			}
> > >  		}
> > > +
> > > +		/* It's better to leak the page if the decryption fails. */
> > > +		if (hv_isolation_type_tdx()) {
> > > +			ret = set_memory_decrypted(
> > > +				(unsigned long)hv_cpu->synic_message_page, 1);
> > > +			if (ret) {
> > > +				pr_err("Failed to decrypt SYNIC msg page\n");
> > > +				hv_cpu->synic_message_page = NULL;
> > > +				goto err;
> > > +			}
> > > +
> > > +			ret = set_memory_decrypted(
> > > +				(unsigned long)hv_cpu->synic_event_page, 1);
> > > +			if (ret) {
> > > +				pr_err("Failed to decrypt SYNIC event page\n");
> > > +				hv_cpu->synic_event_page = NULL;
> > > +				goto err;
> > > +			}
> >
> > The error handling still doesn't work quite correctly.   In the TDX case, upon
> > exiting this function, the synic_message_page and the synic_event_page
> > must
> > each either be mapped decrypted or be NULL.  This requirement is so
> > that hv_synic_free() will do the right thing in changing the mapping back to
> > encrypted.  hv_synic_free() can't handle a non-NULL page being encrypted.
> >
> > In the above code, if we fail to decrypt the synic_message_page, then setting
> > it to NULL will leak the page (which we'll live with) and ensures that
> > hv_synic_free()
> > will handle it correctly.  But at that point we'll exit with synic_event_page
> > non-NULL and in the encrypted state, which hv_synic_free() can't handle.
> >
> > Michael
> 
> Thanks for spotting the issue!
> I think the below extra changes should do the job:
> 
> @@ -121,91 +121,102 @@ int hv_synic_alloc(void)
> 
>         /*
>          * First, zero all per-cpu memory areas so hv_synic_free() can
>          * detect what memory has been allocated and cleanup properly
>          * after any failures.
>          */
>         for_each_present_cpu(cpu) {
>                 hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
>                 memset(hv_cpu, 0, sizeof(*hv_cpu));
>         }
> 
>         hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
>                                          GFP_KERNEL);
>         if (hv_context.hv_numa_map == NULL) {
>                 pr_err("Unable to allocate NUMA map\n");
>                 goto err;
>         }
> 
>         for_each_present_cpu(cpu) {
>                 hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
> 
>                 tasklet_init(&hv_cpu->msg_dpc,
>                              vmbus_on_msg_dpc, (unsigned long) hv_cpu);
> 
>                 /*
>                  * Synic message and event pages are allocated by paravisor.
>                  * Skip these pages allocation here.
>                  */
>                 if (!hv_isolation_type_snp() && !hv_root_partition) {
>                         hv_cpu->synic_message_page =
>                                 (void *)get_zeroed_page(GFP_ATOMIC);
>                         if (hv_cpu->synic_message_page == NULL) {
>                                 pr_err("Unable to allocate SYNIC message page\n");
>                                 goto err;
>                         }
> 
>                         hv_cpu->synic_event_page =
>                                 (void *)get_zeroed_page(GFP_ATOMIC);
>                         if (hv_cpu->synic_event_page == NULL) {
>                                 pr_err("Unable to allocate SYNIC event page\n");
> +
> +                               free_page((unsigned long)hv_cpu->synic_message_page);
> +                               hv_cpu->synic_message_page = NULL;
> +
>                                 goto err;
>                         }
>                 }
> 
>                 /* It's better to leak the page if the decryption fails. */
>                 if (hv_isolation_type_tdx()) {
>                         ret = set_memory_decrypted(
>                                 (unsigned long)hv_cpu->synic_message_page, 1);
>                         if (ret) {
>                                 pr_err("Failed to decrypt SYNIC msg page\n");
>                                 hv_cpu->synic_message_page = NULL;
> +
> +                               /*
> +                                * Free the event page so that a TDX VM won't
> +                                * try to encrypt the page in hv_synic_free().
> +                                */
> +                               free_page((unsigned long)hv_cpu->synic_event_page);
> +                               hv_cpu->synic_event_page = NULL;
>                                 goto err;
>                         }
> 
>                         ret = set_memory_decrypted(
>                                 (unsigned long)hv_cpu->synic_event_page, 1);
>                         if (ret) {
>                                 pr_err("Failed to decrypt SYNIC event page\n");
>                                 hv_cpu->synic_event_page = NULL;
>                                 goto err;
>                         }
> 
>                         memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
>                         memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
>                 }
>         }
> 
>         return 0;
>  err:
>         /*
>          * Any memory allocations that succeeded will be freed when
>          * the caller cleans up by calling hv_synic_free()
>          */
>         return ret;
>  }
> 
> I'm going to use the below (i.e. v5 + the above extra changes) in v6.
> Please let me know if there is still any bug.
> 
> @@ -116,6 +117,7 @@ int hv_synic_alloc(void)
>  {
>         int cpu;
>         struct hv_per_cpu_context *hv_cpu;
> +       int ret = -ENOMEM;
> 
>         /*
>          * First, zero all per-cpu memory areas so hv_synic_free() can
> @@ -156,9 +158,42 @@ int hv_synic_alloc(void)
>                                 (void *)get_zeroed_page(GFP_ATOMIC);
>                         if (hv_cpu->synic_event_page == NULL) {
>                                 pr_err("Unable to allocate SYNIC event page\n");
> +
> +                               free_page((unsigned long)hv_cpu->synic_message_page);
> +                               hv_cpu->synic_message_page = NULL;
> +
>                                 goto err;
>                         }
>                 }
> +
> +               /* It's better to leak the page if the decryption fails. */
> +               if (hv_isolation_type_tdx()) {
> +                       ret = set_memory_decrypted(
> +                               (unsigned long)hv_cpu->synic_message_page, 1);
> +                       if (ret) {
> +                               pr_err("Failed to decrypt SYNIC msg page\n");
> +                               hv_cpu->synic_message_page = NULL;
> +
> +                               /*
> +                                * Free the event page so that a TDX VM won't
> +                                * try to encrypt the page in hv_synic_free().
> +                                */
> +                               free_page((unsigned long)hv_cpu->synic_event_page);
> +                               hv_cpu->synic_event_page = NULL;
> +                               goto err;
> +                       }
> +
> +                       ret = set_memory_decrypted(
> +                               (unsigned long)hv_cpu->synic_event_page, 1);
> +                       if (ret) {
> +                               pr_err("Failed to decrypt SYNIC event page\n");
> +                               hv_cpu->synic_event_page = NULL;
> +                               goto err;
> +                       }
> +
> +                       memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
> +                       memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
> +               }

Yes, this looks good to me.  A minor point:  In the two calls to set decrypted,
if there is a failure, output the value of "ret" in the error message.  It should
never happen, but if it did, it could be hard to diagnose, and we'll want all
the info we can get about the failure.  And do the same in hv_synic_free()
if setting back to encrypted should fail.

Michael

>         }
> 
>         return 0;
> @@ -167,18 +202,40 @@ int hv_synic_alloc(void)
>          * Any memory allocations that succeeded will be freed when
>          * the caller cleans up by calling hv_synic_free()
>          */
> -       return -ENOMEM;
> +       return ret;
>  }
> 
> 
>  void hv_synic_free(void)
>  {
>         int cpu;
> +       int ret;
> 
>         for_each_present_cpu(cpu) {
>                 struct hv_per_cpu_context *hv_cpu
>                         = per_cpu_ptr(hv_context.cpu_context, cpu);
> 
> +               /* It's better to leak the page if the encryption fails. */
> +               if (hv_isolation_type_tdx()) {
> +                       if (hv_cpu->synic_message_page) {
> +                               ret = set_memory_encrypted((unsigned long)
> +                                       hv_cpu->synic_message_page, 1);
> +                               if (ret) {
> +                                       pr_err("Failed to encrypt SYNIC msg page\n");
> +                                       hv_cpu->synic_message_page = NULL;
> +                               }
> +                       }
> +
> +                       if (hv_cpu->synic_event_page) {
> +                               ret = set_memory_encrypted((unsigned long)
> +                                       hv_cpu->synic_event_page, 1);
> +                               if (ret) {
> +                                       pr_err("Failed to encrypt SYNIC event page\n");
> +                                       hv_cpu->synic_event_page = NULL;
> +                               }
> +                       }
> +               }
> +
>                 free_page((unsigned long)hv_cpu->synic_event_page);
>                 free_page((unsigned long)hv_cpu->synic_message_page);
>         }
> 
> 
> I'll post a separate patch (currently if hv_synic_alloc() --> get_zeroed_page() fails,
> hv_context.hv_numa_map is leaked):
> 
> 
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -1515,27 +1515,27 @@ static int vmbus_bus_init(void)
>         }
> 
>         ret = hv_synic_alloc();
>         if (ret)
>                 goto err_alloc;
> 
>         /*
>          * Initialize the per-cpu interrupt state and stimer state.
>          * Then connect to the host.
>          */
>         ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
>                                 hv_synic_init, hv_synic_cleanup);
>         if (ret < 0)
> -               goto err_cpuhp;
> +               goto err_alloc;
>         hyperv_cpuhp_online = ret;
> 
>         ret = vmbus_connect();
>         if (ret)
>                 goto err_connect;
> 
>         if (hv_is_isolation_supported())
>                 sysctl_record_panic_msg = 0;
> 
>         /*
>          * Only register if the crash MSRs are available
>          */
>         if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
> @@ -1567,29 +1567,28 @@ static int vmbus_bus_init(void)
>         /*
>          * Always register the vmbus unload panic notifier because we
>          * need to shut the VMbus channel connection on panic.
>          */
>         atomic_notifier_chain_register(&panic_notifier_list,
>                                &hyperv_panic_vmbus_unload_block);
> 
>         vmbus_request_offers();
> 
>         return 0;
> 
>  err_connect:
>         cpuhp_remove_state(hyperv_cpuhp_online);
> -err_cpuhp:
> -       hv_synic_free();
>  err_alloc:
> +       hv_synic_free();
>         if (vmbus_irq == -1) {
>                 hv_remove_vmbus_handler();
>         } else {
>                 free_percpu_irq(vmbus_irq, vmbus_evt);
>                 free_percpu(vmbus_evt);
>         }
>  err_setup:
>         bus_unregister(&hv_bus);
>         unregister_sysctl_table(hv_ctl_table_hdr);
>         hv_ctl_table_hdr = NULL;
>         return ret;
>  }
  
Dexuan Cui May 2, 2023, 7:21 p.m. UTC | #4
> From: Michael Kelley (LINUX) <mikelley@microsoft.com>
> Sent: Tuesday, May 2, 2023 8:26 AM
> ...
> Yes, this looks good to me.  A minor point:  In the two calls to set
Thanks for the confirmation!

> decrypted,
> if there is a failure, output the value of "ret" in the error message.  It should
> never happen, but if it did, it could be hard to diagnose, and we'll want all
> the info we can get about the failure.  And do the same in hv_synic_free()
> if setting back to encrypted should fail.
> 
> Michael

Will do in v6.
  

Patch

diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index fb8b2c088681..16919c7b3196 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -173,7 +173,8 @@  static bool __send_ipi_mask(const struct cpumask *mask, int vector,
 	    (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask)))
 		return true;
 
-	if (!hv_hypercall_pg)
+	/* A TDX guest doesn't use hv_hypercall_pg. */
+	if (!hv_isolation_type_tdx() && !hv_hypercall_pg)
 		return false;
 
 	if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
@@ -227,7 +228,8 @@  static bool __send_ipi_one(int cpu, int vector)
 
 	trace_hyperv_send_ipi_one(cpu, vector);
 
-	if (!hv_hypercall_pg || (vp == VP_INVAL))
+	/* A TDX guest doesn't use hv_hypercall_pg. */
+	if ((!hv_isolation_type_tdx() && !hv_hypercall_pg) || (vp == VP_INVAL))
 		return false;
 
 	if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index f175e0de821c..f28357ecad7d 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -79,7 +79,7 @@  static int hyperv_init_ghcb(void)
 static int hv_cpu_init(unsigned int cpu)
 {
 	union hv_vp_assist_msr_contents msr = { 0 };
-	struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
+	struct hv_vp_assist_page **hvp;
 	int ret;
 
 	ret = hv_common_cpu_init(cpu);
@@ -89,6 +89,7 @@  static int hv_cpu_init(unsigned int cpu)
 	if (!hv_vp_assist_page)
 		return 0;
 
+	hvp = &hv_vp_assist_page[cpu];
 	if (hv_root_partition) {
 		/*
 		 * For root partition we get the hypervisor provided VP assist
@@ -398,11 +399,21 @@  void __init hyperv_init(void)
 	if (hv_common_init())
 		return;
 
-	hv_vp_assist_page = kcalloc(num_possible_cpus(),
-				    sizeof(*hv_vp_assist_page), GFP_KERNEL);
+	/*
+	 * The VP assist page is useless to a TDX guest: the only use we
+	 * would have for it is lazy EOI, which can not be used with TDX.
+	 */
+	if (hv_isolation_type_tdx())
+		hv_vp_assist_page = NULL;
+	else
+		hv_vp_assist_page = kcalloc(num_possible_cpus(),
+					    sizeof(*hv_vp_assist_page),
+					    GFP_KERNEL);
 	if (!hv_vp_assist_page) {
 		ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
-		goto common_free;
+
+		if (!hv_isolation_type_tdx())
+			goto common_free;
 	}
 
 	if (hv_isolation_type_snp()) {
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index a87fb934cd4b..e9106c9d92f8 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -405,8 +405,27 @@  static void __init ms_hyperv_init_platform(void)
 
 		if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
 			static_branch_enable(&isolation_type_snp);
-		else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX)
+		else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
 			static_branch_enable(&isolation_type_tdx);
+
+			/*
+			 * The GPAs of SynIC Event/Message pages and VMBus
+			 * Moniter pages need to be added by this offset.
+			 */
+			ms_hyperv.shared_gpa_boundary = cc_mkdec(0);
+
+			/* Don't use the unsafe Hyper-V TSC page */
+			ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+
+			/* HV_REGISTER_CRASH_CTL is unsupported */
+			ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+			/* Don't trust Hyper-V's TLB-flushing hypercalls */
+			ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+
+			/* A TDX VM must use x2APIC and doesn't use lazy EOI */
+			ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
+		}
 	}
 
 	if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 4e1407d59ba0..fa7dce26ec67 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -18,6 +18,7 @@ 
 #include <linux/clockchips.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/set_memory.h>
 #include <clocksource/hyperv_timer.h>
 #include <asm/mshyperv.h>
 #include "hyperv_vmbus.h"
@@ -116,6 +117,7 @@  int hv_synic_alloc(void)
 {
 	int cpu;
 	struct hv_per_cpu_context *hv_cpu;
+	int ret = -ENOMEM;
 
 	/*
 	 * First, zero all per-cpu memory areas so hv_synic_free() can
@@ -159,6 +161,28 @@  int hv_synic_alloc(void)
 				goto err;
 			}
 		}
+
+		/* It's better to leak the page if the decryption fails. */
+		if (hv_isolation_type_tdx()) {
+			ret = set_memory_decrypted(
+				(unsigned long)hv_cpu->synic_message_page, 1);
+			if (ret) {
+				pr_err("Failed to decrypt SYNIC msg page\n");
+				hv_cpu->synic_message_page = NULL;
+				goto err;
+			}
+
+			ret = set_memory_decrypted(
+				(unsigned long)hv_cpu->synic_event_page, 1);
+			if (ret) {
+				pr_err("Failed to decrypt SYNIC event page\n");
+				hv_cpu->synic_event_page = NULL;
+				goto err;
+			}
+
+			memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
+			memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
+		}
 	}
 
 	return 0;
@@ -167,18 +191,40 @@  int hv_synic_alloc(void)
 	 * Any memory allocations that succeeded will be freed when
 	 * the caller cleans up by calling hv_synic_free()
 	 */
-	return -ENOMEM;
+	return ret;
 }
 
 
 void hv_synic_free(void)
 {
 	int cpu;
+	int ret;
 
 	for_each_present_cpu(cpu) {
 		struct hv_per_cpu_context *hv_cpu
 			= per_cpu_ptr(hv_context.cpu_context, cpu);
 
+		/* It's better to leak the page if the encryption fails. */
+		if (hv_isolation_type_tdx()) {
+			if (hv_cpu->synic_message_page) {
+				ret = set_memory_encrypted((unsigned long)
+					hv_cpu->synic_message_page, 1);
+				if (ret) {
+					pr_err("Failed to encrypt SYNIC msg page\n");
+					hv_cpu->synic_message_page = NULL;
+				}
+			}
+
+			if (hv_cpu->synic_event_page) {
+				ret = set_memory_encrypted((unsigned long)
+					hv_cpu->synic_event_page, 1);
+				if (ret) {
+					pr_err("Failed to encrypt SYNIC event page\n");
+					hv_cpu->synic_event_page = NULL;
+				}
+			}
+		}
+
 		free_page((unsigned long)hv_cpu->synic_event_page);
 		free_page((unsigned long)hv_cpu->synic_message_page);
 	}
@@ -215,7 +261,8 @@  void hv_synic_enable_regs(unsigned int cpu)
 		if (!hv_cpu->synic_message_page)
 			pr_err("Fail to map synic message page.\n");
 	} else {
-		simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
+		simp.base_simp_gpa =
+			cc_mkdec(virt_to_phys(hv_cpu->synic_message_page))
 			>> HV_HYP_PAGE_SHIFT;
 	}
 
@@ -234,7 +281,8 @@  void hv_synic_enable_regs(unsigned int cpu)
 		if (!hv_cpu->synic_event_page)
 			pr_err("Fail to map synic event page.\n");
 	} else {
-		siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
+		siefp.base_siefp_gpa =
+			cc_mkdec(virt_to_phys(hv_cpu->synic_event_page))
 			>> HV_HYP_PAGE_SHIFT;
 	}