[v1,1/3] x86/tdx: Check for TDX partitioning during early TDX init

Message ID 20231122170106.270266-1-jpiotrowski@linux.microsoft.com
State New
Headers
Series [v1,1/3] x86/tdx: Check for TDX partitioning during early TDX init |

Commit Message

Jeremi Piotrowski Nov. 22, 2023, 5:01 p.m. UTC
  Check for additional CPUID bits to identify TDX guests running with Trust
Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).

In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
to Linux running as an L2 TD VM. This is because a majority of TDX facilities
are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
aware mechanisms for what's left. So currently such guests do not have
X86_FEATURE_TDX_GUEST set.

We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
need to check these additional CPUID bits, but we skip further initialization
in the function as we aren't guaranteed access to TDX module calls.

Cc: <stable@vger.kernel.org> # v6.5+
Signed-off-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
---
 arch/x86/coco/tdx/tdx.c    | 29 ++++++++++++++++++++++++++---
 arch/x86/include/asm/tdx.h |  3 +++
 2 files changed, 29 insertions(+), 3 deletions(-)
  

Comments

Jeremi Piotrowski Nov. 22, 2023, 5:19 p.m. UTC | #1
On 22/11/2023 18:01, Jeremi Piotrowski wrote:
> Check for additional CPUID bits to identify TDX guests running with Trust
> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
> 
> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
> aware mechanisms for what's left. So currently such guests do not have
> X86_FEATURE_TDX_GUEST set.
> 
> We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
> need to check these additional CPUID bits, but we skip further initialization
> in the function as we aren't guaranteed access to TDX module calls.
> 
> Cc: <stable@vger.kernel.org> # v6.5+
> Signed-off-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
> ---
>  arch/x86/coco/tdx/tdx.c    | 29 ++++++++++++++++++++++++++---
>  arch/x86/include/asm/tdx.h |  3 +++
>  2 files changed, 29 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> index 1d6b863c42b0..c7bbbaaf654d 100644
> --- a/arch/x86/coco/tdx/tdx.c
> +++ b/arch/x86/coco/tdx/tdx.c
> @@ -8,6 +8,7 @@
>  #include <linux/export.h>
>  #include <linux/io.h>
>  #include <asm/coco.h>
> +#include <asm/hyperv-tlfs.h>
>  #include <asm/tdx.h>
>  #include <asm/vmx.h>
>  #include <asm/insn.h>
> @@ -37,6 +38,8 @@
>  
>  #define TDREPORT_SUBTYPE_0	0
>  
> +bool tdx_partitioning_active;
> +
>  /* Called from __tdx_hypercall() for unrecoverable failure */
>  noinstr void __tdx_hypercall_failed(void)
>  {
> @@ -757,19 +760,38 @@ static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
>  	return true;
>  }
>  
> +
> +static bool early_is_hv_tdx_partitioning(void)
> +{
> +	u32 eax, ebx, ecx, edx;
> +	cpuid(HYPERV_CPUID_ISOLATION_CONFIG, &eax, &ebx, &ecx, &edx);
> +	return eax & HV_PARAVISOR_PRESENT &&
> +	       (ebx & HV_ISOLATION_TYPE) == HV_ISOLATION_TYPE_TDX;
> +}
> +
>  void __init tdx_early_init(void)
>  {
>  	u64 cc_mask;
>  	u32 eax, sig[3];
>  
>  	cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2],  &sig[1]);
> -
> -	if (memcmp(TDX_IDENT, sig, sizeof(sig)))
> -		return;
> +	if (memcmp(TDX_IDENT, sig, sizeof(sig))) {
> +		tdx_partitioning_active = early_is_hv_tdx_partitioning();
> +		if (!tdx_partitioning_active)
> +			return;
> +	}

Hi Borislav,

Just wanted to run another option by you. Instead of checking the CPUID here we
could accomplish the same result by doing _this_ in the hyperv cc init:

diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 8c6bf07f7d2b..705794642d34 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -595,6 +595,8 @@ void __init hv_vtom_init(void)
 #endif
 
 	case HV_ISOLATION_TYPE_TDX:
+		setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
+		tdx_partitioning_active = true;
 		cc_vendor = CC_VENDOR_INTEL;
 		break;
 

Which approach do you prefer?

Thanks,
Jeremi

>  
>  	setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
>  
>  	cc_vendor = CC_VENDOR_INTEL;
> +
> +	/*
> +	 * Need to defer cc_mask and page visibility callback initializations
> +	 * to a TD-partitioning aware implementation.
> +	 */
> +	if (tdx_partitioning_active)
> +		goto exit;
> +
>  	tdx_parse_tdinfo(&cc_mask);
>  	cc_set_mask(cc_mask);
>  
> @@ -820,5 +842,6 @@ void __init tdx_early_init(void)
>  	 */
>  	x86_cpuinit.parallel_bringup = false;
>  
> +exit:
>  	pr_info("Guest detected\n");
>  }
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index 603e6d1e9d4a..fe22f8675859 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -52,6 +52,7 @@ bool tdx_early_handle_ve(struct pt_regs *regs);
>  
>  int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
>  
> +extern bool tdx_partitioning_active;
>  #else
>  
>  static inline void tdx_early_init(void) { };
> @@ -71,6 +72,8 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
>  {
>  	return -ENODEV;
>  }
> +
> +#define tdx_partitioning_active false
>  #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
>  #endif /* !__ASSEMBLY__ */
>  #endif /* _ASM_X86_TDX_H */
  
Kirill A. Shutemov Nov. 23, 2023, 1:58 p.m. UTC | #2
On Wed, Nov 22, 2023 at 06:01:04PM +0100, Jeremi Piotrowski wrote:
> Check for additional CPUID bits to identify TDX guests running with Trust
> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
> 
> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
> aware mechanisms for what's left. So currently such guests do not have
> X86_FEATURE_TDX_GUEST set.
> 
> We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
> need to check these additional CPUID bits, but we skip further initialization
> in the function as we aren't guaranteed access to TDX module calls.

I don't follow. The idea of partitioning is that L2 OS can be
unenlightened and have no idea if it runs indide of TD. But this patch
tries to enumerate TDX anyway.

Why?
  
Jeremi Piotrowski Nov. 24, 2023, 10:31 a.m. UTC | #3
On 23/11/2023 14:58, Kirill A. Shutemov wrote:
> On Wed, Nov 22, 2023 at 06:01:04PM +0100, Jeremi Piotrowski wrote:
>> Check for additional CPUID bits to identify TDX guests running with Trust
>> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
>> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
>>
>> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
>> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
>> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
>> aware mechanisms for what's left. So currently such guests do not have
>> X86_FEATURE_TDX_GUEST set.
>>
>> We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
>> need to check these additional CPUID bits, but we skip further initialization
>> in the function as we aren't guaranteed access to TDX module calls.
> 
> I don't follow. The idea of partitioning is that L2 OS can be
> unenlightened and have no idea if it runs indide of TD. But this patch
> tries to enumerate TDX anyway.
> 
> Why?
> 

That's not the only idea of partitioning. Partitioning provides different privilege
levels within the TD, and unenlightened L2 OS can be made to work but are inefficient.
In our case Linux always runs enlightened (both with and without TD partitioning), and
uses TDX functionality where applicable (TDX vmcalls, PTE encryption bit).

There have been long discussions on LKML about how CoCo features should be supported,
I've followed most of them and I believe we've converged on: the kernel is fully
aware what kind of guest it is (SNP/TDX) and uses CC_ATTR_XXX to check for specific
SNP/TDX features.

Right now the guest with TD partitioning is missing out on X86_FEATURE_TDX_GUEST.
That's why this patch tries to enumerate TDX.

I have posted an alternate version of this patch for discussion here:
https://lore.kernel.org/lkml/0799b692-4b26-4e00-9cec-fdc4c929ea58@linux.microsoft.com/
  
Kirill A. Shutemov Nov. 24, 2023, 10:43 a.m. UTC | #4
On Fri, Nov 24, 2023 at 11:31:44AM +0100, Jeremi Piotrowski wrote:
> On 23/11/2023 14:58, Kirill A. Shutemov wrote:
> > On Wed, Nov 22, 2023 at 06:01:04PM +0100, Jeremi Piotrowski wrote:
> >> Check for additional CPUID bits to identify TDX guests running with Trust
> >> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
> >> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
> >>
> >> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
> >> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
> >> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
> >> aware mechanisms for what's left. So currently such guests do not have
> >> X86_FEATURE_TDX_GUEST set.
> >>
> >> We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
> >> need to check these additional CPUID bits, but we skip further initialization
> >> in the function as we aren't guaranteed access to TDX module calls.
> > 
> > I don't follow. The idea of partitioning is that L2 OS can be
> > unenlightened and have no idea if it runs indide of TD. But this patch
> > tries to enumerate TDX anyway.
> > 
> > Why?
> > 
> 
> That's not the only idea of partitioning. Partitioning provides different privilege
> levels within the TD, and unenlightened L2 OS can be made to work but are inefficient.
> In our case Linux always runs enlightened (both with and without TD partitioning), and
> uses TDX functionality where applicable (TDX vmcalls, PTE encryption bit).

What value L1 adds in this case? If L2 has to be enlightened just run the
enlightened OS directly as L1 and ditch half-measures. I think you can
gain some performance this way.
  
Jeremi Piotrowski Nov. 24, 2023, 11:04 a.m. UTC | #5
On 24/11/2023 11:43, Kirill A. Shutemov wrote:
> On Fri, Nov 24, 2023 at 11:31:44AM +0100, Jeremi Piotrowski wrote:
>> On 23/11/2023 14:58, Kirill A. Shutemov wrote:
>>> On Wed, Nov 22, 2023 at 06:01:04PM +0100, Jeremi Piotrowski wrote:
>>>> Check for additional CPUID bits to identify TDX guests running with Trust
>>>> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
>>>> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
>>>>
>>>> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
>>>> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
>>>> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
>>>> aware mechanisms for what's left. So currently such guests do not have
>>>> X86_FEATURE_TDX_GUEST set.
>>>>
>>>> We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
>>>> need to check these additional CPUID bits, but we skip further initialization
>>>> in the function as we aren't guaranteed access to TDX module calls.
>>>
>>> I don't follow. The idea of partitioning is that L2 OS can be
>>> unenlightened and have no idea if it runs indide of TD. But this patch
>>> tries to enumerate TDX anyway.
>>>
>>> Why?
>>>
>>
>> That's not the only idea of partitioning. Partitioning provides different privilege
>> levels within the TD, and unenlightened L2 OS can be made to work but are inefficient.
>> In our case Linux always runs enlightened (both with and without TD partitioning), and
>> uses TDX functionality where applicable (TDX vmcalls, PTE encryption bit).
> 
> What value L1 adds in this case? If L2 has to be enlightened just run the
> enlightened OS directly as L1 and ditch half-measures. I think you can
> gain some performance this way.
> 

It's primarily about the privilege separation, performance is a reason
one doesn't want to run unenlightened. The L1 makes the following possible:
- TPM emulation within the trust domain but isolated from the OS
- infrastructure interfaces for things like VM live migration
- support for Virtual Trust Levels[1], Virtual Secure Mode[2]

These provide a lot of value to users, it's not at all about half-measures.

[1]: https://lore.kernel.org/lkml/1681192532-15460-1-git-send-email-ssengar@linux.microsoft.com/
[2]: https://lore.kernel.org/lkml/20231108111806.92604-1-nsaenz@amazon.com/
  
Kirill A. Shutemov Nov. 24, 2023, 1:33 p.m. UTC | #6
On Fri, Nov 24, 2023 at 12:04:56PM +0100, Jeremi Piotrowski wrote:
> On 24/11/2023 11:43, Kirill A. Shutemov wrote:
> > On Fri, Nov 24, 2023 at 11:31:44AM +0100, Jeremi Piotrowski wrote:
> >> On 23/11/2023 14:58, Kirill A. Shutemov wrote:
> >>> On Wed, Nov 22, 2023 at 06:01:04PM +0100, Jeremi Piotrowski wrote:
> >>>> Check for additional CPUID bits to identify TDX guests running with Trust
> >>>> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
> >>>> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
> >>>>
> >>>> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
> >>>> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
> >>>> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
> >>>> aware mechanisms for what's left. So currently such guests do not have
> >>>> X86_FEATURE_TDX_GUEST set.
> >>>>
> >>>> We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
> >>>> need to check these additional CPUID bits, but we skip further initialization
> >>>> in the function as we aren't guaranteed access to TDX module calls.
> >>>
> >>> I don't follow. The idea of partitioning is that L2 OS can be
> >>> unenlightened and have no idea if it runs indide of TD. But this patch
> >>> tries to enumerate TDX anyway.
> >>>
> >>> Why?
> >>>
> >>
> >> That's not the only idea of partitioning. Partitioning provides different privilege
> >> levels within the TD, and unenlightened L2 OS can be made to work but are inefficient.
> >> In our case Linux always runs enlightened (both with and without TD partitioning), and
> >> uses TDX functionality where applicable (TDX vmcalls, PTE encryption bit).
> > 
> > What value L1 adds in this case? If L2 has to be enlightened just run the
> > enlightened OS directly as L1 and ditch half-measures. I think you can
> > gain some performance this way.
> > 
> 
> It's primarily about the privilege separation, performance is a reason
> one doesn't want to run unenlightened. The L1 makes the following possible:
> - TPM emulation within the trust domain but isolated from the OS
> - infrastructure interfaces for things like VM live migration
> - support for Virtual Trust Levels[1], Virtual Secure Mode[2]
> 
> These provide a lot of value to users, it's not at all about half-measures.

Hm. Okay.

Can we take a step back? What is bigger picture here? What enlightenment
do you expect from the guest when everything is in-place?

So far I see that you try to get kernel think that it runs as TDX guest,
but not really. This is not very convincing model.

Why does L2 need to know if it runs under TDX or SEV? Can't it just think
it runs as Hyper-V guest and all difference between TDX and SEV abstracted
by L1?

So far, I failed to see coherent design. Maybe I just don't know where to
look.
  
Jeremi Piotrowski Nov. 24, 2023, 4:19 p.m. UTC | #7
On 24/11/2023 14:33, Kirill A. Shutemov wrote:
> On Fri, Nov 24, 2023 at 12:04:56PM +0100, Jeremi Piotrowski wrote:
>> On 24/11/2023 11:43, Kirill A. Shutemov wrote:
>>> On Fri, Nov 24, 2023 at 11:31:44AM +0100, Jeremi Piotrowski wrote:
>>>> On 23/11/2023 14:58, Kirill A. Shutemov wrote:
>>>>> On Wed, Nov 22, 2023 at 06:01:04PM +0100, Jeremi Piotrowski wrote:
>>>>>> Check for additional CPUID bits to identify TDX guests running with Trust
>>>>>> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
>>>>>> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
>>>>>>
>>>>>> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
>>>>>> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
>>>>>> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
>>>>>> aware mechanisms for what's left. So currently such guests do not have
>>>>>> X86_FEATURE_TDX_GUEST set.
>>>>>>
>>>>>> We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
>>>>>> need to check these additional CPUID bits, but we skip further initialization
>>>>>> in the function as we aren't guaranteed access to TDX module calls.
>>>>>
>>>>> I don't follow. The idea of partitioning is that L2 OS can be
>>>>> unenlightened and have no idea if it runs indide of TD. But this patch
>>>>> tries to enumerate TDX anyway.
>>>>>
>>>>> Why?
>>>>>
>>>>
>>>> That's not the only idea of partitioning. Partitioning provides different privilege
>>>> levels within the TD, and unenlightened L2 OS can be made to work but are inefficient.
>>>> In our case Linux always runs enlightened (both with and without TD partitioning), and
>>>> uses TDX functionality where applicable (TDX vmcalls, PTE encryption bit).
>>>
>>> What value L1 adds in this case? If L2 has to be enlightened just run the
>>> enlightened OS directly as L1 and ditch half-measures. I think you can
>>> gain some performance this way.
>>>
>>
>> It's primarily about the privilege separation, performance is a reason
>> one doesn't want to run unenlightened. The L1 makes the following possible:
>> - TPM emulation within the trust domain but isolated from the OS
>> - infrastructure interfaces for things like VM live migration
>> - support for Virtual Trust Levels[1], Virtual Secure Mode[2]
>>
>> These provide a lot of value to users, it's not at all about half-measures.
> 
> Hm. Okay.
> 
> Can we take a step back? What is bigger picture here? What enlightenment
> do you expect from the guest when everything is in-place?
> 

All the functional enlightenment are already in place in the kernel and
everything works (correct me if I'm wrong Dexuan/Michael). The enlightenments
are that TDX VMCALLs are needed for MSR manipulation and vmbus operations,
encrypted bit needs to be manipulated in the page tables and page
visibility propagated to VMM.

Whats missing is the tdx_guest flag is not exposed to userspace in /proc/cpuinfo,
and as a result dmesg does not currently display:
"Memory Encryption Features active: Intel TDX".

That's what I set out to correct.

> So far I see that you try to get kernel think that it runs as TDX guest,
> but not really. This is not very convincing model.
> 

No that's not accurate at all. The kernel is running as a TDX guest so I
want the kernel to know that. TDX is not a monolithic thing, it has different
features that can be in-use and it has differences in behavior when running
with TD partitioning (example: no #VE/TDX module calls). So those differences
need to be clearly modeled in code.

> Why does L2 need to know if it runs under TDX or SEV? Can't it just think
> it runs as Hyper-V guest and all difference between TDX and SEV abstracted
> by L1?
> 

If you look into the git history you'll find this was attempted with
CC_VENDOR_HYPERV. That proved to be a dead end as some things just can't be
abstracted (GHCI vs GHCB; the encrypted bit works differently). What resulted
was a ton of conditionals and duplication. After long discussions with Borislav
we converged on clearly identifying with the underlying technology (SEV/TDX)
and being explicit about support for optional parts in each scheme (like vTOM).
  
Kai Huang Nov. 29, 2023, 4:36 a.m. UTC | #8
On Fri, 2023-11-24 at 17:19 +0100, Jeremi Piotrowski wrote:
> On 24/11/2023 14:33, Kirill A. Shutemov wrote:
> > On Fri, Nov 24, 2023 at 12:04:56PM +0100, Jeremi Piotrowski wrote:
> > > On 24/11/2023 11:43, Kirill A. Shutemov wrote:
> > > > On Fri, Nov 24, 2023 at 11:31:44AM +0100, Jeremi Piotrowski wrote:
> > > > > On 23/11/2023 14:58, Kirill A. Shutemov wrote:
> > > > > > On Wed, Nov 22, 2023 at 06:01:04PM +0100, Jeremi Piotrowski wrote:
> > > > > > > Check for additional CPUID bits to identify TDX guests running with Trust
> > > > > > > Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
> > > > > > > inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
> > > > > > > 
> > > > > > > In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
> > > > > > > to Linux running as an L2 TD VM. This is because a majority of TDX facilities
> > > > > > > are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
> > > > > > > aware mechanisms for what's left. So currently such guests do not have
> > > > > > > X86_FEATURE_TDX_GUEST set.
> > > > > > > 
> > > > > > > We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
> > > > > > > need to check these additional CPUID bits, but we skip further initialization
> > > > > > > in the function as we aren't guaranteed access to TDX module calls.
> > > > > > 
> > > > > > I don't follow. The idea of partitioning is that L2 OS can be
> > > > > > unenlightened and have no idea if it runs indide of TD. But this patch
> > > > > > tries to enumerate TDX anyway.
> > > > > > 
> > > > > > Why?
> > > > > > 
> > > > > 
> > > > > That's not the only idea of partitioning. Partitioning provides different privilege
> > > > > levels within the TD, and unenlightened L2 OS can be made to work but are inefficient.
> > > > > In our case Linux always runs enlightened (both with and without TD partitioning), and
> > > > > uses TDX functionality where applicable (TDX vmcalls, PTE encryption bit).
> > > > 
> > > > What value L1 adds in this case? If L2 has to be enlightened just run the
> > > > enlightened OS directly as L1 and ditch half-measures. I think you can
> > > > gain some performance this way.
> > > > 
> > > 
> > > It's primarily about the privilege separation, performance is a reason
> > > one doesn't want to run unenlightened. The L1 makes the following possible:
> > > - TPM emulation within the trust domain but isolated from the OS
> > > - infrastructure interfaces for things like VM live migration
> > > - support for Virtual Trust Levels[1], Virtual Secure Mode[2]
> > > 
> > > These provide a lot of value to users, it's not at all about half-measures.

It's not obvious why the listed things above are related to TDX guest.  They
look more like hyperv specific enlightenment which don't have dependency of TDX
guest.

For instance, the "Emulating Hyper-V VSM with KVM" design in your above [2] says
nothing about TDX (or SEV):

https://lore.kernel.org/lkml/20231108111806.92604-34-nsaenz@amazon.com/

> > 
> > Hm. Okay.
> > 
> > Can we take a step back? What is bigger picture here? What enlightenment
> > do you expect from the guest when everything is in-place?
> > 
> 
> All the functional enlightenment are already in place in the kernel and
> everything works (correct me if I'm wrong Dexuan/Michael). The enlightenments
> are that TDX VMCALLs are needed for MSR manipulation and vmbus operations,
> encrypted bit needs to be manipulated in the page tables and page
> visibility propagated to VMM.

Not quite family with hyperv enlightenments, but are these enlightenments TDX
guest specific?  Because if they are not, then they should be able to be
emulated by the normal hyperv, thus the hyperv as L1 (which is TDX guest) can
emulate them w/o letting the L2 know the hypervisor it runs on is actually a TDX
guest.

Btw, even if there's performance concern here, as you mentioned the TDVMCALL is
actually made to the L0 which means L0 must be aware such VMCALL is from L2 and
needs to be injected to L1 to handle, which IMHO not only complicates the L0 but
also may not have any performance benefits.

> 
> Whats missing is the tdx_guest flag is not exposed to userspace in /proc/cpuinfo,
> and as a result dmesg does not currently display:
> "Memory Encryption Features active: Intel TDX".
> 
> That's what I set out to correct.
> 
> > So far I see that you try to get kernel think that it runs as TDX guest,
> > but not really. This is not very convincing model.
> > 
> 
> No that's not accurate at all. The kernel is running as a TDX guest so I
> want the kernel to know that. 
> 

But it isn't.  It runs on a hypervisor which is a TDX guest, but this doesn't
make itself a TDX guest.

> TDX is not a monolithic thing, it has different
> features that can be in-use and it has differences in behavior when running
> with TD partitioning (example: no #VE/TDX module calls). So those differences
> need to be clearly modeled in code.

Well IMHO this is a design choice but not a fact.  E.g., if we never sets
TDX_GUEST flag for L2 then it naturally doesn't use TDX guest related staff. 
Otherwise we need additional patches like your patch 2/3 in this series to stop
the L2 to use certain TDX functionality.

And I guess we will need more patches to stop L2 from doing TDX guest things. 
E.g., we might want to disable TDX attestation interface support in the L2
guest, because the L2 is indeed not a TDX guest..

So to me even there's value to advertise L2 as TDX guest, the pros/cons need to
be evaluated to see whether it is worth.

> 
> > Why does L2 need to know if it runs under TDX or SEV? Can't it just think
> > it runs as Hyper-V guest and all difference between TDX and SEV abstracted
> > by L1?
> > 
> 
> If you look into the git history you'll find this was attempted with
> CC_VENDOR_HYPERV. That proved to be a dead end as some things just can't be
> abstracted (GHCI vs GHCB; the encrypted bit works differently). What resulted
> was a ton of conditionals and duplication. After long discussions with Borislav
> we converged on clearly identifying with the underlying technology (SEV/TDX)
> and being explicit about support for optional parts in each scheme (like vTOM).

Can you provide more background?  For instance, why does the L2 needs to know
the encrypted bit that is in *L1*?
  
Borislav Petkov Nov. 29, 2023, 4:40 p.m. UTC | #9
On Wed, Nov 22, 2023 at 06:19:20PM +0100, Jeremi Piotrowski wrote:
> Which approach do you prefer?

I'm trying to figure out from the whole thread, what this guest is.

* A HyperV second-level guest

* of type TDX

* Needs to defer cc_mask and page visibility bla...

* needs to disable TDX module calls

* stub out tdx_accept_memory

Anything else?

And my worry is that this is going to become a mess and your patches
already show that it is going in that direction because you need to run
the TDX side but still have *some* things done differently. Which is
needed because this is a different type of guest, even if it is a TDX
one.

Which reminds me, we have amd_cc_platform_vtom() which is a similar type
of thing.

And the TDX side could do something similar and at least *try* to
abstract away all that stuff.

Would it be nice? Of course not!

How can one model a virt zoo of at least a dozen guest types but still
keep code sane... :-\
  
Reshetova, Elena Nov. 30, 2023, 7:08 a.m. UTC | #10
> On Wed, Nov 22, 2023 at 06:19:20PM +0100, Jeremi Piotrowski wrote:
> > Which approach do you prefer?
> 
> I'm trying to figure out from the whole thread, what this guest is.
> 
> * A HyperV second-level guest
> 
> * of type TDX
> 
> * Needs to defer cc_mask and page visibility bla...
> 
> * needs to disable TDX module calls
> 
> * stub out tdx_accept_memory
> 
> Anything else?
> 

Actually I want to challenge the whole notion that a TDX partitioning L2
guest is a TDX guest. By the definition of the TDX partitioning spec, 
L2 guest can be anything and L1 VMM can emulate any environment
for its L2 guests, including running a fully unmodified TDX 1.0 guest
(and virtualizing TDG calls for it and whatever else is needed).
So we are really talking about a big spectrum of possible L2 guests:

1. Normal legacy guest without *any* TDX knowledge
2. Normal legacy guest with some awareness of TDX partitioning (note, not TDX 1.0 aware!) 
(being able to do tdvmcalls to L0, being able to use shared memory, 
being able to use tdx partitioning specific paravirt interface to L1 if defined, etc) 
... 
3. Normal TDX 1.0 guest that is unaware that it runs in partitioned environment
4. and so on

I don’t know if AMD architecture would support all this spectrum of the guests through.
If it does, then we can discuss this vendor agnostic, which is much better.

Given that the many possible combinations of the L2 guests (and as Kai rightfully
pointed out, each option will be further broken down by what exactly interface 
L1 VMM decides to expose to the guest), I think we cannot hardcode any logic
about what this partitioned guest is in the guest itself. 
Instead we should have a flexible way for the L2 guest to discover the virt environment
it runs in (as modelled by L1 VMM) and the baseline should not to assume
it is a TDX or SEV guest, but assume this is some special virt guest (or legacy guest,
whatever approach is cleaner) and expose additional interfaces to it. 

Best Regards,
Elena.
  
Borislav Petkov Nov. 30, 2023, 7:55 a.m. UTC | #11
On Thu, Nov 30, 2023 at 07:08:00AM +0000, Reshetova, Elena wrote:
> ...
> 3. Normal TDX 1.0 guest that is unaware that it runs in partitioned
>    environment
> 4. and so on

There's a reason I call it a virt zoo.

> I don’t know if AMD architecture would support all this spectrum of
> the guests through.

I hear threats...

> Instead we should have a flexible way for the L2 guest to discover
> the virt environment it runs in (as modelled by L1 VMM) and the
> baseline should not to assume it is a TDX or SEV guest, but assume
> this is some special virt guest (or legacy guest, whatever approach
> is cleaner) and expose additional interfaces to it.

You can do flexible all you want but all that guest zoo is using the
kernel. The same code base which boots on gazillion incarnations of real
hardware. And we have trouble keeping that code base clean already.

Now, all those weird guests come along, they're more or less
"compatible" but not fully. So they have to do an exception here,
disable some feature there which they don't want/support/cannot/bla. Or
they use a paravisor which does *some* of the work for them so that
needs to be accomodated too.

And so they start sprinkling around all those "differences" around the
kernel. And turn it into an unmaintainable mess. We've been here before
- last time it was called "if (XEN)"... and we're already getting there
again only with two L1 encrypted guests technologies. I'm currently
working on trimming down some of the SEV mess we've already added...

So - and I've said this a bunch of times already - whatever guest type
it is, its interaction with the main kernel better be properly designed
and abstracted away so that it doesn't turn into a mess.

Thx.
  
Reshetova, Elena Nov. 30, 2023, 8:31 a.m. UTC | #12
> On Thu, Nov 30, 2023 at 07:08:00AM +0000, Reshetova, Elena wrote:
> > ...
> > 3. Normal TDX 1.0 guest that is unaware that it runs in partitioned
> >    environment
> > 4. and so on
> 
> There's a reason I call it a virt zoo.
> 
> > I don’t know if AMD architecture would support all this spectrum of
> > the guests through.
> 
> I hear threats...

No threats whatsoever, I just truly don’t know details of SEV architecture
on this and how it envisioned to operate under this nesting scenario.
I raised this point to see if we can build the common understanding 
on this. My personal understanding (please correct me) was that SEV
would also allow different types of L2 guests, so I think we should be
aligning on this.  

> 
> > Instead we should have a flexible way for the L2 guest to discover
> > the virt environment it runs in (as modelled by L1 VMM) and the
> > baseline should not to assume it is a TDX or SEV guest, but assume
> > this is some special virt guest (or legacy guest, whatever approach
> > is cleaner) and expose additional interfaces to it.
> 
> You can do flexible all you want but all that guest zoo is using the
> kernel. The same code base which boots on gazillion incarnations of real
> hardware. And we have trouble keeping that code base clean already.

Fully agree, I wasn’t objecting this. What I was objecting is to make
explicit assumptions on what the L2 guest under TDX partitioning is. 

> 
> Now, all those weird guests come along, they're more or less
> "compatible" but not fully. So they have to do an exception here,
> disable some feature there which they don't want/support/cannot/bla. Or
> they use a paravisor which does *some* of the work for them so that
> needs to be accomodated too.
> 
> And so they start sprinkling around all those "differences" around the
> kernel. And turn it into an unmaintainable mess. We've been here before
> - last time it was called "if (XEN)"... and we're already getting there
> again only with two L1 encrypted guests technologies. I'm currently
> working on trimming down some of the SEV mess we've already added...
> 
> So - and I've said this a bunch of times already - whatever guest type
> it is, its interaction with the main kernel better be properly designed
> and abstracted away so that it doesn't turn into a mess.

Yes, agree, so what are our options and overall strategy on this? 
We can try to push as much as possible complexity into L1 VMM in this
scenario to keep the guest kernel almost free from these sprinkling differences.
Afterall the L1 VMM can emulate whatever it wants for the guest.
We can also see if there is a true need to add another virtualization
abstraction here, i.e. "nested encrypted guest". But to justify this one
we need to have usecases/scenarios where L1 VMM actually cannot run
L2 guest (legacy or TDX enabled) as it is. 
@Jeremi Piotrowski do you have such usecase/scenarios you can describe?

Any other options we should be considering as overall strategy? 

Best Regards,
Elena.

> 
> Thx.
> 
> --
> Regards/Gruss,
>     Boris.
> 
> https://people.kernel.org/tglx/notes-about-netiquette
  
Borislav Petkov Nov. 30, 2023, 9:21 a.m. UTC | #13
On Thu, Nov 30, 2023 at 08:31:03AM +0000, Reshetova, Elena wrote:
> No threats whatsoever,

I don't mean you - others. :-)

> I just truly don’t know details of SEV architecture on this and how it
> envisioned to operate under this nesting scenario.  I raised this
> point to see if we can build the common understanding on this. My
> personal understanding (please correct me) was that SEV would also
> allow different types of L2 guests, so I think we should be aligning
> on this.

Yes, makes sense. The only L2 thing I've heard of is some Azure setup.

> Yes, agree, so what are our options and overall strategy on this?  We
> can try to push as much as possible complexity into L1 VMM in this
> scenario to keep the guest kernel almost free from these sprinkling
> differences.

I like that angle. :)

> Afterall the L1 VMM can emulate whatever it wants for the guest.
> We can also see if there is a true need to add another virtualization
> abstraction here, i.e. "nested encrypted guest".

Yes, that too. I'm sceptical but there are use cases with that paravisor
thing and being able to run an unmodified guest, i.e., something like
a very old OS which no one develops anymore but customers simply can't
part ways with it for raisins.

> Any other options we should be considering as overall strategy?

The less the kernel knows about guests, the better, I'd say.

The other thing I'm missing most of the time is, people come with those
patches enabling this and that but nowhere does it say: "we would love
to have this because of this great idea we had" or "brings so much more
performance" or "amazing code quality imrpovement" or, or other great
reason.

Rather, it is "yeah, we do this and you should support it". Well, nope.

Thx.
  
Jeremi Piotrowski Dec. 1, 2023, 4:16 p.m. UTC | #14
On 29/11/2023 05:36, Huang, Kai wrote:
> On Fri, 2023-11-24 at 17:19 +0100, Jeremi Piotrowski wrote:
>> On 24/11/2023 14:33, Kirill A. Shutemov wrote:
>>> On Fri, Nov 24, 2023 at 12:04:56PM +0100, Jeremi Piotrowski wrote:
>>>> On 24/11/2023 11:43, Kirill A. Shutemov wrote:
>>>>> On Fri, Nov 24, 2023 at 11:31:44AM +0100, Jeremi Piotrowski wrote:
>>>>>> On 23/11/2023 14:58, Kirill A. Shutemov wrote:
>>>>>>> On Wed, Nov 22, 2023 at 06:01:04PM +0100, Jeremi Piotrowski wrote:
>>>>>>>> Check for additional CPUID bits to identify TDX guests running with Trust
>>>>>>>> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
>>>>>>>> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
>>>>>>>>
>>>>>>>> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is visible
>>>>>>>> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
>>>>>>>> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
>>>>>>>> aware mechanisms for what's left. So currently such guests do not have
>>>>>>>> X86_FEATURE_TDX_GUEST set.
>>>>>>>>
>>>>>>>> We want the kernel to have X86_FEATURE_TDX_GUEST set for all TDX guests so we
>>>>>>>> need to check these additional CPUID bits, but we skip further initialization
>>>>>>>> in the function as we aren't guaranteed access to TDX module calls.
>>>>>>>
>>>>>>> I don't follow. The idea of partitioning is that L2 OS can be
>>>>>>> unenlightened and have no idea if it runs indide of TD. But this patch
>>>>>>> tries to enumerate TDX anyway.
>>>>>>>
>>>>>>> Why?
>>>>>>>
>>>>>>
>>>>>> That's not the only idea of partitioning. Partitioning provides different privilege
>>>>>> levels within the TD, and unenlightened L2 OS can be made to work but are inefficient.
>>>>>> In our case Linux always runs enlightened (both with and without TD partitioning), and
>>>>>> uses TDX functionality where applicable (TDX vmcalls, PTE encryption bit).
>>>>>
>>>>> What value L1 adds in this case? If L2 has to be enlightened just run the
>>>>> enlightened OS directly as L1 and ditch half-measures. I think you can
>>>>> gain some performance this way.
>>>>>
>>>>
>>>> It's primarily about the privilege separation, performance is a reason
>>>> one doesn't want to run unenlightened. The L1 makes the following possible:
>>>> - TPM emulation within the trust domain but isolated from the OS
>>>> - infrastructure interfaces for things like VM live migration
>>>> - support for Virtual Trust Levels[1], Virtual Secure Mode[2]
>>>>
>>>> These provide a lot of value to users, it's not at all about half-measures.
> 
> It's not obvious why the listed things above are related to TDX guest.  They
> look more like hyperv specific enlightenment which don't have dependency of TDX
> guest.
> 
> For instance, the "Emulating Hyper-V VSM with KVM" design in your above [2] says
> nothing about TDX (or SEV):
> 
> https://lore.kernel.org/lkml/20231108111806.92604-34-nsaenz@amazon.com/
> 

These are features that require TD-partitioning to implement in the context of a
TDX guest. I was trying to answer Kirill's question "What value L1 adds in this case?".

In SEV-SNP we implement the same features using VMPLs (privilege levels) and an SVSM
(but call it paravisor).

I should have elaborated on what I meant with [2]: it shows how others are trying to 
implement Virtual Trust Levels in KVM for non-confidential guests. You can't do that
for TDX without TD-partitioning.

>>>
>>> Hm. Okay.
>>>
>>> Can we take a step back? What is bigger picture here? What enlightenment
>>> do you expect from the guest when everything is in-place?
>>>
>>
>> All the functional enlightenment are already in place in the kernel and
>> everything works (correct me if I'm wrong Dexuan/Michael). The enlightenments
>> are that TDX VMCALLs are needed for MSR manipulation and vmbus operations,
>> encrypted bit needs to be manipulated in the page tables and page
>> visibility propagated to VMM.
> 
> Not quite family with hyperv enlightenments, but are these enlightenments TDX
> guest specific?  Because if they are not, then they should be able to be
> emulated by the normal hyperv, thus the hyperv as L1 (which is TDX guest) can
> emulate them w/o letting the L2 know the hypervisor it runs on is actually a TDX
> guest.

I would say that these hyperv enlightenments are confidential guest specific
(TDX/SNP) when running with TD-partitioning/VMPL. In both cases there are TDX/SNP
specific ways to exit directly to L0 (when needed) and native privileged instructions
trap to the paravisor.

L1 is not hyperv and no one wants to emulate the I/O path. The L2 guest knows that
it's confidential so that it can explicitly use swiotlb, toggle page visibility
and notify the host (L0) on the I/O path without incurring additional emulation
overhead.

> 
> Btw, even if there's performance concern here, as you mentioned the TDVMCALL is
> actually made to the L0 which means L0 must be aware such VMCALL is from L2 and
> needs to be injected to L1 to handle, which IMHO not only complicates the L0 but
> also may not have any performance benefits.

The TDVMCALLs are related to the I/O path (networking/block io) into the L2 guest, and
so they intentionally go straight to L0 and are never injected to L1. L1 is not
involved in that path at all.

Using something different than TDVMCALLs here would lead to additional traps to L1 and
just add latency/complexity.

> 
>>
>> Whats missing is the tdx_guest flag is not exposed to userspace in /proc/cpuinfo,
>> and as a result dmesg does not currently display:
>> "Memory Encryption Features active: Intel TDX".
>>
>> That's what I set out to correct.
>>
>>> So far I see that you try to get kernel think that it runs as TDX guest,
>>> but not really. This is not very convincing model.
>>>
>>
>> No that's not accurate at all. The kernel is running as a TDX guest so I
>> want the kernel to know that. 
>>
> 
> But it isn't.  It runs on a hypervisor which is a TDX guest, but this doesn't
> make itself a TDX guest.> 

That depends on your definition of "TDX guest". The TDX 1.5 TD partitioning spec
talks of TDX-enlightened L1 VMM, (optionally) TDX-enlightened L2 VM and Unmodified
Legacy L2 VM. Here we're dealing with a TDX-enlightened L2 VM.

If a guest runs inside an Intel TDX protected TD, is aware of memory encryption and
issues TDVMCALLs - to me that makes it a TDX guest.

>> TDX is not a monolithic thing, it has different
>> features that can be in-use and it has differences in behavior when running
>> with TD partitioning (example: no #VE/TDX module calls). So those differences
>> need to be clearly modeled in code.
> 
> Well IMHO this is a design choice but not a fact.  E.g., if we never sets
> TDX_GUEST flag for L2 then it naturally doesn't use TDX guest related staff. 
> Otherwise we need additional patches like your patch 2/3 in this series to stop
> the L2 to use certain TDX functionality.
> 
> And I guess we will need more patches to stop L2 from doing TDX guest things. 
> E.g., we might want to disable TDX attestation interface support in the L2
> guest, because the L2 is indeed not a TDX guest..
> 

The TDX attestation interface uses a TDCALL, I covered that in patch 2. The only
extra patch missing is to not rely on tdx_safe_halt (which is a TDVMCALL).

> So to me even there's value to advertise L2 as TDX guest, the pros/cons need to
> be evaluated to see whether it is worth.
> 

This all started from this discussion [1]. X86_FEATURE_TDX_GUEST is needed so
that dmesg contains the correct print ("Memory Encryption Features active: Intel TDX")
and so that userspace sees X86_FEATURE_TDX_GUEST. I think it's fair to want parity
here so that a workload scheduler like kubernetes knows how to place a workload
in a TDX protected VM, regardless of what hypervisor runs underneath.

[1]: https://lore.kernel.org/lkml/1699546489-4606-1-git-send-email-jpiotrowski@linux.microsoft.com/

>>
>>> Why does L2 need to know if it runs under TDX or SEV? Can't it just think
>>> it runs as Hyper-V guest and all difference between TDX and SEV abstracted
>>> by L1?
>>>
>>
>> If you look into the git history you'll find this was attempted with
>> CC_VENDOR_HYPERV. That proved to be a dead end as some things just can't be
>> abstracted (GHCI vs GHCB; the encrypted bit works differently). What resulted
>> was a ton of conditionals and duplication. After long discussions with Borislav
>> we converged on clearly identifying with the underlying technology (SEV/TDX)
>> and being explicit about support for optional parts in each scheme (like vTOM).
> 
> Can you provide more background?  For instance, why does the L2 needs to know
> the encrypted bit that is in *L1*?
> 
> 

Michael or Dexuan know way more about this than me, but my understanding is that when
the L2 guest accesses a bounce buffer that it has shared with the host then it needs
to do so through a mapping the correct kind of mapping with the "decrypted bit" set.
  
Reshetova, Elena Dec. 4, 2023, 9:17 a.m. UTC | #15
> Check for additional CPUID bits to identify TDX guests running with Trust
> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
> 
> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is
> visible
> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
> aware mechanisms for what's left. So currently such guests do not have
> X86_FEATURE_TDX_GUEST set.

Back to this concrete patch. Why cannot L1 VMM emulate the correct value of
the TDX_CPUID_LEAF_ID to L2 VM? It can do this per TDX partitioning arch.
How do you handle this and other CPUID calls call currently in L1? Per spec,
all CPUIDs calls from L2 will cause L2 --> L1 exit, so what do you do in L1? 

Given that you do that simple emulation, you already end up with TDX guest
code being activated. Next you can check what features you wont be able to
provide in L1 and create simple emulation calls for the TDG calls that must be
supported and cannot return error. The biggest TDG call (TDVMCALL) is already
direct call into L0 VMM, so this part doesn’t require L1 VMM support. 

Until we really see what breaks with this approach, I don’t think it is worth to
take in the complexity to support different L1 hypervisors view on partitioning.

Best Regards,
Elena.
  
Jeremi Piotrowski Dec. 4, 2023, 1:39 p.m. UTC | #16
On 30/11/2023 09:31, Reshetova, Elena wrote:
> 
>> On Thu, Nov 30, 2023 at 07:08:00AM +0000, Reshetova, Elena wrote:
>>> ...
>>> 3. Normal TDX 1.0 guest that is unaware that it runs in partitioned
>>>    environment
>>> 4. and so on
>>
>> There's a reason I call it a virt zoo.
>>
>>> I don’t know if AMD architecture would support all this spectrum of
>>> the guests through.
>>
>> I hear threats...
> 
> No threats whatsoever, I just truly don’t know details of SEV architecture
> on this and how it envisioned to operate under this nesting scenario.
> I raised this point to see if we can build the common understanding 
> on this. My personal understanding (please correct me) was that SEV
> would also allow different types of L2 guests, so I think we should be
> aligning on this.
I don't think SNP allows the level of freedom you describe. But regardless
of the possibilities, I can't think of users of this technology actually
being interested in running all of these options. Would love to hear someone
speak up.

I think of VMPLs provided by SNP and the TD-partitioning L1/L2 scheme as
the equivalent of ARM's trustzone/EL3 concept. It lets you separate high
privilege operations into a hardware isolated context. In this case it's
within the same confidential computing boundary. That's our use case.
 
> 
>>
>>> Instead we should have a flexible way for the L2 guest to discover
>>> the virt environment it runs in (as modelled by L1 VMM) and the
>>> baseline should not to assume it is a TDX or SEV guest, but assume
>>> this is some special virt guest (or legacy guest, whatever approach
>>> is cleaner) and expose additional interfaces to it.
>>
>> You can do flexible all you want but all that guest zoo is using the
>> kernel. The same code base which boots on gazillion incarnations of real
>> hardware. And we have trouble keeping that code base clean already.
> 
> Fully agree, I wasn’t objecting this. What I was objecting is to make
> explicit assumptions on what the L2 guest under TDX partitioning is. 
> 

That's fair, my intention was to have this simple logic (if td-partitioning,
then this and this is given) until a different user of TD-partitioning comes
along and then we figure out which parts generalize.

>>
>> Now, all those weird guests come along, they're more or less
>> "compatible" but not fully. So they have to do an exception here,
>> disable some feature there which they don't want/support/cannot/bla. Or
>> they use a paravisor which does *some* of the work for them so that
>> needs to be accomodated too.
>>
>> And so they start sprinkling around all those "differences" around the
>> kernel. And turn it into an unmaintainable mess. We've been here before
>> - last time it was called "if (XEN)"... and we're already getting there
>> again only with two L1 encrypted guests technologies. I'm currently
>> working on trimming down some of the SEV mess we've already added...
>>
>> So - and I've said this a bunch of times already - whatever guest type
>> it is, its interaction with the main kernel better be properly designed
>> and abstracted away so that it doesn't turn into a mess.
> 
> Yes, agree, so what are our options and overall strategy on this? 
> We can try to push as much as possible complexity into L1 VMM in this
> scenario to keep the guest kernel almost free from these sprinkling differences.
> Afterall the L1 VMM can emulate whatever it wants for the guest.
> We can also see if there is a true need to add another virtualization
> abstraction here, i.e. "nested encrypted guest". But to justify this one
> we need to have usecases/scenarios where L1 VMM actually cannot run
> L2 guest (legacy or TDX enabled) as it is. 
> @Jeremi Piotrowski do you have such usecase/scenarios you can describe?
> > Any other options we should be considering as overall strategy? 

Just taking a step back: we're big SNP and TDX users. The only kind of guest
that meets our users needs on both SNP and TDX and that we support across
our server fleet is closest to what you listed as 2:
"guest with a CoCo security module (paravisor) and targeted CoCo enlightenments".

We're aligned on the need to push complexity out of the kernel which is exactly
what has happened (also across vendors): the guest is mostly unconcerned by the
differences between TDX and SNP (except notification hypercall in the I/O path),
does not need all the changes in the early boot code that TDX/SNP have forced,
switches page visibility with the same hypercall for both etc.

I'm not aware of use cases for fully legacy guests, and my guess is they would suffer
from excessive overhead.

I am also not aware of use cases for "pretending to be an TDX 1.0 guest". Doing that
removes opportunities to share kernel code with normal guests and SNP guests on hyperv.
I'd also like to point out something that Michael wrote here[1] regarding paravisor
interfaces:
"But it seems like any kind of (forwarding) scheme needs to be a well-defined contract
that would work for both TDX and SEV-SNP."

[1]: https://lore.kernel.org/lkml/SN6PR02MB415717E09C249A31F2A4E229D4BCA@SN6PR02MB4157.namprd02.prod.outlook.com/

Another thing to note: in SNP you *need* to interact with VMPL0 (~L1 VMM) when
running at other VMPLs (eg. pvalidate and rmpadjust only possible at VMPL0) so
the kernel needs to cooperate with VMPL0 to operate. On skimming the TD-part
spec I'm not sure how "supporting fast-path I/O" would be possible with supporting
a "TDX 1.0 guest" with no TD-part awareness (if you need to trap all TDVMCALL then
that's not OK).

Now back to the topic at hand: I think what's needed is to stop treating
X86_FEATURE_TDX_GUEST as an all-or-nothing thing. Split out the individual
enlightenment into separate CC attributes, allow them to be selected without
requiring you to buy the whole zoo. I don't think we need a "nested encrypted guest"
abstraction.

Jeremi

> 
> Best Regards,
> Elena.
> 
>>
>> Thx.
>>
>> --
>> Regards/Gruss,
>>     Boris.
>>
>> https://people.kernel.org/tglx/notes-about-netiquette
  
Jeremi Piotrowski Dec. 4, 2023, 4:44 p.m. UTC | #17
On 30/11/2023 10:21, Borislav Petkov wrote:
> On Thu, Nov 30, 2023 at 08:31:03AM +0000, Reshetova, Elena wrote:
>> No threats whatsoever,
> 
> I don't mean you - others. :-)
> 
>> I just truly don’t know details of SEV architecture on this and how it
>> envisioned to operate under this nesting scenario.  I raised this
>> point to see if we can build the common understanding on this. My
>> personal understanding (please correct me) was that SEV would also
>> allow different types of L2 guests, so I think we should be aligning
>> on this.
> 
> Yes, makes sense. The only L2 thing I've heard of is some Azure setup.

"L2" is the Intel terminology in TD-partitioning (see figure 11.2 in [1]),
but Azure uses it for the exact same thing as VMPLs in SEV-SNP. On the AMD
side the community is working on Coconut SVSM[2] and there was an AMD SVSM[3]
project before that, both of them have the same idea as the Azure paravisor.
SUSE, Red Hat, IBM and others are active in SVSM development, we've also started
contributing. I think this kind of usage will also appear on TDX soon.

[1]: https://cdrdv2.intel.com/v1/dl/getContent/773039
[2]: https://github.com/coconut-svsm/svsm
[3]: https://github.com/AMDESE/linux-svsm

> 
>> Yes, agree, so what are our options and overall strategy on this?  We
>> can try to push as much as possible complexity into L1 VMM in this
>> scenario to keep the guest kernel almost free from these sprinkling
>> differences.
> 
> I like that angle. :)
> 
>> Afterall the L1 VMM can emulate whatever it wants for the guest.
>> We can also see if there is a true need to add another virtualization
>> abstraction here, i.e. "nested encrypted guest".
> 
> Yes, that too. I'm sceptical but there are use cases with that paravisor
> thing and being able to run an unmodified guest, i.e., something like
> a very old OS which no one develops anymore but customers simply can't
> part ways with it for raisins

I don't think we'll be seeing Windows XP TDX/SNP guests :)

The primary use case for the paravisor (/SVSM) is pretty common across the
the industry and projects that I shared: TPM emulation. Then comes the
other stuff: live migration, "trustlets", further device emulation. All this
inside the confidential boundary.

> 
>> Any other options we should be considering as overall strategy?
> 
> The less the kernel knows about guests, the better, I'd say.
> 

The challenge lies in navigating the Venn diagram of: standard guest,
TDX guest and SNP guest. Is a "guest" more "TDX" or more "Hyper-V" (or KVM/Vmware)?
Should the TDX (and SNP) code be extended with more knowledge of hypervisor
interfaces or should the hypervisor interfaces know more about TDX/SNP.

I'd love it if we all had some agreement on this. I posted these patches
based on suggestions that TDX guest-ness takes precedence.

> The other thing I'm missing most of the time is, people come with those
> patches enabling this and that but nowhere does it say: "we would love
> to have this because of this great idea we had" or "brings so much more
> performance" or "amazing code quality imrpovement" or, or other great
> reason.
> 
> Rather, it is "yeah, we do this and you should support it". Well, nope.
> 
I hear you, that's we've been making an effort to be more open about use cases
and capabilities along with patches. We also care about simplifying the code
to make it easier to follow the flows. I think the whole kernel has come
a long way since the first confidential guest patches were posted.

Jeremi

> Thx.
>
  
Jeremi Piotrowski Dec. 4, 2023, 7:07 p.m. UTC | #18
On 04/12/2023 10:17, Reshetova, Elena wrote:
>> Check for additional CPUID bits to identify TDX guests running with Trust
>> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
>> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
>>
>> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is
>> visible
>> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
>> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
>> aware mechanisms for what's left. So currently such guests do not have
>> X86_FEATURE_TDX_GUEST set.
> 
> Back to this concrete patch. Why cannot L1 VMM emulate the correct value of
> the TDX_CPUID_LEAF_ID to L2 VM? It can do this per TDX partitioning arch.
> How do you handle this and other CPUID calls call currently in L1? Per spec,
> all CPUIDs calls from L2 will cause L2 --> L1 exit, so what do you do in L1?
The disclaimer here is that I don't have access to the paravisor (L1) code. But
to the best of my knowledge the L1 handles CPUID calls by calling into the TDX
module, or synthesizing a response itself. TDX_CPUID_LEAF_ID is not provided to
the L2 guest in order to discriminate a guest that is solely responsible for every
TDX mechanism (running at L1) from one running at L2 that has to cooperate with L1.
More below.

> 
> Given that you do that simple emulation, you already end up with TDX guest
> code being activated. Next you can check what features you wont be able to
> provide in L1 and create simple emulation calls for the TDG calls that must be
> supported and cannot return error. The biggest TDG call (TDVMCALL) is already
> direct call into L0 VMM, so this part doesn’t require L1 VMM support. 

I don't see anything in the TD-partitioning spec that gives the TDX guest a way
to detect if it's running at L2 or L1, or check whether TDVMCALLs go to L0/L1.
So in any case this requires an extra cpuid call to establish the environment.
Given that, exposing TDX_CPUID_LEAF_ID to the guest doesn't help.

I'll give some examples of where the idea of emulating a TDX environment
without attempting L1-L2 cooperation breaks down.

hlt: if the guest issues a hlt TDVMCALL it goes to L0, but if it issues a classic hlt
it traps to L1. The hlt should definitely go to L1 so that L1 has a chance to do
housekeeping.

map gpa: say the guest uses MAP_GPA TDVMCALL. This goes to L0, not L1 which is the actual
entity that needs to have a say in performing the conversion. L1 can't act on the request
if L0 would forward it because of the CoCo threat model. So L1 and L2 get out of sync.
The only safe approach is for L2 to use a different mechanism to trap to L1 explicitly.

Having a paravisor is required to support a TPM and having TDVMCALLs go to L0 is
required to make performance viable for real workloads.

> 
> Until we really see what breaks with this approach, I don’t think it is worth to
> take in the complexity to support different L1 hypervisors view on partitioning.
> 

I'm not asking to support different L1 hypervisors view on partitioning, I want to
clean up the code (by fixing assumptions that no longer hold) for the model that I'm
describing that: the kernel already supports, has an implementation that works and
has actual users. This is also a model that Intel intentionally created the TD-partitioning
spec to support.

So lets work together to make X86_FEATURE_TDX_GUEST match reality.

Best regards,
Jeremi

> Best Regards,
> Elena.
> 
>
  
Jeremi Piotrowski Dec. 4, 2023, 7:37 p.m. UTC | #19
On 29/11/2023 17:40, Borislav Petkov wrote:
> On Wed, Nov 22, 2023 at 06:19:20PM +0100, Jeremi Piotrowski wrote:
>> Which approach do you prefer?
> 
> I'm trying to figure out from the whole thread, what this guest is.

Wanted to clarify some things directly here. This type guest is supported
in the kernel already[1], so this whole series is the kind of attempt to
share more code that you advocated for in another email.

[1]: https://lore.kernel.org/lkml/20230824080712.30327-1-decui@microsoft.com/#t

> 
> * A HyperV second-level guest

From Hyper-V's point of view it's a TDX guest with privilege levels inside, not
second-level...

> 
> * of type TDX

...but Intel TDX calls these privilege levels L1 and L2 instead of VMPL0/VMPL1-3.

> 
> * Needs to defer cc_mask and page visibility bla...
>

The implementations in tdx_early_init() depend on TDX module calls (not avail)
and the correct calls are standard Hyper-V hypercalls (same as vTOM SNP guests).

> * needs to disable TDX module calls
> 
> * stub out tdx_accept_memory

This is actually a fix that for something that only works by accident right now
and I meant to post separately from the rest of the discussion.

If you look at arch/x86/include/asm/unaccepted_memory.h (below), it is used by both
CONFIG_INTEL_TDX_GUEST and CONFIG_AMD_MEM_ENCRYPT, but there is no tdx_accept_memory
implementation when CONFIG_INTEL_TDX_GUEST is not set. This is subtle and confusing,
the stub should be there.
 
static inline void arch_accept_memory(phys_addr_t start, phys_addr_t end)
{
        /* Platform-specific memory-acceptance call goes here */
        if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
                if (!tdx_accept_memory(start, end))
                        panic("TDX: Failed to accept memory\n");
        } else if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
                snp_accept_memory(start, end);
        } else {
                panic("Cannot accept memory: unknown platform\n");
        }
}

> 
> Anything else?
> 
> And my worry is that this is going to become a mess and your patches
> already show that it is going in that direction because you need to run
> the TDX side but still have *some* things done differently. Which is
> needed because this is a different type of guest, even if it is a TDX
> one.
> 
> Which reminds me, we have amd_cc_platform_vtom() which is a similar type
> of thing.
> 
> And the TDX side could do something similar and at least *try* to
> abstract away all that stuff.
> 
> Would it be nice? Of course not!
> 
> How can one model a virt zoo of at least a dozen guest types but still
> keep code sane... :-\
>
  
Kirill A. Shutemov Dec. 5, 2023, 10:54 a.m. UTC | #20
On Mon, Dec 04, 2023 at 08:07:38PM +0100, Jeremi Piotrowski wrote:
> On 04/12/2023 10:17, Reshetova, Elena wrote:
> >> Check for additional CPUID bits to identify TDX guests running with Trust
> >> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
> >> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
> >>
> >> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is
> >> visible
> >> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
> >> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
> >> aware mechanisms for what's left. So currently such guests do not have
> >> X86_FEATURE_TDX_GUEST set.
> > 
> > Back to this concrete patch. Why cannot L1 VMM emulate the correct value of
> > the TDX_CPUID_LEAF_ID to L2 VM? It can do this per TDX partitioning arch.
> > How do you handle this and other CPUID calls call currently in L1? Per spec,
> > all CPUIDs calls from L2 will cause L2 --> L1 exit, so what do you do in L1?
> The disclaimer here is that I don't have access to the paravisor (L1) code. But
> to the best of my knowledge the L1 handles CPUID calls by calling into the TDX
> module, or synthesizing a response itself. TDX_CPUID_LEAF_ID is not provided to
> the L2 guest in order to discriminate a guest that is solely responsible for every
> TDX mechanism (running at L1) from one running at L2 that has to cooperate with L1.
> More below.
> 
> > 
> > Given that you do that simple emulation, you already end up with TDX guest
> > code being activated. Next you can check what features you wont be able to
> > provide in L1 and create simple emulation calls for the TDG calls that must be
> > supported and cannot return error. The biggest TDG call (TDVMCALL) is already
> > direct call into L0 VMM, so this part doesn’t require L1 VMM support. 
> 
> I don't see anything in the TD-partitioning spec that gives the TDX guest a way
> to detect if it's running at L2 or L1, or check whether TDVMCALLs go to L0/L1.
> So in any case this requires an extra cpuid call to establish the environment.
> Given that, exposing TDX_CPUID_LEAF_ID to the guest doesn't help.
> 
> I'll give some examples of where the idea of emulating a TDX environment
> without attempting L1-L2 cooperation breaks down.
> 
> hlt: if the guest issues a hlt TDVMCALL it goes to L0, but if it issues a classic hlt
> it traps to L1. The hlt should definitely go to L1 so that L1 has a chance to do
> housekeeping.

Why would L2 issue HLT TDVMCALL? It only happens in response to #VE, but
if partitioning enabled #VEs are routed to L1 anyway.

> map gpa: say the guest uses MAP_GPA TDVMCALL. This goes to L0, not L1 which is the actual
> entity that needs to have a say in performing the conversion. L1 can't act on the request
> if L0 would forward it because of the CoCo threat model. So L1 and L2 get out of sync.
> The only safe approach is for L2 to use a different mechanism to trap to L1 explicitly.

Hm? L1 is always in loop on share<->private conversion. I don't know why
you need MAP_GPA for that.

You can't rely on MAP_GPA anyway. It is optional (unfortunately). Conversion
doesn't require MAP_GPA call.

> Having a paravisor is required to support a TPM and having TDVMCALLs go to L0 is
> required to make performance viable for real workloads.
> 
> > 
> > Until we really see what breaks with this approach, I don’t think it is worth to
> > take in the complexity to support different L1 hypervisors view on partitioning.
> > 
> 
> I'm not asking to support different L1 hypervisors view on partitioning, I want to
> clean up the code (by fixing assumptions that no longer hold) for the model that I'm
> describing that: the kernel already supports, has an implementation that works and
> has actual users. This is also a model that Intel intentionally created the TD-partitioning
> spec to support.
> 
> So lets work together to make X86_FEATURE_TDX_GUEST match reality.

I think the right direction is to make TDX architecture good enough
without that. If we need more hooks in TDX module that give required
control to L1, let's do that. (I don't see it so far)
  
Reshetova, Elena Dec. 5, 2023, 1:24 p.m. UTC | #21
> On 04/12/2023 10:17, Reshetova, Elena wrote:
> >> Check for additional CPUID bits to identify TDX guests running with Trust
> >> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
> >> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD
> VM(s).
> >>
> >> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is
> >> visible
> >> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
> >> are controlled by the L1 VMM and the L2 TDX guest needs to use TD
> partitioning
> >> aware mechanisms for what's left. So currently such guests do not have
> >> X86_FEATURE_TDX_GUEST set.
> >
> > Back to this concrete patch. Why cannot L1 VMM emulate the correct value of
> > the TDX_CPUID_LEAF_ID to L2 VM? It can do this per TDX partitioning arch.
> > How do you handle this and other CPUID calls call currently in L1? Per spec,
> > all CPUIDs calls from L2 will cause L2 --> L1 exit, so what do you do in L1?
> The disclaimer here is that I don't have access to the paravisor (L1) code. But
> to the best of my knowledge the L1 handles CPUID calls by calling into the TDX
> module, or synthesizing a response itself. TDX_CPUID_LEAF_ID is not provided to
> the L2 guest in order to discriminate a guest that is solely responsible for every
> TDX mechanism (running at L1) from one running at L2 that has to cooperate
> with L1.
> More below.

OK, so in your case it is a decision of L1 VMM not to set the TDX_CPUID_LEAF_ID
to reflect that it is a tdx guest and it is on purpose because you want to 
drop into a special tdx guest, i.e. partitioned guest. 

> 
> >
> > Given that you do that simple emulation, you already end up with TDX guest
> > code being activated. Next you can check what features you wont be able to
> > provide in L1 and create simple emulation calls for the TDG calls that must be
> > supported and cannot return error. The biggest TDG call (TDVMCALL) is already
> > direct call into L0 VMM, so this part doesn’t require L1 VMM support.
> 
> I don't see anything in the TD-partitioning spec that gives the TDX guest a way
> to detect if it's running at L2 or L1, or check whether TDVMCALLs go to L0/L1.
> So in any case this requires an extra cpuid call to establish the environment.
> Given that, exposing TDX_CPUID_LEAF_ID to the guest doesn't help.

Yes, there is nothing like this in spec and it is on purpose, because the idea is that
L1 can fully control the environment for L2 and virtualize it in the way it wants. 

> 
> I'll give some examples of where the idea of emulating a TDX environment
> without attempting L1-L2 cooperation breaks down.
> 
> hlt: if the guest issues a hlt TDVMCALL it goes to L0, but if it issues a classic hlt
> it traps to L1. The hlt should definitely go to L1 so that L1 has a chance to do
> housekeeping.
> 
> map gpa: say the guest uses MAP_GPA TDVMCALL. This goes to L0, not L1 which
> is the actual
> entity that needs to have a say in performing the conversion. L1 can't act on the
> request
> if L0 would forward it because of the CoCo threat model. So L1 and L2 get out of
> sync.
> The only safe approach is for L2 to use a different mechanism to trap to L1
> explicitly.

Interesting, thank you for the examples! What it looks like to me that if we give
an ability to L1 VMM to specify what TDVMCALL leaves should go into L0 and
which ones should end up in L1, we would actually address your usecase more
cleanly without the fragmentation of the tdx guest code. 
Is it a viable option for you?
I do understand that such option doesn’t exist at the moment, but if there is
a good usecase, we can argue that it is needed. 

Best Regards,
Elena.
  
Kai Huang Dec. 5, 2023, 1:26 p.m. UTC | #22
> 
> > > > 
> > > > Hm. Okay.
> > > > 
> > > > Can we take a step back? What is bigger picture here? What enlightenment
> > > > do you expect from the guest when everything is in-place?
> > > > 
> > > 
> > > All the functional enlightenment are already in place in the kernel and
> > > everything works (correct me if I'm wrong Dexuan/Michael). The enlightenments
> > > are that TDX VMCALLs are needed for MSR manipulation and vmbus operations,
> > > encrypted bit needs to be manipulated in the page tables and page
> > > visibility propagated to VMM.
> > 
> > Not quite family with hyperv enlightenments, but are these enlightenments TDX
> > guest specific?  Because if they are not, then they should be able to be
> > emulated by the normal hyperv, thus the hyperv as L1 (which is TDX guest) can
> > emulate them w/o letting the L2 know the hypervisor it runs on is actually a TDX
> > guest.
> 
> I would say that these hyperv enlightenments are confidential guest specific
> (TDX/SNP) when running with TD-partitioning/VMPL. In both cases there are TDX/SNP
> specific ways to exit directly to L0 (when needed) and native privileged instructions
> trap to the paravisor.
> 
> L1 is not hyperv and no one wants to emulate the I/O path. The L2 guest knows that
> it's confidential so that it can explicitly use swiotlb, toggle page visibility
> and notify the host (L0) on the I/O path without incurring additional emulation
> overhead.
> 
> > 
> > Btw, even if there's performance concern here, as you mentioned the TDVMCALL is
> > actually made to the L0 which means L0 must be aware such VMCALL is from L2 and
> > needs to be injected to L1 to handle, which IMHO not only complicates the L0 but
> > also may not have any performance benefits.
> 
> The TDVMCALLs are related to the I/O path (networking/block io) into the L2 guest, and
> so they intentionally go straight to L0 and are never injected to L1. L1 is not
> involved in that path at all.
> 
> Using something different than TDVMCALLs here would lead to additional traps to L1 and
> just add latency/complexity.

Looks by default you assume we should use TDX partitioning as "paravisor L1" +
"L0 device I/O emulation".

I think we are lacking background of this usage model and how it works.  For
instance, typically L2 is created by L1, and L1 is responsible for L2's device
I/O emulation.  I don't quite understand how could L0 emulate L2's device I/O?

Can you provide more information?

> 
> > 
> > > 
> > > Whats missing is the tdx_guest flag is not exposed to userspace in /proc/cpuinfo,
> > > and as a result dmesg does not currently display:
> > > "Memory Encryption Features active: Intel TDX".
> > > 
> > > That's what I set out to correct.
> > > 
> > > > So far I see that you try to get kernel think that it runs as TDX guest,
> > > > but not really. This is not very convincing model.
> > > > 
> > > 
> > > No that's not accurate at all. The kernel is running as a TDX guest so I
> > > want the kernel to know that. 
> > > 
> > 
> > But it isn't.  It runs on a hypervisor which is a TDX guest, but this doesn't
> > make itself a TDX guest.> 
> 
> That depends on your definition of "TDX guest". The TDX 1.5 TD partitioning spec
> talks of TDX-enlightened L1 VMM, (optionally) TDX-enlightened L2 VM and Unmodified
> Legacy L2 VM. Here we're dealing with a TDX-enlightened L2 VM.
> 
> If a guest runs inside an Intel TDX protected TD, is aware of memory encryption and
> issues TDVMCALLs - to me that makes it a TDX guest.

The thing I don't quite understand is what enlightenment(s) requires L2 to issue
TDVMCALL and know "encryption bit".

The reason that I can think of is:

If device I/O emulation of L2 is done by L0 then I guess it's reasonable to make
L2 aware of the "encryption bit" because L0 can only write emulated data to
shared buffer.  The shared buffer must be initially converted by the L2 by using
MAP_GPA TDVMCALL to L0 (to zap private pages in S-EPT etc), and L2 needs to know
the "encryption bit" to set up its page table properly.  L1 must be aware of
such private <-> shared conversion too to setup page table properly so L1 must
also be notified.

The concern I am having is whether there's other usage model(s) that we need to
consider.  For instance, running both unmodified L2 and enlightened L2.  Or some
L2 only needs TDVMCALL enlightenment but no "encryption bit".

In other words, that seems pretty much L1 hypervisor/paravisor implementation
specific.  I am wondering whether we can completely hide the enlightenment(s)
logic to hypervisor/paravisor specific code but not generically mark L2 as TDX
guest but still need to disable TDCALL sort of things.

Hope we are getting closer to be on the same page.
  
Jeremi Piotrowski Dec. 6, 2023, 5:49 p.m. UTC | #23
On 05/12/2023 11:54, Kirill A. Shutemov wrote:
> On Mon, Dec 04, 2023 at 08:07:38PM +0100, Jeremi Piotrowski wrote:
>> On 04/12/2023 10:17, Reshetova, Elena wrote:
>>>> Check for additional CPUID bits to identify TDX guests running with Trust
>>>> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
>>>> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
>>>>
>>>> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is
>>>> visible
>>>> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
>>>> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
>>>> aware mechanisms for what's left. So currently such guests do not have
>>>> X86_FEATURE_TDX_GUEST set.
>>>
>>> Back to this concrete patch. Why cannot L1 VMM emulate the correct value of
>>> the TDX_CPUID_LEAF_ID to L2 VM? It can do this per TDX partitioning arch.
>>> How do you handle this and other CPUID calls call currently in L1? Per spec,
>>> all CPUIDs calls from L2 will cause L2 --> L1 exit, so what do you do in L1?
>> The disclaimer here is that I don't have access to the paravisor (L1) code. But
>> to the best of my knowledge the L1 handles CPUID calls by calling into the TDX
>> module, or synthesizing a response itself. TDX_CPUID_LEAF_ID is not provided to
>> the L2 guest in order to discriminate a guest that is solely responsible for every
>> TDX mechanism (running at L1) from one running at L2 that has to cooperate with L1.
>> More below.
>>
>>>
>>> Given that you do that simple emulation, you already end up with TDX guest
>>> code being activated. Next you can check what features you wont be able to
>>> provide in L1 and create simple emulation calls for the TDG calls that must be
>>> supported and cannot return error. The biggest TDG call (TDVMCALL) is already
>>> direct call into L0 VMM, so this part doesn’t require L1 VMM support. 
>>
>> I don't see anything in the TD-partitioning spec that gives the TDX guest a way
>> to detect if it's running at L2 or L1, or check whether TDVMCALLs go to L0/L1.
>> So in any case this requires an extra cpuid call to establish the environment.
>> Given that, exposing TDX_CPUID_LEAF_ID to the guest doesn't help.
>>
>> I'll give some examples of where the idea of emulating a TDX environment
>> without attempting L1-L2 cooperation breaks down.
>>
>> hlt: if the guest issues a hlt TDVMCALL it goes to L0, but if it issues a classic hlt
>> it traps to L1. The hlt should definitely go to L1 so that L1 has a chance to do
>> housekeeping.
> 
> Why would L2 issue HLT TDVMCALL? It only happens in response to #VE, but
> if partitioning enabled #VEs are routed to L1 anyway.

What about tdx_safe_halt? When X86_FEATURE_TDX_GUEST is defined I see
"using TDX aware idle routing" in dmesg.

> 
>> map gpa: say the guest uses MAP_GPA TDVMCALL. This goes to L0, not L1 which is the actual
>> entity that needs to have a say in performing the conversion. L1 can't act on the request
>> if L0 would forward it because of the CoCo threat model. So L1 and L2 get out of sync.
>> The only safe approach is for L2 to use a different mechanism to trap to L1 explicitly.
> 
> Hm? L1 is always in loop on share<->private conversion. I don't know why
> you need MAP_GPA for that.
> 
> You can't rely on MAP_GPA anyway. It is optional (unfortunately). Conversion
> doesn't require MAP_GPA call.
> 

I'm sorry, I don't quite follow. I'm reading tdx_enc_status_changed():
- TDVMCALL_MAP_GPA is issued for all transitions
- TDX_ACCEPT_PAGE is issued for shared->private transitions

This doesn't work in partitioning when TDVMCALLs go to L0: TDVMCALL_MAP_GPA bypasses
L1 and TDX_ACCEPT_PAGE is L1 responsibility.

If you want to see how this is currently supported take a look at arch/x86/hyperv/ivm.c.
All memory starts as private and there is a hypercall to notify the paravisor for both
TDX (when partitioning) and SNP (when VMPL). This guarantees that all page conversions
go through L1.

>> Having a paravisor is required to support a TPM and having TDVMCALLs go to L0 is
>> required to make performance viable for real workloads.
>>
>>>
>>> Until we really see what breaks with this approach, I don’t think it is worth to
>>> take in the complexity to support different L1 hypervisors view on partitioning.
>>>
>>
>> I'm not asking to support different L1 hypervisors view on partitioning, I want to
>> clean up the code (by fixing assumptions that no longer hold) for the model that I'm
>> describing that: the kernel already supports, has an implementation that works and
>> has actual users. This is also a model that Intel intentionally created the TD-partitioning
>> spec to support.
>>
>> So lets work together to make X86_FEATURE_TDX_GUEST match reality.
> 
> I think the right direction is to make TDX architecture good enough
> without that. If we need more hooks in TDX module that give required
> control to L1, let's do that. (I don't see it so far)
> 

I'm not the right person to propose changes to the TDX module, I barely know anything about
TDX. The team that develops the paravisor collaborates with Intel on it and was also consulted
in TD-partitioning design.

I'm also not sure what kind of changes you envision. Everything is supported by the
kernel already and the paravisor ABI is meant to stay vendor independent.

What I'm trying to accomplish is better integration with the non-partitioning side of TDX
so that users don't see "Memory Encryption Features active: AMD SEV" when running on Intel
TDX with a paravisor.
  
Jeremi Piotrowski Dec. 6, 2023, 6:47 p.m. UTC | #24
On 05/12/2023 14:26, Huang, Kai wrote:
>>
>>>>>
>>>>> Hm. Okay.
>>>>>
>>>>> Can we take a step back? What is bigger picture here? What enlightenment
>>>>> do you expect from the guest when everything is in-place?
>>>>>
>>>>
>>>> All the functional enlightenment are already in place in the kernel and
>>>> everything works (correct me if I'm wrong Dexuan/Michael). The enlightenments
>>>> are that TDX VMCALLs are needed for MSR manipulation and vmbus operations,
>>>> encrypted bit needs to be manipulated in the page tables and page
>>>> visibility propagated to VMM.
>>>
>>> Not quite family with hyperv enlightenments, but are these enlightenments TDX
>>> guest specific?  Because if they are not, then they should be able to be
>>> emulated by the normal hyperv, thus the hyperv as L1 (which is TDX guest) can
>>> emulate them w/o letting the L2 know the hypervisor it runs on is actually a TDX
>>> guest.
>>
>> I would say that these hyperv enlightenments are confidential guest specific
>> (TDX/SNP) when running with TD-partitioning/VMPL. In both cases there are TDX/SNP
>> specific ways to exit directly to L0 (when needed) and native privileged instructions
>> trap to the paravisor.
>>
>> L1 is not hyperv and no one wants to emulate the I/O path. The L2 guest knows that
>> it's confidential so that it can explicitly use swiotlb, toggle page visibility
>> and notify the host (L0) on the I/O path without incurring additional emulation
>> overhead.
>>
>>>
>>> Btw, even if there's performance concern here, as you mentioned the TDVMCALL is
>>> actually made to the L0 which means L0 must be aware such VMCALL is from L2 and
>>> needs to be injected to L1 to handle, which IMHO not only complicates the L0 but
>>> also may not have any performance benefits.
>>
>> The TDVMCALLs are related to the I/O path (networking/block io) into the L2 guest, and
>> so they intentionally go straight to L0 and are never injected to L1. L1 is not
>> involved in that path at all.
>>
>> Using something different than TDVMCALLs here would lead to additional traps to L1 and
>> just add latency/complexity.
> 
> Looks by default you assume we should use TDX partitioning as "paravisor L1" +
> "L0 device I/O emulation".
> 

I don't actually want to impose this model on anyone, but this is the one that
could use some refactoring. I intend to rework these patches to not use a single
"td_partitioning_active" for decisions.

> I think we are lacking background of this usage model and how it works.  For
> instance, typically L2 is created by L1, and L1 is responsible for L2's device
> I/O emulation.  I don't quite understand how could L0 emulate L2's device I/O?
> 
> Can you provide more information?

Let's differentiate between fast and slow I/O. The whole point of the paravisor in
L1 is to provide device emulation for slow I/O: TPM, RTC, NVRAM, IO-APIC, serial ports.

But fast I/O is designed to bypass it and go straight to L0. Hyper-V uses paravirtual
vmbus devices for fast I/O (net/block). The vmbus protocol has awareness of page visibility
built-in and uses native (GHCI on TDX, GHCB on SNP) mechanisms for notifications. So once
everything is set up (rings/buffers in swiotlb), the I/O for fast devices does not
involve L1. This is only possible when the VM manages C-bit itself.

I think the same thing could work for virtio if someone would "enlighten" vring
notification calls (instead of I/O or MMIO instructions).

> 
>>
>>>
>>>>
>>>> Whats missing is the tdx_guest flag is not exposed to userspace in /proc/cpuinfo,
>>>> and as a result dmesg does not currently display:
>>>> "Memory Encryption Features active: Intel TDX".
>>>>
>>>> That's what I set out to correct.
>>>>
>>>>> So far I see that you try to get kernel think that it runs as TDX guest,
>>>>> but not really. This is not very convincing model.
>>>>>
>>>>
>>>> No that's not accurate at all. The kernel is running as a TDX guest so I
>>>> want the kernel to know that. 
>>>>
>>>
>>> But it isn't.  It runs on a hypervisor which is a TDX guest, but this doesn't
>>> make itself a TDX guest.> 
>>
>> That depends on your definition of "TDX guest". The TDX 1.5 TD partitioning spec
>> talks of TDX-enlightened L1 VMM, (optionally) TDX-enlightened L2 VM and Unmodified
>> Legacy L2 VM. Here we're dealing with a TDX-enlightened L2 VM.
>>
>> If a guest runs inside an Intel TDX protected TD, is aware of memory encryption and
>> issues TDVMCALLs - to me that makes it a TDX guest.
> 
> The thing I don't quite understand is what enlightenment(s) requires L2 to issue
> TDVMCALL and know "encryption bit".
> 
> The reason that I can think of is:
> 
> If device I/O emulation of L2 is done by L0 then I guess it's reasonable to make
> L2 aware of the "encryption bit" because L0 can only write emulated data to
> shared buffer.  The shared buffer must be initially converted by the L2 by using
> MAP_GPA TDVMCALL to L0 (to zap private pages in S-EPT etc), and L2 needs to know
> the "encryption bit" to set up its page table properly.  L1 must be aware of
> such private <-> shared conversion too to setup page table properly so L1 must
> also be notified.

Your description is correct, except that L2 uses a hypercall (hv_mark_gpa_visibility())
to notify L1 and L1 issues the MAP_GPA TDVMCALL to L0.

C-bit awareness is necessary to setup the whole swiotlb pool to be host visible for
DMA.

> 
> The concern I am having is whether there's other usage model(s) that we need to
> consider.  For instance, running both unmodified L2 and enlightened L2.  Or some
> L2 only needs TDVMCALL enlightenment but no "encryption bit".
> 

Presumably unmodified L2 and enlightened L2 are already covered by current code but
require excessive trapping to L1.

I can't see a usecase for TDVMCALLs but no "encryption bit". 

> In other words, that seems pretty much L1 hypervisor/paravisor implementation
> specific.  I am wondering whether we can completely hide the enlightenment(s)
> logic to hypervisor/paravisor specific code but not generically mark L2 as TDX
> guest but still need to disable TDCALL sort of things.

That's how it currently works - all the enlightenments are in hypervisor/paravisor
specific code in arch/x86/hyperv and drivers/hv and the vm is not marked with
X86_FEATURE_TDX_GUEST.

But without X86_FEATURE_TDX_GUEST userspace has no unified way to discover that an
environment is protected by TDX and also the VM gets classified as "AMD SEV" in dmesg.
This is due to CC_ATTR_GUEST_MEM_ENCRYPT being set but X86_FEATURE_TDX_GUEST not.

> 
> Hope we are getting closer to be on the same page.
> 

I feel we are getting there
  
Kirill A. Shutemov Dec. 6, 2023, 10:54 p.m. UTC | #25
On Wed, Dec 06, 2023 at 06:49:11PM +0100, Jeremi Piotrowski wrote:
> On 05/12/2023 11:54, Kirill A. Shutemov wrote:
> > On Mon, Dec 04, 2023 at 08:07:38PM +0100, Jeremi Piotrowski wrote:
> >> On 04/12/2023 10:17, Reshetova, Elena wrote:
> >>>> Check for additional CPUID bits to identify TDX guests running with Trust
> >>>> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
> >>>> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
> >>>>
> >>>> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is
> >>>> visible
> >>>> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
> >>>> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
> >>>> aware mechanisms for what's left. So currently such guests do not have
> >>>> X86_FEATURE_TDX_GUEST set.
> >>>
> >>> Back to this concrete patch. Why cannot L1 VMM emulate the correct value of
> >>> the TDX_CPUID_LEAF_ID to L2 VM? It can do this per TDX partitioning arch.
> >>> How do you handle this and other CPUID calls call currently in L1? Per spec,
> >>> all CPUIDs calls from L2 will cause L2 --> L1 exit, so what do you do in L1?
> >> The disclaimer here is that I don't have access to the paravisor (L1) code. But
> >> to the best of my knowledge the L1 handles CPUID calls by calling into the TDX
> >> module, or synthesizing a response itself. TDX_CPUID_LEAF_ID is not provided to
> >> the L2 guest in order to discriminate a guest that is solely responsible for every
> >> TDX mechanism (running at L1) from one running at L2 that has to cooperate with L1.
> >> More below.
> >>
> >>>
> >>> Given that you do that simple emulation, you already end up with TDX guest
> >>> code being activated. Next you can check what features you wont be able to
> >>> provide in L1 and create simple emulation calls for the TDG calls that must be
> >>> supported and cannot return error. The biggest TDG call (TDVMCALL) is already
> >>> direct call into L0 VMM, so this part doesn’t require L1 VMM support. 
> >>
> >> I don't see anything in the TD-partitioning spec that gives the TDX guest a way
> >> to detect if it's running at L2 or L1, or check whether TDVMCALLs go to L0/L1.
> >> So in any case this requires an extra cpuid call to establish the environment.
> >> Given that, exposing TDX_CPUID_LEAF_ID to the guest doesn't help.
> >>
> >> I'll give some examples of where the idea of emulating a TDX environment
> >> without attempting L1-L2 cooperation breaks down.
> >>
> >> hlt: if the guest issues a hlt TDVMCALL it goes to L0, but if it issues a classic hlt
> >> it traps to L1. The hlt should definitely go to L1 so that L1 has a chance to do
> >> housekeeping.
> > 
> > Why would L2 issue HLT TDVMCALL? It only happens in response to #VE, but
> > if partitioning enabled #VEs are routed to L1 anyway.
> 
> What about tdx_safe_halt? When X86_FEATURE_TDX_GUEST is defined I see
> "using TDX aware idle routing" in dmesg.

Yeah. I forgot about this one. My bad. :/

I think it makes a case for more fine-grained control on where TDVMCALL
routed: to L1 or to L0. I think TDX module can do that.

BTW, what kind of housekeeping do you do in L1 for HLT case?

> >> map gpa: say the guest uses MAP_GPA TDVMCALL. This goes to L0, not L1 which is the actual
> >> entity that needs to have a say in performing the conversion. L1 can't act on the request
> >> if L0 would forward it because of the CoCo threat model. So L1 and L2 get out of sync.
> >> The only safe approach is for L2 to use a different mechanism to trap to L1 explicitly.
> > 
> > Hm? L1 is always in loop on share<->private conversion. I don't know why
> > you need MAP_GPA for that.
> > 
> > You can't rely on MAP_GPA anyway. It is optional (unfortunately). Conversion
> > doesn't require MAP_GPA call.
> > 
> 
> I'm sorry, I don't quite follow. I'm reading tdx_enc_status_changed():
> - TDVMCALL_MAP_GPA is issued for all transitions
> - TDX_ACCEPT_PAGE is issued for shared->private transitions

I am talking about TDX architecture. It doesn't require MAP_GPA call.
Just setting shared bit and touching the page will do the conversion.
MAP_GPA is "being nice" on the guest behalf.

Linux do MAP_GPA all the time. Or tries to. I had bug where I converted
page by mistake this way. It was pain to debug.

My point is that if you *must* catch all conversions in L1, MAP_GPA is not
reliable way.

> This doesn't work in partitioning when TDVMCALLs go to L0: TDVMCALL_MAP_GPA bypasses
> L1 and TDX_ACCEPT_PAGE is L1 responsibility.
> 
> If you want to see how this is currently supported take a look at arch/x86/hyperv/ivm.c.
> All memory starts as private and there is a hypercall to notify the paravisor for both
> TDX (when partitioning) and SNP (when VMPL). This guarantees that all page conversions
> go through L1.

But L1 guest control anyway during page conversion and it has to manage
aliases with TDG.MEM.PAGE.ATTR.RD/WR. Why do you need MAP_GPA for that?

> >> Having a paravisor is required to support a TPM and having TDVMCALLs go to L0 is
> >> required to make performance viable for real workloads.
> >>
> >>>
> >>> Until we really see what breaks with this approach, I don’t think it is worth to
> >>> take in the complexity to support different L1 hypervisors view on partitioning.
> >>>
> >>
> >> I'm not asking to support different L1 hypervisors view on partitioning, I want to
> >> clean up the code (by fixing assumptions that no longer hold) for the model that I'm
> >> describing that: the kernel already supports, has an implementation that works and
> >> has actual users. This is also a model that Intel intentionally created the TD-partitioning
> >> spec to support.
> >>
> >> So lets work together to make X86_FEATURE_TDX_GUEST match reality.
> > 
> > I think the right direction is to make TDX architecture good enough
> > without that. If we need more hooks in TDX module that give required
> > control to L1, let's do that. (I don't see it so far)
> > 
> 
> I'm not the right person to propose changes to the TDX module, I barely know anything about
> TDX. The team that develops the paravisor collaborates with Intel on it and was also consulted
> in TD-partitioning design.

One possible change I mentioned above: make TDVMCALL exit to L1 for some
TDVMCALL leafs (or something along the line).

I would like to keep it transparent for enlightened TDX Linux guest. It
should not care if it runs as L1 or as L2 in your environment.

> I'm also not sure what kind of changes you envision. Everything is supported by the
> kernel already and the paravisor ABI is meant to stay vendor independent.
> 
> What I'm trying to accomplish is better integration with the non-partitioning side of TDX
> so that users don't see "Memory Encryption Features active: AMD SEV" when running on Intel
> TDX with a paravisor.

This part is cosmetics and doesn't make much difference.
  
Kai Huang Dec. 7, 2023, 12:58 p.m. UTC | #26
> 
> > I think we are lacking background of this usage model and how it works.  For
> > instance, typically L2 is created by L1, and L1 is responsible for L2's device
> > I/O emulation.  I don't quite understand how could L0 emulate L2's device I/O?
> > 
> > Can you provide more information?
> 
> Let's differentiate between fast and slow I/O. The whole point of the paravisor in
> L1 is to provide device emulation for slow I/O: TPM, RTC, NVRAM, IO-APIC, serial ports.
> 
> But fast I/O is designed to bypass it and go straight to L0. Hyper-V uses paravirtual
> vmbus devices for fast I/O (net/block). The vmbus protocol has awareness of page visibility
> built-in and uses native (GHCI on TDX, GHCB on SNP) mechanisms for notifications. So once
> everything is set up (rings/buffers in swiotlb), the I/O for fast devices does not
> involve L1. This is only possible when the VM manages C-bit itself.

Yeah that makes sense.  Thanks for the info.

> 
> I think the same thing could work for virtio if someone would "enlighten" vring
> notification calls (instead of I/O or MMIO instructions).
> 
> > 
> > > 
> > > > 
> > > > > 
> > > > > Whats missing is the tdx_guest flag is not exposed to userspace in /proc/cpuinfo,
> > > > > and as a result dmesg does not currently display:
> > > > > "Memory Encryption Features active: Intel TDX".
> > > > > 
> > > > > That's what I set out to correct.
> > > > > 
> > > > > > So far I see that you try to get kernel think that it runs as TDX guest,
> > > > > > but not really. This is not very convincing model.
> > > > > > 
> > > > > 
> > > > > No that's not accurate at all. The kernel is running as a TDX guest so I
> > > > > want the kernel to know that. 
> > > > > 
> > > > 
> > > > But it isn't.  It runs on a hypervisor which is a TDX guest, but this doesn't
> > > > make itself a TDX guest.> 
> > > 
> > > That depends on your definition of "TDX guest". The TDX 1.5 TD partitioning spec
> > > talks of TDX-enlightened L1 VMM, (optionally) TDX-enlightened L2 VM and Unmodified
> > > Legacy L2 VM. Here we're dealing with a TDX-enlightened L2 VM.
> > > 
> > > If a guest runs inside an Intel TDX protected TD, is aware of memory encryption and
> > > issues TDVMCALLs - to me that makes it a TDX guest.
> > 
> > The thing I don't quite understand is what enlightenment(s) requires L2 to issue
> > TDVMCALL and know "encryption bit".
> > 
> > The reason that I can think of is:
> > 
> > If device I/O emulation of L2 is done by L0 then I guess it's reasonable to make
> > L2 aware of the "encryption bit" because L0 can only write emulated data to
> > shared buffer.  The shared buffer must be initially converted by the L2 by using
> > MAP_GPA TDVMCALL to L0 (to zap private pages in S-EPT etc), and L2 needs to know
> > the "encryption bit" to set up its page table properly.  L1 must be aware of
> > such private <-> shared conversion too to setup page table properly so L1 must
> > also be notified.
> 
> Your description is correct, except that L2 uses a hypercall (hv_mark_gpa_visibility())
> to notify L1 and L1 issues the MAP_GPA TDVMCALL to L0.

In TDX partitioning IIUC L1 and L2 use different secure-EPT page table when
mapping GPA of L1 and L2.  Therefore IIUC entries of both secure-EPT table which
map to the "to be converted page" need to be zapped.  

I am not entirely sure whether using hv_mark_gpa_visibility() is suffice?  As if
the MAP_GPA was from L1 then I am not sure L0 is easy to zap secure-EPT entry
for L2.

But anyway these are details probably we don't need to consider.

> 
> C-bit awareness is necessary to setup the whole swiotlb pool to be host visible for
> DMA.

Agreed.

> 
> > 
> > The concern I am having is whether there's other usage model(s) that we need to
> > consider.  For instance, running both unmodified L2 and enlightened L2.  Or some
> > L2 only needs TDVMCALL enlightenment but no "encryption bit".
> > 
> 
> Presumably unmodified L2 and enlightened L2 are already covered by current code but
> require excessive trapping to L1.
> 
> I can't see a usecase for TDVMCALLs but no "encryption bit". 
> 
> > In other words, that seems pretty much L1 hypervisor/paravisor implementation
> > specific.  I am wondering whether we can completely hide the enlightenment(s)
> > logic to hypervisor/paravisor specific code but not generically mark L2 as TDX
> > guest but still need to disable TDCALL sort of things.
> 
> That's how it currently works - all the enlightenments are in hypervisor/paravisor
> specific code in arch/x86/hyperv and drivers/hv and the vm is not marked with
> X86_FEATURE_TDX_GUEST.

And I believe there's a reason that the VM is not marked as TDX guest.

> 
> But without X86_FEATURE_TDX_GUEST userspace has no unified way to discover that an
> environment is protected by TDX and also the VM gets classified as "AMD SEV" in dmesg.
> This is due to CC_ATTR_GUEST_MEM_ENCRYPT being set but X86_FEATURE_TDX_GUEST not.

Can you provide more information about what does _userspace_ do here?

What's the difference if it sees a TDX guest or a normal non-coco guest in
/proc/cpuinfo?

Looks the whole purpose of this series is to make userspace happy by advertising
TDX guest to /proc/cpuinfo.  But if we do that we will have bad side-effect in
the kernel so that we need to do things in your patch 2/3.

That doesn't seem very convincing.  Is there any other way that userspace can
utilize, e.g., any HV hypervisor/paravisor specific attributes that are exposed
to userspace?
  
Jeremi Piotrowski Dec. 7, 2023, 5:06 p.m. UTC | #27
On 06/12/2023 23:54, Kirill A. Shutemov wrote:
> On Wed, Dec 06, 2023 at 06:49:11PM +0100, Jeremi Piotrowski wrote:
>> On 05/12/2023 11:54, Kirill A. Shutemov wrote:
>>> On Mon, Dec 04, 2023 at 08:07:38PM +0100, Jeremi Piotrowski wrote:
>>>> On 04/12/2023 10:17, Reshetova, Elena wrote:
>>>>>> Check for additional CPUID bits to identify TDX guests running with Trust
>>>>>> Domain (TD) partitioning enabled. TD partitioning is like nested virtualization
>>>>>> inside the Trust Domain so there is a L1 TD VM(M) and there can be L2 TD VM(s).
>>>>>>
>>>>>> In this arrangement we are not guaranteed that the TDX_CPUID_LEAF_ID is
>>>>>> visible
>>>>>> to Linux running as an L2 TD VM. This is because a majority of TDX facilities
>>>>>> are controlled by the L1 VMM and the L2 TDX guest needs to use TD partitioning
>>>>>> aware mechanisms for what's left. So currently such guests do not have
>>>>>> X86_FEATURE_TDX_GUEST set.
>>>>>
>>>>> Back to this concrete patch. Why cannot L1 VMM emulate the correct value of
>>>>> the TDX_CPUID_LEAF_ID to L2 VM? It can do this per TDX partitioning arch.
>>>>> How do you handle this and other CPUID calls call currently in L1? Per spec,
>>>>> all CPUIDs calls from L2 will cause L2 --> L1 exit, so what do you do in L1?
>>>> The disclaimer here is that I don't have access to the paravisor (L1) code. But
>>>> to the best of my knowledge the L1 handles CPUID calls by calling into the TDX
>>>> module, or synthesizing a response itself. TDX_CPUID_LEAF_ID is not provided to
>>>> the L2 guest in order to discriminate a guest that is solely responsible for every
>>>> TDX mechanism (running at L1) from one running at L2 that has to cooperate with L1.
>>>> More below.
>>>>
>>>>>
>>>>> Given that you do that simple emulation, you already end up with TDX guest
>>>>> code being activated. Next you can check what features you wont be able to
>>>>> provide in L1 and create simple emulation calls for the TDG calls that must be
>>>>> supported and cannot return error. The biggest TDG call (TDVMCALL) is already
>>>>> direct call into L0 VMM, so this part doesn’t require L1 VMM support. 
>>>>
>>>> I don't see anything in the TD-partitioning spec that gives the TDX guest a way
>>>> to detect if it's running at L2 or L1, or check whether TDVMCALLs go to L0/L1.
>>>> So in any case this requires an extra cpuid call to establish the environment.
>>>> Given that, exposing TDX_CPUID_LEAF_ID to the guest doesn't help.
>>>>
>>>> I'll give some examples of where the idea of emulating a TDX environment
>>>> without attempting L1-L2 cooperation breaks down.
>>>>
>>>> hlt: if the guest issues a hlt TDVMCALL it goes to L0, but if it issues a classic hlt
>>>> it traps to L1. The hlt should definitely go to L1 so that L1 has a chance to do
>>>> housekeeping.
>>>
>>> Why would L2 issue HLT TDVMCALL? It only happens in response to #VE, but
>>> if partitioning enabled #VEs are routed to L1 anyway.
>>
>> What about tdx_safe_halt? When X86_FEATURE_TDX_GUEST is defined I see
>> "using TDX aware idle routing" in dmesg.
> 
> Yeah. I forgot about this one. My bad. :/
> 
> I think it makes a case for more fine-grained control on where TDVMCALL
> routed: to L1 or to L0. I think TDX module can do that.

can -> could. Elena suggested something similar in another mail. That might
make it possible to standardize future paravisor-like usecases, similar to
[1]. Not sure this alone would be enough.

[1]: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/58019.pdf

> 
> BTW, what kind of housekeeping do you do in L1 for HLT case?
> 

L1 is responsible for "virtual trust levels" so it might want to service
software running at other trust levels (other L2).

>>>> map gpa: say the guest uses MAP_GPA TDVMCALL. This goes to L0, not L1 which is the actual
>>>> entity that needs to have a say in performing the conversion. L1 can't act on the request
>>>> if L0 would forward it because of the CoCo threat model. So L1 and L2 get out of sync.
>>>> The only safe approach is for L2 to use a different mechanism to trap to L1 explicitly.
>>>
>>> Hm? L1 is always in loop on share<->private conversion. I don't know why
>>> you need MAP_GPA for that.
>>>
>>> You can't rely on MAP_GPA anyway. It is optional (unfortunately). Conversion
>>> doesn't require MAP_GPA call.
>>>
>>
>> I'm sorry, I don't quite follow. I'm reading tdx_enc_status_changed():
>> - TDVMCALL_MAP_GPA is issued for all transitions
>> - TDX_ACCEPT_PAGE is issued for shared->private transitions
> 
> I am talking about TDX architecture. It doesn't require MAP_GPA call.
> Just setting shared bit and touching the page will do the conversion.

Not when L1 is involved.

> MAP_GPA is "being nice" on the guest behalf.> 
> Linux do MAP_GPA all the time. Or tries to. I had bug where I converted
> page by mistake this way. It was pain to debug.

Reading TDX host support in KVM/Qemu it seems to rely on the MAP_GPA, and
I believe so does Hyper-V when the guest runs non-partitioned TDX.

So the spec may say one thing, but if all implementations do the MAP_GPA
then it becomes expected behavior. But we're digressing.

> 
> My point is that if you *must* catch all conversions in L1, MAP_GPA is not
> reliable way.

Exactly, you're confirming my point. Which is why L2 issues a hypervisor specific
hypercall on every page visibility change that will always be handled by L1.

> 
>> This doesn't work in partitioning when TDVMCALLs go to L0: TDVMCALL_MAP_GPA bypasses
>> L1 and TDX_ACCEPT_PAGE is L1 responsibility.
>>
>> If you want to see how this is currently supported take a look at arch/x86/hyperv/ivm.c.
>> All memory starts as private and there is a hypercall to notify the paravisor for both
>> TDX (when partitioning) and SNP (when VMPL). This guarantees that all page conversions
>> go through L1.
> 
> But L1 guest control anyway during page conversion and it has to manage
> aliases with TDG.MEM.PAGE.ATTR.RD/WR. Why do you need MAP_GPA for that?
>

When the L2 wants to perform a page conversion it needs to notify L1 of this so that it
can do its part managing the aliases. Without L1 involvement the conversion doesn't
happen. MAP_GPA is not suitable for this purpose as I've described and you've confirmed
above.
 
>>>> Having a paravisor is required to support a TPM and having TDVMCALLs go to L0 is
>>>> required to make performance viable for real workloads.
>>>>
>>>>>
>>>>> Until we really see what breaks with this approach, I don’t think it is worth to
>>>>> take in the complexity to support different L1 hypervisors view on partitioning.
>>>>>
>>>>
>>>> I'm not asking to support different L1 hypervisors view on partitioning, I want to
>>>> clean up the code (by fixing assumptions that no longer hold) for the model that I'm
>>>> describing that: the kernel already supports, has an implementation that works and
>>>> has actual users. This is also a model that Intel intentionally created the TD-partitioning
>>>> spec to support.
>>>>
>>>> So lets work together to make X86_FEATURE_TDX_GUEST match reality.
>>>
>>> I think the right direction is to make TDX architecture good enough
>>> without that. If we need more hooks in TDX module that give required
>>> control to L1, let's do that. (I don't see it so far)
>>>
>>
>> I'm not the right person to propose changes to the TDX module, I barely know anything about
>> TDX. The team that develops the paravisor collaborates with Intel on it and was also consulted
>> in TD-partitioning design.
> 
> One possible change I mentioned above: make TDVMCALL exit to L1 for some
> TDVMCALL leafs (or something along the line).
> 

You can explore changes to TDVMCALL handling in the TDX module but I don't see any reason
this would be adopted, because a shared hypercall to control page visibility for SNP & TDX is
already part of Hyper-V ABI and works great for this purpose.

> I would like to keep it transparent for enlightened TDX Linux guest. It
> should not care if it runs as L1 or as L2 in your environment.

I understand that is how you would prefer it but, as we've established in these emails,
that doesn't work when the L1 paravisor provides services to the L2 with an L1 specific
protocol and TDVMCALLs are routed to L0 for performance reasons. It can't be done
transparently with TDX 1.5 calls alone and we already have TDX 1.5 deployed to users with
an upstream kernel.

Can you propose a way forward to enable X86_FEATURE_TDX_GUEST for this usecase? We're
talking about consolidating existing kernel TDX code here.

> 
>> I'm also not sure what kind of changes you envision. Everything is supported by the
>> kernel already and the paravisor ABI is meant to stay vendor independent.
>>
>> What I'm trying to accomplish is better integration with the non-partitioning side of TDX
>> so that users don't see "Memory Encryption Features active: AMD SEV" when running on Intel
>> TDX with a paravisor.
> 
> This part is cosmetics and doesn't make much difference.
> 

Not according to Intel engineers and users that I've talked to that are unhappy with this.
And also userspace lacks discoverability of tdx guest in /proc/cpuinfo.
  
Jeremi Piotrowski Dec. 7, 2023, 5:21 p.m. UTC | #28
On 07/12/2023 13:58, Huang, Kai wrote:
>>
>> That's how it currently works - all the enlightenments are in hypervisor/paravisor
>> specific code in arch/x86/hyperv and drivers/hv and the vm is not marked with
>> X86_FEATURE_TDX_GUEST.
> 
> And I believe there's a reason that the VM is not marked as TDX guest.
Yes, as Elena said:
"""
OK, so in your case it is a decision of L1 VMM not to set the TDX_CPUID_LEAF_ID
to reflect that it is a tdx guest and it is on purpose because you want to 
drop into a special tdx guest, i.e. partitioned guest. 
"""
TDX does not provide a means to let the partitioned guest know that it needs to
cooperate with the paravisor (e.g. because TDVMCALLs are routed to L0) so this is
exposed in a paravisor specific way (cpuids in patch 1).

> 
>>
>> But without X86_FEATURE_TDX_GUEST userspace has no unified way to discover that an
>> environment is protected by TDX and also the VM gets classified as "AMD SEV" in dmesg.
>> This is due to CC_ATTR_GUEST_MEM_ENCRYPT being set but X86_FEATURE_TDX_GUEST not.
> 
> Can you provide more information about what does _userspace_ do here?

I gave one usecase in a different email. A workload scheduler like Kubernetes might want to
place a workload in a confidential environment, and needs a way to determine that a VM is
TDX protected (or SNP protected) to make that placement decision.

> 
> What's the difference if it sees a TDX guest or a normal non-coco guest in
> /proc/cpuinfo?
> 
> Looks the whole purpose of this series is to make userspace happy by advertising
> TDX guest to /proc/cpuinfo.  But if we do that we will have bad side-effect in
> the kernel so that we need to do things in your patch 2/3.
> 

Yes, exactly. It's unifying the two approaches so that userspace doesn't have to
care.

> That doesn't seem very convincing.

Why not? 
The whole point of the kernel is to provide a unified interface to userspace and
abstract away these small differences. Yes it requires some kernel code to do,
thats not a reason to force every userspace to implement its own logic. This is
what the flags in /proc/cpuinfo are for.

> Is there any other way that userspace can
> utilize, e.g., any HV hypervisor/paravisor specific attributes that are exposed
> to userspace?
> 

There are no HV hyper-/para-visor attributes exposed to userspace directly, but
userspace can poke at the same cpuid bits as in patch 1 to make this determination.
Not great for confidential computing adoption.
  
Reshetova, Elena Dec. 7, 2023, 5:36 p.m. UTC | #29
> >> The TDVMCALLs are related to the I/O path (networking/block io) into the L2
> guest, and
> >> so they intentionally go straight to L0 and are never injected to L1. L1 is not
> >> involved in that path at all.
> >>
> >> Using something different than TDVMCALLs here would lead to additional
> traps to L1 and
> >> just add latency/complexity.
> >
> > Looks by default you assume we should use TDX partitioning as "paravisor L1" +
> > "L0 device I/O emulation".
> >
> 
> I don't actually want to impose this model on anyone, but this is the one that
> could use some refactoring. I intend to rework these patches to not use a single
> "td_partitioning_active" for decisions.
> 
> > I think we are lacking background of this usage model and how it works.  For
> > instance, typically L2 is created by L1, and L1 is responsible for L2's device
> > I/O emulation.  I don't quite understand how could L0 emulate L2's device I/O?
> >
> > Can you provide more information?
> 
> Let's differentiate between fast and slow I/O. The whole point of the paravisor in
> L1 is to provide device emulation for slow I/O: TPM, RTC, NVRAM, IO-APIC, serial
> ports.

Out of my curiosity and not really related to this discussion, but could you please
elaborate on RTC part here? Do you actually host secure time in L1 to be provided
to the L2? 

Best Regards,
Elena.
  
Jeremi Piotrowski Dec. 7, 2023, 7:35 p.m. UTC | #30
On 07/12/2023 18:21, Jeremi Piotrowski wrote:
> On 07/12/2023 13:58, Huang, Kai wrote:
>>>
>>> That's how it currently works - all the enlightenments are in hypervisor/paravisor
>>> specific code in arch/x86/hyperv and drivers/hv and the vm is not marked with
>>> X86_FEATURE_TDX_GUEST.
>>
>> And I believe there's a reason that the VM is not marked as TDX guest.
> Yes, as Elena said:
> """
> OK, so in your case it is a decision of L1 VMM not to set the TDX_CPUID_LEAF_ID
> to reflect that it is a tdx guest and it is on purpose because you want to 
> drop into a special tdx guest, i.e. partitioned guest. 
> """
> TDX does not provide a means to let the partitioned guest know that it needs to
> cooperate with the paravisor (e.g. because TDVMCALLs are routed to L0) so this is
> exposed in a paravisor specific way (cpuids in patch 1).
> 
>>
>>>
>>> But without X86_FEATURE_TDX_GUEST userspace has no unified way to discover that an
>>> environment is protected by TDX and also the VM gets classified as "AMD SEV" in dmesg.
>>> This is due to CC_ATTR_GUEST_MEM_ENCRYPT being set but X86_FEATURE_TDX_GUEST not.
>>
>> Can you provide more information about what does _userspace_ do here?
> 
> I gave one usecase in a different email. A workload scheduler like Kubernetes might want to
> place a workload in a confidential environment, and needs a way to determine that a VM is
> TDX protected (or SNP protected) to make that placement decision.
> 
>>
>> What's the difference if it sees a TDX guest or a normal non-coco guest in
>> /proc/cpuinfo?
>>
>> Looks the whole purpose of this series is to make userspace happy by advertising
>> TDX guest to /proc/cpuinfo.  But if we do that we will have bad side-effect in
>> the kernel so that we need to do things in your patch 2/3.
>>
> 
> Yes, exactly. It's unifying the two approaches so that userspace doesn't have to
> care.
> 
>> That doesn't seem very convincing.
> 
> Why not? 
> The whole point of the kernel is to provide a unified interface to userspace and
> abstract away these small differences. Yes it requires some kernel code to do,
> thats not a reason to force every userspace to implement its own logic. This is
> what the flags in /proc/cpuinfo are for.
> 

So I feel like we're finally getting to the gist of the disagreements in this thread.

Here's something I think we should all agree on (both a) and b)). X86_FEATURE_TDX_GUEST:
a) is visible to userspace and not just some kernel-only construct
b) means "this is a guest running in an Intel TDX Trust Domain, and said guest is aware
   of TDX"

a) is obvious but I think needs restating. b) is what userspace expects, and excludes legacy
(/unmodified) guests running in a TD. That's a reasonable definition.

For kernel only checks we can rely on platform-specific CC_ATTRS checked through
intel_cc_platform_has.

@Borislav: does that sound reasonable to you?
@Kai, @Kirill, @Elena: can I get you to agree with this compromise, for userspace' sake?

Jeremi
  
Kirill A. Shutemov Dec. 7, 2023, 8:56 p.m. UTC | #31
On Thu, Dec 07, 2023 at 06:06:38PM +0100, Jeremi Piotrowski wrote:
> > 
> >> This doesn't work in partitioning when TDVMCALLs go to L0: TDVMCALL_MAP_GPA bypasses
> >> L1 and TDX_ACCEPT_PAGE is L1 responsibility.
> >>
> >> If you want to see how this is currently supported take a look at arch/x86/hyperv/ivm.c.
> >> All memory starts as private and there is a hypercall to notify the paravisor for both
> >> TDX (when partitioning) and SNP (when VMPL). This guarantees that all page conversions
> >> go through L1.
> > 
> > But L1 guest control anyway during page conversion and it has to manage
> > aliases with TDG.MEM.PAGE.ATTR.RD/WR. Why do you need MAP_GPA for that?
> >
> 
> When the L2 wants to perform a page conversion it needs to notify L1 of this so that it
> can do its part managing the aliases. Without L1 involvement the conversion doesn't
> happen. MAP_GPA is not suitable for this purpose as I've described and you've confirmed
> above.

Memory conversion causes exit to L1 as there will be no aliases in L2
otherwise. There's no need to intercept MAP_GPA for that. See section
21.8 of TD partitioning spec.

>  
> > One possible change I mentioned above: make TDVMCALL exit to L1 for some
> > TDVMCALL leafs (or something along the line).
> > 
> 
> You can explore changes to TDVMCALL handling in the TDX module but I don't see any reason
> this would be adopted, because a shared hypercall to control page visibility for SNP & TDX is
> already part of Hyper-V ABI and works great for this purpose.
> 
> > I would like to keep it transparent for enlightened TDX Linux guest. It
> > should not care if it runs as L1 or as L2 in your environment.
> 
> I understand that is how you would prefer it but, as we've established in these emails,
> that doesn't work when the L1 paravisor provides services to the L2 with an L1 specific
> protocol and TDVMCALLs are routed to L0 for performance reasons. It can't be done
> transparently with TDX 1.5 calls alone and we already have TDX 1.5 deployed to users with
> an upstream kernel.

TDX 1.5 is not set in stone (yet). The spec is still draft. We can add
capabilities if we make case for them.

Let's try to shift the discussion to how to make TDX better rather than
adding workaround to kernel.
  
Kai Huang Dec. 8, 2023, 10:51 a.m. UTC | #32
On Thu, 2023-12-07 at 20:35 +0100, Jeremi Piotrowski wrote:
> On 07/12/2023 18:21, Jeremi Piotrowski wrote:
> > On 07/12/2023 13:58, Huang, Kai wrote:
> > > > 
> > > > That's how it currently works - all the enlightenments are in hypervisor/paravisor
> > > > specific code in arch/x86/hyperv and drivers/hv and the vm is not marked with
> > > > X86_FEATURE_TDX_GUEST.
> > > 
> > > And I believe there's a reason that the VM is not marked as TDX guest.
> > Yes, as Elena said:
> > """
> > OK, so in your case it is a decision of L1 VMM not to set the TDX_CPUID_LEAF_ID
> > to reflect that it is a tdx guest and it is on purpose because you want to 
> > drop into a special tdx guest, i.e. partitioned guest. 
> > """
> > TDX does not provide a means to let the partitioned guest know that it needs to
> > cooperate with the paravisor (e.g. because TDVMCALLs are routed to L0) so this is
> > exposed in a paravisor specific way (cpuids in patch 1).
> > 
> > > 
> > > > 
> > > > But without X86_FEATURE_TDX_GUEST userspace has no unified way to discover that an
> > > > environment is protected by TDX and also the VM gets classified as "AMD SEV" in dmesg.
> > > > This is due to CC_ATTR_GUEST_MEM_ENCRYPT being set but X86_FEATURE_TDX_GUEST not.
> > > 
> > > Can you provide more information about what does _userspace_ do here?
> > 
> > I gave one usecase in a different email. A workload scheduler like Kubernetes might want to
> > place a workload in a confidential environment, and needs a way to determine that a VM is
> > TDX protected (or SNP protected) to make that placement decision.
> > 
> > > 
> > > What's the difference if it sees a TDX guest or a normal non-coco guest in
> > > /proc/cpuinfo?
> > > 
> > > Looks the whole purpose of this series is to make userspace happy by advertising
> > > TDX guest to /proc/cpuinfo.  But if we do that we will have bad side-effect in
> > > the kernel so that we need to do things in your patch 2/3.
> > > 
> > 
> > Yes, exactly. It's unifying the two approaches so that userspace doesn't have to
> > care.
> > 
> > > That doesn't seem very convincing.
> > 
> > Why not? 
> > The whole point of the kernel is to provide a unified interface to userspace and
> > abstract away these small differences. 
> > 

Agree it's better.

> > Yes it requires some kernel code to do,
> > thats not a reason to force every userspace to implement its own logic. This is
> > what the flags in /proc/cpuinfo are for.
> > 

Agree /proc/cpuinfo _can_ be one choice, but I am not sure whether it is the
best choice (for your case).

> 
> So I feel like we're finally getting to the gist of the disagreements in this thread.
> 
> Here's something I think we should all agree on (both a) and b)). X86_FEATURE_TDX_GUEST:
> a) is visible to userspace and not just some kernel-only construct
> b) means "this is a guest running in an Intel TDX Trust Domain, and said guest is aware
>    of TDX"
> 
> a) is obvious but I think needs restating. b) is what userspace expects, and excludes legacy
> (/unmodified) guests running in a TD. That's a reasonable definition.

I don't understand b).  Firstly, a guest with X86_FEATURE_TDX_GUEST can just be
a TDX guest.  How can we distinguish a normal TDX guest vs an enlightened guest
inside TDX hypervisor/paravisor (L2)?

From userspace ABI's point of view, a "TDX guest" flag in /proc/cpuinfo just
means it is a TDX guest, nothing more.

Maybe in your case, the userspace only cares about L2, but what if in other
cases userspace wants to do different things for the above two cases?

The point is, from userspace ABI's view, I think each individual feature the
kernel provides should have its own ABI (if userspace needs that).

Your case is more like a normal L2 VM running inside TDX L1 hypervisor/paravisor
+ some L1 hypervisor/paravisor specific enlightenments.  Therefore IMHO it's
more reasonable to introduce some L1 hypervisor/paravisor specific entries as
userspace ABI.

For example, can we have something below in /sysfs (again, just example)?

	# cat /sys/kernel/coco/hyperv/isolation_type
	# tdx
	# cat /sys/kernel/coco/hyperv/paravisor
	# true

Then your userspace can certainly do the desired things if it observes above.

I am not familiar with the SEV/SEV-ES/SEV-SNP guest, e.g., how they expose flag
in /proc/cpuinfo or whether there's any other /sysfs entries, but the above
should cover SEV* too I guess (e.g., the isolation_type can just return "sev-
snp").

> 
> For kernel only checks we can rely on platform-specific CC_ATTRS checked through
> intel_cc_platform_has.

IMHO CC_ATTR_TDX_MODULE_CALLS actually doesn't make a lot sense, because "TDX
module call" obviously is TDX specific.  It doesn't make a lot sense to abuse
adding vendor specific CC attributes unless there's no better way.  

But this is just my opinion that CC attributes should be something generic if
possible.

> 
> @Borislav: does that sound reasonable to you?
> @Kai, @Kirill, @Elena: can I get you to agree with this compromise, for userspace' sake?
> 

As said above, I totally agree ideally the kernel should provide some unified
way to allow userspace query whether it's a confidential VM or not.  

Sure /proc/cpuinfo can certainly be one of the ABI, but my concern is somehow I
feel you just want to use it for your convenience at the cost of adding
something arguable to it, and what's worse, then needing to hack the kernel to
workaround the problems that can be avoided at the first place.
  
Jeremi Piotrowski Dec. 8, 2023, 12:45 p.m. UTC | #33
On 07/12/2023 18:36, Reshetova, Elena wrote:
>>>> The TDVMCALLs are related to the I/O path (networking/block io) into the L2
>> guest, and
>>>> so they intentionally go straight to L0 and are never injected to L1. L1 is not
>>>> involved in that path at all.
>>>>
>>>> Using something different than TDVMCALLs here would lead to additional
>> traps to L1 and
>>>> just add latency/complexity.
>>>
>>> Looks by default you assume we should use TDX partitioning as "paravisor L1" +
>>> "L0 device I/O emulation".
>>>
>>
>> I don't actually want to impose this model on anyone, but this is the one that
>> could use some refactoring. I intend to rework these patches to not use a single
>> "td_partitioning_active" for decisions.
>>
>>> I think we are lacking background of this usage model and how it works.  For
>>> instance, typically L2 is created by L1, and L1 is responsible for L2's device
>>> I/O emulation.  I don't quite understand how could L0 emulate L2's device I/O?
>>>
>>> Can you provide more information?
>>
>> Let's differentiate between fast and slow I/O. The whole point of the paravisor in
>> L1 is to provide device emulation for slow I/O: TPM, RTC, NVRAM, IO-APIC, serial
>> ports.
> 
> Out of my curiosity and not really related to this discussion, but could you please
> elaborate on RTC part here? Do you actually host secure time in L1 to be provided
> to the L2? 
> 
> Best Regards,
> Elena.

Hi Elena,

I think this RTC is more for compatibility and to give the guest a way to initialize
the system clock. This could potentially be a secure time source in the future but it
isn't right now. This is what the guest sees right now (might need some more
enlightenment):

# dmesg | grep -E -i 'clock|rtc|tsc'
[    0.000000] clocksource: hyperv_clocksource_tsc_page: mask: 0xffffffffffffffff max_cycles: 0x24e6a1710, max_idle_ns: 440795202120 ns
[    0.000001] tsc: Marking TSC unstable due to running on Hyper-V
[    0.003621] tsc: Detected 2100.000 MHz processor
[    0.411943] clocksource: refined-jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645519600211568 ns
[    0.887459] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
[    0.895842] PM: RTC time: 16:31:58, date: 2023-12-07
[    1.043431] PTP clock support registered
[    1.096563] clocksource: Switched to clocksource hyperv_clocksource_tsc_page
[    1.384341] rtc_cmos 00:02: registered as rtc0
[    1.387469] rtc_cmos 00:02: setting system clock to 2023-12-07T16:31:58 UTC (1701966718)
[    1.392529] rtc_cmos 00:02: alarms up to one day, 114 bytes nvram
[    1.484243] sched_clock: Marking stable (1449873200, 33603000)->(3264982600, -1781506400)

Jeremi
  

Patch

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 1d6b863c42b0..c7bbbaaf654d 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -8,6 +8,7 @@ 
 #include <linux/export.h>
 #include <linux/io.h>
 #include <asm/coco.h>
+#include <asm/hyperv-tlfs.h>
 #include <asm/tdx.h>
 #include <asm/vmx.h>
 #include <asm/insn.h>
@@ -37,6 +38,8 @@ 
 
 #define TDREPORT_SUBTYPE_0	0
 
+bool tdx_partitioning_active;
+
 /* Called from __tdx_hypercall() for unrecoverable failure */
 noinstr void __tdx_hypercall_failed(void)
 {
@@ -757,19 +760,38 @@  static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
 	return true;
 }
 
+
+static bool early_is_hv_tdx_partitioning(void)
+{
+	u32 eax, ebx, ecx, edx;
+	cpuid(HYPERV_CPUID_ISOLATION_CONFIG, &eax, &ebx, &ecx, &edx);
+	return eax & HV_PARAVISOR_PRESENT &&
+	       (ebx & HV_ISOLATION_TYPE) == HV_ISOLATION_TYPE_TDX;
+}
+
 void __init tdx_early_init(void)
 {
 	u64 cc_mask;
 	u32 eax, sig[3];
 
 	cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2],  &sig[1]);
-
-	if (memcmp(TDX_IDENT, sig, sizeof(sig)))
-		return;
+	if (memcmp(TDX_IDENT, sig, sizeof(sig))) {
+		tdx_partitioning_active = early_is_hv_tdx_partitioning();
+		if (!tdx_partitioning_active)
+			return;
+	}
 
 	setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
 
 	cc_vendor = CC_VENDOR_INTEL;
+
+	/*
+	 * Need to defer cc_mask and page visibility callback initializations
+	 * to a TD-partitioning aware implementation.
+	 */
+	if (tdx_partitioning_active)
+		goto exit;
+
 	tdx_parse_tdinfo(&cc_mask);
 	cc_set_mask(cc_mask);
 
@@ -820,5 +842,6 @@  void __init tdx_early_init(void)
 	 */
 	x86_cpuinit.parallel_bringup = false;
 
+exit:
 	pr_info("Guest detected\n");
 }
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 603e6d1e9d4a..fe22f8675859 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -52,6 +52,7 @@  bool tdx_early_handle_ve(struct pt_regs *regs);
 
 int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
 
+extern bool tdx_partitioning_active;
 #else
 
 static inline void tdx_early_init(void) { };
@@ -71,6 +72,8 @@  static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
 {
 	return -ENODEV;
 }
+
+#define tdx_partitioning_active false
 #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_TDX_H */