[v10,021/108] KVM: TDX: initialize VM with TDX specific parameters

Message ID ebb4beadff8e117d0fb643af4ee310b6608679bf.1667110240.git.isaku.yamahata@intel.com
State New
Headers
Series KVM TDX basic feature support |

Commit Message

Isaku Yamahata Oct. 30, 2022, 6:22 a.m. UTC
  From: Xiaoyao Li <xiaoyao.li@intel.com>

TDX requires additional parameters for TDX VM for confidential execution to
protect its confidentiality of its memory contents and its CPU state from
any other software, including VMM. When creating guest TD VM before
creating vcpu, the number of vcpu, TSC frequency (that is same among
vcpus. and it can't be changed.)  CPUIDs which is emulated by the TDX
module. It means guest can trust those CPUIDs. and sha384 values for
measurement.

Add new subcommand, KVM_TDX_INIT_VM, to pass parameters for TDX guest.  It
assigns encryption key to the TDX guest for memory encryption.  TDX
encrypts memory per-guest bases.  It assigns device model passes per-VM
parameters for the TDX guest.  The maximum number of vcpus, tsc frequency
(TDX guest has fised VM-wide TSC frequency. not per-vcpu.  The TDX guest
can not change it.), attributes (production or debug), available extended
features (which is reflected into guest XCR0, IA32_XSS MSR), cpuids, sha384
measurements, and etc.

This subcommand is called before creating vcpu and KVM_SET_CPUID2, i.e.
cpuids configurations aren't available yet.  So CPUIDs configuration values
needs to be passed in struct kvm_init_vm.  It's device model responsibility
to make this cpuid config for KVM_TDX_INIT_VM and KVM_SET_CPUID2.

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/include/asm/tdx.h            |   3 +
 arch/x86/include/uapi/asm/kvm.h       |  31 +++
 arch/x86/kvm/vmx/tdx.c                | 296 ++++++++++++++++++++++----
 arch/x86/kvm/vmx/tdx.h                |  22 ++
 tools/arch/x86/include/uapi/asm/kvm.h |  33 +++
 5 files changed, 347 insertions(+), 38 deletions(-)
  

Comments

Wang, Lei Nov. 16, 2022, 5:34 a.m. UTC | #1
On 10/30/2022 2:22 PM, isaku.yamahata@intel.com wrote:
> From: Xiaoyao Li <xiaoyao.li@intel.com>
> 
> TDX requires additional parameters for TDX VM for confidential execution to
> protect its confidentiality of its memory contents and its CPU state from
> any other software, including VMM. When creating guest TD VM before
> creating vcpu, the number of vcpu, TSC frequency (that is same among
> vcpus. and it can't be changed.)  CPUIDs which is emulated by the TDX
> module. It means guest can trust those CPUIDs. and sha384 values for
> measurement.
> 
> Add new subcommand, KVM_TDX_INIT_VM, to pass parameters for TDX guest.  It
> assigns encryption key to the TDX guest for memory encryption.  TDX
> encrypts memory per-guest bases.  It assigns device model passes per-VM
> parameters for the TDX guest.  The maximum number of vcpus, tsc frequency
> (TDX guest has fised VM-wide TSC frequency. not per-vcpu.  The TDX guest
> can not change it.), attributes (production or debug), available extended
> features (which is reflected into guest XCR0, IA32_XSS MSR), cpuids, sha384
> measurements, and etc.
> 
> This subcommand is called before creating vcpu and KVM_SET_CPUID2, i.e.
> cpuids configurations aren't available yet.  So CPUIDs configuration values
> needs to be passed in struct kvm_init_vm.  It's device model responsibility
> to make this cpuid config for KVM_TDX_INIT_VM and KVM_SET_CPUID2.
> 
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/tdx.h            |   3 +
>  arch/x86/include/uapi/asm/kvm.h       |  31 +++
>  arch/x86/kvm/vmx/tdx.c                | 296 ++++++++++++++++++++++----
>  arch/x86/kvm/vmx/tdx.h                |  22 ++
>  tools/arch/x86/include/uapi/asm/kvm.h |  33 +++
>  5 files changed, 347 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index cd304d323d33..05ac4bfc8f8a 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -131,6 +131,9 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
>  #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
>  
>  #ifdef CONFIG_INTEL_TDX_HOST
> +
> +/* -1 indicates CPUID leaf with no sub-leaves. */
> +#define TDX_CPUID_NO_SUBLEAF	((u32)-1)
>  struct tdx_cpuid_config {
>  	u32	leaf;
>  	u32	sub_leaf;
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 2ad9666e02a5..26661879c031 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -538,6 +538,7 @@ struct kvm_pmu_event_filter {
>  /* Trust Domain eXtension sub-ioctl() commands. */
>  enum kvm_tdx_cmd_id {
>  	KVM_TDX_CAPABILITIES = 0,
> +	KVM_TDX_INIT_VM,
>  
>  	KVM_TDX_CMD_NR_MAX,
>  };
> @@ -583,4 +584,34 @@ struct kvm_tdx_capabilities {
>  	struct kvm_tdx_cpuid_config cpuid_configs[0];
>  };
>  
> +struct kvm_tdx_init_vm {
> +	__u64 attributes;
> +	__u64 mrconfigid[6];	/* sha384 digest */
> +	__u64 mrowner[6];	/* sha384 digest */
> +	__u64 mrownerconfig[6];	/* sha348 digest */
> +	union {
> +		/*
> +		 * KVM_TDX_INIT_VM is called before vcpu creation, thus before
> +		 * KVM_SET_CPUID2.  CPUID configurations needs to be passed.
> +		 *
> +		 * This configuration supersedes KVM_SET_CPUID{,2}.
> +		 * The user space VMM, e.g. qemu, should make them consistent
> +		 * with this values.
> +		 * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(256)
> +		 * = 8KB.
> +		 */
> +		struct {
> +			struct kvm_cpuid2 cpuid;
> +			/* 8KB with KVM_MAX_CPUID_ENTRIES. */
> +			struct kvm_cpuid_entry2 entries[];
> +		};
> +		/*
> +		 * For future extensibility.
> +		 * The size(struct kvm_tdx_init_vm) = 16KB.
> +		 * This should be enough given sizeof(TD_PARAMS) = 1024
> +		 */
> +		__u64 reserved[2029];
> +	};
> +};
> +
>  #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index d77709a6da51..54045e0576e7 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -284,6 +284,205 @@ static int tdx_do_tdh_mng_key_config(void *param)
>  int tdx_vm_init(struct kvm *kvm)
>  {
>  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +
> +	kvm_tdx->hkid = -1;
> +
> +	/*
> +	 * This function initializes only KVM software construct.  It doesn't
> +	 * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc.
> +	 * It is handled by KVM_TDX_INIT_VM, __tdx_td_init().
> +	 */
> +
> +	return 0;
> +}
> +
> +int tdx_dev_ioctl(void __user *argp)
> +{
> +	struct kvm_tdx_capabilities __user *user_caps;
> +	struct kvm_tdx_capabilities caps;
> +	struct kvm_tdx_cmd cmd;
> +
> +	BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
> +		     sizeof(struct tdx_cpuid_config));
> +
> +	if (copy_from_user(&cmd, argp, sizeof(cmd)))
> +		return -EFAULT;
> +	if (cmd.flags || cmd.error || cmd.unused)
> +		return -EINVAL;
> +	/*
> +	 * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
> +	 * mem_enc_ioctl().
> +	 */
> +	if (cmd.id != KVM_TDX_CAPABILITIES)
> +		return -EINVAL;
> +
> +	user_caps = (void __user *)cmd.data;
> +	if (copy_from_user(&caps, user_caps, sizeof(caps)))
> +		return -EFAULT;
> +
> +	if (caps.nr_cpuid_configs < tdx_caps.nr_cpuid_configs)
> +		return -E2BIG;
> +
> +	caps = (struct kvm_tdx_capabilities) {
> +		.attrs_fixed0 = tdx_caps.attrs_fixed0,
> +		.attrs_fixed1 = tdx_caps.attrs_fixed1,
> +		.xfam_fixed0 = tdx_caps.xfam_fixed0,
> +		.xfam_fixed1 = tdx_caps.xfam_fixed1,
> +		.nr_cpuid_configs = tdx_caps.nr_cpuid_configs,
> +		.padding = 0,
> +	};
> +
> +	if (copy_to_user(user_caps, &caps, sizeof(caps)))
> +		return -EFAULT;
> +	if (copy_to_user(user_caps->cpuid_configs, &tdx_caps.cpuid_configs,
> +			 tdx_caps.nr_cpuid_configs *
> +			 sizeof(struct tdx_cpuid_config)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +/*
> + * cpuid entry lookup in TDX cpuid config way.
> + * The difference is how to specify index(subleaves).
> + * Specify index to TDX_CPUID_NO_SUBLEAF for CPUID leaf with no-subleaves.
> + */
> +static const struct kvm_cpuid_entry2 *tdx_find_cpuid_entry(const struct kvm_cpuid2 *cpuid,
> +							   u32 function, u32 index)
> +{
> +	int i;
> +
> +	/* In TDX CPU CONFIG, TDX_CPUID_NO_SUBLEAF means index = 0. */
> +	if (index == TDX_CPUID_NO_SUBLEAF)
> +		index = 0;
> +
> +	for (i = 0; i < cpuid->nent; i++) {
> +		const struct kvm_cpuid_entry2 *e = &cpuid->entries[i];
> +
> +		if (e->function == function &&
> +		    (e->index == index ||
> +		     !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
> +			return e;
> +	}
> +	return NULL;
> +}
> +
> +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
> +			struct kvm_tdx_init_vm *init_vm)
> +{
> +	const struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
> +	const struct kvm_cpuid_entry2 *entry;
> +	u64 guest_supported_xcr0;
> +	u64 guest_supported_xss;
> +	int max_pa;
> +	int i;
> +
> +	if (kvm->created_vcpus)
> +		return -EBUSY;
> +	td_params->max_vcpus = kvm->max_vcpus;
> +	td_params->attributes = init_vm->attributes;
> +	if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
> +		/*
> +		 * TODO: save/restore PMU related registers around TDENTER.
> +		 * Once it's done, remove this guard.
> +		 */
> +		pr_warn("TD doesn't support perfmon yet. KVM needs to save/restore "
> +			"host perf registers properly.\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	for (i = 0; i < tdx_caps.nr_cpuid_configs; i++) {
> +		const struct tdx_cpuid_config *config = &tdx_caps.cpuid_configs[i];
> +		const struct kvm_cpuid_entry2 *entry =
> +			tdx_find_cpuid_entry(cpuid, config->leaf, config->sub_leaf);
> +		struct tdx_cpuid_value *value = &td_params->cpuid_values[i];
> +
> +		if (!entry)
> +			continue;
> +
> +		value->eax = entry->eax & config->eax;
> +		value->ebx = entry->ebx & config->ebx;
> +		value->ecx = entry->ecx & config->ecx;
> +		value->edx = entry->edx & config->edx;
> +	}
> +
> +	max_pa = 36;
> +	entry = tdx_find_cpuid_entry(cpuid, 0x80000008, 0);
> +	if (entry)
> +		max_pa = entry->eax & 0xff;
> +
> +	td_params->eptp_controls = VMX_EPTP_MT_WB;
> +	/*
> +	 * No CPU supports 4-level && max_pa > 48.
> +	 * "5-level paging and 5-level EPT" section 4.1 4-level EPT
> +	 * "4-level EPT is limited to translating 48-bit guest-physical
> +	 *  addresses."
> +	 * cpu_has_vmx_ept_5levels() check is just in case.
> +	 */
> +	if (cpu_has_vmx_ept_5levels() && max_pa > 48) {
> +		td_params->eptp_controls |= VMX_EPTP_PWL_5;
> +		td_params->exec_controls |= TDX_EXEC_CONTROL_MAX_GPAW;
> +	} else {
> +		td_params->eptp_controls |= VMX_EPTP_PWL_4;
> +	}
> +
> +	/* Setup td_params.xfam */
> +	entry = tdx_find_cpuid_entry(cpuid, 0xd, 0);
> +	if (entry)
> +		guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32));
> +	else
> +		guest_supported_xcr0 = 0;
> +	guest_supported_xcr0 &= kvm_caps.supported_xcr0;
> +
> +	entry = tdx_find_cpuid_entry(cpuid, 0xd, 1);
> +	if (entry)
> +		guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32));
> +	else
> +		guest_supported_xss = 0;
> +	/* PT can be exposed to TD guest regardless of KVM's XSS support */
> +	guest_supported_xss &= (kvm_caps.supported_xss | XFEATURE_MASK_PT);
> +
> +	td_params->xfam = guest_supported_xcr0 | guest_supported_xss;
> +	if (td_params->xfam & XFEATURE_MASK_LBR) {
> +		/*
> +		 * TODO: once KVM supports LBR(save/restore LBR related
> +		 * registers around TDENTER), remove this guard.
> +		 */
> +		pr_warn("TD doesn't support LBR yet. KVM needs to save/restore "
> +			"IA32_LBR_DEPTH properly.\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (td_params->xfam & XFEATURE_MASK_XTILE) {
> +		/*
> +		 * TODO: once KVM supports AMX(save/restore AMX related
> +		 * registers around TDENTER), remove this guard.
> +		 */
> +		pr_warn("TD doesn't support AMX yet. KVM needs to save/restore "
> +			"IA32_XFD, IA32_XFD_ERR properly.\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	td_params->tsc_frequency =
> +		TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
> +
> +#define MEMCPY_SAME_SIZE(dst, src)				\
> +	do {							\
> +		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
> +		memcpy((dst), (src), sizeof(dst));		\
> +	} while (0)
> +
> +	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
> +	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
> +	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
> +
> +	return 0;
> +}
> +
> +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct tdx_module_output out;
>  	cpumask_var_t packages;
>  	int ret, i;
>  	u64 err;
> @@ -390,10 +589,13 @@ int tdx_vm_init(struct kvm *kvm)
>  		tdx_mark_td_page_added(&kvm_tdx->tdcs[i]);
>  	}
>  
> -	/*
> -	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> -	 * ioctl() to define the configure CPUID values for the TD.
> -	 */
> +	err = tdh_mng_init(kvm_tdx->tdr.pa, __pa(td_params), &out);
> +	if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error(TDH_MNG_INIT, err, &out);
> +		ret = -EIO;
> +		goto teardown;
> +	}
> +
>  	return 0;
>  
>  	/*
> @@ -431,50 +633,65 @@ int tdx_vm_init(struct kvm *kvm)
>  	return ret;
>  }
>  
> -int tdx_dev_ioctl(void __user *argp)
> +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
>  {
> -	struct kvm_tdx_capabilities __user *user_caps;
> -	struct kvm_tdx_capabilities caps;
> -	struct kvm_tdx_cmd cmd;
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct kvm_tdx_init_vm *init_vm = NULL;
> +	struct td_params *td_params = NULL;
> +	void *entries_end;
> +	int ret;
>  
> -	BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
> -		     sizeof(struct tdx_cpuid_config));
> +	BUILD_BUG_ON(sizeof(*init_vm) != 16 * 1024);
> +	BUILD_BUG_ON((sizeof(*init_vm) - offsetof(typeof(*init_vm), entries)) /
> +		     sizeof(init_vm->entries[0]) < KVM_MAX_CPUID_ENTRIES);
> +	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
>  
> -	if (copy_from_user(&cmd, argp, sizeof(cmd)))
> -		return -EFAULT;
> -	if (cmd.flags || cmd.error || cmd.unused)
> +	if (is_td_initialized(kvm))
>  		return -EINVAL;
> -	/*
> -	 * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
> -	 * mem_enc_ioctl().
> -	 */
> -	if (cmd.id != KVM_TDX_CAPABILITIES)
> +
> +	if (cmd->flags)
>  		return -EINVAL;
>  
> -	user_caps = (void __user *)cmd.data;
> -	if (copy_from_user(&caps, user_caps, sizeof(caps)))
> -		return -EFAULT;
> +	init_vm = kzalloc(sizeof(*init_vm), GFP_KERNEL);
> +	if (copy_from_user(init_vm, (void __user *)cmd->data, sizeof(*init_vm))) {

Pointer 'init_vm' returned from call to function 'kzalloc' may be NULL and will
be dereferenced at this line.

Maybe a NULL-check here is needed?

> +		ret = -EFAULT;
> +		goto out;
> +	}
>  
> -	if (caps.nr_cpuid_configs < tdx_caps.nr_cpuid_configs)
> -		return -E2BIG;
> +	ret = -EINVAL;
> +	if (init_vm->cpuid.padding)
> +		goto out;
> +	/* init_vm->entries shouldn't overrun. */
> +	entries_end = init_vm->entries + init_vm->cpuid.nent;
> +	if (entries_end > (void *)(init_vm + 1))
> +		goto out;
> +	/* Unused part must be zero. */
> +	if (memchr_inv(entries_end, 0, (void *)(init_vm + 1) - entries_end))
> +		goto out;
>  
> -	caps = (struct kvm_tdx_capabilities) {
> -		.attrs_fixed0 = tdx_caps.attrs_fixed0,
> -		.attrs_fixed1 = tdx_caps.attrs_fixed1,
> -		.xfam_fixed0 = tdx_caps.xfam_fixed0,
> -		.xfam_fixed1 = tdx_caps.xfam_fixed1,
> -		.nr_cpuid_configs = tdx_caps.nr_cpuid_configs,
> -		.padding = 0,
> -	};
> +	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
> +	if (!td_params) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
>  
> -	if (copy_to_user(user_caps, &caps, sizeof(caps)))
> -		return -EFAULT;
> -	if (copy_to_user(user_caps->cpuid_configs, &tdx_caps.cpuid_configs,
> -			 tdx_caps.nr_cpuid_configs *
> -			 sizeof(struct tdx_cpuid_config)))
> -		return -EFAULT;
> +	ret = setup_tdparams(kvm, td_params, init_vm);
> +	if (ret)
> +		goto out;
>  
> -	return 0;
> +	ret = __tdx_td_init(kvm, td_params);
> +	if (ret)
> +		goto out;
> +
> +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> +	kvm_tdx->attributes = td_params->attributes;
> +	kvm_tdx->xfam = td_params->xfam;
> +
> +out:
> +	/* kfree() accepts NULL. */
> +	kfree(init_vm);
> +	kfree(td_params);
> +	return ret;
>  }
>  
>  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> @@ -490,6 +707,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  	mutex_lock(&kvm->lock);
>  
>  	switch (tdx_cmd.id) {
> +	case KVM_TDX_INIT_VM:
> +		r = tdx_td_init(kvm, &tdx_cmd);
> +		break;
>  	default:
>  		r = -EINVAL;
>  		goto out;
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 938314635b47..ff0ea9cad347 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -18,7 +18,11 @@ struct kvm_tdx {
>  	struct tdx_td_page tdr;
>  	struct tdx_td_page *tdcs;
>  
> +	u64 attributes;
> +	u64 xfam;
>  	int hkid;
> +
> +	u64 tsc_offset;
>  };
>  
>  struct vcpu_tdx {
> @@ -48,6 +52,11 @@ static inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
>  	return container_of(vcpu, struct vcpu_tdx, vcpu);
>  }
>  
> +static inline bool is_td_initialized(struct kvm *kvm)
> +{
> +	return to_kvm_tdx(kvm)->hkid > 0;
> +}
> +
>  static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
>  {
>  #define VMCS_ENC_ACCESS_TYPE_MASK	0x1UL
> @@ -148,6 +157,19 @@ TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
>  TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch);
>  TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
>  
> +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
> +{
> +	struct tdx_module_output out;
> +	u64 err;
> +
> +	err = tdh_mng_rd(kvm_tdx->tdr.pa, TDCS_EXEC(field), &out);
> +	if (unlikely(err)) {
> +		pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
> +		return 0;
> +	}
> +	return out.r8;
> +}
> +
>  #else
>  struct kvm_tdx {
>  	struct kvm kvm;
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index 2ad9666e02a5..531a0033e530 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -538,6 +538,7 @@ struct kvm_pmu_event_filter {
>  /* Trust Domain eXtension sub-ioctl() commands. */
>  enum kvm_tdx_cmd_id {
>  	KVM_TDX_CAPABILITIES = 0,
> +	KVM_TDX_INIT_VM,
>  
>  	KVM_TDX_CMD_NR_MAX,
>  };
> @@ -583,4 +584,36 @@ struct kvm_tdx_capabilities {
>  	struct kvm_tdx_cpuid_config cpuid_configs[0];
>  };
>  
> +struct kvm_tdx_init_vm {
> +	__u64 attributes;
> +	__u32 max_vcpus;
> +	__u32 padding;
> +	__u64 mrconfigid[6];    /* sha384 digest */
> +	__u64 mrowner[6];       /* sha384 digest */
> +	__u64 mrownerconfig[6]; /* sha348 digest */
> +	union {
> +		/*
> +		 * KVM_TDX_INIT_VM is called before vcpu creation, thus before
> +		 * KVM_SET_CPUID2.  CPUID configurations needs to be passed.
> +		 *
> +		 * This configuration supersedes KVM_SET_CPUID{,2}.
> +		 * The user space VMM, e.g. qemu, should make them consistent
> +		 * with this values.
> +		 * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(256)
> +		 * = 8KB.
> +		 */
> +		struct {
> +			struct kvm_cpuid2 cpuid;
> +			/* 8KB with KVM_MAX_CPUID_ENTRIES. */
> +			struct kvm_cpuid_entry2 entries[];
> +		};
> +		/*
> +		 * For future extensibility.
> +		 * The size(struct kvm_tdx_init_vm) = 16KB.
> +		 * This should be enough given sizeof(TD_PARAMS) = 1024
> +		 */
> +		__u64 reserved[2028];
> +	};
> +};
> +
>  #endif /* _ASM_X86_KVM_H */
  
Isaku Yamahata Nov. 17, 2022, 5:51 p.m. UTC | #2
On Wed, Nov 16, 2022 at 01:34:46PM +0800,
"Wang, Lei" <lei4.wang@intel.com> wrote:

> > @@ -431,50 +633,65 @@ int tdx_vm_init(struct kvm *kvm)
> >  	return ret;
> >  }
> >  
> > -int tdx_dev_ioctl(void __user *argp)
> > +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> >  {
> > -	struct kvm_tdx_capabilities __user *user_caps;
> > -	struct kvm_tdx_capabilities caps;
> > -	struct kvm_tdx_cmd cmd;
> > +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > +	struct kvm_tdx_init_vm *init_vm = NULL;
> > +	struct td_params *td_params = NULL;
> > +	void *entries_end;
> > +	int ret;
> >  
> > -	BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
> > -		     sizeof(struct tdx_cpuid_config));
> > +	BUILD_BUG_ON(sizeof(*init_vm) != 16 * 1024);
> > +	BUILD_BUG_ON((sizeof(*init_vm) - offsetof(typeof(*init_vm), entries)) /
> > +		     sizeof(init_vm->entries[0]) < KVM_MAX_CPUID_ENTRIES);
> > +	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
> >  
> > -	if (copy_from_user(&cmd, argp, sizeof(cmd)))
> > -		return -EFAULT;
> > -	if (cmd.flags || cmd.error || cmd.unused)
> > +	if (is_td_initialized(kvm))
> >  		return -EINVAL;
> > -	/*
> > -	 * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
> > -	 * mem_enc_ioctl().
> > -	 */
> > -	if (cmd.id != KVM_TDX_CAPABILITIES)
> > +
> > +	if (cmd->flags)
> >  		return -EINVAL;
> >  
> > -	user_caps = (void __user *)cmd.data;
> > -	if (copy_from_user(&caps, user_caps, sizeof(caps)))
> > -		return -EFAULT;
> > +	init_vm = kzalloc(sizeof(*init_vm), GFP_KERNEL);
> > +	if (copy_from_user(init_vm, (void __user *)cmd->data, sizeof(*init_vm))) {
> 
> Pointer 'init_vm' returned from call to function 'kzalloc' may be NULL and will
> be dereferenced at this line.
> 
> Maybe a NULL-check here is needed?

Yes, thank you for catching it.

Thanks,
  
Wang, Lei Jan. 4, 2023, 7:59 a.m. UTC | #3
On 10/30/2022 2:22 PM, isaku.yamahata@intel.com wrote:
> From: Xiaoyao Li <xiaoyao.li@intel.com>
> 
> TDX requires additional parameters for TDX VM for confidential execution to
> protect its confidentiality of its memory contents and its CPU state from
> any other software, including VMM. When creating guest TD VM before
> creating vcpu, the number of vcpu, TSC frequency (that is same among
> vcpus. and it can't be changed.)  CPUIDs which is emulated by the TDX
> module. It means guest can trust those CPUIDs. and sha384 values for
> measurement.
> 
> Add new subcommand, KVM_TDX_INIT_VM, to pass parameters for TDX guest.  It
> assigns encryption key to the TDX guest for memory encryption.  TDX
> encrypts memory per-guest bases.  It assigns device model passes per-VM
> parameters for the TDX guest.  The maximum number of vcpus, tsc frequency
> (TDX guest has fised VM-wide TSC frequency. not per-vcpu.  The TDX guest
> can not change it.), attributes (production or debug), available extended
> features (which is reflected into guest XCR0, IA32_XSS MSR), cpuids, sha384
> measurements, and etc.
> 
> This subcommand is called before creating vcpu and KVM_SET_CPUID2, i.e.
> cpuids configurations aren't available yet.  So CPUIDs configuration values
> needs to be passed in struct kvm_init_vm.  It's device model responsibility

I suppose this should be kvm_tdx_init_vm.

> to make this cpuid config for KVM_TDX_INIT_VM and KVM_SET_CPUID2.
> 
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/tdx.h            |   3 +
>  arch/x86/include/uapi/asm/kvm.h       |  31 +++
>  arch/x86/kvm/vmx/tdx.c                | 296 ++++++++++++++++++++++----
>  arch/x86/kvm/vmx/tdx.h                |  22 ++
>  tools/arch/x86/include/uapi/asm/kvm.h |  33 +++
>  5 files changed, 347 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index cd304d323d33..05ac4bfc8f8a 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -131,6 +131,9 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
>  #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
>  
>  #ifdef CONFIG_INTEL_TDX_HOST
> +
> +/* -1 indicates CPUID leaf with no sub-leaves. */
> +#define TDX_CPUID_NO_SUBLEAF	((u32)-1)
>  struct tdx_cpuid_config {
>  	u32	leaf;
>  	u32	sub_leaf;
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 2ad9666e02a5..26661879c031 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -538,6 +538,7 @@ struct kvm_pmu_event_filter {
>  /* Trust Domain eXtension sub-ioctl() commands. */
>  enum kvm_tdx_cmd_id {
>  	KVM_TDX_CAPABILITIES = 0,
> +	KVM_TDX_INIT_VM,
>  
>  	KVM_TDX_CMD_NR_MAX,
>  };
> @@ -583,4 +584,34 @@ struct kvm_tdx_capabilities {
>  	struct kvm_tdx_cpuid_config cpuid_configs[0];
>  };
>  
> +struct kvm_tdx_init_vm {
> +	__u64 attributes;
> +	__u64 mrconfigid[6];	/* sha384 digest */
> +	__u64 mrowner[6];	/* sha384 digest */
> +	__u64 mrownerconfig[6];	/* sha348 digest */
> +	union {
> +		/*
> +		 * KVM_TDX_INIT_VM is called before vcpu creation, thus before
> +		 * KVM_SET_CPUID2.  CPUID configurations needs to be passed.
> +		 *
> +		 * This configuration supersedes KVM_SET_CPUID{,2}.
> +		 * The user space VMM, e.g. qemu, should make them consistent
> +		 * with this values.
> +		 * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(256)
> +		 * = 8KB.
> +		 */
> +		struct {
> +			struct kvm_cpuid2 cpuid;
> +			/* 8KB with KVM_MAX_CPUID_ENTRIES. */
> +			struct kvm_cpuid_entry2 entries[];
> +		};
> +		/*
> +		 * For future extensibility.
> +		 * The size(struct kvm_tdx_init_vm) = 16KB.
> +		 * This should be enough given sizeof(TD_PARAMS) = 1024
> +		 */
> +		__u64 reserved[2029];
> +	};
> +};
> +
>  #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index d77709a6da51..54045e0576e7 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -284,6 +284,205 @@ static int tdx_do_tdh_mng_key_config(void *param)
>  int tdx_vm_init(struct kvm *kvm)
>  {
>  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +
> +	kvm_tdx->hkid = -1;
> +
> +	/*
> +	 * This function initializes only KVM software construct.  It doesn't
> +	 * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc.
> +	 * It is handled by KVM_TDX_INIT_VM, __tdx_td_init().
> +	 */
> +
> +	return 0;
> +}
> +
> +int tdx_dev_ioctl(void __user *argp)
> +{
> +	struct kvm_tdx_capabilities __user *user_caps;
> +	struct kvm_tdx_capabilities caps;
> +	struct kvm_tdx_cmd cmd;
> +
> +	BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
> +		     sizeof(struct tdx_cpuid_config));
> +
> +	if (copy_from_user(&cmd, argp, sizeof(cmd)))
> +		return -EFAULT;
> +	if (cmd.flags || cmd.error || cmd.unused)
> +		return -EINVAL;
> +	/*
> +	 * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
> +	 * mem_enc_ioctl().
> +	 */
> +	if (cmd.id != KVM_TDX_CAPABILITIES)
> +		return -EINVAL;
> +
> +	user_caps = (void __user *)cmd.data;
> +	if (copy_from_user(&caps, user_caps, sizeof(caps)))
> +		return -EFAULT;
> +
> +	if (caps.nr_cpuid_configs < tdx_caps.nr_cpuid_configs)
> +		return -E2BIG;
> +
> +	caps = (struct kvm_tdx_capabilities) {
> +		.attrs_fixed0 = tdx_caps.attrs_fixed0,
> +		.attrs_fixed1 = tdx_caps.attrs_fixed1,
> +		.xfam_fixed0 = tdx_caps.xfam_fixed0,
> +		.xfam_fixed1 = tdx_caps.xfam_fixed1,
> +		.nr_cpuid_configs = tdx_caps.nr_cpuid_configs,
> +		.padding = 0,
> +	};
> +
> +	if (copy_to_user(user_caps, &caps, sizeof(caps)))
> +		return -EFAULT;
> +	if (copy_to_user(user_caps->cpuid_configs, &tdx_caps.cpuid_configs,
> +			 tdx_caps.nr_cpuid_configs *
> +			 sizeof(struct tdx_cpuid_config)))
> +		return -EFAULT;
> +
> +	return 0;
> +}

tdx_dev_ioctl() is introduced in previous patch with the same code added here,
which means this is just a place change and it will confuse reviewers. Is it
neccesary to do so?

> +
> +/*
> + * cpuid entry lookup in TDX cpuid config way.
> + * The difference is how to specify index(subleaves).
> + * Specify index to TDX_CPUID_NO_SUBLEAF for CPUID leaf with no-subleaves.
> + */
> +static const struct kvm_cpuid_entry2 *tdx_find_cpuid_entry(const struct kvm_cpuid2 *cpuid,
> +							   u32 function, u32 index)
> +{
> +	int i;
> +
> +	/* In TDX CPU CONFIG, TDX_CPUID_NO_SUBLEAF means index = 0. */
> +	if (index == TDX_CPUID_NO_SUBLEAF)
> +		index = 0;
> +
> +	for (i = 0; i < cpuid->nent; i++) {
> +		const struct kvm_cpuid_entry2 *e = &cpuid->entries[i];
> +
> +		if (e->function == function &&
> +		    (e->index == index ||
> +		     !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
> +			return e;
> +	}
> +	return NULL;
> +}
> +
> +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
> +			struct kvm_tdx_init_vm *init_vm)

Indentation is broken here, please fix it.

> +{
> +	const struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
> +	const struct kvm_cpuid_entry2 *entry;
> +	u64 guest_supported_xcr0;
> +	u64 guest_supported_xss;
> +	int max_pa;
> +	int i;
> +
> +	if (kvm->created_vcpus)
> +		return -EBUSY;
> +	td_params->max_vcpus = kvm->max_vcpus;
> +	td_params->attributes = init_vm->attributes;
> +	if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
> +		/*
> +		 * TODO: save/restore PMU related registers around TDENTER.
> +		 * Once it's done, remove this guard.
> +		 */
> +		pr_warn("TD doesn't support perfmon yet. KVM needs to save/restore "
> +			"host perf registers properly.\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	for (i = 0; i < tdx_caps.nr_cpuid_configs; i++) {
> +		const struct tdx_cpuid_config *config = &tdx_caps.cpuid_configs[i];
> +		const struct kvm_cpuid_entry2 *entry =
> +			tdx_find_cpuid_entry(cpuid, config->leaf, config->sub_leaf);
> +		struct tdx_cpuid_value *value = &td_params->cpuid_values[i];
> +
> +		if (!entry)
> +			continue;
> +
> +		value->eax = entry->eax & config->eax;
> +		value->ebx = entry->ebx & config->ebx;
> +		value->ecx = entry->ecx & config->ecx;
> +		value->edx = entry->edx & config->edx;
> +	}
> +
> +	max_pa = 36;
> +	entry = tdx_find_cpuid_entry(cpuid, 0x80000008, 0);
> +	if (entry)
> +		max_pa = entry->eax & 0xff;
> +
> +	td_params->eptp_controls = VMX_EPTP_MT_WB;
> +	/*
> +	 * No CPU supports 4-level && max_pa > 48.
> +	 * "5-level paging and 5-level EPT" section 4.1 4-level EPT
> +	 * "4-level EPT is limited to translating 48-bit guest-physical
> +	 *  addresses."
> +	 * cpu_has_vmx_ept_5levels() check is just in case.
> +	 */
> +	if (cpu_has_vmx_ept_5levels() && max_pa > 48) {
> +		td_params->eptp_controls |= VMX_EPTP_PWL_5;
> +		td_params->exec_controls |= TDX_EXEC_CONTROL_MAX_GPAW;
> +	} else {
> +		td_params->eptp_controls |= VMX_EPTP_PWL_4;
> +	}
> +
> +	/* Setup td_params.xfam */
> +	entry = tdx_find_cpuid_entry(cpuid, 0xd, 0);
> +	if (entry)
> +		guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32));
> +	else
> +		guest_supported_xcr0 = 0;
> +	guest_supported_xcr0 &= kvm_caps.supported_xcr0;
> +
> +	entry = tdx_find_cpuid_entry(cpuid, 0xd, 1);
> +	if (entry)
> +		guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32));
> +	else
> +		guest_supported_xss = 0;
> +	/* PT can be exposed to TD guest regardless of KVM's XSS support */
> +	guest_supported_xss &= (kvm_caps.supported_xss | XFEATURE_MASK_PT);
> +
> +	td_params->xfam = guest_supported_xcr0 | guest_supported_xss;
> +	if (td_params->xfam & XFEATURE_MASK_LBR) {
> +		/*
> +		 * TODO: once KVM supports LBR(save/restore LBR related
> +		 * registers around TDENTER), remove this guard.
> +		 */
> +		pr_warn("TD doesn't support LBR yet. KVM needs to save/restore "
> +			"IA32_LBR_DEPTH properly.\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (td_params->xfam & XFEATURE_MASK_XTILE) {
> +		/*
> +		 * TODO: once KVM supports AMX(save/restore AMX related
> +		 * registers around TDENTER), remove this guard.
> +		 */
> +		pr_warn("TD doesn't support AMX yet. KVM needs to save/restore "
> +			"IA32_XFD, IA32_XFD_ERR properly.\n");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	td_params->tsc_frequency =
> +		TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
> +
> +#define MEMCPY_SAME_SIZE(dst, src)				\
> +	do {							\
> +		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
> +		memcpy((dst), (src), sizeof(dst));		\
> +	} while (0)
> +
> +	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
> +	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
> +	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
> +
> +	return 0;
> +}
> +
> +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct tdx_module_output out;
>  	cpumask_var_t packages;
>  	int ret, i;
>  	u64 err;
> @@ -390,10 +589,13 @@ int tdx_vm_init(struct kvm *kvm)
>  		tdx_mark_td_page_added(&kvm_tdx->tdcs[i]);
>  	}
>  
> -	/*
> -	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> -	 * ioctl() to define the configure CPUID values for the TD.
> -	 */
> +	err = tdh_mng_init(kvm_tdx->tdr.pa, __pa(td_params), &out);
> +	if (WARN_ON_ONCE(err)) {
> +		pr_tdx_error(TDH_MNG_INIT, err, &out);
> +		ret = -EIO;
> +		goto teardown;
> +	}
> +
>  	return 0;
>  
>  	/*
> @@ -431,50 +633,65 @@ int tdx_vm_init(struct kvm *kvm)
>  	return ret;
>  }
>  
> -int tdx_dev_ioctl(void __user *argp)
> +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
>  {
> -	struct kvm_tdx_capabilities __user *user_caps;
> -	struct kvm_tdx_capabilities caps;
> -	struct kvm_tdx_cmd cmd;
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	struct kvm_tdx_init_vm *init_vm = NULL;
> +	struct td_params *td_params = NULL;
> +	void *entries_end;
> +	int ret;
>  
> -	BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
> -		     sizeof(struct tdx_cpuid_config));
> +	BUILD_BUG_ON(sizeof(*init_vm) != 16 * 1024);
> +	BUILD_BUG_ON((sizeof(*init_vm) - offsetof(typeof(*init_vm), entries)) /
> +		     sizeof(init_vm->entries[0]) < KVM_MAX_CPUID_ENTRIES);
> +	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
>  
> -	if (copy_from_user(&cmd, argp, sizeof(cmd)))
> -		return -EFAULT;
> -	if (cmd.flags || cmd.error || cmd.unused)
> +	if (is_td_initialized(kvm))
>  		return -EINVAL;
> -	/*
> -	 * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
> -	 * mem_enc_ioctl().
> -	 */
> -	if (cmd.id != KVM_TDX_CAPABILITIES)
> +
> +	if (cmd->flags)
>  		return -EINVAL;
>  
> -	user_caps = (void __user *)cmd.data;
> -	if (copy_from_user(&caps, user_caps, sizeof(caps)))
> -		return -EFAULT;
> +	init_vm = kzalloc(sizeof(*init_vm), GFP_KERNEL);
> +	if (copy_from_user(init_vm, (void __user *)cmd->data, sizeof(*init_vm))) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
>  
> -	if (caps.nr_cpuid_configs < tdx_caps.nr_cpuid_configs)
> -		return -E2BIG;
> +	ret = -EINVAL;
> +	if (init_vm->cpuid.padding)
> +		goto out;
> +	/* init_vm->entries shouldn't overrun. */
> +	entries_end = init_vm->entries + init_vm->cpuid.nent;
> +	if (entries_end > (void *)(init_vm + 1))
> +		goto out;
> +	/* Unused part must be zero. */
> +	if (memchr_inv(entries_end, 0, (void *)(init_vm + 1) - entries_end))
> +		goto out;
>  
> -	caps = (struct kvm_tdx_capabilities) {
> -		.attrs_fixed0 = tdx_caps.attrs_fixed0,
> -		.attrs_fixed1 = tdx_caps.attrs_fixed1,
> -		.xfam_fixed0 = tdx_caps.xfam_fixed0,
> -		.xfam_fixed1 = tdx_caps.xfam_fixed1,
> -		.nr_cpuid_configs = tdx_caps.nr_cpuid_configs,
> -		.padding = 0,
> -	};
> +	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
> +	if (!td_params) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
>  
> -	if (copy_to_user(user_caps, &caps, sizeof(caps)))
> -		return -EFAULT;
> -	if (copy_to_user(user_caps->cpuid_configs, &tdx_caps.cpuid_configs,
> -			 tdx_caps.nr_cpuid_configs *
> -			 sizeof(struct tdx_cpuid_config)))
> -		return -EFAULT;
> +	ret = setup_tdparams(kvm, td_params, init_vm);
> +	if (ret)
> +		goto out;
>  
> -	return 0;
> +	ret = __tdx_td_init(kvm, td_params);
> +	if (ret)
> +		goto out;
> +
> +	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> +	kvm_tdx->attributes = td_params->attributes;
> +	kvm_tdx->xfam = td_params->xfam;
> +
> +out:
> +	/* kfree() accepts NULL. */
> +	kfree(init_vm);
> +	kfree(td_params);
> +	return ret;
>  }
>  
>  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> @@ -490,6 +707,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  	mutex_lock(&kvm->lock);
>  
>  	switch (tdx_cmd.id) {
> +	case KVM_TDX_INIT_VM:
> +		r = tdx_td_init(kvm, &tdx_cmd);
> +		break;
>  	default:
>  		r = -EINVAL;
>  		goto out;
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 938314635b47..ff0ea9cad347 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -18,7 +18,11 @@ struct kvm_tdx {
>  	struct tdx_td_page tdr;
>  	struct tdx_td_page *tdcs;
>  
> +	u64 attributes;
> +	u64 xfam;
>  	int hkid;
> +
> +	u64 tsc_offset;
>  };
>  
>  struct vcpu_tdx {
> @@ -48,6 +52,11 @@ static inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
>  	return container_of(vcpu, struct vcpu_tdx, vcpu);
>  }
>  
> +static inline bool is_td_initialized(struct kvm *kvm)
> +{
> +	return to_kvm_tdx(kvm)->hkid > 0;
> +}

There is a similar function, is_hkid_assigned(), which is previously defined. Do
you think redefining a new function here will bring code redundency?

> +
>  static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
>  {
>  #define VMCS_ENC_ACCESS_TYPE_MASK	0x1UL
> @@ -148,6 +157,19 @@ TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
>  TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch);
>  TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
>  
> +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
> +{
> +	struct tdx_module_output out;
> +	u64 err;
> +
> +	err = tdh_mng_rd(kvm_tdx->tdr.pa, TDCS_EXEC(field), &out);
> +	if (unlikely(err)) {
> +		pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
> +		return 0;
> +	}
> +	return out.r8;
> +}
> +
>  #else
>  struct kvm_tdx {
>  	struct kvm kvm;
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index 2ad9666e02a5..531a0033e530 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -538,6 +538,7 @@ struct kvm_pmu_event_filter {
>  /* Trust Domain eXtension sub-ioctl() commands. */
>  enum kvm_tdx_cmd_id {
>  	KVM_TDX_CAPABILITIES = 0,
> +	KVM_TDX_INIT_VM,
>  
>  	KVM_TDX_CMD_NR_MAX,
>  };
> @@ -583,4 +584,36 @@ struct kvm_tdx_capabilities {
>  	struct kvm_tdx_cpuid_config cpuid_configs[0];
>  };
>  
> +struct kvm_tdx_init_vm {
> +	__u64 attributes;
> +	__u32 max_vcpus;
> +	__u32 padding;
> +	__u64 mrconfigid[6];    /* sha384 digest */
> +	__u64 mrowner[6];       /* sha384 digest */
> +	__u64 mrownerconfig[6]; /* sha348 digest */
> +	union {
> +		/*
> +		 * KVM_TDX_INIT_VM is called before vcpu creation, thus before
> +		 * KVM_SET_CPUID2.  CPUID configurations needs to be passed.
> +		 *
> +		 * This configuration supersedes KVM_SET_CPUID{,2}.
> +		 * The user space VMM, e.g. qemu, should make them consistent
> +		 * with this values.
> +		 * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(256)
> +		 * = 8KB.
> +		 */
> +		struct {
> +			struct kvm_cpuid2 cpuid;
> +			/* 8KB with KVM_MAX_CPUID_ENTRIES. */
> +			struct kvm_cpuid_entry2 entries[];
> +		};
> +		/*
> +		 * For future extensibility.
> +		 * The size(struct kvm_tdx_init_vm) = 16KB.
> +		 * This should be enough given sizeof(TD_PARAMS) = 1024
> +		 */
> +		__u64 reserved[2028];
> +	};
> +};
> +
>  #endif /* _ASM_X86_KVM_H */
  
Isaku Yamahata Jan. 12, 2023, 3:12 a.m. UTC | #4
On Wed, Jan 04, 2023 at 03:59:08PM +0800,
"Wang, Lei" <lei4.wang@intel.com> wrote:

> On 10/30/2022 2:22 PM, isaku.yamahata@intel.com wrote:
> > From: Xiaoyao Li <xiaoyao.li@intel.com>
> > 
> > TDX requires additional parameters for TDX VM for confidential execution to
> > protect its confidentiality of its memory contents and its CPU state from
> > any other software, including VMM. When creating guest TD VM before
> > creating vcpu, the number of vcpu, TSC frequency (that is same among
> > vcpus. and it can't be changed.)  CPUIDs which is emulated by the TDX
> > module. It means guest can trust those CPUIDs. and sha384 values for
> > measurement.
> > 
> > Add new subcommand, KVM_TDX_INIT_VM, to pass parameters for TDX guest.  It
> > assigns encryption key to the TDX guest for memory encryption.  TDX
> > encrypts memory per-guest bases.  It assigns device model passes per-VM
> > parameters for the TDX guest.  The maximum number of vcpus, tsc frequency
> > (TDX guest has fised VM-wide TSC frequency. not per-vcpu.  The TDX guest
> > can not change it.), attributes (production or debug), available extended
> > features (which is reflected into guest XCR0, IA32_XSS MSR), cpuids, sha384
> > measurements, and etc.
> > 
> > This subcommand is called before creating vcpu and KVM_SET_CPUID2, i.e.
> > cpuids configurations aren't available yet.  So CPUIDs configuration values
> > needs to be passed in struct kvm_init_vm.  It's device model responsibility
> 
> I suppose this should be kvm_tdx_init_vm.
> 
> > to make this cpuid config for KVM_TDX_INIT_VM and KVM_SET_CPUID2.
> > 
> > Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> >  arch/x86/include/asm/tdx.h            |   3 +
> >  arch/x86/include/uapi/asm/kvm.h       |  31 +++
> >  arch/x86/kvm/vmx/tdx.c                | 296 ++++++++++++++++++++++----
> >  arch/x86/kvm/vmx/tdx.h                |  22 ++
> >  tools/arch/x86/include/uapi/asm/kvm.h |  33 +++
> >  5 files changed, 347 insertions(+), 38 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> > index cd304d323d33..05ac4bfc8f8a 100644
> > --- a/arch/x86/include/asm/tdx.h
> > +++ b/arch/x86/include/asm/tdx.h
> > @@ -131,6 +131,9 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
> >  #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
> >  
> >  #ifdef CONFIG_INTEL_TDX_HOST
> > +
> > +/* -1 indicates CPUID leaf with no sub-leaves. */
> > +#define TDX_CPUID_NO_SUBLEAF	((u32)-1)
> >  struct tdx_cpuid_config {
> >  	u32	leaf;
> >  	u32	sub_leaf;
> > diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> > index 2ad9666e02a5..26661879c031 100644
> > --- a/arch/x86/include/uapi/asm/kvm.h
> > +++ b/arch/x86/include/uapi/asm/kvm.h
> > @@ -538,6 +538,7 @@ struct kvm_pmu_event_filter {
> >  /* Trust Domain eXtension sub-ioctl() commands. */
> >  enum kvm_tdx_cmd_id {
> >  	KVM_TDX_CAPABILITIES = 0,
> > +	KVM_TDX_INIT_VM,
> >  
> >  	KVM_TDX_CMD_NR_MAX,
> >  };
> > @@ -583,4 +584,34 @@ struct kvm_tdx_capabilities {
> >  	struct kvm_tdx_cpuid_config cpuid_configs[0];
> >  };
> >  
> > +struct kvm_tdx_init_vm {
> > +	__u64 attributes;
> > +	__u64 mrconfigid[6];	/* sha384 digest */
> > +	__u64 mrowner[6];	/* sha384 digest */
> > +	__u64 mrownerconfig[6];	/* sha348 digest */
> > +	union {
> > +		/*
> > +		 * KVM_TDX_INIT_VM is called before vcpu creation, thus before
> > +		 * KVM_SET_CPUID2.  CPUID configurations needs to be passed.
> > +		 *
> > +		 * This configuration supersedes KVM_SET_CPUID{,2}.
> > +		 * The user space VMM, e.g. qemu, should make them consistent
> > +		 * with this values.
> > +		 * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(256)
> > +		 * = 8KB.
> > +		 */
> > +		struct {
> > +			struct kvm_cpuid2 cpuid;
> > +			/* 8KB with KVM_MAX_CPUID_ENTRIES. */
> > +			struct kvm_cpuid_entry2 entries[];
> > +		};
> > +		/*
> > +		 * For future extensibility.
> > +		 * The size(struct kvm_tdx_init_vm) = 16KB.
> > +		 * This should be enough given sizeof(TD_PARAMS) = 1024
> > +		 */
> > +		__u64 reserved[2029];
> > +	};
> > +};
> > +
> >  #endif /* _ASM_X86_KVM_H */
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index d77709a6da51..54045e0576e7 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -284,6 +284,205 @@ static int tdx_do_tdh_mng_key_config(void *param)
> >  int tdx_vm_init(struct kvm *kvm)
> >  {
> >  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > +
> > +	kvm_tdx->hkid = -1;
> > +
> > +	/*
> > +	 * This function initializes only KVM software construct.  It doesn't
> > +	 * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc.
> > +	 * It is handled by KVM_TDX_INIT_VM, __tdx_td_init().
> > +	 */
> > +
> > +	return 0;
> > +}
> > +
> > +int tdx_dev_ioctl(void __user *argp)
> > +{
> > +	struct kvm_tdx_capabilities __user *user_caps;
> > +	struct kvm_tdx_capabilities caps;
> > +	struct kvm_tdx_cmd cmd;
> > +
> > +	BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
> > +		     sizeof(struct tdx_cpuid_config));
> > +
> > +	if (copy_from_user(&cmd, argp, sizeof(cmd)))
> > +		return -EFAULT;
> > +	if (cmd.flags || cmd.error || cmd.unused)
> > +		return -EINVAL;
> > +	/*
> > +	 * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
> > +	 * mem_enc_ioctl().
> > +	 */
> > +	if (cmd.id != KVM_TDX_CAPABILITIES)
> > +		return -EINVAL;
> > +
> > +	user_caps = (void __user *)cmd.data;
> > +	if (copy_from_user(&caps, user_caps, sizeof(caps)))
> > +		return -EFAULT;
> > +
> > +	if (caps.nr_cpuid_configs < tdx_caps.nr_cpuid_configs)
> > +		return -E2BIG;
> > +
> > +	caps = (struct kvm_tdx_capabilities) {
> > +		.attrs_fixed0 = tdx_caps.attrs_fixed0,
> > +		.attrs_fixed1 = tdx_caps.attrs_fixed1,
> > +		.xfam_fixed0 = tdx_caps.xfam_fixed0,
> > +		.xfam_fixed1 = tdx_caps.xfam_fixed1,
> > +		.nr_cpuid_configs = tdx_caps.nr_cpuid_configs,
> > +		.padding = 0,
> > +	};
> > +
> > +	if (copy_to_user(user_caps, &caps, sizeof(caps)))
> > +		return -EFAULT;
> > +	if (copy_to_user(user_caps->cpuid_configs, &tdx_caps.cpuid_configs,
> > +			 tdx_caps.nr_cpuid_configs *
> > +			 sizeof(struct tdx_cpuid_config)))
> > +		return -EFAULT;
> > +
> > +	return 0;
> > +}
> 
> tdx_dev_ioctl() is introduced in previous patch with the same code added here,
> which means this is just a place change and it will confuse reviewers. Is it
> neccesary to do so?

Right. Somehow diff was also confused. I'll try to twist the previous patch to
introduce __tdx_td_init() with the previous patch.


...

> > +static inline bool is_td_initialized(struct kvm *kvm)
> > +{
> > +	return to_kvm_tdx(kvm)->hkid > 0;
> > +}
> 
> There is a similar function, is_hkid_assigned(), which is previously defined. Do
> you think redefining a new function here will bring code redundency?

I'll drop this function and replace it with is_hkid_assigned().
  

Patch

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index cd304d323d33..05ac4bfc8f8a 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -131,6 +131,9 @@  static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
 #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
 
 #ifdef CONFIG_INTEL_TDX_HOST
+
+/* -1 indicates CPUID leaf with no sub-leaves. */
+#define TDX_CPUID_NO_SUBLEAF	((u32)-1)
 struct tdx_cpuid_config {
 	u32	leaf;
 	u32	sub_leaf;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 2ad9666e02a5..26661879c031 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -538,6 +538,7 @@  struct kvm_pmu_event_filter {
 /* Trust Domain eXtension sub-ioctl() commands. */
 enum kvm_tdx_cmd_id {
 	KVM_TDX_CAPABILITIES = 0,
+	KVM_TDX_INIT_VM,
 
 	KVM_TDX_CMD_NR_MAX,
 };
@@ -583,4 +584,34 @@  struct kvm_tdx_capabilities {
 	struct kvm_tdx_cpuid_config cpuid_configs[0];
 };
 
+struct kvm_tdx_init_vm {
+	__u64 attributes;
+	__u64 mrconfigid[6];	/* sha384 digest */
+	__u64 mrowner[6];	/* sha384 digest */
+	__u64 mrownerconfig[6];	/* sha348 digest */
+	union {
+		/*
+		 * KVM_TDX_INIT_VM is called before vcpu creation, thus before
+		 * KVM_SET_CPUID2.  CPUID configurations needs to be passed.
+		 *
+		 * This configuration supersedes KVM_SET_CPUID{,2}.
+		 * The user space VMM, e.g. qemu, should make them consistent
+		 * with this values.
+		 * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(256)
+		 * = 8KB.
+		 */
+		struct {
+			struct kvm_cpuid2 cpuid;
+			/* 8KB with KVM_MAX_CPUID_ENTRIES. */
+			struct kvm_cpuid_entry2 entries[];
+		};
+		/*
+		 * For future extensibility.
+		 * The size(struct kvm_tdx_init_vm) = 16KB.
+		 * This should be enough given sizeof(TD_PARAMS) = 1024
+		 */
+		__u64 reserved[2029];
+	};
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index d77709a6da51..54045e0576e7 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -284,6 +284,205 @@  static int tdx_do_tdh_mng_key_config(void *param)
 int tdx_vm_init(struct kvm *kvm)
 {
 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+	kvm_tdx->hkid = -1;
+
+	/*
+	 * This function initializes only KVM software construct.  It doesn't
+	 * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc.
+	 * It is handled by KVM_TDX_INIT_VM, __tdx_td_init().
+	 */
+
+	return 0;
+}
+
+int tdx_dev_ioctl(void __user *argp)
+{
+	struct kvm_tdx_capabilities __user *user_caps;
+	struct kvm_tdx_capabilities caps;
+	struct kvm_tdx_cmd cmd;
+
+	BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
+		     sizeof(struct tdx_cpuid_config));
+
+	if (copy_from_user(&cmd, argp, sizeof(cmd)))
+		return -EFAULT;
+	if (cmd.flags || cmd.error || cmd.unused)
+		return -EINVAL;
+	/*
+	 * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
+	 * mem_enc_ioctl().
+	 */
+	if (cmd.id != KVM_TDX_CAPABILITIES)
+		return -EINVAL;
+
+	user_caps = (void __user *)cmd.data;
+	if (copy_from_user(&caps, user_caps, sizeof(caps)))
+		return -EFAULT;
+
+	if (caps.nr_cpuid_configs < tdx_caps.nr_cpuid_configs)
+		return -E2BIG;
+
+	caps = (struct kvm_tdx_capabilities) {
+		.attrs_fixed0 = tdx_caps.attrs_fixed0,
+		.attrs_fixed1 = tdx_caps.attrs_fixed1,
+		.xfam_fixed0 = tdx_caps.xfam_fixed0,
+		.xfam_fixed1 = tdx_caps.xfam_fixed1,
+		.nr_cpuid_configs = tdx_caps.nr_cpuid_configs,
+		.padding = 0,
+	};
+
+	if (copy_to_user(user_caps, &caps, sizeof(caps)))
+		return -EFAULT;
+	if (copy_to_user(user_caps->cpuid_configs, &tdx_caps.cpuid_configs,
+			 tdx_caps.nr_cpuid_configs *
+			 sizeof(struct tdx_cpuid_config)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * cpuid entry lookup in TDX cpuid config way.
+ * The difference is how to specify index(subleaves).
+ * Specify index to TDX_CPUID_NO_SUBLEAF for CPUID leaf with no-subleaves.
+ */
+static const struct kvm_cpuid_entry2 *tdx_find_cpuid_entry(const struct kvm_cpuid2 *cpuid,
+							   u32 function, u32 index)
+{
+	int i;
+
+	/* In TDX CPU CONFIG, TDX_CPUID_NO_SUBLEAF means index = 0. */
+	if (index == TDX_CPUID_NO_SUBLEAF)
+		index = 0;
+
+	for (i = 0; i < cpuid->nent; i++) {
+		const struct kvm_cpuid_entry2 *e = &cpuid->entries[i];
+
+		if (e->function == function &&
+		    (e->index == index ||
+		     !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
+			return e;
+	}
+	return NULL;
+}
+
+static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
+			struct kvm_tdx_init_vm *init_vm)
+{
+	const struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
+	const struct kvm_cpuid_entry2 *entry;
+	u64 guest_supported_xcr0;
+	u64 guest_supported_xss;
+	int max_pa;
+	int i;
+
+	if (kvm->created_vcpus)
+		return -EBUSY;
+	td_params->max_vcpus = kvm->max_vcpus;
+	td_params->attributes = init_vm->attributes;
+	if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
+		/*
+		 * TODO: save/restore PMU related registers around TDENTER.
+		 * Once it's done, remove this guard.
+		 */
+		pr_warn("TD doesn't support perfmon yet. KVM needs to save/restore "
+			"host perf registers properly.\n");
+		return -EOPNOTSUPP;
+	}
+
+	for (i = 0; i < tdx_caps.nr_cpuid_configs; i++) {
+		const struct tdx_cpuid_config *config = &tdx_caps.cpuid_configs[i];
+		const struct kvm_cpuid_entry2 *entry =
+			tdx_find_cpuid_entry(cpuid, config->leaf, config->sub_leaf);
+		struct tdx_cpuid_value *value = &td_params->cpuid_values[i];
+
+		if (!entry)
+			continue;
+
+		value->eax = entry->eax & config->eax;
+		value->ebx = entry->ebx & config->ebx;
+		value->ecx = entry->ecx & config->ecx;
+		value->edx = entry->edx & config->edx;
+	}
+
+	max_pa = 36;
+	entry = tdx_find_cpuid_entry(cpuid, 0x80000008, 0);
+	if (entry)
+		max_pa = entry->eax & 0xff;
+
+	td_params->eptp_controls = VMX_EPTP_MT_WB;
+	/*
+	 * No CPU supports 4-level && max_pa > 48.
+	 * "5-level paging and 5-level EPT" section 4.1 4-level EPT
+	 * "4-level EPT is limited to translating 48-bit guest-physical
+	 *  addresses."
+	 * cpu_has_vmx_ept_5levels() check is just in case.
+	 */
+	if (cpu_has_vmx_ept_5levels() && max_pa > 48) {
+		td_params->eptp_controls |= VMX_EPTP_PWL_5;
+		td_params->exec_controls |= TDX_EXEC_CONTROL_MAX_GPAW;
+	} else {
+		td_params->eptp_controls |= VMX_EPTP_PWL_4;
+	}
+
+	/* Setup td_params.xfam */
+	entry = tdx_find_cpuid_entry(cpuid, 0xd, 0);
+	if (entry)
+		guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32));
+	else
+		guest_supported_xcr0 = 0;
+	guest_supported_xcr0 &= kvm_caps.supported_xcr0;
+
+	entry = tdx_find_cpuid_entry(cpuid, 0xd, 1);
+	if (entry)
+		guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32));
+	else
+		guest_supported_xss = 0;
+	/* PT can be exposed to TD guest regardless of KVM's XSS support */
+	guest_supported_xss &= (kvm_caps.supported_xss | XFEATURE_MASK_PT);
+
+	td_params->xfam = guest_supported_xcr0 | guest_supported_xss;
+	if (td_params->xfam & XFEATURE_MASK_LBR) {
+		/*
+		 * TODO: once KVM supports LBR(save/restore LBR related
+		 * registers around TDENTER), remove this guard.
+		 */
+		pr_warn("TD doesn't support LBR yet. KVM needs to save/restore "
+			"IA32_LBR_DEPTH properly.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (td_params->xfam & XFEATURE_MASK_XTILE) {
+		/*
+		 * TODO: once KVM supports AMX(save/restore AMX related
+		 * registers around TDENTER), remove this guard.
+		 */
+		pr_warn("TD doesn't support AMX yet. KVM needs to save/restore "
+			"IA32_XFD, IA32_XFD_ERR properly.\n");
+		return -EOPNOTSUPP;
+	}
+
+	td_params->tsc_frequency =
+		TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
+
+#define MEMCPY_SAME_SIZE(dst, src)				\
+	do {							\
+		BUILD_BUG_ON(sizeof(dst) != sizeof(src));	\
+		memcpy((dst), (src), sizeof(dst));		\
+	} while (0)
+
+	MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
+	MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
+	MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
+
+	return 0;
+}
+
+static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params)
+{
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	struct tdx_module_output out;
 	cpumask_var_t packages;
 	int ret, i;
 	u64 err;
@@ -390,10 +589,13 @@  int tdx_vm_init(struct kvm *kvm)
 		tdx_mark_td_page_added(&kvm_tdx->tdcs[i]);
 	}
 
-	/*
-	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
-	 * ioctl() to define the configure CPUID values for the TD.
-	 */
+	err = tdh_mng_init(kvm_tdx->tdr.pa, __pa(td_params), &out);
+	if (WARN_ON_ONCE(err)) {
+		pr_tdx_error(TDH_MNG_INIT, err, &out);
+		ret = -EIO;
+		goto teardown;
+	}
+
 	return 0;
 
 	/*
@@ -431,50 +633,65 @@  int tdx_vm_init(struct kvm *kvm)
 	return ret;
 }
 
-int tdx_dev_ioctl(void __user *argp)
+static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
 {
-	struct kvm_tdx_capabilities __user *user_caps;
-	struct kvm_tdx_capabilities caps;
-	struct kvm_tdx_cmd cmd;
+	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+	struct kvm_tdx_init_vm *init_vm = NULL;
+	struct td_params *td_params = NULL;
+	void *entries_end;
+	int ret;
 
-	BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
-		     sizeof(struct tdx_cpuid_config));
+	BUILD_BUG_ON(sizeof(*init_vm) != 16 * 1024);
+	BUILD_BUG_ON((sizeof(*init_vm) - offsetof(typeof(*init_vm), entries)) /
+		     sizeof(init_vm->entries[0]) < KVM_MAX_CPUID_ENTRIES);
+	BUILD_BUG_ON(sizeof(struct td_params) != 1024);
 
-	if (copy_from_user(&cmd, argp, sizeof(cmd)))
-		return -EFAULT;
-	if (cmd.flags || cmd.error || cmd.unused)
+	if (is_td_initialized(kvm))
 		return -EINVAL;
-	/*
-	 * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
-	 * mem_enc_ioctl().
-	 */
-	if (cmd.id != KVM_TDX_CAPABILITIES)
+
+	if (cmd->flags)
 		return -EINVAL;
 
-	user_caps = (void __user *)cmd.data;
-	if (copy_from_user(&caps, user_caps, sizeof(caps)))
-		return -EFAULT;
+	init_vm = kzalloc(sizeof(*init_vm), GFP_KERNEL);
+	if (copy_from_user(init_vm, (void __user *)cmd->data, sizeof(*init_vm))) {
+		ret = -EFAULT;
+		goto out;
+	}
 
-	if (caps.nr_cpuid_configs < tdx_caps.nr_cpuid_configs)
-		return -E2BIG;
+	ret = -EINVAL;
+	if (init_vm->cpuid.padding)
+		goto out;
+	/* init_vm->entries shouldn't overrun. */
+	entries_end = init_vm->entries + init_vm->cpuid.nent;
+	if (entries_end > (void *)(init_vm + 1))
+		goto out;
+	/* Unused part must be zero. */
+	if (memchr_inv(entries_end, 0, (void *)(init_vm + 1) - entries_end))
+		goto out;
 
-	caps = (struct kvm_tdx_capabilities) {
-		.attrs_fixed0 = tdx_caps.attrs_fixed0,
-		.attrs_fixed1 = tdx_caps.attrs_fixed1,
-		.xfam_fixed0 = tdx_caps.xfam_fixed0,
-		.xfam_fixed1 = tdx_caps.xfam_fixed1,
-		.nr_cpuid_configs = tdx_caps.nr_cpuid_configs,
-		.padding = 0,
-	};
+	td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
+	if (!td_params) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-	if (copy_to_user(user_caps, &caps, sizeof(caps)))
-		return -EFAULT;
-	if (copy_to_user(user_caps->cpuid_configs, &tdx_caps.cpuid_configs,
-			 tdx_caps.nr_cpuid_configs *
-			 sizeof(struct tdx_cpuid_config)))
-		return -EFAULT;
+	ret = setup_tdparams(kvm, td_params, init_vm);
+	if (ret)
+		goto out;
 
-	return 0;
+	ret = __tdx_td_init(kvm, td_params);
+	if (ret)
+		goto out;
+
+	kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
+	kvm_tdx->attributes = td_params->attributes;
+	kvm_tdx->xfam = td_params->xfam;
+
+out:
+	/* kfree() accepts NULL. */
+	kfree(init_vm);
+	kfree(td_params);
+	return ret;
 }
 
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
@@ -490,6 +707,9 @@  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 	mutex_lock(&kvm->lock);
 
 	switch (tdx_cmd.id) {
+	case KVM_TDX_INIT_VM:
+		r = tdx_td_init(kvm, &tdx_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 938314635b47..ff0ea9cad347 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -18,7 +18,11 @@  struct kvm_tdx {
 	struct tdx_td_page tdr;
 	struct tdx_td_page *tdcs;
 
+	u64 attributes;
+	u64 xfam;
 	int hkid;
+
+	u64 tsc_offset;
 };
 
 struct vcpu_tdx {
@@ -48,6 +52,11 @@  static inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_tdx, vcpu);
 }
 
+static inline bool is_td_initialized(struct kvm *kvm)
+{
+	return to_kvm_tdx(kvm)->hkid > 0;
+}
+
 static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
 {
 #define VMCS_ENC_ACCESS_TYPE_MASK	0x1UL
@@ -148,6 +157,19 @@  TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
 TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch);
 TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
 
+static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
+{
+	struct tdx_module_output out;
+	u64 err;
+
+	err = tdh_mng_rd(kvm_tdx->tdr.pa, TDCS_EXEC(field), &out);
+	if (unlikely(err)) {
+		pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
+		return 0;
+	}
+	return out.r8;
+}
+
 #else
 struct kvm_tdx {
 	struct kvm kvm;
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 2ad9666e02a5..531a0033e530 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -538,6 +538,7 @@  struct kvm_pmu_event_filter {
 /* Trust Domain eXtension sub-ioctl() commands. */
 enum kvm_tdx_cmd_id {
 	KVM_TDX_CAPABILITIES = 0,
+	KVM_TDX_INIT_VM,
 
 	KVM_TDX_CMD_NR_MAX,
 };
@@ -583,4 +584,36 @@  struct kvm_tdx_capabilities {
 	struct kvm_tdx_cpuid_config cpuid_configs[0];
 };
 
+struct kvm_tdx_init_vm {
+	__u64 attributes;
+	__u32 max_vcpus;
+	__u32 padding;
+	__u64 mrconfigid[6];    /* sha384 digest */
+	__u64 mrowner[6];       /* sha384 digest */
+	__u64 mrownerconfig[6]; /* sha348 digest */
+	union {
+		/*
+		 * KVM_TDX_INIT_VM is called before vcpu creation, thus before
+		 * KVM_SET_CPUID2.  CPUID configurations needs to be passed.
+		 *
+		 * This configuration supersedes KVM_SET_CPUID{,2}.
+		 * The user space VMM, e.g. qemu, should make them consistent
+		 * with this values.
+		 * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(256)
+		 * = 8KB.
+		 */
+		struct {
+			struct kvm_cpuid2 cpuid;
+			/* 8KB with KVM_MAX_CPUID_ENTRIES. */
+			struct kvm_cpuid_entry2 entries[];
+		};
+		/*
+		 * For future extensibility.
+		 * The size(struct kvm_tdx_init_vm) = 16KB.
+		 * This should be enough given sizeof(TD_PARAMS) = 1024
+		 */
+		__u64 reserved[2028];
+	};
+};
+
 #endif /* _ASM_X86_KVM_H */