[v9,4/6] KVM: x86: Introduce untag_addr() in kvm_x86_ops

Message ID 20230606091842.13123-5-binbin.wu@linux.intel.com
State New
Headers
Series Linear Address Masking (LAM) KVM Enabling |

Commit Message

Binbin Wu June 6, 2023, 9:18 a.m. UTC
  Introduce a new optional interface untag_addr() to kvm_x86_ops to untag
the metadata from linear address. Implement LAM version in VMX.

When enabled feature like Intel Linear Address Masking or AMD Upper
Address Ignore, linear address may be tagged with metadata. Linear
address should be checked for modified canonicality and untagged in
instruction emulations or VMExit handlers if LAM or UAI is applicable.

Introduce untag_addr() to kvm_x86_ops to hide the vendor specific code.
Pass the 'flags' to avoid distinguishing processor vendor in common emulator
path for the cases whose untag policies are different in the future.

For VMX, LAM version is implemented.

Signed-off-by: Binbin Wu <binbin.wu@linux.intel.com>
Tested-by: Xuelian Guo <xuelian.guo@intel.com>
Reviewed-by: Chao Gao <chao.gao@intel.com>
---
 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  2 +
 arch/x86/kvm/kvm_emulate.h         |  1 +
 arch/x86/kvm/vmx/vmx.c             | 73 ++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.h             |  2 +
 5 files changed, 79 insertions(+)
  

Comments

Sean Christopherson June 28, 2023, 12:15 a.m. UTC | #1
On Tue, Jun 06, 2023, Binbin Wu wrote:
> diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
> index 5b9ec610b2cb..c2091e24a6b9 100644
> --- a/arch/x86/kvm/kvm_emulate.h
> +++ b/arch/x86/kvm/kvm_emulate.h
> @@ -91,6 +91,7 @@ struct x86_instruction_info {
>  /* x86-specific emulation flags */
>  #define X86EMUL_F_FETCH			BIT(0)
>  #define X86EMUL_F_WRITE			BIT(1)
> +#define X86EMUL_F_SKIPLAM		BIT(2)

See my comments in the LASS series about describing the access, not dictating
the end behavior.

>  
>  struct x86_emulate_ops {
>  	void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 52dcf3c00bb8..82a225d1000e 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -8133,6 +8133,77 @@ static void vmx_vm_destroy(struct kvm *kvm)
>  	free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
>  }
>  
> +#define LAM_S57_EN_MASK (X86_CR4_LAM_SUP | X86_CR4_LA57)
> +static int lam_sign_extend_bit(struct kvm_vcpu *vcpu, gva_t addr)
> +{
> +	u64 cr3, cr4;
> +
> +	/*
> +	 * The LAM identification of a pointer as user or supervisor is
> +	 * based solely on the value of pointer bit 63.
> +	 */
> +	if (!(addr >> 63)) {

BIT_ULL(63)

> +		cr3 = kvm_read_cr3(vcpu);

Use the perfectly good helper added earlier in the series:

		cr3_lam = kvm_get_active_lam_bits();

That has the added bonus of avoiding a VMREAD of CR3 when LAM is disabled in CR4.

> +		if (cr3 & X86_CR3_LAM_U57)
> +			return 56;
> +		if (cr3 & X86_CR3_LAM_U48)
> +			return 47;
> +	} else {
> +		cr4 = kvm_read_cr4_bits(vcpu, LAM_S57_EN_MASK);
> +		if (cr4 == LAM_S57_EN_MASK)
> +			return 56;
> +		if (cr4 & X86_CR4_LAM_SUP)
> +			return 47;

This is way too complicated for a simple thing.  Burying multiple bits in a #define
and then relying on specific bits being in the mask is unnecessarily subtle.

And this whole helper shouldn't exist.  There's one caller, and will only ever
be one caller.  Defining magic numbers, i.e. using -1 to signal "disabled", makes
it that much harder to read the code.

More below.

> +	}
> +	return -1;
> +}
> +
> +/*
> + * Only called in 64-bit mode.
> + *
> + * LAM has a modified canonical check when applicable:
> + * LAM_S48                : [ 1 ][ metadata ][ 1 ]
> + *                            63               47
> + * LAM_U48                : [ 0 ][ metadata ][ 0 ]
> + *                            63               47
> + * LAM_S57                : [ 1 ][ metadata ][ 1 ]
> + *                            63               56
> + * LAM_U57 + 5-lvl paging : [ 0 ][ metadata ][ 0 ]
> + *                            63               56
> + * LAM_U57 + 4-lvl paging : [ 0 ][ metadata ][ 0...0 ]
> + *                            63               56..47
> + *
> + * Untag the metadata bits by sign-extending the value of bit 47 (LAM48) or
> + * bit 56 (LAM57). The resulting address after untag isn't guaranteed to be
> + * canonical. Callers should perform the original canonical check and raise
> + * #GP/#SS if the address is non-canonical.
> + *
> + * Note that KVM masks the metadata in addresses, performs the (original)
> + * canonicality checking and then walks page table. This is slightly
> + * different from hardware behavior but achieves the same effect.
> + * Specifically, if LAM is enabled, the processor performs a modified
> + * canonicality checking where the metadata are ignored instead of
> + * masked. After the modified canonicality checking, the processor masks
> + * the metadata before passing addresses for paging translation.
> + */
> +void vmx_untag_addr(struct kvm_vcpu *vcpu, gva_t *gva, u32 flags)

Rather than modify the pointer, return the untagged address.  That's more flexible
as it allows using the result in if-statements and whatnot.  That might not ever
come into play, but there's no good reason to use an in/out param in a void
function.

> +{
> +	int sign_ext_bit;
> +
> +	/*
> +	 * Check LAM_U48 in cr3_ctrl_bits to avoid guest_cpuid_has().
> +	 * If not set, vCPU doesn't supports LAM.
> +	 */
> +	if (!(vcpu->arch.cr3_ctrl_bits & X86_CR3_LAM_U48) ||

This is unnecessary, KVM should never allow the LAM bits in CR3 to be set if LAM
isn't supported.

> +	    (flags & X86EMUL_F_SKIPLAM) || WARN_ON_ONCE(!is_64_bit_mode(vcpu)))

Same comments as the LASS series, don't WARN, just put the check here.

> +		return;
> +
> +	sign_ext_bit = lam_sign_extend_bit(vcpu, *gva);
> +	if (sign_ext_bit > 0)
> +		*gva = (sign_extend64(*gva, sign_ext_bit) & ~BIT_ULL(63)) |
> +		       (*gva & BIT_ULL(63));


Something like this?  The early return in the user path is a bit forced, e.g. it
could also be:

	if (cr3 & X86_CR3_LAM_U57)
		lam_bit = 56;
	else if (X86_CR3_LAM_U48)
		lam_bit = 48;
	else
		return gva;

but IMO making the CR3 and CR4 paths somewhat symmetrical is valuable.

gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva,
			    unsigned int flags)
{
	unsigned long cr3_bits, cr4_bits;
	int lam_bit;

	if (flags & (X86EMUL_F_FETCH | X86EMUL_F_BRANCH_INVLPG | X86EMUL_F_IMPLICIT))
		return gva;

	if (!is_64_bit_mode(vcpu))
		return gva;

	/*
	 * Bit 63 determines if the address should be treated as user address
	 * or a supervisor address.
	 */
	if (!(gva & BIT_ULL(63))) {
		cr3_bits = kvm_get_active_lam_bits(vcpu);
		if (!(cr3 & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))
			return gva;

		/* LAM_U48 is ignored if LAM_U57 is set. */
		lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
	} else {
		if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP_BIT))
			return gva;

		lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
	}
	return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
}
  
Binbin Wu June 29, 2023, 6:12 a.m. UTC | #2
On 6/28/2023 8:15 AM, Sean Christopherson wrote:
> On Tue, Jun 06, 2023, Binbin Wu wrote:
>> diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
>> index 5b9ec610b2cb..c2091e24a6b9 100644
>> --- a/arch/x86/kvm/kvm_emulate.h
>> +++ b/arch/x86/kvm/kvm_emulate.h
>> @@ -91,6 +91,7 @@ struct x86_instruction_info {
>>   /* x86-specific emulation flags */
>>   #define X86EMUL_F_FETCH			BIT(0)
>>   #define X86EMUL_F_WRITE			BIT(1)
>> +#define X86EMUL_F_SKIPLAM		BIT(2)
> See my comments in the LASS series about describing the access, not dictating
> the end behavior.

The suggestion do decouple the code to specific feature in common 
emulator code, thanks.

>
>>   
>>   struct x86_emulate_ops {
>>   	void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 52dcf3c00bb8..82a225d1000e 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -8133,6 +8133,77 @@ static void vmx_vm_destroy(struct kvm *kvm)
>>   	free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
>>   }
>>   
>> +#define LAM_S57_EN_MASK (X86_CR4_LAM_SUP | X86_CR4_LA57)
>> +static int lam_sign_extend_bit(struct kvm_vcpu *vcpu, gva_t addr)
>> +{
>> +	u64 cr3, cr4;
>> +
>> +	/*
>> +	 * The LAM identification of a pointer as user or supervisor is
>> +	 * based solely on the value of pointer bit 63.
>> +	 */
>> +	if (!(addr >> 63)) {
> BIT_ULL(63)
>
>> +		cr3 = kvm_read_cr3(vcpu);
> Use the perfectly good helper added earlier in the series:
>
> 		cr3_lam = kvm_get_active_lam_bits();
Good suggestion. Thanks.

>
> That has the added bonus of avoiding a VMREAD of CR3 when LAM is disabled in CR4.
Why? I don't get the point.

>
>> +		if (cr3 & X86_CR3_LAM_U57)
>> +			return 56;
>> +		if (cr3 & X86_CR3_LAM_U48)
>> +			return 47;
>> +	} else {
>> +		cr4 = kvm_read_cr4_bits(vcpu, LAM_S57_EN_MASK);
>> +		if (cr4 == LAM_S57_EN_MASK)
>> +			return 56;
>> +		if (cr4 & X86_CR4_LAM_SUP)
>> +			return 47;
> This is way too complicated for a simple thing.  Burying multiple bits in a #define
> and then relying on specific bits being in the mask is unnecessarily subtle.
>
> And this whole helper shouldn't exist.  There's one caller, and will only ever
> be one caller.  Defining magic numbers, i.e. using -1 to signal "disabled", makes
> it that much harder to read the code.
>
> More below.
>
>> +	}
>> +	return -1;
>> +}
>> +
>> +/*
>> + * Only called in 64-bit mode.
>> + *
>> + * LAM has a modified canonical check when applicable:
>> + * LAM_S48                : [ 1 ][ metadata ][ 1 ]
>> + *                            63               47
>> + * LAM_U48                : [ 0 ][ metadata ][ 0 ]
>> + *                            63               47
>> + * LAM_S57                : [ 1 ][ metadata ][ 1 ]
>> + *                            63               56
>> + * LAM_U57 + 5-lvl paging : [ 0 ][ metadata ][ 0 ]
>> + *                            63               56
>> + * LAM_U57 + 4-lvl paging : [ 0 ][ metadata ][ 0...0 ]
>> + *                            63               56..47
>> + *
>> + * Untag the metadata bits by sign-extending the value of bit 47 (LAM48) or
>> + * bit 56 (LAM57). The resulting address after untag isn't guaranteed to be
>> + * canonical. Callers should perform the original canonical check and raise
>> + * #GP/#SS if the address is non-canonical.
>> + *
>> + * Note that KVM masks the metadata in addresses, performs the (original)
>> + * canonicality checking and then walks page table. This is slightly
>> + * different from hardware behavior but achieves the same effect.
>> + * Specifically, if LAM is enabled, the processor performs a modified
>> + * canonicality checking where the metadata are ignored instead of
>> + * masked. After the modified canonicality checking, the processor masks
>> + * the metadata before passing addresses for paging translation.
>> + */
>> +void vmx_untag_addr(struct kvm_vcpu *vcpu, gva_t *gva, u32 flags)
> Rather than modify the pointer, return the untagged address.  That's more flexible
> as it allows using the result in if-statements and whatnot.  That might not ever
> come into play, but there's no good reason to use an in/out param in a void
> function.
In earlier version, it did return the untagged address.
In this version, I changed it as an in/out param to make the interface 
conditional and avoid to add a dummy one in SVM.
Is it can be a reason?


>
>> +{
>> +	int sign_ext_bit;
>> +
>> +	/*
>> +	 * Check LAM_U48 in cr3_ctrl_bits to avoid guest_cpuid_has().
>> +	 * If not set, vCPU doesn't supports LAM.
>> +	 */
>> +	if (!(vcpu->arch.cr3_ctrl_bits & X86_CR3_LAM_U48) ||
> This is unnecessary, KVM should never allow the LAM bits in CR3 to be set if LAM
> isn't supported.

OK.
>
>> +	    (flags & X86EMUL_F_SKIPLAM) || WARN_ON_ONCE(!is_64_bit_mode(vcpu)))
> Same comments as the LASS series, don't WARN, just put the check here.
OK.

>
>> +		return;
>> +
>> +	sign_ext_bit = lam_sign_extend_bit(vcpu, *gva);
>> +	if (sign_ext_bit > 0)
>> +		*gva = (sign_extend64(*gva, sign_ext_bit) & ~BIT_ULL(63)) |
>> +		       (*gva & BIT_ULL(63));
>
> Something like this?  The early return in the user path is a bit forced, e.g. it
> could also be:
>
> 	if (cr3 & X86_CR3_LAM_U57)
> 		lam_bit = 56;
> 	else if (X86_CR3_LAM_U48)
> 		lam_bit = 48;
> 	else
> 		return gva;
>
> but IMO making the CR3 and CR4 paths somewhat symmetrical is valuable.
>
> gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva,
> 			    unsigned int flags)
> {
> 	unsigned long cr3_bits, cr4_bits;
> 	int lam_bit;
>
> 	if (flags & (X86EMUL_F_FETCH | X86EMUL_F_BRANCH_INVLPG | X86EMUL_F_IMPLICIT))
Thanks for the suggestion. Overall, it looks good to me.

Suppose "X86EMUL_F_BRANCH_INVLPG "  should be two flags for branch and 
invlpg, right?

And for LAM, X86EMUL_F_IMPLICIT will not be used because in the implicit 
access to memory management registers or descriptors,
the linear base addresses still need to be canonical and no hooks will 
be added to untag the addresses in these pathes.
So I probably will remove the check for X86EMUL_F_IMPLICIT here.

> 		return gva;
>
> 	if (!is_64_bit_mode(vcpu))
> 		return gva;
>
> 	/*
> 	 * Bit 63 determines if the address should be treated as user address
> 	 * or a supervisor address.
> 	 */
> 	if (!(gva & BIT_ULL(63))) {
> 		cr3_bits = kvm_get_active_lam_bits(vcpu);
> 		if (!(cr3 & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))
> 			return gva;
>
> 		/* LAM_U48 is ignored if LAM_U57 is set. */
> 		lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
> 	} else {
> 		if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP_BIT))
> 			return gva;
>
> 		lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
> 	}
> 	return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
> }
>
  
Chao Gao June 29, 2023, 6:57 a.m. UTC | #3
On Thu, Jun 29, 2023 at 02:12:27PM +0800, Binbin Wu wrote:
>> > +	/*
>> > +	 * Check LAM_U48 in cr3_ctrl_bits to avoid guest_cpuid_has().
>> > +	 * If not set, vCPU doesn't supports LAM.
>> > +	 */
>> > +	if (!(vcpu->arch.cr3_ctrl_bits & X86_CR3_LAM_U48) ||
>> This is unnecessary, KVM should never allow the LAM bits in CR3 to be set if LAM
>> isn't supported.

A corner case is:

If EPT is enabled, CR3 writes are not trapped. then guests can set the
LAM bits in CR3 if hardware supports LAM regardless whether or not guest
enumerates LAM.
  
Binbin Wu June 29, 2023, 7:22 a.m. UTC | #4
On 6/29/2023 2:57 PM, Chao Gao wrote:
> On Thu, Jun 29, 2023 at 02:12:27PM +0800, Binbin Wu wrote:
>>>> +	/*
>>>> +	 * Check LAM_U48 in cr3_ctrl_bits to avoid guest_cpuid_has().
>>>> +	 * If not set, vCPU doesn't supports LAM.
>>>> +	 */
>>>> +	if (!(vcpu->arch.cr3_ctrl_bits & X86_CR3_LAM_U48) ||
>>> This is unnecessary, KVM should never allow the LAM bits in CR3 to be set if LAM
>>> isn't supported.
> A corner case is:
>
> If EPT is enabled, CR3 writes are not trapped. then guests can set the
> LAM bits in CR3 if hardware supports LAM regardless whether or not guest
> enumerates LAM.
I recalled the main reason why I added the check.
It's used to avoid the following checking on CR3 & CR4, which may cause 
an additional VMREAD.

Also, about the virtualization hole, if guest can enable LAM bits in CR3 
in non-root mode without cause any problem,
that means the hardware supports LAM, should KVM continue to untag the 
address following CR3 setting?
Because skip untag the address probably will cause guest failure, and of 
cause, this is the guest itself to blame.
But untag the address seems do no harm?
  
David Laight June 29, 2023, 8:30 a.m. UTC | #5
From: Binbin Wu
> Sent: 29 June 2023 07:12
...
> >> +void vmx_untag_addr(struct kvm_vcpu *vcpu, gva_t *gva, u32 flags)
> >
> > Rather than modify the pointer, return the untagged address.  That's more flexible
> > as it allows using the result in if-statements and whatnot.  That might not ever
> > come into play, but there's no good reason to use an in/out param in a void
> > function.
>
> In earlier version, it did return the untagged address.
> In this version, I changed it as an in/out param to make the interface
> conditional and avoid to add a dummy one in SVM.
> Is it can be a reason?

You are always going to need a 'dummy' version.
If it ends up being 'x = x' the compiler will just optimise
it away.

But for a real function you'll get much better code from:
	x = fn(x);
than
	fn(&x);

It also lets you used 'void *' (etc) to avoid casts which
can easily hide bugs.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  
Sean Christopherson June 29, 2023, 3:16 p.m. UTC | #6
On Thu, Jun 29, 2023, Binbin Wu wrote:
> On 6/28/2023 8:15 AM, Sean Christopherson wrote:
> > On Tue, Jun 06, 2023, Binbin Wu wrote:
> > Use the perfectly good helper added earlier in the series:
> > 
> > 		cr3_lam = kvm_get_active_lam_bits();
> Good suggestion. Thanks.
> 
> > 
> > That has the added bonus of avoiding a VMREAD of CR3 when LAM is disabled in CR4.
> Why? I don't get the point.

Sorry, typo on my end.  When LAM is disabled in guest CPUID, not CR4.

> > > +void vmx_untag_addr(struct kvm_vcpu *vcpu, gva_t *gva, u32 flags)
> > Rather than modify the pointer, return the untagged address.  That's more flexible
> > as it allows using the result in if-statements and whatnot.  That might not ever
> > come into play, but there's no good reason to use an in/out param in a void
> > function.
> In earlier version, it did return the untagged address.
> In this version, I changed it as an in/out param to make the interface
> conditional and avoid to add a dummy one in SVM.
> Is it can be a reason?

Hmm, no.  You can achieve the same by doing:

	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);

	if (!kvm_x86_ops.get_untagged_addr)
		return addr;

	return static_call(kvm_x86_get_untagged_addr)(vcpu, addr, flags);

> > gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva,
> > 			    unsigned int flags)
> > {
> > 	unsigned long cr3_bits, cr4_bits;
> > 	int lam_bit;
> > 
> > 	if (flags & (X86EMUL_F_FETCH | X86EMUL_F_BRANCH_INVLPG | X86EMUL_F_IMPLICIT))
> Thanks for the suggestion. Overall, it looks good to me.
> 
> Suppose "X86EMUL_F_BRANCH_INVLPG "  should be two flags for branch and
> invlpg, right?

Yeah, typo again.  Should just be X86EMUL_F_INVLPG, because unlike LASS, LAM
ignores all FETCH types.

> And for LAM, X86EMUL_F_IMPLICIT will not be used because in the implicit
> access to memory management registers or descriptors,
> the linear base addresses still need to be canonical and no hooks will be
> added to untag the addresses in these pathes.
> So I probably will remove the check for X86EMUL_F_IMPLICIT here.

No, please keep it, e.g. so that changes in the emulator don't lead to breakage,
and to document that they are exempt.

If you want, you could do WARN_ON_ONCE() for the IMPLICIT case, but I don't know
that that's worthwhile, e.g. nothing will go wrong if KVM tries to untag an
implicit access, and deliberately avoiding the call make make it annoying to
consolidate code in the future.
  
Sean Christopherson June 29, 2023, 3:33 p.m. UTC | #7
On Thu, Jun 29, 2023, Binbin Wu wrote:
> On 6/29/2023 2:57 PM, Chao Gao wrote:
> > On Thu, Jun 29, 2023 at 02:12:27PM +0800, Binbin Wu wrote:
> > > > > +	/*
> > > > > +	 * Check LAM_U48 in cr3_ctrl_bits to avoid guest_cpuid_has().
> > > > > +	 * If not set, vCPU doesn't supports LAM.
> > > > > +	 */
> > > > > +	if (!(vcpu->arch.cr3_ctrl_bits & X86_CR3_LAM_U48) ||
> > > > This is unnecessary, KVM should never allow the LAM bits in CR3 to be set if LAM
> > > > isn't supported.
> > A corner case is:
> > 
> > If EPT is enabled, CR3 writes are not trapped. then guests can set the
> > LAM bits in CR3 if hardware supports LAM regardless whether or not guest
> > enumerates LAM.

Argh, that's a really obnoxious virtualization hole.

> I recalled the main reason why I added the check.
> It's used to avoid the following checking on CR3 & CR4, which may cause an
> additional VMREAD.

FWIW, that will (and should) be handled by kvm_get_active_lam_bits().  Hmm, though
since CR4.LAM_SUP is a separate thing, that should probably be
kvm_get_active_cr3_lam_bits().

> Also, about the virtualization hole, if guest can enable LAM bits in CR3 in
> non-root mode without cause any problem, that means the hardware supports
> LAM, should KVM continue to untag the address following CR3 setting?

Hrm, no, KVM should honor the architecture.  The virtualization hole is bad enough
as it is, I don't want to KVM to actively make it worse.

> Because skip untag the address probably will cause guest failure, and of
> cause, this is the guest itself to blame.

Yeah, guest's fault.  The fact that it the guest won't get all the #GPs it should
is unfortunate, but intercepting all writes to CR3 just to close the hole is sadly
a really bad tradeoff.

> But untag the address seems do no harm?

In an of itself, not really.  But I don't want to set the precedent in KVM that
user LAM is supported regardless of guest CPUID.

Another problem with the virtualization hole is that the guest will be able
to induce VM-Fail when KVM is running on L1, because L0 will likely enforce the
CR3 checks on VM-Enter but not intercept MOV CR3.  I.e. the guest can get an
illegal value into vmcs.GUEST_CR3.  We could add code to explicitly detect that
case to help triage such failures, but I don't know that it's worth the code, e.g.

	if (exit_reason.failed_vmentry) {
		if (boot_cpu_has(X86_FEATURE_LAM) &&
		    !guest_can_use(X86_FEATURE_LAM) &&
		    (kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)))
		    	pr_warn_ratelimited("Guest abused LAM virtualization hole\n");
		else
			dump_vmcs(vcpu);
		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
		vcpu->run->fail_entry.hardware_entry_failure_reason
			= exit_reason.full;
		vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
		return 0;
	}
  
Binbin Wu June 29, 2023, 5:26 p.m. UTC | #8
On 6/29/2023 11:16 PM, Sean Christopherson wrote:
>> And for LAM, X86EMUL_F_IMPLICIT will not be used because in the implicit
>> access to memory management registers or descriptors,
>> the linear base addresses still need to be canonical and no hooks will be
>> added to untag the addresses in these pathes.
>> So I probably will remove the check for X86EMUL_F_IMPLICIT here.
> No, please keep it, e.g. so that changes in the emulator don't lead to breakage,
> and to document that they are exempt.
>
> If you want, you could do WARN_ON_ONCE() for the IMPLICIT case, but I don't know
> that that's worthwhile, e.g. nothing will go wrong if KVM tries to untag an
> implicit access, and deliberately avoiding the call make make it annoying to
> consolidate code in the future.
Right.
Have a second thought, X86EMUL_F_IMPLICIT should be kept in case SVM has 
a different implementation and needs to do untag for IMPLICIT cases.
  

Patch

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 13bc212cd4bc..c0cebe671d41 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -52,6 +52,7 @@  KVM_X86_OP(cache_reg)
 KVM_X86_OP(get_rflags)
 KVM_X86_OP(set_rflags)
 KVM_X86_OP(get_if_flag)
+KVM_X86_OP_OPTIONAL(untag_addr)
 KVM_X86_OP(flush_tlb_all)
 KVM_X86_OP(flush_tlb_current)
 KVM_X86_OP_OPTIONAL(flush_remote_tlbs)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 46471dd9cc1b..62a72560fa65 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1588,6 +1588,8 @@  struct kvm_x86_ops {
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 	bool (*get_if_flag)(struct kvm_vcpu *vcpu);
 
+	void (*untag_addr)(struct kvm_vcpu *vcpu, gva_t *gva, u32 flags);
+
 	void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
 	void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
 	int  (*flush_remote_tlbs)(struct kvm *kvm);
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 5b9ec610b2cb..c2091e24a6b9 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -91,6 +91,7 @@  struct x86_instruction_info {
 /* x86-specific emulation flags */
 #define X86EMUL_F_FETCH			BIT(0)
 #define X86EMUL_F_WRITE			BIT(1)
+#define X86EMUL_F_SKIPLAM		BIT(2)
 
 struct x86_emulate_ops {
 	void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 52dcf3c00bb8..82a225d1000e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8133,6 +8133,77 @@  static void vmx_vm_destroy(struct kvm *kvm)
 	free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
 }
 
+#define LAM_S57_EN_MASK (X86_CR4_LAM_SUP | X86_CR4_LA57)
+static int lam_sign_extend_bit(struct kvm_vcpu *vcpu, gva_t addr)
+{
+	u64 cr3, cr4;
+
+	/*
+	 * The LAM identification of a pointer as user or supervisor is
+	 * based solely on the value of pointer bit 63.
+	 */
+	if (!(addr >> 63)) {
+		cr3 = kvm_read_cr3(vcpu);
+		if (cr3 & X86_CR3_LAM_U57)
+			return 56;
+		if (cr3 & X86_CR3_LAM_U48)
+			return 47;
+	} else {
+		cr4 = kvm_read_cr4_bits(vcpu, LAM_S57_EN_MASK);
+		if (cr4 == LAM_S57_EN_MASK)
+			return 56;
+		if (cr4 & X86_CR4_LAM_SUP)
+			return 47;
+	}
+	return -1;
+}
+
+/*
+ * Only called in 64-bit mode.
+ *
+ * LAM has a modified canonical check when applicable:
+ * LAM_S48                : [ 1 ][ metadata ][ 1 ]
+ *                            63               47
+ * LAM_U48                : [ 0 ][ metadata ][ 0 ]
+ *                            63               47
+ * LAM_S57                : [ 1 ][ metadata ][ 1 ]
+ *                            63               56
+ * LAM_U57 + 5-lvl paging : [ 0 ][ metadata ][ 0 ]
+ *                            63               56
+ * LAM_U57 + 4-lvl paging : [ 0 ][ metadata ][ 0...0 ]
+ *                            63               56..47
+ *
+ * Untag the metadata bits by sign-extending the value of bit 47 (LAM48) or
+ * bit 56 (LAM57). The resulting address after untag isn't guaranteed to be
+ * canonical. Callers should perform the original canonical check and raise
+ * #GP/#SS if the address is non-canonical.
+ *
+ * Note that KVM masks the metadata in addresses, performs the (original)
+ * canonicality checking and then walks page table. This is slightly
+ * different from hardware behavior but achieves the same effect.
+ * Specifically, if LAM is enabled, the processor performs a modified
+ * canonicality checking where the metadata are ignored instead of
+ * masked. After the modified canonicality checking, the processor masks
+ * the metadata before passing addresses for paging translation.
+ */
+void vmx_untag_addr(struct kvm_vcpu *vcpu, gva_t *gva, u32 flags)
+{
+	int sign_ext_bit;
+
+	/*
+	 * Check LAM_U48 in cr3_ctrl_bits to avoid guest_cpuid_has().
+	 * If not set, vCPU doesn't supports LAM.
+	 */
+	if (!(vcpu->arch.cr3_ctrl_bits & X86_CR3_LAM_U48) ||
+	    (flags & X86EMUL_F_SKIPLAM) || WARN_ON_ONCE(!is_64_bit_mode(vcpu)))
+		return;
+
+	sign_ext_bit = lam_sign_extend_bit(vcpu, *gva);
+	if (sign_ext_bit > 0)
+		*gva = (sign_extend64(*gva, sign_ext_bit) & ~BIT_ULL(63)) |
+		       (*gva & BIT_ULL(63));
+}
+
 static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.name = KBUILD_MODNAME,
 
@@ -8181,6 +8252,8 @@  static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.set_rflags = vmx_set_rflags,
 	.get_if_flag = vmx_get_if_flag,
 
+	.untag_addr = vmx_untag_addr,
+
 	.flush_tlb_all = vmx_flush_tlb_all,
 	.flush_tlb_current = vmx_flush_tlb_current,
 	.flush_tlb_gva = vmx_flush_tlb_gva,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9e66531861cf..c4bbd3024fa8 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -433,6 +433,8 @@  void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
 
+void vmx_untag_addr(struct kvm_vcpu *vcpu, gva_t *gva, u32 flags);
+
 static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
 					     int type, bool value)
 {