[RFC,2/3] KVM: arm64: Allow userspace to trap SMCCC sub-ranges

Message ID 20221110015327.3389351-3-oliver.upton@linux.dev
State New
Headers
Series [RFC,1/3] KVM: arm64: Use a generalized accessor for SMCCC args |

Commit Message

Oliver Upton Nov. 10, 2022, 1:53 a.m. UTC
  As the SMCCC (and related specifications) march towards an
'everything and the kitchen sink' interface for interacting with a
system, it is less likely that KVM will implement every supported
feature.

Add a capability that allows userspace to trap hypercall ranges,
allowing the VMM to mix-and-match between calls handled in userspace vs.
KVM.

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 arch/arm64/include/asm/kvm_host.h |  5 ++++
 arch/arm64/include/uapi/asm/kvm.h | 15 ++++++++++
 arch/arm64/kvm/arm.c              | 10 +++++++
 arch/arm64/kvm/hypercalls.c       | 48 +++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 5 files changed, 79 insertions(+)
  

Comments

Marc Zyngier Nov. 10, 2022, 12:22 p.m. UTC | #1
On Thu, 10 Nov 2022 01:53:26 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> As the SMCCC (and related specifications) march towards an
> 'everything and the kitchen sink' interface for interacting with a
> system, it is less likely that KVM will implement every supported
> feature.
> 
> Add a capability that allows userspace to trap hypercall ranges,
> allowing the VMM to mix-and-match between calls handled in userspace vs.
> KVM.
> 
> Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
> ---
>  arch/arm64/include/asm/kvm_host.h |  5 ++++
>  arch/arm64/include/uapi/asm/kvm.h | 15 ++++++++++
>  arch/arm64/kvm/arm.c              | 10 +++++++
>  arch/arm64/kvm/hypercalls.c       | 48 +++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h          |  1 +
>  5 files changed, 79 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index e33ed7c09a28..cc3872f1900c 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -52,6 +52,9 @@
>  
>  #define KVM_HAVE_MMU_RWLOCK
>  
> +#define KVM_ARM_USER_HYPERCALL_FLAGS	\
> +		GENMASK_ULL(KVM_ARM_USER_HYPERCALL_FLAGS_COUNT - 1, 0)
> +
>  /*
>   * Mode of operation configurable with kvm-arm.mode early param.
>   * See Documentation/admin-guide/kernel-parameters.txt for more information.
> @@ -104,11 +107,13 @@ struct kvm_arch_memory_slot {
>  /**
>   * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests
>   *
> + * @user_trap_bmap: Bitmap of SMCCC function ranges trapped to userspace
>   * @std_bmap: Bitmap of standard secure service calls
>   * @std_hyp_bmap: Bitmap of standard hypervisor service calls
>   * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls
>   */
>  struct kvm_smccc_features {
> +	unsigned long user_trap_bmap;

nit: I strongly object to the word 'trap'. By definition, this is a
trap. The difference here is that you *forward* something to userspace
instead of implementing it in the kernel.

>  	unsigned long std_bmap;
>  	unsigned long std_hyp_bmap;
>  	unsigned long vendor_hyp_bmap;
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index 316917b98707..07fa3f597e61 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -370,6 +370,21 @@ enum {
>  #endif
>  };
>  
> +enum {
> +	KVM_ARM_USER_HYPERCALL_OWNER_ARCH		= 0,
> +	KVM_ARM_USER_HYPERCALL_OWNER_CPU		= 1,
> +	KVM_ARM_USER_HYPERCALL_OWNER_SIP		= 2,
> +	KVM_ARM_USER_HYPERCALL_OWNER_OEM		= 3,
> +	KVM_ARM_USER_HYPERCALL_OWNER_STANDARD		= 4,
> +	KVM_ARM_USER_HYPERCALL_OWNER_STANDARD_HYP	= 5,
> +	KVM_ARM_USER_HYPERCALL_OWNER_VENDOR_HYP		= 6,
> +	KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_APP	= 7,
> +	KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_OS		= 8,
> +#ifdef __KERNEL__
> +	KVM_ARM_USER_HYPERCALL_FLAGS_COUNT,
> +#endif
> +};
> +
>  /* Device Control API: ARM VGIC */
>  #define KVM_DEV_ARM_VGIC_GRP_ADDR	0
>  #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS	1
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 6f0b56e7f8c7..6e8a222fc295 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -100,6 +100,13 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>  		r = 0;
>  		set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
>  		break;
> +	case KVM_CAP_ARM_USER_HYPERCALLS:
> +		if (cap->args[0] & ~KVM_ARM_USER_HYPERCALL_FLAGS)
> +			return -EINVAL;
> +
> +		r = 0;
> +		kvm->arch.smccc_feat.user_trap_bmap = cap->args[0];
> +		break;
>  	default:
>  		r = -EINVAL;
>  		break;
> @@ -285,6 +292,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_ARM_PTRAUTH_GENERIC:
>  		r = system_has_full_ptr_auth();
>  		break;
> +	case KVM_CAP_ARM_USER_HYPERCALLS:
> +		r = KVM_ARM_USER_HYPERCALL_FLAGS;
> +		break;
>  	default:
>  		r = 0;
>  	}
> diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
> index 62ce45d0d957..22a23b12201d 100644
> --- a/arch/arm64/kvm/hypercalls.c
> +++ b/arch/arm64/kvm/hypercalls.c
> @@ -92,6 +92,49 @@ static bool kvm_hvc_call_default_allowed(u32 func_id)
>  	}
>  }
>  
> +static bool kvm_hvc_call_user_trapped(struct kvm_vcpu *vcpu, u32 func_id)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long *bmap = &kvm->arch.smccc_feat.user_trap_bmap;
> +
> +	switch (ARM_SMCCC_OWNER_NUM(func_id)) {
> +	case ARM_SMCCC_OWNER_ARCH:
> +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_ARCH, bmap);
> +	case ARM_SMCCC_OWNER_CPU:
> +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_CPU, bmap);
> +	case ARM_SMCCC_OWNER_SIP:
> +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_SIP, bmap);
> +	case ARM_SMCCC_OWNER_OEM:
> +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_OEM, bmap);
> +	case ARM_SMCCC_OWNER_STANDARD:
> +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_STANDARD, bmap);
> +	case ARM_SMCCC_OWNER_STANDARD_HYP:
> +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_STANDARD_HYP, bmap);
> +	case ARM_SMCCC_OWNER_VENDOR_HYP:
> +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_VENDOR_HYP, bmap);
> +	case ARM_SMCCC_OWNER_TRUSTED_APP ... ARM_SMCCC_OWNER_TRUSTED_APP_END:
> +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_APP, bmap);
> +	case ARM_SMCCC_OWNER_TRUSTED_OS ... ARM_SMCCC_OWNER_TRUSTED_OS_END:
> +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_OS, bmap);
> +	default:
> +		return false;
> +	}

You have multiple problems here:

- the granularity is way too coarse. You want to express arbitrary
  ranges, and not necessarily grab a whole owner range.

- you have now an overlap between ranges that are handled in the
  kernel (PSCI, spectre mitigations) and ranges that userspace wants
  to observe. Not good.

If we are going down this road, this can only be done at the
*function* level. And userspace must know that the kernel will refuse
to forward some ranges.

So obviously, this cannot be a simple bitmap. Making it a radix tree
(or an xarray, which is basically the same thing) could work. And the
filtering request from userspace can be similar to what we have for
the PMU filters.

> +}
> +
> +static void kvm_hvc_prepare_user_trap(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_run *run = vcpu->run;
> +
> +	run->exit_reason	= KVM_EXIT_HYPERCALL;
> +	run->hypercall.nr	= smccc_get_function(vcpu);
> +	run->hypercall.args[0]	= smccc_get_arg(vcpu, 1);
> +	run->hypercall.args[1]	= smccc_get_arg(vcpu, 2);
> +	run->hypercall.args[2]	= smccc_get_arg(vcpu, 3);
> +	run->hypercall.args[3]	= smccc_get_arg(vcpu, 4);
> +	run->hypercall.args[4]	= smccc_get_arg(vcpu, 5);
> +	run->hypercall.args[5]	= smccc_get_arg(vcpu, 6);

All of which is readily available through the ONE_REG interface. I'm
mildly reluctant to expose another interface that disclose the same
information (yes, I understand the performance impact).

Thanks,

	M.
  
Oliver Upton Nov. 10, 2022, 9:13 p.m. UTC | #2
On Thu, Nov 10, 2022 at 12:22:12PM +0000, Marc Zyngier wrote:
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index e33ed7c09a28..cc3872f1900c 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -52,6 +52,9 @@
> >  
> >  #define KVM_HAVE_MMU_RWLOCK
> >  
> > +#define KVM_ARM_USER_HYPERCALL_FLAGS	\
> > +		GENMASK_ULL(KVM_ARM_USER_HYPERCALL_FLAGS_COUNT - 1, 0)
> > +
> >  /*
> >   * Mode of operation configurable with kvm-arm.mode early param.
> >   * See Documentation/admin-guide/kernel-parameters.txt for more information.
> > @@ -104,11 +107,13 @@ struct kvm_arch_memory_slot {
> >  /**
> >   * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests
> >   *
> > + * @user_trap_bmap: Bitmap of SMCCC function ranges trapped to userspace
> >   * @std_bmap: Bitmap of standard secure service calls
> >   * @std_hyp_bmap: Bitmap of standard hypervisor service calls
> >   * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls
> >   */
> >  struct kvm_smccc_features {
> > +	unsigned long user_trap_bmap;
> 
> nit: I strongly object to the word 'trap'. By definition, this is a
> trap. The difference here is that you *forward* something to userspace
> instead of implementing it in the kernel.

I think you're being polite calling this a 'nit' :-)

Naming came about lazily to shorten some names, but completely breaks
the notion of what a trap is. Oops.

> > diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
> > index 62ce45d0d957..22a23b12201d 100644
> > --- a/arch/arm64/kvm/hypercalls.c
> > +++ b/arch/arm64/kvm/hypercalls.c
> > @@ -92,6 +92,49 @@ static bool kvm_hvc_call_default_allowed(u32 func_id)
> >  	}
> >  }
> >  
> > +static bool kvm_hvc_call_user_trapped(struct kvm_vcpu *vcpu, u32 func_id)
> > +{
> > +	struct kvm *kvm = vcpu->kvm;
> > +	unsigned long *bmap = &kvm->arch.smccc_feat.user_trap_bmap;
> > +
> > +	switch (ARM_SMCCC_OWNER_NUM(func_id)) {
> > +	case ARM_SMCCC_OWNER_ARCH:
> > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_ARCH, bmap);
> > +	case ARM_SMCCC_OWNER_CPU:
> > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_CPU, bmap);
> > +	case ARM_SMCCC_OWNER_SIP:
> > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_SIP, bmap);
> > +	case ARM_SMCCC_OWNER_OEM:
> > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_OEM, bmap);
> > +	case ARM_SMCCC_OWNER_STANDARD:
> > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_STANDARD, bmap);
> > +	case ARM_SMCCC_OWNER_STANDARD_HYP:
> > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_STANDARD_HYP, bmap);
> > +	case ARM_SMCCC_OWNER_VENDOR_HYP:
> > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_VENDOR_HYP, bmap);
> > +	case ARM_SMCCC_OWNER_TRUSTED_APP ... ARM_SMCCC_OWNER_TRUSTED_APP_END:
> > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_APP, bmap);
> > +	case ARM_SMCCC_OWNER_TRUSTED_OS ... ARM_SMCCC_OWNER_TRUSTED_OS_END:
> > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_OS, bmap);
> > +	default:
> > +		return false;
> > +	}
> 
> You have multiple problems here:
> 
> - the granularity is way too coarse. You want to express arbitrary
>   ranges, and not necessarily grab a whole owner range.
> 
> - you have now an overlap between ranges that are handled in the
>   kernel (PSCI, spectre mitigations) and ranges that userspace wants
>   to observe. Not good.

We need to come to agreement on what degree of mix-and-match should be
supported.

Spectre really ought to be in the kernel, and I don't think anyone is
particularly excited about reimplementing PSCI. Right now my interest
in this starts and ends with forwarding the vendor-specific hypercall
range to userspace, allowing something like Hyper-V PV on KVM.

> If we are going down this road, this can only be done at the
> *function* level. And userspace must know that the kernel will refuse
> to forward some ranges.

The goal of what I was trying to get at is that either the kernel or
userspace takes ownership of a range that has an ABI, but not both. i.e.
you really wouldn't want some VMM or cloud provider trapping portions of
KVM's vendor-specific range while still reporting a 'vanilla' ABI at the
time of discovery. Same goes for PSCI, TRNG, etc.

> So obviously, this cannot be a simple bitmap. Making it a radix tree
> (or an xarray, which is basically the same thing) could work. And the
> filtering request from userspace can be similar to what we have for
> the PMU filters.

Right, we'll need a more robust data structure for all this.

My only concern is that communicating the hypercall filter between
user/kernel with a set of ranges or function numbers is that we could be
mutating what KVM *doesn't* already implement into an ABI of sorts.

i.e. suppose that userspace wants to filter function(s) in an
unallocated/unused range of function numbers. Later down the line KVM
adds support for a new shiny thing and the filter becomes a subset of a
now allocated range of calls. We then reject the filter due to the
incongruence.

> > +}
> > +
> > +static void kvm_hvc_prepare_user_trap(struct kvm_vcpu *vcpu)
> > +{
> > +	struct kvm_run *run = vcpu->run;
> > +
> > +	run->exit_reason	= KVM_EXIT_HYPERCALL;
> > +	run->hypercall.nr	= smccc_get_function(vcpu);
> > +	run->hypercall.args[0]	= smccc_get_arg(vcpu, 1);
> > +	run->hypercall.args[1]	= smccc_get_arg(vcpu, 2);
> > +	run->hypercall.args[2]	= smccc_get_arg(vcpu, 3);
> > +	run->hypercall.args[3]	= smccc_get_arg(vcpu, 4);
> > +	run->hypercall.args[4]	= smccc_get_arg(vcpu, 5);
> > +	run->hypercall.args[5]	= smccc_get_arg(vcpu, 6);
> 
> All of which is readily available through the ONE_REG interface. I'm
> mildly reluctant to expose another interface that disclose the same
> information (yes, I understand the performance impact).

I can drop this bit for now, always easy to add it back in and advertize
with a flag if the overhead is too great.

--
Thanks,
Oliver
  
Marc Zyngier Nov. 11, 2022, 8:26 a.m. UTC | #3
On Thu, 10 Nov 2022 21:13:54 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> On Thu, Nov 10, 2022 at 12:22:12PM +0000, Marc Zyngier wrote:
> > > +static bool kvm_hvc_call_user_trapped(struct kvm_vcpu *vcpu, u32 func_id)
> > > +{
> > > +	struct kvm *kvm = vcpu->kvm;
> > > +	unsigned long *bmap = &kvm->arch.smccc_feat.user_trap_bmap;
> > > +
> > > +	switch (ARM_SMCCC_OWNER_NUM(func_id)) {
> > > +	case ARM_SMCCC_OWNER_ARCH:
> > > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_ARCH, bmap);
> > > +	case ARM_SMCCC_OWNER_CPU:
> > > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_CPU, bmap);
> > > +	case ARM_SMCCC_OWNER_SIP:
> > > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_SIP, bmap);
> > > +	case ARM_SMCCC_OWNER_OEM:
> > > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_OEM, bmap);
> > > +	case ARM_SMCCC_OWNER_STANDARD:
> > > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_STANDARD, bmap);
> > > +	case ARM_SMCCC_OWNER_STANDARD_HYP:
> > > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_STANDARD_HYP, bmap);
> > > +	case ARM_SMCCC_OWNER_VENDOR_HYP:
> > > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_VENDOR_HYP, bmap);
> > > +	case ARM_SMCCC_OWNER_TRUSTED_APP ... ARM_SMCCC_OWNER_TRUSTED_APP_END:
> > > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_APP, bmap);
> > > +	case ARM_SMCCC_OWNER_TRUSTED_OS ... ARM_SMCCC_OWNER_TRUSTED_OS_END:
> > > +		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_OS, bmap);
> > > +	default:
> > > +		return false;
> > > +	}
> > 
> > You have multiple problems here:
> > 
> > - the granularity is way too coarse. You want to express arbitrary
> >   ranges, and not necessarily grab a whole owner range.
> > 
> > - you have now an overlap between ranges that are handled in the
> >   kernel (PSCI, spectre mitigations) and ranges that userspace wants
> >   to observe. Not good.
> 
> We need to come to agreement on what degree of mix-and-match should be
> supported.
> 
> Spectre really ought to be in the kernel, and I don't think anyone is
> particularly excited about reimplementing PSCI. Right now my interest
> in this starts and ends with forwarding the vendor-specific hypercall
> range to userspace, allowing something like Hyper-V PV on KVM.
> 
> > If we are going down this road, this can only be done at the
> > *function* level. And userspace must know that the kernel will refuse
> > to forward some ranges.
> 
> The goal of what I was trying to get at is that either the kernel or
> userspace takes ownership of a range that has an ABI, but not both. i.e.
> you really wouldn't want some VMM or cloud provider trapping portions of
> KVM's vendor-specific range while still reporting a 'vanilla' ABI at the
> time of discovery. Same goes for PSCI, TRNG, etc.

But I definitely think this is one of the major use cases. For
example, there is value in taking PSCI to userspace in order to
implement a newer version of the spec, or to support sub-features that
KVM doesn't (want to) implement. I don't think this changes the ABI
from the guest perspective.

pKVM also has a use case for this where userspace gets a notification
of the hypercall that a guest has performed to share memory.

Communication with a TEE also is on the cards, as would be a FFA
implementation. All of this could be implemented in KVM, or in
userspace, depending what users of these misfeatures want to do.

> 
> > So obviously, this cannot be a simple bitmap. Making it a radix tree
> > (or an xarray, which is basically the same thing) could work. And the
> > filtering request from userspace can be similar to what we have for
> > the PMU filters.
> 
> Right, we'll need a more robust data structure for all this.
> 
> My only concern is that communicating the hypercall filter between
> user/kernel with a set of ranges or function numbers is that we could be
> mutating what KVM *doesn't* already implement into an ABI of sorts.
> 
> i.e. suppose that userspace wants to filter function(s) in an
> unallocated/unused range of function numbers. Later down the line KVM
> adds support for a new shiny thing and the filter becomes a subset of a
> now allocated range of calls. We then reject the filter due to the
> incongruence.

But isn't the problem to ask for ranges that are unallocated the first
place? What semantic can userspace give to such a thing other than
replying "not implemented", which is what the kernel would do anyway?

The more interesting problem is when you want to emulate another
hypervisor, and that the vendor spaces overlap (a very likely
outcome). Somehow, this means overriding all the KVM-specific
hypercalls, and let userspace deal with it. But again, this can be
done on a per function basis.

Thanks,

	M.
  
Oliver Upton Nov. 11, 2022, 11:39 p.m. UTC | #4
On Fri, Nov 11, 2022 at 08:26:02AM +0000, Marc Zyngier wrote:
> On Thu, 10 Nov 2022 21:13:54 +0000, Oliver Upton <oliver.upton@linux.dev> wrote:
> > The goal of what I was trying to get at is that either the kernel or
> > userspace takes ownership of a range that has an ABI, but not both. i.e.
> > you really wouldn't want some VMM or cloud provider trapping portions of
> > KVM's vendor-specific range while still reporting a 'vanilla' ABI at the
> > time of discovery. Same goes for PSCI, TRNG, etc.
> 
> But I definitely think this is one of the major use cases. For
> example, there is value in taking PSCI to userspace in order to
> implement a newer version of the spec, or to support sub-features that
> KVM doesn't (want to) implement. I don't think this changes the ABI from
> the guest perspective.

I disagree for the implications of partially trapping the 'Vendor
Specific Hypervisor Service'. If the UID for the range still reports KVM
but userspace decided to add some new widget, then from the guest
perspective that widget is now part of KVM's own ABI with the guest.

Trapping the whole range is a bit of a hack to workaround the need for
more complicated verification of a hypercall filter.

But for everything else, I'm fine with arbitrary function filtering.
Userspace is always welcome to shoot itself in the foot.

> pKVM also has a use case for this where userspace gets a notification
> of the hypercall that a guest has performed to share memory.

Is that hypercall in the 'Vendor Specific Hypervisor Service' range?

> Communication with a TEE also is on the cards, as would be a FFA
> implementation. All of this could be implemented in KVM, or in
> userspace, depending what users of these misfeatures want to do.

I'm very hopeful that by forwarding all of this to userspace we can get
out of the business of implementing every darn spec that comes along.

> > > So obviously, this cannot be a simple bitmap. Making it a radix tree
> > > (or an xarray, which is basically the same thing) could work. And the
> > > filtering request from userspace can be similar to what we have for
> > > the PMU filters.
> > 
> > Right, we'll need a more robust data structure for all this.
> > 
> > My only concern is that communicating the hypercall filter between
> > user/kernel with a set of ranges or function numbers is that we could be
> > mutating what KVM *doesn't* already implement into an ABI of sorts.
> > 
> > i.e. suppose that userspace wants to filter function(s) in an
> > unallocated/unused range of function numbers. Later down the line KVM
> > adds support for a new shiny thing and the filter becomes a subset of a
> > now allocated range of calls. We then reject the filter due to the
> > incongruence.
> 
> But isn't the problem to ask for ranges that are unallocated the first
> place? What semantic can userspace give to such a thing other than
> replying "not implemented", which is what the kernel would do anyway?

... assuming we know about all defined services in the kernel. I don't
care about it too much, but there is the case about a new userspace on
an old kernel.

> The more interesting problem is when you want to emulate another
> hypervisor, and that the vendor spaces overlap (a very likely
> outcome). Somehow, this means overriding all the KVM-specific
> hypercalls, and let userspace deal with it. But again, this can be
> done on a per function basis.

Other hypercalls removed, would you be in favor of an all-or-nothing
rule for KVM's vendor range implementation? Of course, that could still
be enforced on whatever UAPI approach we take.

--
Thanks,
Oliver
  
Marc Zyngier Nov. 14, 2022, 11:36 a.m. UTC | #5
On Fri, 11 Nov 2022 23:39:09 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> On Fri, Nov 11, 2022 at 08:26:02AM +0000, Marc Zyngier wrote:
> > On Thu, 10 Nov 2022 21:13:54 +0000, Oliver Upton <oliver.upton@linux.dev> wrote:
> > > The goal of what I was trying to get at is that either the kernel or
> > > userspace takes ownership of a range that has an ABI, but not both. i.e.
> > > you really wouldn't want some VMM or cloud provider trapping portions of
> > > KVM's vendor-specific range while still reporting a 'vanilla' ABI at the
> > > time of discovery. Same goes for PSCI, TRNG, etc.
> > 
> > But I definitely think this is one of the major use cases. For
> > example, there is value in taking PSCI to userspace in order to
> > implement a newer version of the spec, or to support sub-features that
> > KVM doesn't (want to) implement. I don't think this changes the ABI from
> > the guest perspective.
> 
> I disagree for the implications of partially trapping the 'Vendor
> Specific Hypervisor Service'. If the UID for the range still reports KVM
> but userspace decided to add some new widget, then from the guest
> perspective that widget is now part of KVM's own ABI with the guest.

But that's what I mean by "I don't think this changes the ABI from the
guest perspective". The guest cannot know who is doing the emulation
anyway, so it is userspace's duty to preserve the illusion. At the
end of the day, this is only a configuration mechanism, and it is no
different from all other configuration bits (i.e. they need to be
identical on both side for migration).

> Trapping the whole range is a bit of a hack to workaround the need for
> more complicated verification of a hypercall filter.

We already need these things for architected hypercalls. Once we have
the infrastructure, it doesn't matter anymore which range this is for.

> 
> But for everything else, I'm fine with arbitrary function filtering.
> Userspace is always welcome to shoot itself in the foot.
> 
> > pKVM also has a use case for this where userspace gets a notification
> > of the hypercall that a guest has performed to share memory.
> 
> Is that hypercall in the 'Vendor Specific Hypervisor Service' range?

Yes. It is get another KVM hypercall.

> 
> > Communication with a TEE also is on the cards, as would be a FFA
> > implementation. All of this could be implemented in KVM, or in
> > userspace, depending what users of these misfeatures want to do.
> 
> I'm very hopeful that by forwarding all of this to userspace we can get
> out of the business of implementing every darn spec that comes along.

Good luck. All the TEEs have private, home grown APIs, and every
vendor will want to implement their own crap (i.e. there is no spec).

	M.
  
Will Deacon Nov. 18, 2022, 2:56 p.m. UTC | #6
Hey Oliver,

On Thu, Nov 10, 2022 at 01:53:26AM +0000, Oliver Upton wrote:
> As the SMCCC (and related specifications) march towards an
> 'everything and the kitchen sink' interface for interacting with a
> system, it is less likely that KVM will implement every supported
> feature.
> 
> Add a capability that allows userspace to trap hypercall ranges,
> allowing the VMM to mix-and-match between calls handled in userspace vs.
> KVM.
> 
> Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
> ---
>  arch/arm64/include/asm/kvm_host.h |  5 ++++
>  arch/arm64/include/uapi/asm/kvm.h | 15 ++++++++++
>  arch/arm64/kvm/arm.c              | 10 +++++++
>  arch/arm64/kvm/hypercalls.c       | 48 +++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h          |  1 +
>  5 files changed, 79 insertions(+)

[...]

> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 6f0b56e7f8c7..6e8a222fc295 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -100,6 +100,13 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>  		r = 0;
>  		set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
>  		break;
> +	case KVM_CAP_ARM_USER_HYPERCALLS:
> +		if (cap->args[0] & ~KVM_ARM_USER_HYPERCALL_FLAGS)
> +			return -EINVAL;

Why not use KVM_CAP_EXIT_HYPERCALL for this? At some point during pKVM
development, we used that to notify the VMM about memory being shared
back from the guest but we eventually dropped it as the notification to
userspace ended up not being needed:

https://android-kvm.googlesource.com/linux/+/dbd2861832dfc4c8a3103214b3c212ee7ace1c44%5E%21/
https://android-kvm.googlesource.com/linux/+/2a3afc6da99c0e0cb62be1687153ee572903aa80%5E%21/

I'm not saying that what we did was necessarily better, but it seems a bit
simpler and I figured it might be useful to point you to it.

Will
  
Oliver Upton Nov. 18, 2022, 5:04 p.m. UTC | #7
On Fri, Nov 18, 2022 at 02:56:38PM +0000, Will Deacon wrote:
> On Thu, Nov 10, 2022 at 01:53:26AM +0000, Oliver Upton wrote:
> > As the SMCCC (and related specifications) march towards an
> > 'everything and the kitchen sink' interface for interacting with a
> > system, it is less likely that KVM will implement every supported
> > feature.
> > 
> > Add a capability that allows userspace to trap hypercall ranges,
> > allowing the VMM to mix-and-match between calls handled in userspace vs.
> > KVM.
> > 
> > Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
> > ---
> >  arch/arm64/include/asm/kvm_host.h |  5 ++++
> >  arch/arm64/include/uapi/asm/kvm.h | 15 ++++++++++
> >  arch/arm64/kvm/arm.c              | 10 +++++++
> >  arch/arm64/kvm/hypercalls.c       | 48 +++++++++++++++++++++++++++++++
> >  include/uapi/linux/kvm.h          |  1 +
> >  5 files changed, 79 insertions(+)
> 
> [...]
> 
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index 6f0b56e7f8c7..6e8a222fc295 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -100,6 +100,13 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> >  		r = 0;
> >  		set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
> >  		break;
> > +	case KVM_CAP_ARM_USER_HYPERCALLS:
> > +		if (cap->args[0] & ~KVM_ARM_USER_HYPERCALL_FLAGS)
> > +			return -EINVAL;
> 
> Why not use KVM_CAP_EXIT_HYPERCALL for this?

Err... I hilariously hijacked its UAPI for the exit but added a new cap
for it :)

I think the direction going forward will be to provide userspace with a
range-based filter such that (to a degree) we can arbitrarily forward
hypercalls to userspace, allowing for a mix-and-match approach.

> At some point during pKVM
> development, we used that to notify the VMM about memory being shared
> back from the guest but we eventually dropped it as the notification to
> userspace ended up not being needed:
> 
> https://android-kvm.googlesource.com/linux/+/dbd2861832dfc4c8a3103214b3c212ee7ace1c44%5E%21/
> https://android-kvm.googlesource.com/linux/+/2a3afc6da99c0e0cb62be1687153ee572903aa80%5E%21/
> 
> I'm not saying that what we did was necessarily better, but it seems a bit
> simpler and I figured it might be useful to point you to it.

Yeah, this is certainly a lot cleaner than what I've proposed here. And
frankly, for my immediate interest (forwarding vendor hypercalls to
userspace), this would fit the bill. OTOH, I was hoping that something
a bit more flexible could move the onus of implementing every darn spec
onto userspace (where possible).

I know you said pKVM has no need for userspace notifications at this
moment, but could user hypercalls be useful again going forward?

--
Thanks,
Oliver
  

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e33ed7c09a28..cc3872f1900c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -52,6 +52,9 @@ 
 
 #define KVM_HAVE_MMU_RWLOCK
 
+#define KVM_ARM_USER_HYPERCALL_FLAGS	\
+		GENMASK_ULL(KVM_ARM_USER_HYPERCALL_FLAGS_COUNT - 1, 0)
+
 /*
  * Mode of operation configurable with kvm-arm.mode early param.
  * See Documentation/admin-guide/kernel-parameters.txt for more information.
@@ -104,11 +107,13 @@  struct kvm_arch_memory_slot {
 /**
  * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests
  *
+ * @user_trap_bmap: Bitmap of SMCCC function ranges trapped to userspace
  * @std_bmap: Bitmap of standard secure service calls
  * @std_hyp_bmap: Bitmap of standard hypervisor service calls
  * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls
  */
 struct kvm_smccc_features {
+	unsigned long user_trap_bmap;
 	unsigned long std_bmap;
 	unsigned long std_hyp_bmap;
 	unsigned long vendor_hyp_bmap;
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 316917b98707..07fa3f597e61 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -370,6 +370,21 @@  enum {
 #endif
 };
 
+enum {
+	KVM_ARM_USER_HYPERCALL_OWNER_ARCH		= 0,
+	KVM_ARM_USER_HYPERCALL_OWNER_CPU		= 1,
+	KVM_ARM_USER_HYPERCALL_OWNER_SIP		= 2,
+	KVM_ARM_USER_HYPERCALL_OWNER_OEM		= 3,
+	KVM_ARM_USER_HYPERCALL_OWNER_STANDARD		= 4,
+	KVM_ARM_USER_HYPERCALL_OWNER_STANDARD_HYP	= 5,
+	KVM_ARM_USER_HYPERCALL_OWNER_VENDOR_HYP		= 6,
+	KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_APP	= 7,
+	KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_OS		= 8,
+#ifdef __KERNEL__
+	KVM_ARM_USER_HYPERCALL_FLAGS_COUNT,
+#endif
+};
+
 /* Device Control API: ARM VGIC */
 #define KVM_DEV_ARM_VGIC_GRP_ADDR	0
 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS	1
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 6f0b56e7f8c7..6e8a222fc295 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -100,6 +100,13 @@  int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		r = 0;
 		set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
 		break;
+	case KVM_CAP_ARM_USER_HYPERCALLS:
+		if (cap->args[0] & ~KVM_ARM_USER_HYPERCALL_FLAGS)
+			return -EINVAL;
+
+		r = 0;
+		kvm->arch.smccc_feat.user_trap_bmap = cap->args[0];
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -285,6 +292,9 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_PTRAUTH_GENERIC:
 		r = system_has_full_ptr_auth();
 		break;
+	case KVM_CAP_ARM_USER_HYPERCALLS:
+		r = KVM_ARM_USER_HYPERCALL_FLAGS;
+		break;
 	default:
 		r = 0;
 	}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 62ce45d0d957..22a23b12201d 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -92,6 +92,49 @@  static bool kvm_hvc_call_default_allowed(u32 func_id)
 	}
 }
 
+static bool kvm_hvc_call_user_trapped(struct kvm_vcpu *vcpu, u32 func_id)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *bmap = &kvm->arch.smccc_feat.user_trap_bmap;
+
+	switch (ARM_SMCCC_OWNER_NUM(func_id)) {
+	case ARM_SMCCC_OWNER_ARCH:
+		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_ARCH, bmap);
+	case ARM_SMCCC_OWNER_CPU:
+		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_CPU, bmap);
+	case ARM_SMCCC_OWNER_SIP:
+		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_SIP, bmap);
+	case ARM_SMCCC_OWNER_OEM:
+		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_OEM, bmap);
+	case ARM_SMCCC_OWNER_STANDARD:
+		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_STANDARD, bmap);
+	case ARM_SMCCC_OWNER_STANDARD_HYP:
+		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_STANDARD_HYP, bmap);
+	case ARM_SMCCC_OWNER_VENDOR_HYP:
+		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_VENDOR_HYP, bmap);
+	case ARM_SMCCC_OWNER_TRUSTED_APP ... ARM_SMCCC_OWNER_TRUSTED_APP_END:
+		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_APP, bmap);
+	case ARM_SMCCC_OWNER_TRUSTED_OS ... ARM_SMCCC_OWNER_TRUSTED_OS_END:
+		return test_bit(KVM_ARM_USER_HYPERCALL_OWNER_TRUSTED_OS, bmap);
+	default:
+		return false;
+	}
+}
+
+static void kvm_hvc_prepare_user_trap(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	run->exit_reason	= KVM_EXIT_HYPERCALL;
+	run->hypercall.nr	= smccc_get_function(vcpu);
+	run->hypercall.args[0]	= smccc_get_arg(vcpu, 1);
+	run->hypercall.args[1]	= smccc_get_arg(vcpu, 2);
+	run->hypercall.args[2]	= smccc_get_arg(vcpu, 3);
+	run->hypercall.args[3]	= smccc_get_arg(vcpu, 4);
+	run->hypercall.args[4]	= smccc_get_arg(vcpu, 5);
+	run->hypercall.args[5]	= smccc_get_arg(vcpu, 6);
+}
+
 static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id)
 {
 	struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
@@ -128,6 +171,11 @@  int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 	u32 feature;
 	gpa_t gpa;
 
+	if (kvm_hvc_call_user_trapped(vcpu, func_id)) {
+		kvm_hvc_prepare_user_trap(vcpu);
+		return 0;
+	}
+
 	if (!kvm_hvc_call_allowed(vcpu, func_id))
 		goto out;
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0d5d4419139a..ea6c1983a545 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1178,6 +1178,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
+#define KVM_CAP_ARM_USER_HYPERCALLS 224
 
 #ifdef KVM_CAP_IRQ_ROUTING