[03/16] KVM: x86: set gfn-to-pfn cache length consistently with VM word size

Message ID 20221027161849.2989332-4-pbonzini@redhat.com
State New
Headers
Series KVM: x86: Always use non-compat vcpu_runstate_info size for gfn=>pfn cache |

Commit Message

Paolo Bonzini Oct. 27, 2022, 4:18 p.m. UTC
  KVM unconditionally uses the "full" size of the Xen shared info page when
activating the cache in kvm_xen_vcpu_set_attr(), but using the current
mode matches what Xen does.  While KVM did always use the 64-bit size
when activating the cache, what matters is that Xen does not look
beyond the size of the 32-bit struct if the vCPU was initialized in
32-bit mode.  If the guest sets up the runstate info of a 32-bit
VM so that the struct ends at the end of a page, the 64-bit struct
size passed to kvm_gpc_activate() will cause the ioctl or hypercall
to fail, because gfn-to-pfn caches can only be set up for data that fits
in a single page.

Nevertheless, keeping the Xen word size constant throughout the life
of the gpc cache, i.e. not using a different size at check()+refresh()
than at activate(), is desirable because it makes the length/size of
the cache immutable.  This in turn yields a cleaner set of APIs and
avoids potential bugs that could occur if check() were invoked with
a different size than refresh().

So, use the short size at activation time as well.  This means
re-activating the cache if the guest requests the hypercall page
multiple times with different word sizes (this can happen when
kexec-ing, for example).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/xen.c | 47 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 9 deletions(-)
  

Comments

Sean Christopherson Oct. 27, 2022, 5:22 p.m. UTC | #1
On Thu, Oct 27, 2022, Paolo Bonzini wrote:
> So, use the short size at activation time as well.  This means
> re-activating the cache if the guest requests the hypercall page
> multiple times with different word sizes (this can happen when
> kexec-ing, for example).

I don't understand the motivation for allowing a conditionally valid GPA.  I see
a lot of complexity and sub-optimal error handling for a use case that no one
cares about.  Realistically, userspace is never going to provide a GPA that only
works some of the time, because doing otherwise is just asking for a dead guest.

> +static int kvm_xen_reactivate_runstate_gpcs(struct kvm *kvm)
> +{
> +	struct kvm_vcpu *vcpu;
> +	unsigned long i;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		if (vcpu->arch.xen.runstate_cache.active) {

This is not safe when called from kvm_xen_write_hypercall_page(), which doesn't
acquire kvm->lock and thus doesn't guard against a concurrent call via
kvm_xen_vcpu_set_attr().  That's likely a bug in itself, but even if that issue
is fixed, I don't see how this is yields a better uAPI than forcing userspace to
provide an address that is valid for all modes.

If the address becomes bad when the guest changes the hypercall page, the guest
is completely hosed through no fault of its own, whereas limiting the misaligned
detection to KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR means that any "bad" address
will result in immediate failure, i.e. makes it so that errors are 100% userspace
misconfiguration bugs.

> +			int r = kvm_xen_activate_runstate_gpc(vcpu,
> +					vcpu->arch.xen.runstate_cache.gpa);
> +			if (r < 0)

Returning immediately is wrong, as later vCPUs will have a valid, active cache
that hasn't been verified for 64-bit mode.

> +				return r;
> +		}
> +	}
> +	return 0;
> +}
> +
>  void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
>  {
>  	struct kvm_vcpu_xen *vx = &v->arch.xen;
> @@ -212,11 +243,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
>  	if (!vx->runstate_cache.active)
>  		return;
>  
> -	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
> -		user_len = sizeof(struct vcpu_runstate_info);
> -	else
> -		user_len = sizeof(struct compat_vcpu_runstate_info);
> -
> +	user_len = kvm_xen_runstate_info_size(v->kvm);
>  	read_lock_irqsave(&gpc->lock, flags);
>  	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
>  					   user_len)) {
> @@ -461,7 +488,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
>  			mutex_lock(&kvm->lock);
>  			kvm->arch.xen.long_mode = !!data->u.long_mode;
>  			mutex_unlock(&kvm->lock);
> -			r = 0;
> +			r = kvm_xen_reactivate_runstate_gpcs(kvm);

Needs to be called under kvm->lock.  This path also needs to acquire kvm->srcu.

>  		}
>  		break;
>  
> @@ -596,9 +623,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
>  			break;
>  		}
>  
> -		r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
> -				     NULL, KVM_HOST_USES_PFN, data->u.gpa,
> -				     sizeof(struct vcpu_runstate_info));
> +		r = kvm_xen_activate_runstate_gpc(vcpu, data->u.gpa);
>  		break;
>  
>  	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
> @@ -843,9 +868,13 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
>  	u32 page_num = data & ~PAGE_MASK;
>  	u64 page_addr = data & PAGE_MASK;
>  	bool lm = is_long_mode(vcpu);
> +	int r;
>  
>  	/* Latch long_mode for shared_info pages etc. */
>  	vcpu->kvm->arch.xen.long_mode = lm;
> +	r = kvm_xen_reactivate_runstate_gpcs(kvm);
> +	if (r < 0)
> +		return 1;

Aren't we just making up behavior at this point?  Injecting a #GP into the guest
for what is a completely legal operation from the guest's perspective seems all
kinds of wrong.
  
Paolo Bonzini Oct. 28, 2022, 11:36 a.m. UTC | #2
On 10/27/22 19:22, Sean Christopherson wrote:
> On Thu, Oct 27, 2022, Paolo Bonzini wrote:
>> So, use the short size at activation time as well.  This means
>> re-activating the cache if the guest requests the hypercall page
>> multiple times with different word sizes (this can happen when
>> kexec-ing, for example).
> 
> I don't understand the motivation for allowing a conditionally valid GPA.  I see
> a lot of complexity and sub-optimal error handling for a use case that no one
> cares about.  Realistically, userspace is never going to provide a GPA that only
> works some of the time, because doing otherwise is just asking for a dead guest.

We _should_ be following the Xen API, which does not even say that the
areas have to fit in a single page.  In fact, even Linux's

         struct vcpu_register_runstate_memory_area area;

         area.addr.v = &per_cpu(xen_runstate, cpu);
         if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
                                xen_vcpu_nr(cpu), &area))

could fail or not just depending on the linker's whims, if I'm not
very confused.

Other data structures *do* have to fit in a page, but the runstate area
does not and it's exactly the one where the cache comes the most handy.
For this I'm going to wait for David to answer.

That said, the whole gpc API is really messy and needs to be cleaned
up beyond what this series does.  For example,

         read_lock_irqsave(&gpc->lock, flags);
         while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
                                            sizeof(x))) {
                 read_unlock_irqrestore(&gpc->lock, flags);

                 if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, sizeof(x)))
                         return;

                 read_lock_irqsave(&gpc->lock, flags);
         }
	...
         read_unlock_irqrestore(&gpc->lock, flags);

should really be simplified to

	khva = kvm_gpc_lock(gpc);
	if (IS_ERR(khva))
		return;
	...
	kvm_gpc_unlock(gpc);

Only the special preempt-notifier cases would have to use check/refresh
by hand.  If needed they could even pass whatever length they want to
__kvm_gpc_refresh with, explicit marking that it's a here-be-dragons __API.

Also because we're using the gpc from non-preemptible regions the rwlock
critical sections should be enclosed in preempt_disable()/preempt_enable().
Fortunately they're pretty small.

For now I think the best course of action is to quickly get the bugfix
patches to Linus, and for 6.2 drop this one but otherwise keep the length
in kvm_gpc_activate().

> Aren't we just making up behavior at this point?  Injecting a #GP into the guest
> for what is a completely legal operation from the guest's perspective seems all
> kinds of wrong.

Yeah, it is and this patch was a steaming pile...  Though, while this
one is particularly egregious because it comes from an MSR write, a lot
of error returns in xen.c are inviting userspace to make up error
conditions that aren't there in Xen.  In fact activate should probably
ignore a refresh EFAULT altogether.

Paolo
  
Sean Christopherson Oct. 28, 2022, 4:31 p.m. UTC | #3
On Fri, Oct 28, 2022, Paolo Bonzini wrote:
> On 10/27/22 19:22, Sean Christopherson wrote:
> > On Thu, Oct 27, 2022, Paolo Bonzini wrote:
> > > So, use the short size at activation time as well.  This means
> > > re-activating the cache if the guest requests the hypercall page
> > > multiple times with different word sizes (this can happen when
> > > kexec-ing, for example).
> > 
> > I don't understand the motivation for allowing a conditionally valid GPA.  I see
> > a lot of complexity and sub-optimal error handling for a use case that no one
> > cares about.  Realistically, userspace is never going to provide a GPA that only
> > works some of the time, because doing otherwise is just asking for a dead guest.
> 
> We _should_ be following the Xen API, which does not even say that the
> areas have to fit in a single page.

Ah, I didn't realize these are hypercall => userspace => ioctl() paths.

> In fact, even Linux's
> 
>         struct vcpu_register_runstate_memory_area area;
> 
>         area.addr.v = &per_cpu(xen_runstate, cpu);
>         if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
>                                xen_vcpu_nr(cpu), &area))
> 
> could fail or not just depending on the linker's whims, if I'm not
> very confused.
> 
> Other data structures *do* have to fit in a page, but the runstate area
> does not and it's exactly the one where the cache comes the most handy.
> For this I'm going to wait for David to answer.
> 
> That said, the whole gpc API is really messy 

No argument there.

> and needs to be cleaned up beyond what this series does.  For example,
> 
>         read_lock_irqsave(&gpc->lock, flags);
>         while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
>                                            sizeof(x))) {
>                 read_unlock_irqrestore(&gpc->lock, flags);
> 
>                 if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, sizeof(x)))
>                         return;
> 
>                 read_lock_irqsave(&gpc->lock, flags);
>         }
> 	...
>         read_unlock_irqrestore(&gpc->lock, flags);
> 
> should really be simplified to
> 
> 	khva = kvm_gpc_lock(gpc);
> 	if (IS_ERR(khva))
> 		return;
> 	...
> 	kvm_gpc_unlock(gpc);
> 
> Only the special preempt-notifier cases would have to use check/refresh
> by hand.  If needed they could even pass whatever length they want to
> __kvm_gpc_refresh with, explicit marking that it's a here-be-dragons __API.
> 
> Also because we're using the gpc from non-preemptible regions the rwlock
> critical sections should be enclosed in preempt_disable()/preempt_enable().
> Fortunately they're pretty small.
> 
> For now I think the best course of action is to quickly get the bugfix
> patches to Linus, and for 6.2 drop this one but otherwise keep the length
> in kvm_gpc_activate().

Works for me.
  
David Woodhouse Nov. 13, 2022, 1:32 p.m. UTC | #4
On Fri, 2022-10-28 at 13:36 +0200, Paolo Bonzini wrote:
> We _should_ be following the Xen API, which does not even say that the
> areas have to fit in a single page.  In fact, even Linux's
> 
>          struct vcpu_register_runstate_memory_area area;
> 
>          area.addr.v = &per_cpu(xen_runstate, cpu);
>          if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
>                                 xen_vcpu_nr(cpu), &area))
> 
> could fail or not just depending on the linker's whims, if I'm not
> very confused.
> 
> Other data structures *do* have to fit in a page, but the runstate area
> does not and it's exactly the one where the cache comes the most handy.
> For this I'm going to wait for David to answer.

Yeah, I recall vetting a bunch of these to ensure that it's safe to
assume that it does fit within the page... but that clearly isn't true
for the runstate_area.

As things stand, I believe a guest is perfectly entitled to provide a
region which crosses a page boundary, and Xen copes with that. But as
you say, KVM doesn't.

However, I don't think this *is* the case where the cache comes in the
most handy. The cache is really useful where we have to do *atomic*
operations on guest addresses, and doing so directly is a whole lot
nicer than futex-style try-it-and-fail-gracefully operations on
userspace addresses.

For the runstate area, I think we can live with using a gfn_to_hva
cache instead, and writing via the userspace address (with appropriate
atomicity for the RUNSTATE_runnable case as we have at the moment
gating the refresh).
  
David Woodhouse Nov. 13, 2022, 1:36 p.m. UTC | #5
On Sun, 2022-11-13 at 13:32 +0000, David Woodhouse wrote:
> For the runstate area, I think we can live with using a gfn_to_hva
> cache instead, and writing via the userspace address (with appropriate
> atomicity for the RUNSTATE_runnable case as we have at the moment
> gating the refresh).

Which mostly involves just reverting commit a795cd43c5b5 I think?

IIRC the reason for that commit was mostly consistency with other
things that really *did* want to be switched to gpc.
  
Durrant, Paul Nov. 14, 2022, 11:27 a.m. UTC | #6
> -----Original Message-----
> From: David Woodhouse <dwmw2@infradead.org>
> Sent: 13 November 2022 13:37
> To: Paolo Bonzini <pbonzini@redhat.com>; Sean Christopherson
> <seanjc@google.com>
> Cc: linux-kernel@vger.kernel.org; kvm@vger.kernel.org; mhal@rbox.co;
> Durrant, Paul <pdurrant@amazon.co.uk>; Kaya, Metin <metikaya@amazon.co.uk>
> Subject: RE: [EXTERNAL][PATCH 03/16] KVM: x86: set gfn-to-pfn cache length
> consistently with VM word size
> 
> On Sun, 2022-11-13 at 13:32 +0000, David Woodhouse wrote:
> > For the runstate area, I think we can live with using a gfn_to_hva
> > cache instead, and writing via the userspace address (with appropriate
> > atomicity for the RUNSTATE_runnable case as we have at the moment
> > gating the refresh).
> 
> Which mostly involves just reverting commit a795cd43c5b5 I think?
> 
> IIRC the reason for that commit was mostly consistency with other
> things that really *did* want to be switched to gpc.

A straight reversion would re-introduce the page-spanning check in kvm_xen_vcpu_set_attr() but I think we can just leave that out.

  Paul
  
Sean Christopherson Nov. 14, 2022, 4:33 p.m. UTC | #7
On Mon, Nov 14, 2022, Woodhouse, David wrote:
> Most other data structures, including the pvclock info (both Xen and
> native KVM), could potentially cross page boundaries. And isn't that
> also true for things that we'd want to use the GPC for in nesting?

Off the top of my head, no.  Except for MSR and I/O permission bitmaps, which
are >4KiB, things that are referenced by physical address are <=4KiB and must be
naturally aligned.  nVMX does temporarily map L1's MSR bitmap, but that could be
split into two separate mappings if necessary.

> For the runstate info I suggested reverting commit a795cd43c5b5 but
> that doesn't actually work because it still has the same problem. Even
> the gfn-to-hva cache still only really works for a single page, and
> things like kvm_write_guest_offset_cached() will fall back to using
> kvm_write_guest() in the case where it crosses a page boundary.
> 
> I'm wondering if the better fix is to allow the GPC to map more than
> one page.

I agree that KVM should drop the "no page splits" restriction, but I don't think
that would necessarily solve all KVM Xen issues.  KVM still needs to precisely
handle the "correct" struct size, e.g. if one of the structs is placed at the very
end of the page such that the smaller compat version doesn't split a page but the
64-bit version does.
  
Paolo Bonzini Nov. 14, 2022, 4:58 p.m. UTC | #8
On 11/14/22 15:53, Woodhouse, David wrote:
> Most other data structures, including the pvclock info (both Xen and
> native KVM), could potentially cross page boundaries. And isn't that
> also true for things that we'd want to use the GPC for in nesting?

Yes, for kvmclock we likely got away with it because Linux page-aligns 
it (and has been since 2013: commit ed55705dd, originally done for 
vsyscall support).  I have checked OpenBSD and FreeBSD and I think they 
do as well.

I am very very tempted to remove support for "old-style" kvmclock MSRs 
and retroactively declare new-style MSRs to accept only 32-byte aligned 
addresses.  However that doesn't solve the problem.

Paolo
  
David Woodhouse Nov. 14, 2022, 5:36 p.m. UTC | #9
On Mon, 2022-11-14 at 16:33 +0000, Sean Christopherson wrote:
> On Mon, Nov 14, 2022, Woodhouse, David wrote:
> > Most other data structures, including the pvclock info (both Xen and
> > native KVM), could potentially cross page boundaries. And isn't that
> > also true for things that we'd want to use the GPC for in nesting?
> 
> Off the top of my head, no.  Except for MSR and I/O permission bitmaps, which
> are >4KiB, things that are referenced by physical address are <=4KiB and must be
> naturally aligned.  nVMX does temporarily map L1's MSR bitmap, but that could be
> split into two separate mappings if necessary.
> 
> > For the runstate info I suggested reverting commit a795cd43c5b5 but
> > that doesn't actually work because it still has the same problem. Even
> > the gfn-to-hva cache still only really works for a single page, and
> > things like kvm_write_guest_offset_cached() will fall back to using
> > kvm_write_guest() in the case where it crosses a page boundary.
> > 
> > I'm wondering if the better fix is to allow the GPC to map more than
> > one page.
> 
> I agree that KVM should drop the "no page splits" restriction, but I don't think
> that would necessarily solve all KVM Xen issues.  KVM still needs to precisely
> handle the "correct" struct size, e.g. if one of the structs is placed at the very
> end of the page such that the smaller compat version doesn't split a page but the
> 64-bit version does.

I think we can be more explicit that the guest 'long' mode shall never
change while anything is mapped. Xen automatically detects that a guest
is in 64-bit mode very early on, either in the first 'fill the
hypercall page' MSR write, or when setting HVM_PARAM_CALLBACK_IRQ to
configure interrupt routing.

Strictly speaking, a guest could put itself into 32-bit mode and set
HVM_PARAM_CALLBACK_IRQ *again*. Xen would only update the wallclock
time in that case, and makes no attempt to convert anything else. I
don't think we need to replicate that.

On kexec/soft reset it could go back to 32-bit mode, but the soft reset
unmaps everything so that's OK.

I looked at making the GPC handle multiple pages but can't see how to
sanely do it for the IOMEM case. vmap() takes a list of *pages* not
PFNs, and memremap_pages() is... overly complex.

But if we can reduce it to *just* the runstate info that potentially
needs >1 page, then we can probably handle that with using two GPC (or
maybe GHC) caches for it.
  
David Woodhouse Nov. 15, 2022, 12:16 a.m. UTC | #10
On Mon, 2022-11-14 at 11:27 +0000, Durrant, Paul wrote:
> > On Sun, 2022-11-13 at 13:32 +0000, David Woodhouse wrote:
> > > For the runstate area, I think we can live with using a gfn_to_hva
> > > cache instead, and writing via the userspace address (with appropriate
> > > atomicity for the RUNSTATE_runnable case as we have at the moment
> > > gating the refresh).
> > 
> > Which mostly involves just reverting commit a795cd43c5b5 I think?
> > 
> > IIRC the reason for that commit was mostly consistency with other
> > things that really *did* want to be switched to gpc.
> 
> A straight reversion would re-introduce the page-spanning check in
> kvm_xen_vcpu_set_attr() but I think we can just leave that out.

That check matches the behaviour of kvm_gfn_to_hva_cache_init() which
deliberately clears ghc->memslot if it crosses a page. (Although I
don't see why; it only really needs to do that if it crosses over onto
a second *memslot*, surely?)

So we'd then hit this in kvm_xen_update_runstate_guest():

       /* We made sure it fits in a single page */
       BUG_ON(!ghc->memslot);

We'd need a fallback code path for the page-crossing (or memslot-
crossing) case, and the natural way to do that is to just use
kvm_write_guest(). In the RUNSTATE_runnable case where we can't sleep,
can we get away with using pagefault_disable() around a call to
kvm_write_guest()? 

The worst case here is kind of awful; it's a compat guest with the
first half of its ->state_entry_time being on the first page, and the
second half of it being on the second page.

I'm playing with using a second GPC for the overrun onto the second
page. Debating if it's already too ugly to live before I even fix up
the actual copying part...

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 81114a376c4e..3fc08f416aa3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -647,6 +647,7 @@ struct kvm_vcpu_xen {
 	struct gfn_to_pfn_cache vcpu_info_cache;
 	struct gfn_to_pfn_cache vcpu_time_info_cache;
 	struct gfn_to_pfn_cache runstate_cache;
+	struct gfn_to_pfn_cache runstate2_cache;
 	u64 last_steal;
 	u64 runstate_entry_time;
 	u64 runstate_times[4];
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b8e9628fbf5..e297569bbc8d 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -201,35 +201,59 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 {
 	struct kvm_vcpu_xen *vx = &v->arch.xen;
-	struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
+	struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+	struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
 	uint64_t *user_times;
 	unsigned long flags;
-	size_t user_len;
+	size_t user_len1, user_len2 = 0;
 	int *user_state;
 
 	kvm_xen_update_runstate(v, state);
 
-	if (!vx->runstate_cache.active)
+	if (!gpc1->active)
 		return;
 
 	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
-		user_len = sizeof(struct vcpu_runstate_info);
+		user_len1 = sizeof(struct vcpu_runstate_info);
 	else
-		user_len = sizeof(struct compat_vcpu_runstate_info);
+		user_len1 = sizeof(struct compat_vcpu_runstate_info);
 
-	read_lock_irqsave(&gpc->lock, flags);
-	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
-					   user_len)) {
-		read_unlock_irqrestore(&gpc->lock, flags);
+	if ((gpc1->gpa & ~PAGE_MASK) + user_len1 >= PAGE_SIZE) {
+		user_len2 = user_len1;
+		user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+		user_len2 -= user_len1;
+	}
+
+ retry:
+	read_lock_irqsave(&gpc1->lock, flags);
+	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc1, gpc1->gpa,
+					   user_len1)) {
+		read_unlock_irqrestore(&gpc1->lock, flags);
 
 		/* When invoked from kvm_sched_out() we cannot sleep */
 		if (state == RUNSTATE_runnable)
 			return;
 
-		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
+		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc1, gpc1->gpa, user_len1))
 			return;
 
-		read_lock_irqsave(&gpc->lock, flags);
+		read_lock_irqsave(&gpc1->lock, flags);
+	}
+	if (user_len2) {
+		read_lock(&gpc2->lock);
+		if (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc2, gpc2->gpa, user_len2)) {
+			read_unlock(&gpc2->lock);
+			read_unlock_irqrestore(&gpc1->lock, flags);
+
+			if (state == RUNSTATE_runnable)
+				return;
+
+			if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc2,
+							 gpc2->gpa, user_len2))
+				return;
+
+			goto retry;
+		}
 	}
 
 	/*
@@ -584,23 +608,52 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 		break;
 
-	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+		size_t sz;
+
 		if (!sched_info_on()) {
 			r = -EOPNOTSUPP;
 			break;
 		}
 		if (data->u.gpa == GPA_INVALID) {
+			r = 0;
+		deactivate_out:
 			kvm_gpc_deactivate(vcpu->kvm,
 					   &vcpu->arch.xen.runstate_cache);
-			r = 0;
+		deactivate2_out:
+			kvm_gpc_deactivate(vcpu->kvm,
+					   &vcpu->arch.xen.runstate2_cache);
 			break;
 		}
 
-		r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
-				     NULL, KVM_HOST_USES_PFN, data->u.gpa,
-				     sizeof(struct vcpu_runstate_info));
+		if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+			sz = sizeof(struct vcpu_runstate_info);
+		else
+			sz = sizeof(struct compat_vcpu_runstate_info);
+
+		/* Handle structures which cross a page boundary by using two GPCs */
+		if ((data->u.gpa & ~PAGE_MASK) + sz <= PAGE_SIZE) {
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+					     NULL, KVM_HOST_USES_PFN, data->u.gpa,
+					     sizeof(struct vcpu_runstate_info));
+			goto deactivate2_out;
+		} else {
+			/* Map the end of the first page... */
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+					     NULL, KVM_HOST_USES_PFN, data->u.gpa,
+					     PAGE_SIZE - (data->u.gpa & ~PAGE_MASK));
+			if (r)
+				goto deactivate2_out;
+			/* ... and the start of the second. */
+			sz -= PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache,
+					     NULL, KVM_HOST_USES_PFN,
+					     (data->u.gpa + PAGE_SIZE) & PAGE_MASK, sz);
+			if (r)
+				goto deactivate_out;
+		}
 		break;
-
+	}
 	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
 		if (!sched_info_on()) {
 			r = -EOPNOTSUPP;
@@ -1834,6 +1887,7 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
 	timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
 
 	kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
+	kvm_gpc_init(&vcpu->arch.xen.runstate2_cache);
 	kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
 	kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
 }
@@ -1844,6 +1898,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 		kvm_xen_stop_timer(vcpu);
 
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
+	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache);
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
  
David Woodhouse Nov. 16, 2022, 10:49 p.m. UTC | #11
On Mon, 2022-11-14 at 16:16 -0800, David Woodhouse wrote:
> 
> I'm playing with using a second GPC for the overrun onto the second
> page. Debating if it's already too ugly to live before I even fix up
> the actual copying part...

Well it certainly didn't get any *prettier*. Utterly untested other
than building it, so it's certainly going to be broken, but as an
illustration.

I can't see a sane way to get the two pages vmapped consecutively,
given that they might be IOMEM. So I can't see how to make a single GPC
do this "nicely", and I think we have to declare that the runstate area
is the only case that actually needs this, then do it this way as a
special case... even though it's fugly?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 81114a376c4e..3fc08f416aa3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -647,6 +647,7 @@ struct kvm_vcpu_xen {
 	struct gfn_to_pfn_cache vcpu_info_cache;
 	struct gfn_to_pfn_cache vcpu_time_info_cache;
 	struct gfn_to_pfn_cache runstate_cache;
+	struct gfn_to_pfn_cache runstate2_cache;
 	u64 last_steal;
 	u64 runstate_entry_time;
 	u64 runstate_times[4];
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b8e9628fbf5..14ba45b541bf 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -198,38 +198,101 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 	vx->runstate_entry_time = now;
 }
 
+/*
+ * The guest region is arbitrarily aligned, and could be split across
+ * two pages.
+ *
+ * d1: Pointer to kernel map of first byte of region.
+ * d2: Pointer to kernel map of first byte of second page.
+ * l1: length of first range [ == PAGE_SIZE - (d1 & ~PAGE_MASK) ]
+ * src: Source pointer.
+ * len: Source length to be copied.
+ * dst_ofs: Destination offset within the guest region.
+ */
+static inline void memcpy_to_runstate(void *d1, void *d2, size_t l1,
+				      void *src, size_t len, size_t dst_ofs)
+{
+	size_t copylen;
+
+	if (dst_ofs < l1) {
+		copylen = min(l1 - dst_ofs, len);
+		memcpy(d1 + dst_ofs, src, copylen);
+		if (copylen == len)
+			return;
+
+		src += copylen;
+		dst_ofs += copylen;
+		len -= copylen;
+	}
+
+	BUG_ON(dst_ofs < l1);
+	memcpy(d2 + dst_ofs - l1, src, len);
+}
+
 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 {
 	struct kvm_vcpu_xen *vx = &v->arch.xen;
-	struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
-	uint64_t *user_times;
+	struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+	struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
 	unsigned long flags;
-	size_t user_len;
-	int *user_state;
+	size_t user_len, user_len1, user_len2;
+	size_t times_ofs;
+	u8 *update_bit;
 
 	kvm_xen_update_runstate(v, state);
 
-	if (!vx->runstate_cache.active)
+	if (!gpc1->active)
 		return;
 
-	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
 		user_len = sizeof(struct vcpu_runstate_info);
-	else
+		times_ofs = offsetof(struct vcpu_runstate_info,
+				     state_entry_time);
+	} else {
 		user_len = sizeof(struct compat_vcpu_runstate_info);
+		times_ofs = offsetof(struct compat_vcpu_runstate_info,
+				     state_entry_time);
+	}
 
-	read_lock_irqsave(&gpc->lock, flags);
-	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
-					   user_len)) {
-		read_unlock_irqrestore(&gpc->lock, flags);
+	if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
+		user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+		user_len2 = user_len - user_len1;
+	} else {
+		user_len1 = user_len;
+		user_len2 = 0;
+	}
+	BUG_ON(user_len1 + user_len2 != user_len);
+
+ retry:
+	read_lock_irqsave(&gpc1->lock, flags);
+	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc1, gpc1->gpa,
+					   user_len1)) {
+		read_unlock_irqrestore(&gpc1->lock, flags);
 
 		/* When invoked from kvm_sched_out() we cannot sleep */
 		if (state == RUNSTATE_runnable)
 			return;
 
-		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
+		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc1, gpc1->gpa, user_len1))
 			return;
 
-		read_lock_irqsave(&gpc->lock, flags);
+		read_lock_irqsave(&gpc1->lock, flags);
+	}
+	if (user_len2) {
+		read_lock(&gpc2->lock);
+		if (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc2, gpc2->gpa, user_len2)) {
+			read_unlock(&gpc2->lock);
+			read_unlock_irqrestore(&gpc1->lock, flags);
+
+			if (state == RUNSTATE_runnable)
+				return;
+
+			if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc2,
+							 gpc2->gpa, user_len2))
+				return;
+
+			goto retry;
+		}
 	}
 
 	/*
@@ -252,25 +315,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 		     offsetof(struct compat_vcpu_runstate_info, time) + 4);
 #endif
 
-	user_state = gpc->khva;
-
-	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
-		user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
-						  state_entry_time);
-	else
-		user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
-						  state_entry_time);
-
 	/*
-	 * First write the updated state_entry_time at the appropriate
-	 * location determined by 'offset'.
+	 * The XEN_RUNSTATE_UPDATE bit is the top bit of the state_entry_time
+	 * field. We need to set it (and write-barrier) before the rest.
 	 */
 	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
-		     sizeof(user_times[0]));
+		     sizeof(uint64_t));
 	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
-		     sizeof(user_times[0]));
+		     sizeof(uint64_t));
+	BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
 
-	user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
+	if (user_len1 >= times_ofs + sizeof(uint64_t))
+		update_bit = ((u8 *)gpc1->khva) + times_ofs + sizeof(u64) - 1;
+	else
+		update_bit = ((u8 *)gpc2->khva) + times_ofs + sizeof(u64) - 1 -
+			user_len1;
+
+	*update_bit |= (XEN_RUNSTATE_UPDATE >> 56);
 	smp_wmb();
 
 	/*
@@ -284,7 +345,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
 		     sizeof(vx->current_runstate));
 
-	*user_state = vx->current_runstate;
+	memcpy_to_runstate(gpc1->khva, gpc2->khva, user_len1,
+			   &vx->current_runstate, sizeof(vx->current_runstate),
+			   offsetof(struct vcpu_runstate_info, state));
 
 	/*
 	 * Write the actual runstate times immediately after the
@@ -299,19 +362,28 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
 		     sizeof(vx->runstate_times));
 
-	memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
-	smp_wmb();
+	memcpy_to_runstate(gpc1->khva, gpc2->khva, user_len1,
+			   vx->runstate_times, sizeof(vx->runstate_times),
+			   times_ofs + sizeof(u64));
 
+	memcpy_to_runstate(gpc1->khva, gpc2->khva, user_len1,
+			   &vx->runstate_entry_time, sizeof(vx->runstate_entry_time) - 1,
+			   times_ofs);
+	smp_wmb();
 	/*
 	 * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
 	 * runstate_entry_time field.
 	 */
-	user_times[0] &= ~XEN_RUNSTATE_UPDATE;
+	*update_bit = vx->runstate_entry_time >> 56;
 	smp_wmb();
 
-	read_unlock_irqrestore(&gpc->lock, flags);
+	if (user_len2)
+		read_unlock_irqrestore(&gpc2->lock, flags);
+	read_unlock_irqrestore(&gpc1->lock, flags);
 
-	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+	mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
+	if (user_len2)
+		mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
 }
 
 static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
@@ -584,23 +656,52 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 		break;
 
-	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+		size_t sz;
+
 		if (!sched_info_on()) {
 			r = -EOPNOTSUPP;
 			break;
 		}
 		if (data->u.gpa == GPA_INVALID) {
+			r = 0;
+		deactivate_out:
 			kvm_gpc_deactivate(vcpu->kvm,
 					   &vcpu->arch.xen.runstate_cache);
-			r = 0;
+		deactivate2_out:
+			kvm_gpc_deactivate(vcpu->kvm,
+					   &vcpu->arch.xen.runstate2_cache);
 			break;
 		}
 
-		r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
-				     NULL, KVM_HOST_USES_PFN, data->u.gpa,
-				     sizeof(struct vcpu_runstate_info));
+		if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+			sz = sizeof(struct vcpu_runstate_info);
+		else
+			sz = sizeof(struct compat_vcpu_runstate_info);
+
+		/* Handle structures which cross a page boundary by using two GPCs */
+		if ((data->u.gpa & ~PAGE_MASK) + sz <= PAGE_SIZE) {
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+					     NULL, KVM_HOST_USES_PFN, data->u.gpa,
+					     sizeof(struct vcpu_runstate_info));
+			goto deactivate2_out;
+		} else {
+			/* Map the end of the first page... */
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+					     NULL, KVM_HOST_USES_PFN, data->u.gpa,
+					     PAGE_SIZE - (data->u.gpa & ~PAGE_MASK));
+			if (r)
+				goto deactivate2_out;
+			/* ... and the start of the second. */
+			sz -= PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache,
+					     NULL, KVM_HOST_USES_PFN,
+					     (data->u.gpa + PAGE_SIZE) & PAGE_MASK, sz);
+			if (r)
+				goto deactivate_out;
+		}
 		break;
-
+	}
 	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
 		if (!sched_info_on()) {
 			r = -EOPNOTSUPP;
@@ -1834,6 +1935,7 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
 	timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
 
 	kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
+	kvm_gpc_init(&vcpu->arch.xen.runstate2_cache);
 	kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
 	kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
 }
@@ -1844,6 +1946,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 		kvm_xen_stop_timer(vcpu);
 
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
+	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache);
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
  

Patch

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index b2be60c6efa4..512b4afa6785 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -198,6 +198,37 @@  static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 	vx->runstate_entry_time = now;
 }
 
+static inline size_t kvm_xen_runstate_info_size(struct kvm *kvm)
+{
+	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+		return sizeof(struct vcpu_runstate_info);
+	else
+		return sizeof(struct compat_vcpu_runstate_info);
+}
+
+static int kvm_xen_activate_runstate_gpc(struct kvm_vcpu *vcpu, unsigned long gpa)
+{
+	size_t user_len = kvm_xen_runstate_info_size(vcpu->kvm);
+	return kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+				NULL, KVM_HOST_USES_PFN, gpa, user_len);
+}
+
+static int kvm_xen_reactivate_runstate_gpcs(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.xen.runstate_cache.active) {
+			int r = kvm_xen_activate_runstate_gpc(vcpu,
+					vcpu->arch.xen.runstate_cache.gpa);
+			if (r < 0)
+				return r;
+		}
+	}
+	return 0;
+}
+
 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 {
 	struct kvm_vcpu_xen *vx = &v->arch.xen;
@@ -212,11 +243,7 @@  void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	if (!vx->runstate_cache.active)
 		return;
 
-	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
-		user_len = sizeof(struct vcpu_runstate_info);
-	else
-		user_len = sizeof(struct compat_vcpu_runstate_info);
-
+	user_len = kvm_xen_runstate_info_size(v->kvm);
 	read_lock_irqsave(&gpc->lock, flags);
 	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
 					   user_len)) {
@@ -461,7 +488,7 @@  int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 			mutex_lock(&kvm->lock);
 			kvm->arch.xen.long_mode = !!data->u.long_mode;
 			mutex_unlock(&kvm->lock);
-			r = 0;
+			r = kvm_xen_reactivate_runstate_gpcs(kvm);
 		}
 		break;
 
@@ -596,9 +623,7 @@  int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			break;
 		}
 
-		r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
-				     NULL, KVM_HOST_USES_PFN, data->u.gpa,
-				     sizeof(struct vcpu_runstate_info));
+		r = kvm_xen_activate_runstate_gpc(vcpu, data->u.gpa);
 		break;
 
 	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
@@ -843,9 +868,13 @@  int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 	u32 page_num = data & ~PAGE_MASK;
 	u64 page_addr = data & PAGE_MASK;
 	bool lm = is_long_mode(vcpu);
+	int r;
 
 	/* Latch long_mode for shared_info pages etc. */
 	vcpu->kvm->arch.xen.long_mode = lm;
+	r = kvm_xen_reactivate_runstate_gpcs(kvm);
+	if (r < 0)
+		return 1;
 
 	/*
 	 * If Xen hypercall intercept is enabled, fill the hypercall