[03/27] drm/i915/gvt: Incorporate KVM memslot info into check for 2MiB GTT entry

Message ID 20221223005739.1295925-4-seanjc@google.com
State New
Headers
Series drm/i915/gvt: KVM: KVMGT fixes and page-track cleanups |

Commit Message

Sean Christopherson Dec. 23, 2022, 12:57 a.m. UTC
  Honor KVM's max allowed page size when determining whether or not a 2MiB
GTT shadow page can be created for the guest.  Querying KVM's max allowed
size is somewhat odd as there's no strict requirement that KVM's memslots
and VFIO's mappings are configured with the same gfn=>hva mapping, but
the check will be accurate if userspace wants to have a functional guest,
and at the very least checking KVM's memslots guarantees that the entire
2MiB range has been exposed to the guest.

Note, KVM may also restrict the mapping size for reasons that aren't
relevant to KVMGT, e.g. for KVM's iTLB multi-hit workaround or if the gfn
is write-tracked (KVM's write-tracking only handles writes from vCPUs).
However, such scenarios are unlikely to occur with a well-behaved guest,
and at worst will result in sub-optimal performance.

Fixes: b901b252b6cf ("drm/i915/gvt: Add 2M huge gtt support")
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_page_track.h |  2 ++
 arch/x86/kvm/mmu/page_track.c         | 18 ++++++++++++++++++
 drivers/gpu/drm/i915/gvt/gtt.c        | 10 +++++++++-
 3 files changed, 29 insertions(+), 1 deletion(-)
  

Comments

Yan Zhao Dec. 28, 2022, 5:42 a.m. UTC | #1
On Fri, Dec 23, 2022 at 12:57:15AM +0000, Sean Christopherson wrote:
> Honor KVM's max allowed page size when determining whether or not a 2MiB
> GTT shadow page can be created for the guest.  Querying KVM's max allowed
> size is somewhat odd as there's no strict requirement that KVM's memslots
> and VFIO's mappings are configured with the same gfn=>hva mapping, but
Without vIOMMU, VFIO's mapping is configured with the same as KVM's
memslots, i.e. with the same gfn==>HVA mapping


> the check will be accurate if userspace wants to have a functional guest,
> and at the very least checking KVM's memslots guarantees that the entire
> 2MiB range has been exposed to the guest.

I think just check the entrie 2MiB GFN range are all within KVM memslot is
enough.
If for some reason, KVM maps a 2MiB range in 4K sizes, KVMGT can still map
it in IOMMU size in 2MiB size as long as the PFNs are continous and the
whole range is all exposed to guest.
Actually normal device passthrough with VFIO-PCI also maps GFNs in a
similar way, i.e. maps a guest visible range in as large size as
possible as long as the PFN is continous. 
> 
> Note, KVM may also restrict the mapping size for reasons that aren't
> relevant to KVMGT, e.g. for KVM's iTLB multi-hit workaround or if the gfn
Will iTLB multi-hit affect DMA?
AFAIK, IOMMU mappings currently never sets exec bit (and I'm told this bit is
under discussion to be removed).


> is write-tracked (KVM's write-tracking only handles writes from vCPUs).
> However, such scenarios are unlikely to occur with a well-behaved guest,
> and at worst will result in sub-optimal performance.
> Fixes: b901b252b6cf ("drm/i915/gvt: Add 2M huge gtt support")
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/include/asm/kvm_page_track.h |  2 ++
>  arch/x86/kvm/mmu/page_track.c         | 18 ++++++++++++++++++
>  drivers/gpu/drm/i915/gvt/gtt.c        | 10 +++++++++-
>  3 files changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
> index eb186bc57f6a..3f72c7a172fc 100644
> --- a/arch/x86/include/asm/kvm_page_track.h
> +++ b/arch/x86/include/asm/kvm_page_track.h
> @@ -51,6 +51,8 @@ void kvm_page_track_cleanup(struct kvm *kvm);
>  
>  bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
>  int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
> +enum pg_level kvm_page_track_max_mapping_level(struct kvm *kvm, gfn_t gfn,
> +					       enum pg_level max_level);
>  
>  void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
>  int kvm_page_track_create_memslot(struct kvm *kvm,
> diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
> index 2e09d1b6249f..69ea16c31859 100644
> --- a/arch/x86/kvm/mmu/page_track.c
> +++ b/arch/x86/kvm/mmu/page_track.c
> @@ -300,3 +300,21 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
>  			n->track_flush_slot(kvm, slot, n);
>  	srcu_read_unlock(&head->track_srcu, idx);
>  }
> +
> +enum pg_level kvm_page_track_max_mapping_level(struct kvm *kvm, gfn_t gfn,
> +					       enum pg_level max_level)
> +{
> +	struct kvm_memory_slot *slot;
> +	int idx;
> +
> +	idx = srcu_read_lock(&kvm->srcu);
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
> +		max_level = PG_LEVEL_4K;
> +	else
> +		max_level = kvm_mmu_max_slot_mapping_level(slot, gfn, max_level);
> +	srcu_read_unlock(&kvm->srcu, idx);
> +
> +	return max_level;
> +}
> +EXPORT_SYMBOL_GPL(kvm_page_track_max_mapping_level);
> diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
> index d0fca53a3563..6736d7bd94ea 100644
> --- a/drivers/gpu/drm/i915/gvt/gtt.c
> +++ b/drivers/gpu/drm/i915/gvt/gtt.c
> @@ -1178,14 +1178,22 @@ static int is_2MB_gtt_possible(struct intel_vgpu *vgpu,
>  	struct intel_gvt_gtt_entry *entry)
>  {
>  	const struct intel_gvt_gtt_pte_ops *ops = vgpu->gvt->gtt.pte_ops;
> +	unsigned long gfn = ops->get_pfn(entry);
>  	kvm_pfn_t pfn;
> +	int max_level;
>  
>  	if (!HAS_PAGE_SIZES(vgpu->gvt->gt->i915, I915_GTT_PAGE_SIZE_2M))
>  		return 0;
>  
>  	if (!vgpu->attached)
>  		return -EINVAL;
> -	pfn = gfn_to_pfn(vgpu->vfio_device.kvm, ops->get_pfn(entry));
> +
> +	max_level = kvm_page_track_max_mapping_level(vgpu->vfio_device.kvm,
> +						     gfn, PG_LEVEL_2M);
> +	if (max_level < PG_LEVEL_2M)
> +		return 0;
> +
> +	pfn = gfn_to_pfn(vgpu->vfio_device.kvm, gfn);
>  	if (is_error_noslot_pfn(pfn))
>  		return -EINVAL;
>  
> -- 
> 2.39.0.314.g84b9a713c41-goog
>
  
Sean Christopherson Jan. 3, 2023, 9:13 p.m. UTC | #2
On Wed, Dec 28, 2022, Yan Zhao wrote:
> On Fri, Dec 23, 2022 at 12:57:15AM +0000, Sean Christopherson wrote:
> > Honor KVM's max allowed page size when determining whether or not a 2MiB
> > GTT shadow page can be created for the guest.  Querying KVM's max allowed
> > size is somewhat odd as there's no strict requirement that KVM's memslots
> > and VFIO's mappings are configured with the same gfn=>hva mapping, but
> Without vIOMMU, VFIO's mapping is configured with the same as KVM's
> memslots, i.e. with the same gfn==>HVA mapping

But that's controlled by userspace, correct?

> > the check will be accurate if userspace wants to have a functional guest,
> > and at the very least checking KVM's memslots guarantees that the entire
> > 2MiB range has been exposed to the guest.
> 
> I think just check the entrie 2MiB GFN range are all within KVM memslot is
> enough.

Strictly speaking, no.  E.g. if a 2MiB region is covered with multiple memslots
and the memslots have different properties.

> If for some reason, KVM maps a 2MiB range in 4K sizes, KVMGT can still map
> it in IOMMU size in 2MiB size as long as the PFNs are continous and the
> whole range is all exposed to guest.

I agree that practically speaking this will hold true, but if KVMGT wants to honor
KVM's memslots then checking that KVM allows a hugepage is correct.  Hrm, but on
the flip side, KVMGT ignores read-only memslot flags, so KVMGT is already ignoring
pieces of KVM's memslots.

I have no objection to KVMGT defining its ABI such that KVMGT is allowed to create
2MiB so long as (a) the GFN is contiguous according to VFIO, and (b) that the entire
2MiB range is exposed to the guest.

That said, being fully permissive also seems wasteful, e.g. KVM would need to
explicitly support straddling multiple memslots.

As a middle ground, what about tweaking kvm_page_track_is_valid_gfn() to take a
range, and then checking that the range is contained in a single memslot?

E.g. something like:

bool kvm_page_track_is_contiguous_gfn_range(struct kvm *kvm, gfn_t gfn,
					    unsigned long nr_pages)
{
	struct kvm_memory_slot *memslot;
	bool ret;
	int idx;

	idx = srcu_read_lock(&kvm->srcu);
	memslot = gfn_to_memslot(kvm, gfn);
	ret = kvm_is_visible_memslot(memslot) &&
	      gfn + nr_pages <= memslot->base_gfn + memslot->npages;
	srcu_read_unlock(&kvm->srcu, idx);

	return ret;
}

> Actually normal device passthrough with VFIO-PCI also maps GFNs in a
> similar way, i.e. maps a guest visible range in as large size as
> possible as long as the PFN is continous. 
> > 
> > Note, KVM may also restrict the mapping size for reasons that aren't
> > relevant to KVMGT, e.g. for KVM's iTLB multi-hit workaround or if the gfn
> Will iTLB multi-hit affect DMA?

I highly doubt it, I can't imagine an IOMMU would have a dedicated instruction
TLB :-)

> AFAIK, IOMMU mappings currently never sets exec bit (and I'm told this bit is
> under discussion to be removed).
  
Yan Zhao Jan. 5, 2023, 3:07 a.m. UTC | #3
On Tue, Jan 03, 2023 at 09:13:54PM +0000, Sean Christopherson wrote:
> On Wed, Dec 28, 2022, Yan Zhao wrote:
> > On Fri, Dec 23, 2022 at 12:57:15AM +0000, Sean Christopherson wrote:
> > > Honor KVM's max allowed page size when determining whether or not a 2MiB
> > > GTT shadow page can be created for the guest.  Querying KVM's max allowed
> > > size is somewhat odd as there's no strict requirement that KVM's memslots
> > > and VFIO's mappings are configured with the same gfn=>hva mapping, but
> > Without vIOMMU, VFIO's mapping is configured with the same as KVM's
> > memslots, i.e. with the same gfn==>HVA mapping
> 
> But that's controlled by userspace, correct?

Yes, controlled by QEMU.
VFIO in kernel has no idea of whether vIOMMU is enabled or not.
KVMGT currently is known not working with vIOMMU with shadow mode on
(in this mode, VFIO maps gIOVA ==> HVA ==> HPA) .

> 
> > > the check will be accurate if userspace wants to have a functional guest,
> > > and at the very least checking KVM's memslots guarantees that the entire
> > > 2MiB range has been exposed to the guest.
> > 
> > I think just check the entrie 2MiB GFN range are all within KVM memslot is
> > enough.
> 
> Strictly speaking, no.  E.g. if a 2MiB region is covered with multiple memslots
> and the memslots have different properties.
> 
> > If for some reason, KVM maps a 2MiB range in 4K sizes, KVMGT can still map
> > it in IOMMU size in 2MiB size as long as the PFNs are continous and the
> > whole range is all exposed to guest.
> 
> I agree that practically speaking this will hold true, but if KVMGT wants to honor
> KVM's memslots then checking that KVM allows a hugepage is correct.  Hrm, but on
> the flip side, KVMGT ignores read-only memslot flags, so KVMGT is already ignoring
> pieces of KVM's memslots.
KVMGT calls dma_map_page() with DMA_BIDIRECTIONAL after checking gvt_pin_guest_page().
Though for a read-only memslot, DMA_TO_DEVICE should be used instead
(see dma_info_to_prot()),
as gvt_pin_guest_page() checks (IOMMU_READ | IOMMU_WRITE) permission for each page,
it actually ensures that the pinned GFN is not in a read-only memslot.
So, it should be fine.

> 
> I have no objection to KVMGT defining its ABI such that KVMGT is allowed to create
> 2MiB so long as (a) the GFN is contiguous according to VFIO, and (b) that the entire
> 2MiB range is exposed to the guest.
> 
sorry. I may not put it clearly enough.
for a normal device pass-through via VFIO-PCI, VFIO maps IOMMU mappings in this way:

(a) fault in PFNs in a GFN range within the same memslot (VFIO saves dma_list, which is
the same as memslot list when vIOMMU is not on or not in shadow mode).
(b) map continuous PFNs into iommu driver (honour ro attribute and can > 2MiB as long as
PFNs are continuous).
(c) IOMMU driver decides to map in 2MiB or in 4KiB according to its setting.

For KVMGT, gvt_dma_map_page() first calls gvt_pin_guest_page() which
(a) calls vfio_pin_pages() to check each GFN is within allowed dma_list with
(IOMMU_READ | IOMMU_WRITE) permission and fault-in page. 
(b) checks PFNs are continuous in 2MiB,

Though checking kvm_page_track_max_mapping_level() is also fine, it makes DMA
mapping size unnecessarily smaller.

> That said, being fully permissive also seems wasteful, e.g. KVM would need to
> explicitly support straddling multiple memslots.
> 
> As a middle ground, what about tweaking kvm_page_track_is_valid_gfn() to take a
> range, and then checking that the range is contained in a single memslot?
> 
> E.g. something like:
> 
> bool kvm_page_track_is_contiguous_gfn_range(struct kvm *kvm, gfn_t gfn,
> 					    unsigned long nr_pages)
> {
> 	struct kvm_memory_slot *memslot;
> 	bool ret;
> 	int idx;
> 
> 	idx = srcu_read_lock(&kvm->srcu);
> 	memslot = gfn_to_memslot(kvm, gfn);
> 	ret = kvm_is_visible_memslot(memslot) &&
> 	      gfn + nr_pages <= memslot->base_gfn + memslot->npages;
> 	srcu_read_unlock(&kvm->srcu, idx);
> 
> 	return ret;
> }

Yes, it's good.
But as explained above, gvt_dma_map_page() checks in an equivalent way.
Maybe checking kvm_page_track_is_contiguous_gfn_range() is also not
required?
> 
> > Actually normal device passthrough with VFIO-PCI also maps GFNs in a
> > similar way, i.e. maps a guest visible range in as large size as
> > possible as long as the PFN is continous. 
> > > 
> > > Note, KVM may also restrict the mapping size for reasons that aren't
> > > relevant to KVMGT, e.g. for KVM's iTLB multi-hit workaround or if the gfn
> > Will iTLB multi-hit affect DMA?
> 
> I highly doubt it, I can't imagine an IOMMU would have a dedicated instruction
> TLB :-)
I can double check it with IOMMU hardware experts.
But if DMA would tamper instruction TLB, it should have been reported
as an issue with normal VFIO pass-through?

> > AFAIK, IOMMU mappings currently never sets exec bit (and I'm told this bit is
> > under discussion to be removed).
  
Sean Christopherson Jan. 5, 2023, 5:40 p.m. UTC | #4
On Thu, Jan 05, 2023, Yan Zhao wrote:
> On Tue, Jan 03, 2023 at 09:13:54PM +0000, Sean Christopherson wrote:
> > On Wed, Dec 28, 2022, Yan Zhao wrote:
> > > On Fri, Dec 23, 2022 at 12:57:15AM +0000, Sean Christopherson wrote:
> > > > Honor KVM's max allowed page size when determining whether or not a 2MiB
> > > > GTT shadow page can be created for the guest.  Querying KVM's max allowed
> > > > size is somewhat odd as there's no strict requirement that KVM's memslots
> > > > and VFIO's mappings are configured with the same gfn=>hva mapping, but
> > > Without vIOMMU, VFIO's mapping is configured with the same as KVM's
> > > memslots, i.e. with the same gfn==>HVA mapping
> > 
> > But that's controlled by userspace, correct?
> 
> Yes, controlled by QEMU.

...

> > Strictly speaking, no.  E.g. if a 2MiB region is covered with multiple memslots
> > and the memslots have different properties.
> > 
> > > If for some reason, KVM maps a 2MiB range in 4K sizes, KVMGT can still map
> > > it in IOMMU size in 2MiB size as long as the PFNs are continous and the
> > > whole range is all exposed to guest.
> > 
> > I agree that practically speaking this will hold true, but if KVMGT wants to honor
> > KVM's memslots then checking that KVM allows a hugepage is correct.  Hrm, but on
> > the flip side, KVMGT ignores read-only memslot flags, so KVMGT is already ignoring
> > pieces of KVM's memslots.
> KVMGT calls dma_map_page() with DMA_BIDIRECTIONAL after checking gvt_pin_guest_page().
> Though for a read-only memslot, DMA_TO_DEVICE should be used instead
> (see dma_info_to_prot()),
> as gvt_pin_guest_page() checks (IOMMU_READ | IOMMU_WRITE) permission for each page,
> it actually ensures that the pinned GFN is not in a read-only memslot.
> So, it should be fine.
> 
> > 
> > I have no objection to KVMGT defining its ABI such that KVMGT is allowed to create
> > 2MiB so long as (a) the GFN is contiguous according to VFIO, and (b) that the entire
> > 2MiB range is exposed to the guest.
> > 
> sorry. I may not put it clearly enough.
> for a normal device pass-through via VFIO-PCI, VFIO maps IOMMU mappings in this way:
> 
> (a) fault in PFNs in a GFN range within the same memslot (VFIO saves dma_list, which is
> the same as memslot list when vIOMMU is not on or not in shadow mode).
> (b) map continuous PFNs into iommu driver (honour ro attribute and can > 2MiB as long as
> PFNs are continuous).
> (c) IOMMU driver decides to map in 2MiB or in 4KiB according to its setting.
> 
> For KVMGT, gvt_dma_map_page() first calls gvt_pin_guest_page() which
> (a) calls vfio_pin_pages() to check each GFN is within allowed dma_list with
> (IOMMU_READ | IOMMU_WRITE) permission and fault-in page. 
> (b) checks PFNs are continuous in 2MiB,
> 
> Though checking kvm_page_track_max_mapping_level() is also fine, it makes DMA
> mapping size unnecessarily smaller.

Yeah, I got all that.  What I'm trying to say, and why I asked about whether or
not userspace controls the mappings, is that AFAIK there is nothing in the kernel
that coordinates mappings between VFIO and KVM.  So, very technically, userspace
could map a 2MiB range contiguous in VFIO but not in KVM, or RW in VFIO but RO in KVM.

I can't imagine there's a real use case for doing so, and arguably there's no
requirement that KVMGT honor KVM's memslot.  But because KVMGT taps into KVM's
page-tracking, KVMGT _does_ honor KVM's memslots to some extent because KVMGT
needs to know whether or not a given GFN can be write-protected.

I'm totally fine if KVMGT's ABI is that VFIO is the source of truth for mappings
and permissions, and that the only requirement for KVM memslots is that GTT page
tables need to be visible in KVM's memslots.  But if that's the ABI, then
intel_gvt_is_valid_gfn() should be probing VFIO, not KVM (commit cc753fbe1ac4
("drm/i915/gvt: validate gfn before set shadow page entry").

In other words, pick either VFIO or KVM.  Checking that X is valid according to
KVM and then mapping X through VFIO is confusing and makes assumptions about how
userspace configures KVM and VFIO.  It works because QEMU always configures KVM
and VFIO as expected, but IMO it's unnecessarily fragile and again confusing for
unaware readers because the code is technically flawed.

On a related topic, ppgtt_populate_shadow_entry() should check the validity of the
gfn.  If I'm reading the code correctly, checking only in ppgtt_populate_spt() fails
to handle the case where the guest creates a bogus mapping when writing an existing
GTT PT.

Combing all my trains of thought, what about this as an end state for this series?
(completely untested at this point).  Get rid of the KVM mapping size checks,
verify the validity of the entire range being mapped, and add a FIXME to complain
about using KVM instead of VFIO to determine the validity of ranges.

static bool intel_gvt_is_valid_gfn(struct intel_vgpu *vgpu, unsigned long gfn,
				   enum intel_gvt_gtt_type type)
{
	unsigned long nr_pages;

	if (!vgpu->attached)
		return false;

	if (type == GTT_TYPE_PPGTT_PTE_64K_ENTRY)
		nr_pages = I915_GTT_PAGE_SIZE_64K >> PAGE_SHIFT;
	else if (type == GTT_TYPE_PPGTT_PTE_2M_ENTRY)
		nr_pages = I915_GTT_PAGE_SIZE_2M >> PAGE_SHIFT;
	else
		nr_pages = 1;

	/*
	 * FIXME: Probe VFIO, not KVM.  VFIO is the source of truth for KVMGT
	 * mappings and permissions, KVM's involvement is purely to handle
	 * write-tracking of GTT page tables.
	 */
	return kvm_page_track_is_contiguous_gfn_range(vgpu->vfio_device.kvm,
						      gfn, nr_pages);
}

static int try_map_2MB_gtt_entry(struct intel_vgpu *vgpu, unsigned long gfn,
				 dma_addr_t *dma_addr)
{
	if (!HAS_PAGE_SIZES(vgpu->gvt->gt->i915, I915_GTT_PAGE_SIZE_2M))
		return 0;

	return intel_gvt_dma_map_guest_page(vgpu, gfn,
					    I915_GTT_PAGE_SIZE_2M, dma_addr);
}

static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
	struct intel_vgpu_ppgtt_spt *spt, unsigned long index,
	struct intel_gvt_gtt_entry *ge)
{
	const struct intel_gvt_gtt_pte_ops *pte_ops = vgpu->gvt->gtt.pte_ops;
	dma_addr_t dma_addr = vgpu->gvt->gtt.scratch_mfn << PAGE_SHIFT;
	struct intel_gvt_gtt_entry se = *ge;
	unsigned long gfn;
	int ret;

	if (!pte_ops->test_present(ge))
		goto set_shadow_entry;

	gfn = pte_ops->get_pfn(ge);
	if (!intel_gvt_is_valid_gfn(vgpu, gfn, ge->type))
		goto set_shadow_entry;

	...


set_shadow_entry:
	pte_ops->set_pfn(&se, dma_addr >> PAGE_SHIFT);
	ppgtt_set_shadow_entry(spt, &se, index);
	return 0;
}
  
Yan Zhao Jan. 6, 2023, 5:56 a.m. UTC | #5
On Thu, Jan 05, 2023 at 05:40:32PM +0000, Sean Christopherson wrote:
> On Thu, Jan 05, 2023, Yan Zhao wrote:
> > On Tue, Jan 03, 2023 at 09:13:54PM +0000, Sean Christopherson wrote:
> > > On Wed, Dec 28, 2022, Yan Zhao wrote:
> > > > On Fri, Dec 23, 2022 at 12:57:15AM +0000, Sean Christopherson wrote:
> > > > > Honor KVM's max allowed page size when determining whether or not a 2MiB
> > > > > GTT shadow page can be created for the guest.  Querying KVM's max allowed
> > > > > size is somewhat odd as there's no strict requirement that KVM's memslots
> > > > > and VFIO's mappings are configured with the same gfn=>hva mapping, but
> > > > Without vIOMMU, VFIO's mapping is configured with the same as KVM's
> > > > memslots, i.e. with the same gfn==>HVA mapping
> > > 
> > > But that's controlled by userspace, correct?
> > 
> > Yes, controlled by QEMU.
> 
> ...
> 
> > > Strictly speaking, no.  E.g. if a 2MiB region is covered with multiple memslots
> > > and the memslots have different properties.
> > > 
> > > > If for some reason, KVM maps a 2MiB range in 4K sizes, KVMGT can still map
> > > > it in IOMMU size in 2MiB size as long as the PFNs are continous and the
> > > > whole range is all exposed to guest.
> > > 
> > > I agree that practically speaking this will hold true, but if KVMGT wants to honor
> > > KVM's memslots then checking that KVM allows a hugepage is correct.  Hrm, but on
> > > the flip side, KVMGT ignores read-only memslot flags, so KVMGT is already ignoring
> > > pieces of KVM's memslots.
> > KVMGT calls dma_map_page() with DMA_BIDIRECTIONAL after checking gvt_pin_guest_page().
> > Though for a read-only memslot, DMA_TO_DEVICE should be used instead
> > (see dma_info_to_prot()),
> > as gvt_pin_guest_page() checks (IOMMU_READ | IOMMU_WRITE) permission for each page,
> > it actually ensures that the pinned GFN is not in a read-only memslot.
> > So, it should be fine.
> > 
> > > 
> > > I have no objection to KVMGT defining its ABI such that KVMGT is allowed to create
> > > 2MiB so long as (a) the GFN is contiguous according to VFIO, and (b) that the entire
> > > 2MiB range is exposed to the guest.
> > > 
> > sorry. I may not put it clearly enough.
> > for a normal device pass-through via VFIO-PCI, VFIO maps IOMMU mappings in this way:
> > 
> > (a) fault in PFNs in a GFN range within the same memslot (VFIO saves dma_list, which is
> > the same as memslot list when vIOMMU is not on or not in shadow mode).
> > (b) map continuous PFNs into iommu driver (honour ro attribute and can > 2MiB as long as
> > PFNs are continuous).
> > (c) IOMMU driver decides to map in 2MiB or in 4KiB according to its setting.
> > 
> > For KVMGT, gvt_dma_map_page() first calls gvt_pin_guest_page() which
> > (a) calls vfio_pin_pages() to check each GFN is within allowed dma_list with
> > (IOMMU_READ | IOMMU_WRITE) permission and fault-in page. 
> > (b) checks PFNs are continuous in 2MiB,
> > 
> > Though checking kvm_page_track_max_mapping_level() is also fine, it makes DMA
> > mapping size unnecessarily smaller.
> 
> Yeah, I got all that.  What I'm trying to say, and why I asked about whether or
> not userspace controls the mappings, is that AFAIK there is nothing in the kernel
> that coordinates mappings between VFIO and KVM.  So, very technically, userspace
> could map a 2MiB range contiguous in VFIO but not in KVM, or RW in VFIO but RO in KVM.
> 
> I can't imagine there's a real use case for doing so, and arguably there's no
> requirement that KVMGT honor KVM's memslot.  But because KVMGT taps into KVM's
> page-tracking, KVMGT _does_ honor KVM's memslots to some extent because KVMGT
> needs to know whether or not a given GFN can be write-protected.
> 
> I'm totally fine if KVMGT's ABI is that VFIO is the source of truth for mappings
> and permissions, and that the only requirement for KVM memslots is that GTT page
> tables need to be visible in KVM's memslots.  But if that's the ABI, then
> intel_gvt_is_valid_gfn() should be probing VFIO, not KVM (commit cc753fbe1ac4
> ("drm/i915/gvt: validate gfn before set shadow page entry").
> 
> In other words, pick either VFIO or KVM.  Checking that X is valid according to
> KVM and then mapping X through VFIO is confusing and makes assumptions about how
> userspace configures KVM and VFIO.  It works because QEMU always configures KVM
> and VFIO as expected, but IMO it's unnecessarily fragile and again confusing for
> unaware readers because the code is technically flawed.
>
Agreed. 
Then after some further thought, I think maybe we can just remove
intel_gvt_is_valid_gfn() in KVMGT, because

(1) both intel_gvt_is_valid_gfn() in emulate_ggtt_mmio_write() and
ppgtt_populate_spt() are not for page track purpose, but to validate bogus
GFN.
(2) gvt_pin_guest_page() with gfn and size can do the validity checking,
which is called in intel_gvt_dma_map_guest_page(). So, we can move the
mapping of scratch page to the error path after intel_gvt_dma_map_guest_page().


As below,

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 54b32ab843eb..5a85936df6d4 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -49,15 +49,6 @@
 static bool enable_out_of_sync = false;
 static int preallocated_oos_pages = 8192;

-static bool intel_gvt_is_valid_gfn(struct intel_vgpu *vgpu, unsigned long gfn)
-{
-       if (!vgpu->attached)
-               return false;
-
-       /* FIXME: This doesn't properly handle guest entries larger than 4K. */
-       return kvm_page_track_is_valid_gfn(vgpu->vfio_device.kvm, gfn);
-}
-
 /*
  * validate a gm address and related range size,
  * translate it to host gm address
@@ -1340,16 +1331,12 @@ static int ppgtt_populate_spt(struct intel_vgpu_ppgtt_spt *spt)
                        ppgtt_generate_shadow_entry(&se, s, &ge);
                        ppgtt_set_shadow_entry(spt, &se, i);
                } else {
-                       gfn = ops->get_pfn(&ge);
-                       if (!intel_gvt_is_valid_gfn(vgpu, gfn)) {
+                       ret = ppgtt_populate_shadow_entry(vgpu, spt, i, &ge);
+                       if (ret) {
                                ops->set_pfn(&se, gvt->gtt.scratch_mfn);
                                ppgtt_set_shadow_entry(spt, &se, i);
-                               continue;
-                       }
-
-                       ret = ppgtt_populate_shadow_entry(vgpu, spt, i, &ge);
-                       if (ret)
                                goto fail;
+                       }
                }
        }
        return 0;
@@ -2336,14 +2325,6 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
                m.val64 = e.val64;
                m.type = e.type;

-               /* one PTE update may be issued in multiple writes and the
-                * first write may not construct a valid gfn
-                */
-               if (!intel_gvt_is_valid_gfn(vgpu, gfn)) {
-                       ops->set_pfn(&m, gvt->gtt.scratch_mfn);
-                       goto out;
-               }
-
                ret = intel_gvt_dma_map_guest_page(vgpu, gfn, PAGE_SIZE,
                                                   &dma_addr);
                if (ret) {


> On a related topic, ppgtt_populate_shadow_entry() should check the validity of the
> gfn.  If I'm reading the code correctly, checking only in ppgtt_populate_spt() fails
> to handle the case where the guest creates a bogus mapping when writing an existing
> GTT PT.
Don't get it here. Could you elaborate more?

> 
> Combing all my trains of thought, what about this as an end state for this series?
> (completely untested at this point).  Get rid of the KVM mapping size checks,
> verify the validity of the entire range being mapped, and add a FIXME to complain
> about using KVM instead of VFIO to determine the validity of ranges.
> 
> static bool intel_gvt_is_valid_gfn(struct intel_vgpu *vgpu, unsigned long gfn,
> 				   enum intel_gvt_gtt_type type)
> {
> 	unsigned long nr_pages;
> 
> 	if (!vgpu->attached)
> 		return false;
> 
> 	if (type == GTT_TYPE_PPGTT_PTE_64K_ENTRY)
> 		nr_pages = I915_GTT_PAGE_SIZE_64K >> PAGE_SHIFT;
> 	else if (type == GTT_TYPE_PPGTT_PTE_2M_ENTRY)
> 		nr_pages = I915_GTT_PAGE_SIZE_2M >> PAGE_SHIFT;
> 	else
> 		nr_pages = 1;
> 
> 	/*
> 	 * FIXME: Probe VFIO, not KVM.  VFIO is the source of truth for KVMGT
> 	 * mappings and permissions, KVM's involvement is purely to handle
> 	 * write-tracking of GTT page tables.
> 	 */
> 	return kvm_page_track_is_contiguous_gfn_range(vgpu->vfio_device.kvm,
> 						      gfn, nr_pages);
> }
> 
> static int try_map_2MB_gtt_entry(struct intel_vgpu *vgpu, unsigned long gfn,
> 				 dma_addr_t *dma_addr)
> {
> 	if (!HAS_PAGE_SIZES(vgpu->gvt->gt->i915, I915_GTT_PAGE_SIZE_2M))
> 		return 0;
> 
> 	return intel_gvt_dma_map_guest_page(vgpu, gfn,
> 					    I915_GTT_PAGE_SIZE_2M, dma_addr);
> }
> 
> static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
> 	struct intel_vgpu_ppgtt_spt *spt, unsigned long index,
> 	struct intel_gvt_gtt_entry *ge)
> {
> 	const struct intel_gvt_gtt_pte_ops *pte_ops = vgpu->gvt->gtt.pte_ops;
> 	dma_addr_t dma_addr = vgpu->gvt->gtt.scratch_mfn << PAGE_SHIFT;
> 	struct intel_gvt_gtt_entry se = *ge;
> 	unsigned long gfn;
> 	int ret;
> 
> 	if (!pte_ops->test_present(ge))
> 		goto set_shadow_entry;
> 
> 	gfn = pte_ops->get_pfn(ge);
> 	if (!intel_gvt_is_valid_gfn(vgpu, gfn, ge->type))
> 		goto set_shadow_entry;
As KVMGT only tracks PPGTT page table pages, this check here is not for page
track purpose, but to check bogus GFN.
So, Just leave the bogus GFN check to intel_gvt_dma_map_guest_page() through
VFIO is all right.

On the other hand, for the GFN validity for page track purpose, we can
leave it to kvm_write_track_add_gfn().

Do you think it's ok?


> 	...
> 
> 
> set_shadow_entry:
> 	pte_ops->set_pfn(&se, dma_addr >> PAGE_SHIFT);
> 	ppgtt_set_shadow_entry(spt, &se, index);
> 	return 0;
> }
  
Sean Christopherson Jan. 6, 2023, 11:01 p.m. UTC | #6
On Fri, Jan 06, 2023, Yan Zhao wrote:
> On Thu, Jan 05, 2023 at 05:40:32PM +0000, Sean Christopherson wrote:
> > On Thu, Jan 05, 2023, Yan Zhao wrote:
> > I'm totally fine if KVMGT's ABI is that VFIO is the source of truth for mappings
> > and permissions, and that the only requirement for KVM memslots is that GTT page
> > tables need to be visible in KVM's memslots.  But if that's the ABI, then
> > intel_gvt_is_valid_gfn() should be probing VFIO, not KVM (commit cc753fbe1ac4
> > ("drm/i915/gvt: validate gfn before set shadow page entry").
> > 
> > In other words, pick either VFIO or KVM.  Checking that X is valid according to
> > KVM and then mapping X through VFIO is confusing and makes assumptions about how
> > userspace configures KVM and VFIO.  It works because QEMU always configures KVM
> > and VFIO as expected, but IMO it's unnecessarily fragile and again confusing for
> > unaware readers because the code is technically flawed.
> >
> Agreed. 
> Then after some further thought, I think maybe we can just remove
> intel_gvt_is_valid_gfn() in KVMGT, because
> 
> (1) both intel_gvt_is_valid_gfn() in emulate_ggtt_mmio_write() and
> ppgtt_populate_spt() are not for page track purpose, but to validate bogus
> GFN.
> (2) gvt_pin_guest_page() with gfn and size can do the validity checking,
> which is called in intel_gvt_dma_map_guest_page(). So, we can move the
> mapping of scratch page to the error path after intel_gvt_dma_map_guest_page().

IIUC, that will re-introduce the problem commit cc753fbe1ac4 ("drm/i915/gvt: validate
gfn before set shadow page entry") solved by poking into KVM.  Lack of pre-validation
means that bogus GFNs will trigger error messages, e.g.

			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
				     &cur_iova, ret);

and

			gvt_vgpu_err("fail to populate guest ggtt entry\n");

One thought would be to turn those printks into tracepoints to eliminate unwanted
noise, and to prevent the guest from spamming the host kernel log by programming
garbage into the GTT (gvt_vgpu_err() isn't ratelimited).

> > On a related topic, ppgtt_populate_shadow_entry() should check the validity of the
> > gfn.  If I'm reading the code correctly, checking only in ppgtt_populate_spt() fails
> > to handle the case where the guest creates a bogus mapping when writing an existing
> > GTT PT.
> Don't get it here. Could you elaborate more?

AFAICT, KVMGT only pre-validates the GFN on the initial setup, not when the guest
modifies a write-tracked entry.  I believe this is a moot point if the pre-validation
is removed entirely.

> > 	gfn = pte_ops->get_pfn(ge);
> > 	if (!intel_gvt_is_valid_gfn(vgpu, gfn, ge->type))
> > 		goto set_shadow_entry;
> As KVMGT only tracks PPGTT page table pages, this check here is not for page
> track purpose, but to check bogus GFN.
> So, Just leave the bogus GFN check to intel_gvt_dma_map_guest_page() through
> VFIO is all right.
> 
> On the other hand, for the GFN validity for page track purpose, we can
> leave it to kvm_write_track_add_gfn().
> 
> Do you think it's ok?

Yep, the only hiccup is the gvt_vgpu_err() calls that are guest-triggerable, and
converting those to a tracepoint seems like the right answer.
  
Yan Zhao Jan. 9, 2023, 9:58 a.m. UTC | #7
On Fri, Jan 06, 2023 at 11:01:53PM +0000, Sean Christopherson wrote:
> On Fri, Jan 06, 2023, Yan Zhao wrote:
> > On Thu, Jan 05, 2023 at 05:40:32PM +0000, Sean Christopherson wrote:
> > > On Thu, Jan 05, 2023, Yan Zhao wrote:
> > > I'm totally fine if KVMGT's ABI is that VFIO is the source of truth for mappings
> > > and permissions, and that the only requirement for KVM memslots is that GTT page
> > > tables need to be visible in KVM's memslots.  But if that's the ABI, then
> > > intel_gvt_is_valid_gfn() should be probing VFIO, not KVM (commit cc753fbe1ac4
> > > ("drm/i915/gvt: validate gfn before set shadow page entry").
> > > 
> > > In other words, pick either VFIO or KVM.  Checking that X is valid according to
> > > KVM and then mapping X through VFIO is confusing and makes assumptions about how
> > > userspace configures KVM and VFIO.  It works because QEMU always configures KVM
> > > and VFIO as expected, but IMO it's unnecessarily fragile and again confusing for
> > > unaware readers because the code is technically flawed.
> > >
> > Agreed. 
> > Then after some further thought, I think maybe we can just remove
> > intel_gvt_is_valid_gfn() in KVMGT, because
> > 
> > (1) both intel_gvt_is_valid_gfn() in emulate_ggtt_mmio_write() and
> > ppgtt_populate_spt() are not for page track purpose, but to validate bogus
> > GFN.
> > (2) gvt_pin_guest_page() with gfn and size can do the validity checking,
> > which is called in intel_gvt_dma_map_guest_page(). So, we can move the
> > mapping of scratch page to the error path after intel_gvt_dma_map_guest_page().
> 
> IIUC, that will re-introduce the problem commit cc753fbe1ac4 ("drm/i915/gvt: validate
> gfn before set shadow page entry") solved by poking into KVM.  Lack of pre-validation
> means that bogus GFNs will trigger error messages, e.g.
> 
> 			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
> 				     &cur_iova, ret);
> 
> and
> 
> 			gvt_vgpu_err("fail to populate guest ggtt entry\n");

Thanks for pointing it out.
I checked this commit message and found below original intentions to introduce
pre-validation:
   "GVT may receive partial write on one guest PTE update. Validate gfn
    not to translate incomplete gfn. This avoids some unnecessary error
    messages incurred by the incomplete gfn translating. Also fix the
    bug that the whole PPGTT shadow page update is aborted on any invalid
    gfn entry"

(1) first intention -- unnecessary error message came from GGTT partial write.
    For guest GGTT writes, the guest calls writeq to an MMIO GPA, which is
    8 bytes in length, while QEMU splits the MMIO write into 2 4-byte writes.
    The splitted 2 writes can cause invalid GFN to be found.

    But this partial write issue has been fixed by the two follow-up commits:
        bc0686ff5fad drm/i915/gvt: support inconsecutive partial gtt entry write
        510fe10b6180 drm/i915/gvt: fix a bug of partially write ggtt enties

    so pre-validation to reduce noise is not necessary any more here.

(2) the second intention -- "the whole PPGTT shadow page update is aborted on any
    invalid gfn entry"
    As PPGTT resides in normal guest RAM and we only treat 8-byte writes
    as valid page table writes, any invalid GPA found is regarded as
    an error, either due to guest misbehavior/attack or bug in host
    shadow code. 
    So, direct abort looks good too. Like below:

@@ -1340,13 +1338,6 @@ static int ppgtt_populate_spt(struct intel_vgpu_ppgtt_spt *spt)
                        ppgtt_generate_shadow_entry(&se, s, &ge);
                        ppgtt_set_shadow_entry(spt, &se, i);
                } else {
-                       gfn = ops->get_pfn(&ge);
-                       if (!intel_gvt_is_valid_gfn(vgpu, gfn)) {
-                               ops->set_pfn(&se, gvt->gtt.scratch_mfn);
-                               ppgtt_set_shadow_entry(spt, &se, i);
-                               continue;
-                       }
-
                        ret = ppgtt_populate_shadow_entry(vgpu, spt, i, &ge);
                        if (ret)
                                goto fail;

(I actually found that the original code will print "invalid entry type"
warning which indicates it's broken for a while due to lack of test in
this invalid gfn path)


> One thought would be to turn those printks into tracepoints to eliminate unwanted
> noise, and to prevent the guest from spamming the host kernel log by programming
> garbage into the GTT (gvt_vgpu_err() isn't ratelimited).
As those printks would not happen in normal conditions and printks may have
some advantages to discover the attack or bug, could we just convert
gvt_vgpu_err() to be ratelimited ?

Thanks
Yan

> 
> > > On a related topic, ppgtt_populate_shadow_entry() should check the validity of the
> > > gfn.  If I'm reading the code correctly, checking only in ppgtt_populate_spt() fails
> > > to handle the case where the guest creates a bogus mapping when writing an existing
> > > GTT PT.
> > Don't get it here. Could you elaborate more?
> 
> AFAICT, KVMGT only pre-validates the GFN on the initial setup, not when the guest
> modifies a write-tracked entry.  I believe this is a moot point if the pre-validation
> is removed entirely.
> 
> > > 	gfn = pte_ops->get_pfn(ge);
> > > 	if (!intel_gvt_is_valid_gfn(vgpu, gfn, ge->type))
> > > 		goto set_shadow_entry;
> > As KVMGT only tracks PPGTT page table pages, this check here is not for page
> > track purpose, but to check bogus GFN.
> > So, Just leave the bogus GFN check to intel_gvt_dma_map_guest_page() through
> > VFIO is all right.
> > 
> > On the other hand, for the GFN validity for page track purpose, we can
> > leave it to kvm_write_track_add_gfn().
> > 
> > Do you think it's ok?
> 
> Yep, the only hiccup is the gvt_vgpu_err() calls that are guest-triggerable, and
> converting those to a tracepoint seems like the right answer.
  
Sean Christopherson Jan. 11, 2023, 5:55 p.m. UTC | #8
On Mon, Jan 09, 2023, Yan Zhao wrote:
> On Fri, Jan 06, 2023 at 11:01:53PM +0000, Sean Christopherson wrote:
> > On Fri, Jan 06, 2023, Yan Zhao wrote:
> > > On Thu, Jan 05, 2023 at 05:40:32PM +0000, Sean Christopherson wrote:
> > > > On Thu, Jan 05, 2023, Yan Zhao wrote:
> > > > I'm totally fine if KVMGT's ABI is that VFIO is the source of truth for mappings
> > > > and permissions, and that the only requirement for KVM memslots is that GTT page
> > > > tables need to be visible in KVM's memslots.  But if that's the ABI, then
> > > > intel_gvt_is_valid_gfn() should be probing VFIO, not KVM (commit cc753fbe1ac4
> > > > ("drm/i915/gvt: validate gfn before set shadow page entry").
> > > > 
> > > > In other words, pick either VFIO or KVM.  Checking that X is valid according to
> > > > KVM and then mapping X through VFIO is confusing and makes assumptions about how
> > > > userspace configures KVM and VFIO.  It works because QEMU always configures KVM
> > > > and VFIO as expected, but IMO it's unnecessarily fragile and again confusing for
> > > > unaware readers because the code is technically flawed.
> > > >
> > > Agreed. 
> > > Then after some further thought, I think maybe we can just remove
> > > intel_gvt_is_valid_gfn() in KVMGT, because
> > > 
> > > (1) both intel_gvt_is_valid_gfn() in emulate_ggtt_mmio_write() and
> > > ppgtt_populate_spt() are not for page track purpose, but to validate bogus
> > > GFN.
> > > (2) gvt_pin_guest_page() with gfn and size can do the validity checking,
> > > which is called in intel_gvt_dma_map_guest_page(). So, we can move the
> > > mapping of scratch page to the error path after intel_gvt_dma_map_guest_page().
> > 
> > IIUC, that will re-introduce the problem commit cc753fbe1ac4 ("drm/i915/gvt: validate
> > gfn before set shadow page entry") solved by poking into KVM.  Lack of pre-validation
> > means that bogus GFNs will trigger error messages, e.g.
> > 
> > 			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
> > 				     &cur_iova, ret);
> > 
> > and
> > 
> > 			gvt_vgpu_err("fail to populate guest ggtt entry\n");
> 
> Thanks for pointing it out.
> I checked this commit message and found below original intentions to introduce
> pre-validation:

...

> (I actually found that the original code will print "invalid entry type"
> warning which indicates it's broken for a while due to lack of test in
> this invalid gfn path)
> 
> 
> > One thought would be to turn those printks into tracepoints to eliminate unwanted
> > noise, and to prevent the guest from spamming the host kernel log by programming
> > garbage into the GTT (gvt_vgpu_err() isn't ratelimited).
> As those printks would not happen in normal conditions and printks may have
> some advantages to discover the attack or bug, could we just convert
> gvt_vgpu_err() to be ratelimited ?

That's ultimately a decision that needs to be made by the GVT maintainers, as the
answer depends on the use case.  E.g. if most users of KVMGT run a single VM and
the guest user is also the host admin, then pr_err_ratelimited() is likely an
acceptable/preferable choice as there's a decent chance a human will see the errors
in the host kernel logs and be able to take action.

But if there's unlikely to be a human monitoring the host logs, and/or the guest
user is unrelated to the host admin, then a ratelimited printk() is less useful.
E.g. if there's no one monitoring the logs, then losing messages due to
ratelimiting provides a worse debug experience overall than having to manually
enable tracepoints.   And if there may be many tens of VMs (seems unlikely?), then
ratelimited printk() is even less useful because errors for a specific VM may be
lost, i.e. the printk() can't be relied upon in any way to detect issues.

FWIW, in KVM proper, use of printk() to capture guest "errors" is strongly discourage
for exactly these reasons.
  
Yan Zhao Jan. 12, 2023, 8:31 a.m. UTC | #9
> > > > Note, KVM may also restrict the mapping size for reasons that aren't
> > > > relevant to KVMGT, e.g. for KVM's iTLB multi-hit workaround or if the gfn
> > > Will iTLB multi-hit affect DMA?
> > 
> > I highly doubt it, I can't imagine an IOMMU would have a dedicated instruction
> > TLB :-)
> I can double check it with IOMMU hardware experts.
> But if DMA would tamper instruction TLB, it should have been reported
> as an issue with normal VFIO pass-through?

hi Sean,
This is the feedback:

- CPU Instruction TLB is only filled when CPU fetches an instruction.
- IOMMU uses IOTLB to cache IOVA translation.
  A remapping hardware may implement multiple IOTLBs, and some of these may
  be for special purposes, e.g., only for instruction fetches.
  There is no way for software to be aware that multiple
  translations for smaller pages have been used for a large page. If software
  modifies the paging structures so that the page size used for a 4-KByte range
  of input-addresses changes, the IOTLBs may subsequently contain multiple
  translations for the address range (one for each page size).
  A reference to a input-address in the address range may use any of these
  translations. Which translation is used may vary from one execution to
  another, and the choice may be implementation-specific.
- Theres no similar bug related to DMA requests for instruction fetch hitting
  multiple IOTLB entries reported in IOMMU side.
  The X bit in IOMMU paging structure is to be removed in future and is
  currently always unset.

Thanks
Yan
  
Zhenyu Wang Jan. 19, 2023, 2:58 a.m. UTC | #10
On 2023.01.11 17:55:04 +0000, Sean Christopherson wrote:
> On Mon, Jan 09, 2023, Yan Zhao wrote:
> > On Fri, Jan 06, 2023 at 11:01:53PM +0000, Sean Christopherson wrote:
> > > On Fri, Jan 06, 2023, Yan Zhao wrote:
> > > > On Thu, Jan 05, 2023 at 05:40:32PM +0000, Sean Christopherson wrote:
> > > > > On Thu, Jan 05, 2023, Yan Zhao wrote:
> > > > > I'm totally fine if KVMGT's ABI is that VFIO is the source of truth for mappings
> > > > > and permissions, and that the only requirement for KVM memslots is that GTT page
> > > > > tables need to be visible in KVM's memslots.  But if that's the ABI, then
> > > > > intel_gvt_is_valid_gfn() should be probing VFIO, not KVM (commit cc753fbe1ac4
> > > > > ("drm/i915/gvt: validate gfn before set shadow page entry").
> > > > > 
> > > > > In other words, pick either VFIO or KVM.  Checking that X is valid according to
> > > > > KVM and then mapping X through VFIO is confusing and makes assumptions about how
> > > > > userspace configures KVM and VFIO.  It works because QEMU always configures KVM
> > > > > and VFIO as expected, but IMO it's unnecessarily fragile and again confusing for
> > > > > unaware readers because the code is technically flawed.
> > > > >
> > > > Agreed. 
> > > > Then after some further thought, I think maybe we can just remove
> > > > intel_gvt_is_valid_gfn() in KVMGT, because
> > > > 
> > > > (1) both intel_gvt_is_valid_gfn() in emulate_ggtt_mmio_write() and
> > > > ppgtt_populate_spt() are not for page track purpose, but to validate bogus
> > > > GFN.
> > > > (2) gvt_pin_guest_page() with gfn and size can do the validity checking,
> > > > which is called in intel_gvt_dma_map_guest_page(). So, we can move the
> > > > mapping of scratch page to the error path after intel_gvt_dma_map_guest_page().
> > > 
> > > IIUC, that will re-introduce the problem commit cc753fbe1ac4 ("drm/i915/gvt: validate
> > > gfn before set shadow page entry") solved by poking into KVM.  Lack of pre-validation
> > > means that bogus GFNs will trigger error messages, e.g.
> > > 
> > > 			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
> > > 				     &cur_iova, ret);
> > > 
> > > and
> > > 
> > > 			gvt_vgpu_err("fail to populate guest ggtt entry\n");
> > 
> > Thanks for pointing it out.
> > I checked this commit message and found below original intentions to introduce
> > pre-validation:
> 
> ...
> 
> > (I actually found that the original code will print "invalid entry type"
> > warning which indicates it's broken for a while due to lack of test in
> > this invalid gfn path)
> > 
> > 
> > > One thought would be to turn those printks into tracepoints to eliminate unwanted
> > > noise, and to prevent the guest from spamming the host kernel log by programming
> > > garbage into the GTT (gvt_vgpu_err() isn't ratelimited).
> > As those printks would not happen in normal conditions and printks may have
> > some advantages to discover the attack or bug, could we just convert
> > gvt_vgpu_err() to be ratelimited ?
> 
> That's ultimately a decision that needs to be made by the GVT maintainers, as the
> answer depends on the use case.  E.g. if most users of KVMGT run a single VM and
> the guest user is also the host admin, then pr_err_ratelimited() is likely an
> acceptable/preferable choice as there's a decent chance a human will see the errors
> in the host kernel logs and be able to take action.
> 
> But if there's unlikely to be a human monitoring the host logs, and/or the guest
> user is unrelated to the host admin, then a ratelimited printk() is less useful.
> E.g. if there's no one monitoring the logs, then losing messages due to
> ratelimiting provides a worse debug experience overall than having to manually
> enable tracepoints.   And if there may be many tens of VMs (seems unlikely?), then
> ratelimited printk() is even less useful because errors for a specific VM may be
> lost, i.e. the printk() can't be relied upon in any way to detect issues.
> 
> FWIW, in KVM proper, use of printk() to capture guest "errors" is strongly discourage
> for exactly these reasons.

Current KVMGT usage is mostly in controlled mode, either user is own host admin,
or host admin would pre-configure specific limited number of VMs for KVMGT use.
I think printk on error should be fine, we don't need rate limit, and adding
extra trace monitor for admin might not be necessary. So I'm towards to keep to
use current error message.

thanks
  
Yan Zhao Jan. 19, 2023, 5:26 a.m. UTC | #11
On Thu, Jan 19, 2023 at 10:58:42AM +0800, Zhenyu Wang wrote:
> On 2023.01.11 17:55:04 +0000, Sean Christopherson wrote:
> > On Mon, Jan 09, 2023, Yan Zhao wrote:
> > > On Fri, Jan 06, 2023 at 11:01:53PM +0000, Sean Christopherson wrote:
> > > > On Fri, Jan 06, 2023, Yan Zhao wrote:
> > > > > On Thu, Jan 05, 2023 at 05:40:32PM +0000, Sean Christopherson wrote:
> > > > > > On Thu, Jan 05, 2023, Yan Zhao wrote:
> > > > > > I'm totally fine if KVMGT's ABI is that VFIO is the source of truth for mappings
> > > > > > and permissions, and that the only requirement for KVM memslots is that GTT page
> > > > > > tables need to be visible in KVM's memslots.  But if that's the ABI, then
> > > > > > intel_gvt_is_valid_gfn() should be probing VFIO, not KVM (commit cc753fbe1ac4
> > > > > > ("drm/i915/gvt: validate gfn before set shadow page entry").
> > > > > > 
> > > > > > In other words, pick either VFIO or KVM.  Checking that X is valid according to
> > > > > > KVM and then mapping X through VFIO is confusing and makes assumptions about how
> > > > > > userspace configures KVM and VFIO.  It works because QEMU always configures KVM
> > > > > > and VFIO as expected, but IMO it's unnecessarily fragile and again confusing for
> > > > > > unaware readers because the code is technically flawed.
> > > > > >
> > > > > Agreed. 
> > > > > Then after some further thought, I think maybe we can just remove
> > > > > intel_gvt_is_valid_gfn() in KVMGT, because
> > > > > 
> > > > > (1) both intel_gvt_is_valid_gfn() in emulate_ggtt_mmio_write() and
> > > > > ppgtt_populate_spt() are not for page track purpose, but to validate bogus
> > > > > GFN.
> > > > > (2) gvt_pin_guest_page() with gfn and size can do the validity checking,
> > > > > which is called in intel_gvt_dma_map_guest_page(). So, we can move the
> > > > > mapping of scratch page to the error path after intel_gvt_dma_map_guest_page().
> > > > 
> > > > IIUC, that will re-introduce the problem commit cc753fbe1ac4 ("drm/i915/gvt: validate
> > > > gfn before set shadow page entry") solved by poking into KVM.  Lack of pre-validation
> > > > means that bogus GFNs will trigger error messages, e.g.
> > > > 
> > > > 			gvt_vgpu_err("vfio_pin_pages failed for iova %pad, ret %d\n",
> > > > 				     &cur_iova, ret);
> > > > 
> > > > and
> > > > 
> > > > 			gvt_vgpu_err("fail to populate guest ggtt entry\n");
> > > 
> > > Thanks for pointing it out.
> > > I checked this commit message and found below original intentions to introduce
> > > pre-validation:
> > 
> > ...
> > 
> > > (I actually found that the original code will print "invalid entry type"
> > > warning which indicates it's broken for a while due to lack of test in
> > > this invalid gfn path)
> > > 
> > > 
> > > > One thought would be to turn those printks into tracepoints to eliminate unwanted
> > > > noise, and to prevent the guest from spamming the host kernel log by programming
> > > > garbage into the GTT (gvt_vgpu_err() isn't ratelimited).
> > > As those printks would not happen in normal conditions and printks may have
> > > some advantages to discover the attack or bug, could we just convert
> > > gvt_vgpu_err() to be ratelimited ?
> > 
> > That's ultimately a decision that needs to be made by the GVT maintainers, as the
> > answer depends on the use case.  E.g. if most users of KVMGT run a single VM and
> > the guest user is also the host admin, then pr_err_ratelimited() is likely an
> > acceptable/preferable choice as there's a decent chance a human will see the errors
> > in the host kernel logs and be able to take action.
> > 
> > But if there's unlikely to be a human monitoring the host logs, and/or the guest
> > user is unrelated to the host admin, then a ratelimited printk() is less useful.
> > E.g. if there's no one monitoring the logs, then losing messages due to
> > ratelimiting provides a worse debug experience overall than having to manually
> > enable tracepoints.   And if there may be many tens of VMs (seems unlikely?), then
> > ratelimited printk() is even less useful because errors for a specific VM may be
> > lost, i.e. the printk() can't be relied upon in any way to detect issues.
> > 
> > FWIW, in KVM proper, use of printk() to capture guest "errors" is strongly discourage
> > for exactly these reasons.
> 
> Current KVMGT usage is mostly in controlled mode, either user is own host admin,
> or host admin would pre-configure specific limited number of VMs for KVMGT use.
> I think printk on error should be fine, we don't need rate limit, and adding
> extra trace monitor for admin might not be necessary. So I'm towards to keep to
> use current error message.
> 

Thanks, Sean and Zhenyu.
So, could I just post the final fix as below?
And, Sean, would you like to include it in this series or should I send it out
first?

From dcc931011da3712333f61684ebb20765dbf2fb46 Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Thu, 19 Jan 2023 11:15:54 +0800
Subject: [PATCH] drm/i915/gvt: remove interface intel_gvt_is_valid_gfn

Currently intel_gvt_is_valid_gfn() is called in two places:
(1) shadowing guest GGTT entry
(2) shadowing guest PPGTT leaf entry,
which was introduced in commit cc753fbe1ac4
("drm/i915/gvt: validate gfn before set shadow page entry").

However, now it's not necessary to call this interface any more, because
a. GGTT partial write issue has been fixed by
   commit bc0686ff5fad
   ("drm/i915/gvt: support inconsecutive partial gtt entry write")
   commit 510fe10b6180
   ("drm/i915/gvt: fix a bug of partially write ggtt enties")
b. PPGTT resides in normal guest RAM and we only treat 8-byte writes
   as valid page table writes. Any invalid GPA found is regarded as
   an error, either due to guest misbehavior/attack or bug in host
   shadow code.
   So,rather than do GFN pre-checking and replace invalid GFNs with
   scratch GFN and continue silently, just remove the pre-checking and
   abort PPGTT shadowing on error detected.
c. GFN validity check is still performed in
   intel_gvt_dma_map_guest_page() --> gvt_pin_guest_page().
   It's more desirable to call VFIO interface to do both validity check
   and mapping.
   Calling intel_gvt_is_valid_gfn() to do GFN validity check from KVM side
   while later mapping the GFN through VFIO interface is unnecessarily
   fragile and confusing for unaware readers.

Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 drivers/gpu/drm/i915/gvt/gtt.c | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 445afecbe7ae..9b6c2ca1ee16 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -49,22 +49,6 @@
 static bool enable_out_of_sync = false;
 static int preallocated_oos_pages = 8192;

-static bool intel_gvt_is_valid_gfn(struct intel_vgpu *vgpu, unsigned long gfn)
-{
-       struct kvm *kvm = vgpu->vfio_device.kvm;
-       int idx;
-       bool ret;
-
-       if (!vgpu->attached)
-               return false;
-
-       idx = srcu_read_lock(&kvm->srcu);
-       ret = kvm_is_visible_gfn(kvm, gfn);
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return ret;
-}
-
 /*
  * validate a gm address and related range size,
  * translate it to host gm address

@@ -1345,13 +1329,6 @@ static int ppgtt_populate_spt(struct intel_vgpu_ppgtt_spt *spt)
                        ppgtt_generate_shadow_entry(&se, s, &ge);
                        ppgtt_set_shadow_entry(spt, &se, i);
                } else {
-                       gfn = ops->get_pfn(&ge);
-                       if (!intel_gvt_is_valid_gfn(vgpu, gfn)) {
-                               ops->set_pfn(&se, gvt->gtt.scratch_mfn);
-                               ppgtt_set_shadow_entry(spt, &se, i);
-                               continue;
-                       }
-
                        ret = ppgtt_populate_shadow_entry(vgpu, spt, i, &ge);
                        if (ret)
                                goto fail;
@@ -2326,14 +2303,6 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
                m.val64 = e.val64;
                m.type = e.type;

-               /* one PTE update may be issued in multiple writes and the
-                * first write may not construct a valid gfn
-                */
-               if (!intel_gvt_is_valid_gfn(vgpu, gfn)) {
-                       ops->set_pfn(&m, gvt->gtt.scratch_mfn);
-                       goto out;
-               }
-
                ret = intel_gvt_dma_map_guest_page(vgpu, gfn, PAGE_SIZE,
                                                   &dma_addr);
                if (ret) {
--
2.17.1
  
Sean Christopherson Feb. 23, 2023, 8:41 p.m. UTC | #12
Apologies for the super slow reply, I put this series on the backburner while I
caught up on other stuff and completely missed your questions.

On Thu, Jan 19, 2023, Yan Zhao wrote:
> On Thu, Jan 19, 2023 at 10:58:42AM +0800, Zhenyu Wang wrote:
> > Current KVMGT usage is mostly in controlled mode, either user is own host admin,
> > or host admin would pre-configure specific limited number of VMs for KVMGT use.
> > I think printk on error should be fine, we don't need rate limit, and adding
> > extra trace monitor for admin might not be necessary. So I'm towards to keep to
> > use current error message.
> > 
> 
> Thanks, Sean and Zhenyu.
> So, could I just post the final fix as below?

No objection here.

> And, Sean, would you like to include it in this series or should I send it out
> first?

I'd like to include it in this series as it's necessary (for some definitions of
necessary) to clean up KVM's APIs, and the main benefactor is KVM, i.e. getting
the patch merged sooner than later doesn't really benefit KVMGT itself.

Thanks much!
  
Yan Zhao Feb. 24, 2023, 5:09 a.m. UTC | #13
On Thu, Feb 23, 2023 at 12:41:28PM -0800, Sean Christopherson wrote:
> Apologies for the super slow reply, I put this series on the backburner while I
> caught up on other stuff and completely missed your questions.
>
Never mind :)

> On Thu, Jan 19, 2023, Yan Zhao wrote:
> > On Thu, Jan 19, 2023 at 10:58:42AM +0800, Zhenyu Wang wrote:
> > > Current KVMGT usage is mostly in controlled mode, either user is own host admin,
> > > or host admin would pre-configure specific limited number of VMs for KVMGT use.
> > > I think printk on error should be fine, we don't need rate limit, and adding
> > > extra trace monitor for admin might not be necessary. So I'm towards to keep to
> > > use current error message.
> > > 
> > 
> > Thanks, Sean and Zhenyu.
> > So, could I just post the final fix as below?
> 
> No objection here.
> 
> > And, Sean, would you like to include it in this series or should I send it out
> > first?
> 
> I'd like to include it in this series as it's necessary (for some definitions of
> necessary) to clean up KVM's APIs, and the main benefactor is KVM, i.e. getting
> the patch merged sooner than later doesn't really benefit KVMGT itself.
> 
> Thanks much!

Then please include it and I can help to test once you sending out next
version.

Thanks
Yan
  

Patch

diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index eb186bc57f6a..3f72c7a172fc 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -51,6 +51,8 @@  void kvm_page_track_cleanup(struct kvm *kvm);
 
 bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
 int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
+enum pg_level kvm_page_track_max_mapping_level(struct kvm *kvm, gfn_t gfn,
+					       enum pg_level max_level);
 
 void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
 int kvm_page_track_create_memslot(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 2e09d1b6249f..69ea16c31859 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -300,3 +300,21 @@  void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
 			n->track_flush_slot(kvm, slot, n);
 	srcu_read_unlock(&head->track_srcu, idx);
 }
+
+enum pg_level kvm_page_track_max_mapping_level(struct kvm *kvm, gfn_t gfn,
+					       enum pg_level max_level)
+{
+	struct kvm_memory_slot *slot;
+	int idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	slot = gfn_to_memslot(kvm, gfn);
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+		max_level = PG_LEVEL_4K;
+	else
+		max_level = kvm_mmu_max_slot_mapping_level(slot, gfn, max_level);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return max_level;
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_max_mapping_level);
diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index d0fca53a3563..6736d7bd94ea 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -1178,14 +1178,22 @@  static int is_2MB_gtt_possible(struct intel_vgpu *vgpu,
 	struct intel_gvt_gtt_entry *entry)
 {
 	const struct intel_gvt_gtt_pte_ops *ops = vgpu->gvt->gtt.pte_ops;
+	unsigned long gfn = ops->get_pfn(entry);
 	kvm_pfn_t pfn;
+	int max_level;
 
 	if (!HAS_PAGE_SIZES(vgpu->gvt->gt->i915, I915_GTT_PAGE_SIZE_2M))
 		return 0;
 
 	if (!vgpu->attached)
 		return -EINVAL;
-	pfn = gfn_to_pfn(vgpu->vfio_device.kvm, ops->get_pfn(entry));
+
+	max_level = kvm_page_track_max_mapping_level(vgpu->vfio_device.kvm,
+						     gfn, PG_LEVEL_2M);
+	if (max_level < PG_LEVEL_2M)
+		return 0;
+
+	pfn = gfn_to_pfn(vgpu->vfio_device.kvm, gfn);
 	if (is_error_noslot_pfn(pfn))
 		return -EINVAL;