[v7,5/8] KVM: x86/mmu: Don't pass FOLL_GET to __kvm_follow_pfn
Commit Message
From: David Stevens <stevensd@chromium.org>
Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
memory into the guest that is backed by un-refcounted struct pages - for
example, higher order non-compound pages allocated by the amdgpu driver
via ttm_pool_alloc_page.
The bulk of this change is tracking the is_refcounted_page flag so that
non-refcounted pages don't trigger page_count() == 0 warnings. This is
done by storing the flag in an unused bit in the sptes.
Signed-off-by: David Stevens <stevensd@chromium.org>
---
arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------
arch/x86/kvm/mmu/mmu_internal.h | 1 +
arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++---
arch/x86/kvm/mmu/spte.c | 4 ++-
arch/x86/kvm/mmu/spte.h | 12 ++++++++-
arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++-------
6 files changed, 62 insertions(+), 30 deletions(-)
Comments
On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> From: David Stevens <stevensd@chromium.org>
>
> Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
> memory into the guest that is backed by un-refcounted struct pages - for
> example, higher order non-compound pages allocated by the amdgpu driver
> via ttm_pool_alloc_page.
I guess you mean the tail pages of the higher order non-compound pages?
And as to the head page, it is said to be set to one coincidentally[*],
and shall not be considered as refcounted. IIUC, refcount of this head
page will be increased and decreased soon in hva_to_pfn_remapped(), so
this may not be a problem(?). But treating this head page differently,
as a refcounted one(e.g., to set the A/D flags), is weired.
Or maybe I missed some context, e.g., can the head page be allocted to
guest at all?
>
> The bulk of this change is tracking the is_refcounted_page flag so that
> non-refcounted pages don't trigger page_count() == 0 warnings. This is
> done by storing the flag in an unused bit in the sptes.
Also, maybe we should mention this only works on x86-64.
>
> Signed-off-by: David Stevens <stevensd@chromium.org>
> ---
> arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------
> arch/x86/kvm/mmu/mmu_internal.h | 1 +
> arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++---
> arch/x86/kvm/mmu/spte.c | 4 ++-
> arch/x86/kvm/mmu/spte.h | 12 ++++++++-
> arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++-------
> 6 files changed, 62 insertions(+), 30 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e44ab512c3a1..b1607e314497 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
...
> @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> bool host_writable = !fault || fault->map_writable;
> bool prefetch = !fault || fault->prefetch;
> bool write_fault = fault && fault->write;
> + bool is_refcounted = !fault || fault->is_refcounted_page;
Just wonder, what if a non-refcounted page is prefetched? Or is it possible in
practice?
...
>
> @@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> */
> static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
> {
> - bool host_writable;
> + bool host_writable, is_refcounted;
> gpa_t first_pte_gpa;
> u64 *sptep, spte;
> struct kvm_memory_slot *slot;
> @@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
> sptep = &sp->spt[i];
> spte = *sptep;
> host_writable = spte & shadow_host_writable_mask;
> + // TODO: is this correct?
> + is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED;
> slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
> make_spte(vcpu, sp, slot, pte_access, gfn,
> spte_to_pfn(spte), spte, true, false,
> - host_writable, &spte);
> + host_writable, is_refcounted, &spte);
Could we restrict that a non-refcounted page shall not be used as shadow page?
[*] https://lore.kernel.org/all/8caf3008-dcf3-985a-631e-e019b277c6f0@amd.com/
B.R.
Yu
On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> From: David Stevens <stevensd@chromium.org>
>
> Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
> memory into the guest that is backed by un-refcounted struct pages - for
> example, higher order non-compound pages allocated by the amdgpu driver
> via ttm_pool_alloc_page.
>
> The bulk of this change is tracking the is_refcounted_page flag so that
> non-refcounted pages don't trigger page_count() == 0 warnings. This is
> done by storing the flag in an unused bit in the sptes.
>
> Signed-off-by: David Stevens <stevensd@chromium.org>
> ---
> arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------
> arch/x86/kvm/mmu/mmu_internal.h | 1 +
> arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++---
> arch/x86/kvm/mmu/spte.c | 4 ++-
> arch/x86/kvm/mmu/spte.h | 12 ++++++++-
> arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++-------
> 6 files changed, 62 insertions(+), 30 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e44ab512c3a1..b1607e314497 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -553,12 +553,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
>
> if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
> flush = true;
> - kvm_set_pfn_accessed(spte_to_pfn(old_spte));
> + if (is_refcounted_page_pte(old_spte))
> + kvm_set_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
> }
>
> if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
> flush = true;
> - kvm_set_pfn_dirty(spte_to_pfn(old_spte));
> + if (is_refcounted_page_pte(old_spte))
> + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(old_spte)));
> }
>
> return flush;
> @@ -596,14 +598,18 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
> * before they are reclaimed. Sanity check that, if the pfn is backed
> * by a refcounted page, the refcount is elevated.
> */
> - page = kvm_pfn_to_refcounted_page(pfn);
> - WARN_ON(page && !page_count(page));
> + if (is_refcounted_page_pte(old_spte)) {
> + page = kvm_pfn_to_refcounted_page(pfn);
> + WARN_ON(!page || !page_count(page));
> + }
>
> - if (is_accessed_spte(old_spte))
> - kvm_set_pfn_accessed(pfn);
> + if (is_refcounted_page_pte(old_spte)) {
> + if (is_accessed_spte(old_spte))
> + kvm_set_page_accessed(pfn_to_page(pfn));
>
> - if (is_dirty_spte(old_spte))
> - kvm_set_pfn_dirty(pfn);
> + if (is_dirty_spte(old_spte))
> + kvm_set_page_dirty(pfn_to_page(pfn));
> + }
>
> return old_spte;
> }
> @@ -639,8 +645,8 @@ static bool mmu_spte_age(u64 *sptep)
> * Capture the dirty status of the page, so that it doesn't get
> * lost when the SPTE is marked for access tracking.
> */
> - if (is_writable_pte(spte))
> - kvm_set_pfn_dirty(spte_to_pfn(spte));
> + if (is_writable_pte(spte) && is_refcounted_page_pte(spte))
> + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(spte)));
>
> spte = mark_spte_for_access_track(spte);
> mmu_spte_update_no_track(sptep, spte);
> @@ -1278,8 +1284,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
> {
> bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
> (unsigned long *)sptep);
> - if (was_writable && !spte_ad_enabled(*sptep))
> - kvm_set_pfn_dirty(spte_to_pfn(*sptep));
> + if (was_writable && !spte_ad_enabled(*sptep) && is_refcounted_page_pte(*sptep))
> + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(*sptep)));
>
> return was_writable;
> }
> @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> bool host_writable = !fault || fault->map_writable;
> bool prefetch = !fault || fault->prefetch;
> bool write_fault = fault && fault->write;
> + bool is_refcounted = !fault || fault->is_refcounted_page;
>
> pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
> *sptep, write_fault, gfn);
> @@ -2969,7 +2976,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> }
>
> wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
> - true, host_writable, &spte);
> + true, host_writable, is_refcounted, &spte);
>
> if (*sptep == spte) {
> ret = RET_PF_SPURIOUS;
> @@ -4299,8 +4306,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> struct kvm_follow_pfn foll = {
> .slot = slot,
> .gfn = fault->gfn,
> - .flags = FOLL_GET | (fault->write ? FOLL_WRITE : 0),
> + .flags = fault->write ? FOLL_WRITE : 0,
> .allow_write_mapping = true,
> + .guarded_by_mmu_notifier = true,
> };
>
> /*
> @@ -4317,6 +4325,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> fault->slot = NULL;
> fault->pfn = KVM_PFN_NOSLOT;
> fault->map_writable = false;
> + fault->is_refcounted_page = false;
> return RET_PF_CONTINUE;
> }
> /*
> @@ -4366,6 +4375,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> success:
> fault->hva = foll.hva;
> fault->map_writable = foll.writable;
> + fault->is_refcounted_page = foll.is_refcounted_page;
> return RET_PF_CONTINUE;
> }
>
> @@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>
> out_unlock:
> write_unlock(&vcpu->kvm->mmu_lock);
> - kvm_release_pfn_clean(fault->pfn);
> + if (fault->is_refcounted_page)
> + kvm_set_page_accessed(pfn_to_page(fault->pfn));
> return r;
> }
>
> @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
>
> out_unlock:
> read_unlock(&vcpu->kvm->mmu_lock);
> - kvm_release_pfn_clean(fault->pfn);
Yet kvm_release_pfn() can still be triggered for the kvm_vcpu_maped gfns.
What if guest uses a non-referenced page(e.g., as a vmcs12)? Although I
believe this is not gonna happen in real world...
B.R.
Yu
> > @@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> > */
> > static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
> > {
> > - bool host_writable;
> > + bool host_writable, is_refcounted;
> > gpa_t first_pte_gpa;
> > u64 *sptep, spte;
> > struct kvm_memory_slot *slot;
> > @@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
> > sptep = &sp->spt[i];
> > spte = *sptep;
> > host_writable = spte & shadow_host_writable_mask;
> > + // TODO: is this correct?
> > + is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED;
> > slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
> > make_spte(vcpu, sp, slot, pte_access, gfn,
> > spte_to_pfn(spte), spte, true, false,
> > - host_writable, &spte);
> > + host_writable, is_refcounted, &spte);
>
> Could we restrict that a non-refcounted page shall not be used as shadow page?
Oh, sorry. It's not about shadow page. It's about guest page being
mapped as not refcounted. Silly me...
B.R.
Yu
On Tue, Jul 04, 2023 at 04:50:50PM +0900,
David Stevens <stevensd@chromium.org> wrote:
> diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
> index cf2c6426a6fc..46c681dc45e6 100644
> --- a/arch/x86/kvm/mmu/spte.c
> +++ b/arch/x86/kvm/mmu/spte.c
> @@ -138,7 +138,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> const struct kvm_memory_slot *slot,
> unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
> u64 old_spte, bool prefetch, bool can_unsync,
> - bool host_writable, u64 *new_spte)
> + bool host_writable, bool is_refcounted, u64 *new_spte)
> {
> int level = sp->role.level;
> u64 spte = SPTE_MMU_PRESENT_MASK;
> @@ -188,6 +188,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>
> if (level > PG_LEVEL_4K)
> spte |= PT_PAGE_SIZE_MASK;
> + else if (is_refcounted)
> + spte |= SPTE_MMU_PAGE_REFCOUNTED;
Is REFCOUNTED for 4K page only? What guarantees that large page doesn't have
FOLL_GET? or can we set the bit for large page?
>
> if (shadow_memtype_mask)
> spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
>
> On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> > From: David Stevens <stevensd@chromium.org>
> >
> > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
> > memory into the guest that is backed by un-refcounted struct pages - for
> > example, higher order non-compound pages allocated by the amdgpu driver
> > via ttm_pool_alloc_page.
>
> I guess you mean the tail pages of the higher order non-compound pages?
> And as to the head page, it is said to be set to one coincidentally[*],
> and shall not be considered as refcounted. IIUC, refcount of this head
> page will be increased and decreased soon in hva_to_pfn_remapped(), so
> this may not be a problem(?). But treating this head page differently,
> as a refcounted one(e.g., to set the A/D flags), is weired.
>
> Or maybe I missed some context, e.g., can the head page be allocted to
> guest at all?
Yes, this is to allow mapping the tail pages of higher order
non-compound pages - I should have been more precise in my wording.
The head pages can already be mapped into the guest.
Treating the head and tail pages would require changing how KVM
behaves in a situation it supports today (rather than just adding
support for an unsupported situation). Currently, without this series,
KVM can map VM_PFNMAP|VM_IO memory backed by refcounted pages into the
guest. When that happens, KVM sets the A/D flags. I'm not sure whether
that's actually valid behavior, nor do I know whether anyone actually
cares about it. But it's what KVM does today, and I would shy away
from modifying that behavior without good reason.
> >
> > The bulk of this change is tracking the is_refcounted_page flag so that
> > non-refcounted pages don't trigger page_count() == 0 warnings. This is
> > done by storing the flag in an unused bit in the sptes.
>
> Also, maybe we should mention this only works on x86-64.
>
> >
> > Signed-off-by: David Stevens <stevensd@chromium.org>
> > ---
> > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------
> > arch/x86/kvm/mmu/mmu_internal.h | 1 +
> > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++---
> > arch/x86/kvm/mmu/spte.c | 4 ++-
> > arch/x86/kvm/mmu/spte.h | 12 ++++++++-
> > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++-------
> > 6 files changed, 62 insertions(+), 30 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index e44ab512c3a1..b1607e314497 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
>
> ...
>
> > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> > bool host_writable = !fault || fault->map_writable;
> > bool prefetch = !fault || fault->prefetch;
> > bool write_fault = fault && fault->write;
> > + bool is_refcounted = !fault || fault->is_refcounted_page;
>
> Just wonder, what if a non-refcounted page is prefetched? Or is it possible in
> practice?
Prefetching is still done via gfn_to_page_many_atomic, which sets
FOLL_GET. That's fixable, but it's not something this series currently
does.
> ...
> >
> > @@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> > */
> > static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
> > {
> > - bool host_writable;
> > + bool host_writable, is_refcounted;
> > gpa_t first_pte_gpa;
> > u64 *sptep, spte;
> > struct kvm_memory_slot *slot;
> > @@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
> > sptep = &sp->spt[i];
> > spte = *sptep;
> > host_writable = spte & shadow_host_writable_mask;
> > + // TODO: is this correct?
> > + is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED;
> > slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
> > make_spte(vcpu, sp, slot, pte_access, gfn,
> > spte_to_pfn(spte), spte, true, false,
> > - host_writable, &spte);
> > + host_writable, is_refcounted, &spte);
>
> Could we restrict that a non-refcounted page shall not be used as shadow page?
I'm not very familiar with the shadow mmu, so my response might not
make sense. But do you mean not allowing non-refcoutned pages as the
guest page tables shadowed by a kvm_mmu_page? It would probably be
possible to do that, and I doubt anyone would care about the
restriction. But as far as I can tell, the guest page table is only
accessed via kvm_vcpu_read_guest_atomic, which handles non-refcounted
pages just fine.
-David
On Thu, Jul 6, 2023 at 11:10 AM Isaku Yamahata <isaku.yamahata@gmail.com> wrote:
>
> On Tue, Jul 04, 2023 at 04:50:50PM +0900,
> David Stevens <stevensd@chromium.org> wrote:
>
> > diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
> > index cf2c6426a6fc..46c681dc45e6 100644
> > --- a/arch/x86/kvm/mmu/spte.c
> > +++ b/arch/x86/kvm/mmu/spte.c
> > @@ -138,7 +138,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> > const struct kvm_memory_slot *slot,
> > unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
> > u64 old_spte, bool prefetch, bool can_unsync,
> > - bool host_writable, u64 *new_spte)
> > + bool host_writable, bool is_refcounted, u64 *new_spte)
> > {
> > int level = sp->role.level;
> > u64 spte = SPTE_MMU_PRESENT_MASK;
> > @@ -188,6 +188,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> >
> > if (level > PG_LEVEL_4K)
> > spte |= PT_PAGE_SIZE_MASK;
> > + else if (is_refcounted)
> > + spte |= SPTE_MMU_PAGE_REFCOUNTED;
>
> Is REFCOUNTED for 4K page only? What guarantees that large page doesn't have
> FOLL_GET? or can we set the bit for large page?
Oh, you're right, it should apply to >4K pages as well. This was based
on stale thinking from earlier versions of this series.
-David
On Thu, Jul 06, 2023 at 01:52:08PM +0900, David Stevens wrote:
> On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> >
> > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> > > From: David Stevens <stevensd@chromium.org>
> > >
> > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
> > > memory into the guest that is backed by un-refcounted struct pages - for
> > > example, higher order non-compound pages allocated by the amdgpu driver
> > > via ttm_pool_alloc_page.
> >
> > I guess you mean the tail pages of the higher order non-compound pages?
> > And as to the head page, it is said to be set to one coincidentally[*],
> > and shall not be considered as refcounted. IIUC, refcount of this head
> > page will be increased and decreased soon in hva_to_pfn_remapped(), so
> > this may not be a problem(?). But treating this head page differently,
> > as a refcounted one(e.g., to set the A/D flags), is weired.
> >
> > Or maybe I missed some context, e.g., can the head page be allocted to
> > guest at all?
>
> Yes, this is to allow mapping the tail pages of higher order
> non-compound pages - I should have been more precise in my wording.
> The head pages can already be mapped into the guest.
>
> Treating the head and tail pages would require changing how KVM
> behaves in a situation it supports today (rather than just adding
> support for an unsupported situation). Currently, without this series,
> KVM can map VM_PFNMAP|VM_IO memory backed by refcounted pages into the
> guest. When that happens, KVM sets the A/D flags. I'm not sure whether
> that's actually valid behavior, nor do I know whether anyone actually
> cares about it. But it's what KVM does today, and I would shy away
> from modifying that behavior without good reason.
I know the A/D status of the refcounted, VM_PFNMAP|VM_IO backed pages
will be recorded. And I have no idea if this is a necessary requirement
either.
But it feels awkward to see the head and the tail ones of non-compound
pages be treated inconsistently. After all, the head page just happens
to have its refcount being 1, it is not a real refcounted page.
So I would suggest to mention such different behehavior in the commit
message at least. :)
> > >
> > > @@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> > > */
> > > static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
> > > {
> > > - bool host_writable;
> > > + bool host_writable, is_refcounted;
> > > gpa_t first_pte_gpa;
> > > u64 *sptep, spte;
> > > struct kvm_memory_slot *slot;
> > > @@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
> > > sptep = &sp->spt[i];
> > > spte = *sptep;
> > > host_writable = spte & shadow_host_writable_mask;
> > > + // TODO: is this correct?
> > > + is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED;
> > > slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
> > > make_spte(vcpu, sp, slot, pte_access, gfn,
> > > spte_to_pfn(spte), spte, true, false,
> > > - host_writable, &spte);
> > > + host_writable, is_refcounted, &spte);
> >
> > Could we restrict that a non-refcounted page shall not be used as shadow page?
>
> I'm not very familiar with the shadow mmu, so my response might not
> make sense. But do you mean not allowing non-refcoutned pages as the
> guest page tables shadowed by a kvm_mmu_page? It would probably be
> possible to do that, and I doubt anyone would care about the
> restriction. But as far as I can tell, the guest page table is only
> accessed via kvm_vcpu_read_guest_atomic, which handles non-refcounted
> pages just fine.
Sorry, my brain just got baked... Pls just ignore this question :)
B.R.
Yu
On Thu, Jul 06, 2023 at 01:52:08PM +0900,
David Stevens <stevensd@chromium.org> wrote:
> On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> >
> > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> > > From: David Stevens <stevensd@chromium.org>
> > >
> > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
> > > memory into the guest that is backed by un-refcounted struct pages - for
> > > example, higher order non-compound pages allocated by the amdgpu driver
> > > via ttm_pool_alloc_page.
> >
> > I guess you mean the tail pages of the higher order non-compound pages?
> > And as to the head page, it is said to be set to one coincidentally[*],
> > and shall not be considered as refcounted. IIUC, refcount of this head
> > page will be increased and decreased soon in hva_to_pfn_remapped(), so
> > this may not be a problem(?). But treating this head page differently,
> > as a refcounted one(e.g., to set the A/D flags), is weired.
> >
> > Or maybe I missed some context, e.g., can the head page be allocted to
> > guest at all?
>
> Yes, this is to allow mapping the tail pages of higher order
> non-compound pages - I should have been more precise in my wording.
> The head pages can already be mapped into the guest.
>
> Treating the head and tail pages would require changing how KVM
> behaves in a situation it supports today (rather than just adding
> support for an unsupported situation). Currently, without this series,
> KVM can map VM_PFNMAP|VM_IO memory backed by refcounted pages into the
> guest. When that happens, KVM sets the A/D flags. I'm not sure whether
> that's actually valid behavior, nor do I know whether anyone actually
> cares about it. But it's what KVM does today, and I would shy away
> from modifying that behavior without good reason.
>
> > >
> > > The bulk of this change is tracking the is_refcounted_page flag so that
> > > non-refcounted pages don't trigger page_count() == 0 warnings. This is
> > > done by storing the flag in an unused bit in the sptes.
> >
> > Also, maybe we should mention this only works on x86-64.
> >
> > >
> > > Signed-off-by: David Stevens <stevensd@chromium.org>
> > > ---
> > > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------
> > > arch/x86/kvm/mmu/mmu_internal.h | 1 +
> > > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++---
> > > arch/x86/kvm/mmu/spte.c | 4 ++-
> > > arch/x86/kvm/mmu/spte.h | 12 ++++++++-
> > > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++-------
> > > 6 files changed, 62 insertions(+), 30 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index e44ab512c3a1..b1607e314497 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> >
> > ...
> >
> > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> > > bool host_writable = !fault || fault->map_writable;
> > > bool prefetch = !fault || fault->prefetch;
> > > bool write_fault = fault && fault->write;
> > > + bool is_refcounted = !fault || fault->is_refcounted_page;
> >
> > Just wonder, what if a non-refcounted page is prefetched? Or is it possible in
> > practice?
>
> Prefetching is still done via gfn_to_page_many_atomic, which sets
> FOLL_GET. That's fixable, but it's not something this series currently
> does.
So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with this
hunk. kvm_set_page_{dirty, accessed} won't be called as expected for prefetched
spte. If I read the patch correctly, REFCOUNTED bit in SPTE should represent
whether the corresponding page is ref-countable or not, right?
Because direct_pte_prefetch_many() is for legacy KVM MMU and FNAME(prefetch_pte)
is shadow paging, we need to test it with legacy KVM MMU or shadow paging to hit
the issue, though.
Thanks,
On Fri, Jul 7, 2023 at 12:58 AM Isaku Yamahata <isaku.yamahata@gmail.com> wrote:
>
> On Thu, Jul 06, 2023 at 01:52:08PM +0900,
> David Stevens <stevensd@chromium.org> wrote:
>
> > On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> > >
> > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> > > > From: David Stevens <stevensd@chromium.org>
> > > >
> > > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
> > > > memory into the guest that is backed by un-refcounted struct pages - for
> > > > example, higher order non-compound pages allocated by the amdgpu driver
> > > > via ttm_pool_alloc_page.
> > >
> > > I guess you mean the tail pages of the higher order non-compound pages?
> > > And as to the head page, it is said to be set to one coincidentally[*],
> > > and shall not be considered as refcounted. IIUC, refcount of this head
> > > page will be increased and decreased soon in hva_to_pfn_remapped(), so
> > > this may not be a problem(?). But treating this head page differently,
> > > as a refcounted one(e.g., to set the A/D flags), is weired.
> > >
> > > Or maybe I missed some context, e.g., can the head page be allocted to
> > > guest at all?
> >
> > Yes, this is to allow mapping the tail pages of higher order
> > non-compound pages - I should have been more precise in my wording.
> > The head pages can already be mapped into the guest.
> >
> > Treating the head and tail pages would require changing how KVM
> > behaves in a situation it supports today (rather than just adding
> > support for an unsupported situation). Currently, without this series,
> > KVM can map VM_PFNMAP|VM_IO memory backed by refcounted pages into the
> > guest. When that happens, KVM sets the A/D flags. I'm not sure whether
> > that's actually valid behavior, nor do I know whether anyone actually
> > cares about it. But it's what KVM does today, and I would shy away
> > from modifying that behavior without good reason.
> >
> > > >
> > > > The bulk of this change is tracking the is_refcounted_page flag so that
> > > > non-refcounted pages don't trigger page_count() == 0 warnings. This is
> > > > done by storing the flag in an unused bit in the sptes.
> > >
> > > Also, maybe we should mention this only works on x86-64.
> > >
> > > >
> > > > Signed-off-by: David Stevens <stevensd@chromium.org>
> > > > ---
> > > > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------
> > > > arch/x86/kvm/mmu/mmu_internal.h | 1 +
> > > > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++---
> > > > arch/x86/kvm/mmu/spte.c | 4 ++-
> > > > arch/x86/kvm/mmu/spte.h | 12 ++++++++-
> > > > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++-------
> > > > 6 files changed, 62 insertions(+), 30 deletions(-)
> > > >
> > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > index e44ab512c3a1..b1607e314497 100644
> > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > >
> > > ...
> > >
> > > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> > > > bool host_writable = !fault || fault->map_writable;
> > > > bool prefetch = !fault || fault->prefetch;
> > > > bool write_fault = fault && fault->write;
> > > > + bool is_refcounted = !fault || fault->is_refcounted_page;
> > >
> > > Just wonder, what if a non-refcounted page is prefetched? Or is it possible in
> > > practice?
> >
> > Prefetching is still done via gfn_to_page_many_atomic, which sets
> > FOLL_GET. That's fixable, but it's not something this series currently
> > does.
>
> So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with this
> hunk. kvm_set_page_{dirty, accessed} won't be called as expected for prefetched
> spte. If I read the patch correctly, REFCOUNTED bit in SPTE should represent
> whether the corresponding page is ref-countable or not, right?
>
> Because direct_pte_prefetch_many() is for legacy KVM MMU and FNAME(prefetch_pte)
> is shadow paging, we need to test it with legacy KVM MMU or shadow paging to hit
> the issue, though.
>
direct_pte_prefetch_many and prefetch_gpte both pass NULL for the
fault parameter, so is_refcounted will evaluate to true. So the spte's
refcounted bit will get set in that case.
-David
On Fri, Jul 07, 2023 at 10:35:02AM +0900,
David Stevens <stevensd@chromium.org> wrote:
> > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > > index e44ab512c3a1..b1607e314497 100644
> > > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > >
> > > > ...
> > > >
> > > > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> > > > > bool host_writable = !fault || fault->map_writable;
> > > > > bool prefetch = !fault || fault->prefetch;
> > > > > bool write_fault = fault && fault->write;
> > > > > + bool is_refcounted = !fault || fault->is_refcounted_page;
> > > >
> > > > Just wonder, what if a non-refcounted page is prefetched? Or is it possible in
> > > > practice?
> > >
> > > Prefetching is still done via gfn_to_page_many_atomic, which sets
> > > FOLL_GET. That's fixable, but it's not something this series currently
> > > does.
> >
> > So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with this
> > hunk. kvm_set_page_{dirty, accessed} won't be called as expected for prefetched
> > spte. If I read the patch correctly, REFCOUNTED bit in SPTE should represent
> > whether the corresponding page is ref-countable or not, right?
> >
> > Because direct_pte_prefetch_many() is for legacy KVM MMU and FNAME(prefetch_pte)
> > is shadow paging, we need to test it with legacy KVM MMU or shadow paging to hit
> > the issue, though.
> >
>
> direct_pte_prefetch_many and prefetch_gpte both pass NULL for the
> fault parameter, so is_refcounted will evaluate to true. So the spte's
> refcounted bit will get set in that case.
Oops, my bad. My point is "unconditionally". Is the bit always set for
non-refcountable pages? Or non-refcountable pages are not prefeched?
On Tue, Jul 11, 2023 at 1:34 AM Isaku Yamahata <isaku.yamahata@gmail.com> wrote:
>
> On Fri, Jul 07, 2023 at 10:35:02AM +0900,
> David Stevens <stevensd@chromium.org> wrote:
>
> > > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > > > index e44ab512c3a1..b1607e314497 100644
> > > > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > > >
> > > > > ...
> > > > >
> > > > > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
> > > > > > bool host_writable = !fault || fault->map_writable;
> > > > > > bool prefetch = !fault || fault->prefetch;
> > > > > > bool write_fault = fault && fault->write;
> > > > > > + bool is_refcounted = !fault || fault->is_refcounted_page;
> > > > >
> > > > > Just wonder, what if a non-refcounted page is prefetched? Or is it possible in
> > > > > practice?
> > > >
> > > > Prefetching is still done via gfn_to_page_many_atomic, which sets
> > > > FOLL_GET. That's fixable, but it's not something this series currently
> > > > does.
> > >
> > > So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with this
> > > hunk. kvm_set_page_{dirty, accessed} won't be called as expected for prefetched
> > > spte. If I read the patch correctly, REFCOUNTED bit in SPTE should represent
> > > whether the corresponding page is ref-countable or not, right?
> > >
> > > Because direct_pte_prefetch_many() is for legacy KVM MMU and FNAME(prefetch_pte)
> > > is shadow paging, we need to test it with legacy KVM MMU or shadow paging to hit
> > > the issue, though.
> > >
> >
> > direct_pte_prefetch_many and prefetch_gpte both pass NULL for the
> > fault parameter, so is_refcounted will evaluate to true. So the spte's
> > refcounted bit will get set in that case.
>
> Oops, my bad. My point is "unconditionally". Is the bit always set for
> non-refcountable pages? Or non-refcountable pages are not prefeched?
The bit is never set for non-refcounted pages, and is always set for
refcounted pages. The current series never prefetches non-refcounted
pages, since it continues to use the gfn_to_page_many_atomic API.
-David
On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> @@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>
> out_unlock:
> write_unlock(&vcpu->kvm->mmu_lock);
> - kvm_release_pfn_clean(fault->pfn);
> + if (fault->is_refcounted_page)
> + kvm_set_page_accessed(pfn_to_page(fault->pfn));
For a refcounted page, as now KVM puts its ref early in kvm_faultin_pfn(),
should this kvm_set_page_accessed() be placed before unlocking mmu_lock?
Otherwise, if the user unmaps a region (which triggers kvm_unmap_gfn_range()
with mmu_lock holding for write), and release the page, and if the two
steps happen after checking page_count() in kvm_set_page_accessed() and
before mark_page_accessed(), the latter function may mark accessed to a page
that is released or does not belong to current process.
Is it true?
> return r;
> }
>
> @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
>
> out_unlock:
> read_unlock(&vcpu->kvm->mmu_lock);
> - kvm_release_pfn_clean(fault->pfn);
> + if (fault->is_refcounted_page)
> + kvm_set_page_accessed(pfn_to_page(fault->pfn));
> return r;
> }
Ditto.
On Wed, Jul 19, 2023 at 3:35 PM Yan Zhao <yan.y.zhao@intel.com> wrote:
>
> On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> > @@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
> >
> > out_unlock:
> > write_unlock(&vcpu->kvm->mmu_lock);
> > - kvm_release_pfn_clean(fault->pfn);
> > + if (fault->is_refcounted_page)
> > + kvm_set_page_accessed(pfn_to_page(fault->pfn));
> For a refcounted page, as now KVM puts its ref early in kvm_faultin_pfn(),
> should this kvm_set_page_accessed() be placed before unlocking mmu_lock?
>
> Otherwise, if the user unmaps a region (which triggers kvm_unmap_gfn_range()
> with mmu_lock holding for write), and release the page, and if the two
> steps happen after checking page_count() in kvm_set_page_accessed() and
> before mark_page_accessed(), the latter function may mark accessed to a page
> that is released or does not belong to current process.
>
> Is it true?
Yes, good catch. During some testing last week, I actually found this
bug thanks to the WARN_ON the first patch in this series added to
kvm_is_ad_tracked_page. I'll fix it in the next revision, after Sean
gets a chance to comment on the series.
Thanks,
David
On Thu, Jul 06, 2023, David Stevens wrote:
> On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
> >
> > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote:
> > > From: David Stevens <stevensd@chromium.org>
> > >
> > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map
> > > memory into the guest that is backed by un-refcounted struct pages - for
> > > example, higher order non-compound pages allocated by the amdgpu driver
> > > via ttm_pool_alloc_page.
> >
> > I guess you mean the tail pages of the higher order non-compound pages?
> > And as to the head page, it is said to be set to one coincidentally[*],
> > and shall not be considered as refcounted. IIUC, refcount of this head
> > page will be increased and decreased soon in hva_to_pfn_remapped(), so
> > this may not be a problem(?). But treating this head page differently,
> > as a refcounted one(e.g., to set the A/D flags), is weired.
> >
> > Or maybe I missed some context, e.g., can the head page be allocted to
> > guest at all?
>
> Yes, this is to allow mapping the tail pages of higher order
> non-compound pages - I should have been more precise in my wording.
> The head pages can already be mapped into the guest.
Recording for posterity (or to make an incorrect statment and get corrected),
because I recently had a conversation about the head page not actually being
refcounted. (I can't remember with whom I had the conversation, but I'm pretty
sure it wasn't an imaginary friend).
Even though whatever allocates the page doesn't explicit refcount the head page,
__free_pages() will still do the right thing and (a) keep the head page around
until its last reference is put. And my understanding is that even though it's
a "head" page, it's not a PG_head page, i.e. not a compound page and so is treated
as an order-0 page when KVM invoke put_page().
void __free_pages(struct page *page, unsigned int order)
{
/* get PageHead before we drop reference */
int head = PageHead(page);
if (put_page_testzero(page)) <=== will evaluate false if KVM holds a ref
free_the_page(page, order);
else if (!head) <=== will be false for non-compound pages
while (order-- > 0)
free_the_page(page + (1 << order), order);
}
EXPORT_SYMBOL(__free_pages);
@@ -553,12 +553,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
flush = true;
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+ if (is_refcounted_page_pte(old_spte))
+ kvm_set_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
}
if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
flush = true;
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+ if (is_refcounted_page_pte(old_spte))
+ kvm_set_page_dirty(pfn_to_page(spte_to_pfn(old_spte)));
}
return flush;
@@ -596,14 +598,18 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
* before they are reclaimed. Sanity check that, if the pfn is backed
* by a refcounted page, the refcount is elevated.
*/
- page = kvm_pfn_to_refcounted_page(pfn);
- WARN_ON(page && !page_count(page));
+ if (is_refcounted_page_pte(old_spte)) {
+ page = kvm_pfn_to_refcounted_page(pfn);
+ WARN_ON(!page || !page_count(page));
+ }
- if (is_accessed_spte(old_spte))
- kvm_set_pfn_accessed(pfn);
+ if (is_refcounted_page_pte(old_spte)) {
+ if (is_accessed_spte(old_spte))
+ kvm_set_page_accessed(pfn_to_page(pfn));
- if (is_dirty_spte(old_spte))
- kvm_set_pfn_dirty(pfn);
+ if (is_dirty_spte(old_spte))
+ kvm_set_page_dirty(pfn_to_page(pfn));
+ }
return old_spte;
}
@@ -639,8 +645,8 @@ static bool mmu_spte_age(u64 *sptep)
* Capture the dirty status of the page, so that it doesn't get
* lost when the SPTE is marked for access tracking.
*/
- if (is_writable_pte(spte))
- kvm_set_pfn_dirty(spte_to_pfn(spte));
+ if (is_writable_pte(spte) && is_refcounted_page_pte(spte))
+ kvm_set_page_dirty(pfn_to_page(spte_to_pfn(spte)));
spte = mark_spte_for_access_track(spte);
mmu_spte_update_no_track(sptep, spte);
@@ -1278,8 +1284,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
{
bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
- if (was_writable && !spte_ad_enabled(*sptep))
- kvm_set_pfn_dirty(spte_to_pfn(*sptep));
+ if (was_writable && !spte_ad_enabled(*sptep) && is_refcounted_page_pte(*sptep))
+ kvm_set_page_dirty(pfn_to_page(spte_to_pfn(*sptep)));
return was_writable;
}
@@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
bool host_writable = !fault || fault->map_writable;
bool prefetch = !fault || fault->prefetch;
bool write_fault = fault && fault->write;
+ bool is_refcounted = !fault || fault->is_refcounted_page;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2969,7 +2976,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
- true, host_writable, &spte);
+ true, host_writable, is_refcounted, &spte);
if (*sptep == spte) {
ret = RET_PF_SPURIOUS;
@@ -4299,8 +4306,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
struct kvm_follow_pfn foll = {
.slot = slot,
.gfn = fault->gfn,
- .flags = FOLL_GET | (fault->write ? FOLL_WRITE : 0),
+ .flags = fault->write ? FOLL_WRITE : 0,
.allow_write_mapping = true,
+ .guarded_by_mmu_notifier = true,
};
/*
@@ -4317,6 +4325,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
fault->slot = NULL;
fault->pfn = KVM_PFN_NOSLOT;
fault->map_writable = false;
+ fault->is_refcounted_page = false;
return RET_PF_CONTINUE;
}
/*
@@ -4366,6 +4375,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
success:
fault->hva = foll.hva;
fault->map_writable = foll.writable;
+ fault->is_refcounted_page = foll.is_refcounted_page;
return RET_PF_CONTINUE;
}
@@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(fault->pfn);
+ if (fault->is_refcounted_page)
+ kvm_set_page_accessed(pfn_to_page(fault->pfn));
return r;
}
@@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
out_unlock:
read_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(fault->pfn);
+ if (fault->is_refcounted_page)
+ kvm_set_page_accessed(pfn_to_page(fault->pfn));
return r;
}
#endif
@@ -240,6 +240,7 @@ struct kvm_page_fault {
kvm_pfn_t pfn;
hva_t hva;
bool map_writable;
+ bool is_refcounted_page;
/*
* Indicates the guest is trying to write a gfn that contains one or
@@ -829,7 +829,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(fault->pfn);
+ if (fault->is_refcounted_page)
+ kvm_set_page_accessed(pfn_to_page(fault->pfn));
return r;
}
@@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
*/
static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- bool host_writable;
+ bool host_writable, is_refcounted;
gpa_t first_pte_gpa;
u64 *sptep, spte;
struct kvm_memory_slot *slot;
@@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
sptep = &sp->spt[i];
spte = *sptep;
host_writable = spte & shadow_host_writable_mask;
+ // TODO: is this correct?
+ is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
make_spte(vcpu, sp, slot, pte_access, gfn,
spte_to_pfn(spte), spte, true, false,
- host_writable, &spte);
+ host_writable, is_refcounted, &spte);
return mmu_spte_update(sptep, spte);
}
@@ -138,7 +138,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
u64 old_spte, bool prefetch, bool can_unsync,
- bool host_writable, u64 *new_spte)
+ bool host_writable, bool is_refcounted, u64 *new_spte)
{
int level = sp->role.level;
u64 spte = SPTE_MMU_PRESENT_MASK;
@@ -188,6 +188,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
+ else if (is_refcounted)
+ spte |= SPTE_MMU_PAGE_REFCOUNTED;
if (shadow_memtype_mask)
spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
@@ -95,6 +95,11 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
/* Defined only to keep the above static asserts readable. */
#undef SHADOW_ACC_TRACK_SAVED_MASK
+/*
+ * Indicates that the SPTE refers to a page with a valid refcount.
+ */
+#define SPTE_MMU_PAGE_REFCOUNTED BIT_ULL(59)
+
/*
* Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
* the memslots generation and is derived as follows:
@@ -332,6 +337,11 @@ static inline bool is_dirty_spte(u64 spte)
return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
}
+static inline bool is_refcounted_page_pte(u64 spte)
+{
+ return spte & SPTE_MMU_PAGE_REFCOUNTED;
+}
+
static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
int level)
{
@@ -462,7 +472,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
u64 old_spte, bool prefetch, bool can_unsync,
- bool host_writable, u64 *new_spte);
+ bool host_writable, bool is_refcounted, u64 *new_spte);
u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
union kvm_mmu_page_role role, int index);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
@@ -474,6 +474,7 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
bool was_leaf = was_present && is_last_spte(old_spte, level);
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
+ bool is_refcounted = is_refcounted_page_pte(old_spte);
WARN_ON(level > PT64_ROOT_MAX_LEVEL);
WARN_ON(level < PG_LEVEL_4K);
@@ -538,9 +539,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (is_leaf != was_leaf)
kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
- if (was_leaf && is_dirty_spte(old_spte) &&
+ if (was_leaf && is_dirty_spte(old_spte) && is_refcounted &&
(!is_present || !is_dirty_spte(new_spte) || pfn_changed))
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+ kvm_set_page_dirty(pfn_to_page(spte_to_pfn(old_spte)));
/*
* Recursively handle child PTs if the change removed a subtree from
@@ -552,9 +553,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
- if (was_leaf && is_accessed_spte(old_spte) &&
+ if (was_leaf && is_accessed_spte(old_spte) && is_refcounted &&
(!is_present || !is_accessed_spte(new_spte) || pfn_changed))
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+ kvm_set_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
}
/*
@@ -988,8 +989,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
else
wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
- fault->pfn, iter->old_spte, fault->prefetch, true,
- fault->map_writable, &new_spte);
+ fault->pfn, iter->old_spte, fault->prefetch, true,
+ fault->map_writable, fault->is_refcounted_page,
+ &new_spte);
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
@@ -1205,8 +1207,9 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
* Capture the dirty status of the page, so that it doesn't get
* lost when the SPTE is marked for access tracking.
*/
- if (is_writable_pte(iter->old_spte))
- kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
+ if (is_writable_pte(iter->old_spte) &&
+ is_refcounted_page_pte(iter->old_spte))
+ kvm_set_page_dirty(pfn_to_page(spte_to_pfn(iter->old_spte)));
new_spte = mark_spte_for_access_track(iter->old_spte);
iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
@@ -1626,7 +1629,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
iter.old_spte,
iter.old_spte & ~dbit);
- kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
+ if (is_refcounted_page_pte(iter.old_spte))
+ kvm_set_page_dirty(pfn_to_page(spte_to_pfn(iter.old_spte)));
}
rcu_read_unlock();