@@ -106,9 +106,11 @@ KVM_X86_OP(load_mmu_pgd)
KVM_X86_OP_OPTIONAL(link_private_spt)
KVM_X86_OP_OPTIONAL(free_private_spt)
KVM_X86_OP_OPTIONAL(split_private_spt)
+KVM_X86_OP_OPTIONAL(merge_private_spt)
KVM_X86_OP_OPTIONAL(set_private_spte)
KVM_X86_OP_OPTIONAL(remove_private_spte)
KVM_X86_OP_OPTIONAL(zap_private_spte)
+KVM_X86_OP_OPTIONAL(unzap_private_spte)
KVM_X86_OP(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
KVM_X86_OP(get_l2_tsc_multiplier)
@@ -147,6 +147,7 @@
#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1)
#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
+#define KVM_HPAGE_GFN_MASK(x) (~((1UL << KVM_HPAGE_GFN_SHIFT(x)) - 1))
#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
@@ -1768,11 +1769,14 @@ struct kvm_x86_ops {
void *private_spt);
int (*split_private_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
void *private_spt);
+ int (*merge_private_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
+ void *private_spt);
int (*set_private_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
kvm_pfn_t pfn);
int (*remove_private_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
kvm_pfn_t pfn);
int (*zap_private_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level);
+ int (*unzap_private_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level);
bool (*has_wbinvd_exit)(void);
@@ -71,6 +71,14 @@ tdp_ptep_t spte_to_child_pt(u64 spte, int level)
return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
}
+static void step_down(struct tdp_iter *iter, tdp_ptep_t child_pt)
+{
+ iter->level--;
+ iter->pt_path[iter->level - 1] = child_pt;
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+}
+
/*
* Steps down one level in the paging structure towards the goal GFN. Returns
* true if the iterator was able to step down a level, false otherwise.
@@ -92,14 +100,28 @@ static bool try_step_down(struct tdp_iter *iter)
if (!child_pt)
return false;
- iter->level--;
- iter->pt_path[iter->level - 1] = child_pt;
- iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
- tdp_iter_refresh_sptep(iter);
-
+ step_down(iter, child_pt);
return true;
}
+/* Steps down for frozen spte. Don't re-read sptep because it was frozen. */
+void tdp_iter_step_down(struct tdp_iter *iter, tdp_ptep_t child_pt)
+{
+ WARN_ON_ONCE(!child_pt);
+ WARN_ON_ONCE(iter->yielded);
+ WARN_ON_ONCE(iter->level == iter->min_level);
+
+ step_down(iter, child_pt);
+}
+
+void tdp_iter_step_side(struct tdp_iter *iter)
+{
+ iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
+ iter->next_last_level_gfn = iter->gfn;
+ iter->sptep++;
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+}
+
/*
* Steps to the next entry in the current page table, at the current page table
* level. The next entry could point to a page backing guest memory or another
@@ -117,10 +139,7 @@ static bool try_step_side(struct tdp_iter *iter)
(SPTE_ENT_PER_PAGE - 1))
return false;
- iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
- iter->next_last_level_gfn = iter->gfn;
- iter->sptep++;
- iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+ tdp_iter_step_side(iter);
return true;
}
@@ -134,6 +134,8 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
+void tdp_iter_step_side(struct tdp_iter *iter);
+void tdp_iter_step_down(struct tdp_iter *iter, tdp_ptep_t child_pt);
static inline union kvm_mmu_page_role tdp_iter_child_role(struct tdp_iter *iter)
{
@@ -1205,6 +1205,180 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm, bool skip_private)
}
}
+static int tdp_mmu_iter_step_side(int i, struct tdp_iter *iter)
+{
+ i++;
+
+ /*
+ * if i = SPTE_ENT_PER_PAGE, tdp_iter_step_side() results
+ * in reading the entry beyond the last entry.
+ */
+ if (i < SPTE_ENT_PER_PAGE)
+ tdp_iter_step_side(iter);
+
+ return i;
+}
+
+static int tdp_mmu_merge_private_spt(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ struct tdp_iter *iter, u64 new_spte)
+{
+ u64 *sptep = rcu_dereference(iter->sptep);
+ u64 old_spte = iter->old_spte;
+ struct kvm_mmu_page *child_sp;
+ struct kvm *kvm = vcpu->kvm;
+ struct tdp_iter child_iter;
+ int level = iter->level;
+ gfn_t gfn = iter->gfn;
+ tdp_ptep_t child_pt;
+ u64 child_spte;
+ int ret = 0;
+ int i;
+
+ /*
+ * TDX KVM supports only 2MB large page. It's not supported to merge
+ * 2MB pages into 1GB page at the moment.
+ */
+ WARN_ON_ONCE(fault->goal_level != PG_LEVEL_2M);
+ WARN_ON_ONCE(iter->level != PG_LEVEL_2M);
+ WARN_ON_ONCE(!is_large_pte(new_spte));
+
+ /* Freeze the spte to prevent other threads from working spte. */
+ if (!try_cmpxchg64(sptep, &iter->old_spte, REMOVED_SPTE))
+ return -EBUSY;
+
+ /*
+ * Step down to the child spte. Because tdp_iter_next() assumes the
+ * parent spte isn't frozen, do it manually.
+ */
+ child_pt = spte_to_child_pt(iter->old_spte, iter->level);
+ child_sp = sptep_to_sp(child_pt);
+ WARN_ON_ONCE(child_sp->role.level != PG_LEVEL_4K);
+ WARN_ON_ONCE(!kvm_mmu_page_role_is_private(child_sp->role));
+
+ /* Don't modify iter as the caller will use iter after this function. */
+ child_iter = *iter;
+ /* Adjust the target gfn to the head gfn of the large page. */
+ child_iter.next_last_level_gfn &= -KVM_PAGES_PER_HPAGE(level);
+ tdp_iter_step_down(&child_iter, child_pt);
+
+ /*
+ * All child pages are required to be populated for merging them into a
+ * large page. Populate all child spte.
+ */
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i = tdp_mmu_iter_step_side(i, &child_iter)) {
+ int tmp;
+
+ WARN_ON_ONCE(child_iter.level != PG_LEVEL_4K);
+
+ if (is_shadow_present_pte(child_iter.old_spte)) {
+ /* TODO: relocate page for huge page. */
+ if (WARN_ON_ONCE(spte_to_pfn(child_iter.old_spte) !=
+ spte_to_pfn(new_spte) + i)) {
+ if (!ret)
+ ret = -EAGAIN;
+ continue;
+ }
+ /*
+ * When SEPT_VE_DISABLE=true and the page state is
+ * pending, this case can happen. Just resume the vcpu
+ * again with the expectation for other vcpu to accept
+ * this page.
+ */
+ if (child_iter.gfn == fault->gfn) {
+ if (!ret)
+ ret = -EAGAIN;
+ }
+ continue;
+ }
+
+ child_spte = make_huge_page_split_spte(kvm, new_spte, child_sp->role, i);
+ /*
+ * Because other thread may have started to operate on this spte
+ * before freezing the parent spte, Use atomic version to
+ * prevent race.
+ */
+ tmp = tdp_mmu_set_spte_atomic(vcpu->kvm, &child_iter, child_spte);
+ if (tmp == -EBUSY || tmp == -EAGAIN) {
+ /*
+ * There was a race condition. Populate remaining 4K
+ * spte to resolve fault->gfn to guarantee the forward
+ * progress.
+ */
+ if (!ret)
+ ret = tmp;
+ } else if (tmp) {
+ ret = tmp;
+ goto out;
+ }
+ }
+ if (ret)
+ goto out;
+
+ /* Prevent the Secure-EPT entry from being used. */
+ ret = static_call(kvm_x86_zap_private_spte)(kvm, gfn, level);
+ if (ret)
+ goto out;
+ kvm_flush_remote_tlbs_range(kvm, gfn & KVM_HPAGE_GFN_MASK(level),
+ KVM_PAGES_PER_HPAGE(level));
+
+ /* Merge pages into a large page. */
+ ret = static_call(kvm_x86_merge_private_spt)(kvm, gfn, level,
+ kvm_mmu_private_spt(child_sp));
+ /*
+ * Failed to merge pages because some pages are accepted and some are
+ * pending. Since the child page was mapped above, let vcpu run.
+ */
+ if (ret) {
+ if (static_call(kvm_x86_unzap_private_spte)(kvm, gfn, level))
+ old_spte = SHADOW_NONPRESENT_VALUE |
+ (spte_to_pfn(old_spte) << PAGE_SHIFT) |
+ PT_PAGE_SIZE_MASK;
+ goto out;
+ }
+
+ /* Update stats manually as we don't use tdp_mmu_set_spte{, _atomic}(). */
+ kvm_update_page_stats(kvm, level - 1, -SPTE_ENT_PER_PAGE);
+ kvm_update_page_stats(kvm, level, 1);
+
+ /* Unfreeze spte. */
+ iter->old_spte = new_spte;
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+
+ /*
+ * Free unused child sp. Secure-EPT page was already freed at TDX level
+ * by kvm_x86_merge_private_spt().
+ */
+ tdp_unaccount_mmu_page(kvm, child_sp);
+ tdp_mmu_free_sp(child_sp);
+ return 0;
+
+out:
+ iter->old_spte = old_spte;
+ __kvm_tdp_mmu_write_spte(sptep, old_spte);
+ return ret;
+}
+
+static int __tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ struct tdp_iter *iter, u64 new_spte)
+{
+ /*
+ * The private page has smaller-size pages. For example, the child
+ * pages was converted from shared to page, and now it can be mapped as
+ * a large page. Try to merge small pages into a large page.
+ */
+ if (fault->slot &&
+ kvm_gfn_shared_mask(vcpu->kvm) &&
+ iter->level > PG_LEVEL_4K &&
+ kvm_is_private_gpa(vcpu->kvm, fault->addr) &&
+ is_shadow_present_pte(iter->old_spte) &&
+ !is_large_pte(iter->old_spte))
+ return tdp_mmu_merge_private_spt(vcpu, fault, iter, new_spte);
+
+ return tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte);
+}
+
/*
* Installs a last-level SPTE to handle a TDP page fault.
* (NPT/EPT violation/misconfiguration)
@@ -1246,7 +1420,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ else if (__tdp_mmu_map_handle_target_level(vcpu, fault, iter, new_spte))
return RET_PF_RETRY;
else if (is_shadow_present_pte(iter->old_spte) &&
!is_last_spte(iter->old_spte, iter->level))