@@ -204,7 +204,9 @@ must also set up an MSR filter to process writes to MSR_KVM_MIGRATION_CONTROL.
The hypercall lets a guest request memory permissions to be removed for itself,
identified with set of physical page ranges (GFNs). The HEKI_ATTR_MEM_NOWRITE
-memory page range attribute forbids related modification to the guest.
+memory page range attribute forbids related modification to the guest. The
+HEKI_ATTR_MEM_EXEC attribute allows execution on the specified pages while
+removing it for all the others.
Returns 0 on success or a KVM error code otherwise.
@@ -3313,7 +3313,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
{
if (fault->exec)
- return is_executable_pte(spte);
+ return is_executable_pte(spte, !fault->user);
if (fault->write)
return is_writable_pte(spte);
@@ -5602,6 +5602,39 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
+ /* Skips real page faults if not needed. */
+ if ((error_code & PFERR_FETCH_MASK) &&
+ !kvm_heki_is_exec_allowed(vcpu, cr2_or_gpa)) {
+ /*
+ * TODO: To avoid kvm_heki_is_exec_allowed() call, check
+ * enable_mbec and EPT_VIOLATION_KERNEL_INSTR, see
+ * handle_ept_violation().
+ */
+ struct x86_exception fault = {
+ .vector = PF_VECTOR,
+ .error_code_valid = true,
+ .error_code = error_code,
+ .nested_page_fault = false,
+ /*
+ * TODO: This kind of kernel page fault needs to be handled by
+ * the guest, which is not currently the case, making it try
+ * again and again.
+ *
+ * You may want to test with cr2_or_gva to see the page
+ * fault caught by the guest kernel (thinking it is a
+ * user space fault).
+ */
+ .address = static_call(kvm_x86_fault_gva)(vcpu),
+ .async_page_fault = false,
+ };
+
+ pr_warn_ratelimited(
+ "heki-kvm: Creating fetch #PF at 0x%016llx\n",
+ fault.address);
+ kvm_inject_page_fault(vcpu, &fault);
+ return RET_PF_INVALID;
+ }
+
r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
@@ -342,7 +342,8 @@ TRACE_EVENT(
__field(u8, level)
/* These depend on page entry type, so compute them now. */
__field(bool, r)
- __field(bool, x)
+ __field(bool, kx)
+ __field(bool, ux)
__field(signed char, u)
),
@@ -352,15 +353,17 @@ TRACE_EVENT(
__entry->sptep = virt_to_phys(sptep);
__entry->level = level;
__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
- __entry->x = is_executable_pte(__entry->spte);
+ __entry->kx = is_executable_pte(__entry->spte, true);
+ __entry->ux = is_executable_pte(__entry->spte, false);
__entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
),
- TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+ TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
__entry->gfn, __entry->spte,
__entry->r ? "r" : "-",
__entry->spte & PT_WRITABLE_MASK ? "w" : "-",
- __entry->x ? "x" : "-",
+ __entry->kx ? "X" : "-",
+ __entry->ux ? "x" : "-",
__entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
__entry->level, __entry->sptep
)
@@ -184,10 +184,25 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_access &= ~ACC_EXEC_MASK;
}
- if (pte_access & ACC_EXEC_MASK)
+ if (pte_access & ACC_EXEC_MASK) {
spte |= shadow_x_mask;
- else
+#ifdef CONFIG_HEKI
+ /*
+ * FIXME: Race condition (at boot) if no
+ * lockdep_assert_held_write(vcpu->kvm->mmu_lock);
+ */
+ if (READ_ONCE(vcpu->kvm->heki_kernel_exec_locked)) {
+ if (!heki_exec_is_allowed(vcpu->kvm, gfn))
+ spte &= ~VMX_EPT_EXECUTABLE_MASK;
+ else
+ pr_warn("heki-kvm: Allowing kernel execution "
+ "for GFN 0x%llx\n",
+ gfn);
+ }
+#endif /* CONFIG_HEKI */
+ } else {
spte |= shadow_nx_mask;
+ }
if (pte_access & ACC_USER_MASK)
spte |= shadow_user_mask;
@@ -3,7 +3,10 @@
#ifndef KVM_X86_MMU_SPTE_H
#define KVM_X86_MMU_SPTE_H
+#include <asm/vmx.h>
+
#include "mmu_internal.h"
+#include "../vmx/vmx.h"
/*
* A MMU present SPTE is backed by actual memory and may or may not be present
@@ -307,9 +310,17 @@ static inline bool is_last_spte(u64 pte, int level)
return (level == PG_LEVEL_4K) || is_large_pte(pte);
}
-static inline bool is_executable_pte(u64 spte)
+static inline bool is_executable_pte(u64 spte, bool for_kernel_mode)
{
- return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+ u64 x_mask = shadow_x_mask;
+
+ if (enable_mbec) {
+ if (for_kernel_mode)
+ x_mask &= ~VMX_EPT_USER_EXECUTABLE_MASK;
+ else
+ x_mask &= ~VMX_EPT_EXECUTABLE_MASK;
+ }
+ return (spte & (x_mask | shadow_nx_mask)) == x_mask;
}
static inline kvm_pfn_t spte_to_pfn(u64 pte)
@@ -7,7 +7,10 @@
#include "tdp_mmu.h"
#include "spte.h"
+#include "../x86.h"
+
#include <asm/cmpxchg.h>
+#include <asm/vmx.h>
#include <trace/events/kvm.h>
static bool __read_mostly tdp_mmu_enabled = true;
@@ -1021,6 +1024,76 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
}
}
+#ifdef CONFIG_HEKI
+
+/* TODO: Handle flush? */
+void kvm_heki_fix_all_ept_exec_perm(struct kvm *const kvm)
+{
+ int i;
+ struct kvm_mmu_page *root;
+ const gfn_t start = 0;
+ const gfn_t end = tdp_mmu_max_gfn_exclusive();
+
+ if (WARN_ON_ONCE(!is_tdp_mmu_enabled(kvm)))
+ return;
+
+ if (WARN_ON_ONCE(!enable_mbec))
+ return;
+
+ write_lock(&kvm->mmu_lock);
+
+ /*
+ * Because heki_exec_locked is only set with this code, it cannot be
+ * unlocked. This is protected against race condition thanks to
+ * mmu_lock.
+ */
+ WRITE_ONCE(kvm->heki_kernel_exec_locked, true);
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for_each_tdp_mmu_root(kvm, root, i) {
+ struct tdp_iter iter;
+
+ WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
+
+ /*
+ * TODO: Make sure
+ * !is_shadow_present_pte()/SPTE_MMU_PRESENT_MASK are
+ * well handled when they are present.
+ */
+
+ rcu_read_lock();
+ tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ u64 new_spte;
+
+ if (heki_exec_is_allowed(kvm, iter.gfn)) {
+ pr_warn("heki-kvm: Allowing kernel "
+ "execution for GFN 0x%llx\n",
+ iter.gfn);
+ continue;
+ }
+ pr_warn("heki-kvm: Denying kernel execution "
+ "for GFN 0x%llx\n",
+ iter.gfn);
+
+retry:
+ new_spte = iter.old_spte &
+ ~VMX_EPT_EXECUTABLE_MASK;
+ if (new_spte == iter.old_spte)
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter,
+ new_spte))
+ goto retry;
+ }
+ rcu_read_unlock();
+ }
+ }
+ write_unlock(&kvm->mmu_lock);
+ pr_warn("heki-kvm: Locked executable kernel memory\n");
+}
+
+#endif /* CONFIG_HEKI */
+
/*
* Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
* zap" completes.
@@ -24,6 +24,10 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
+#ifdef CONFIG_HEKI
+void kvm_heki_fix_all_ept_exec_perm(struct kvm *const kvm);
+#endif /* CONFIG_HEKI */
+
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
@@ -20,6 +20,7 @@
#include "irq.h"
#include "ioapic.h"
#include "mmu.h"
+#include "mmu/tdp_mmu.h"
#include "i8254.h"
#include "tss.h"
#include "kvm_cache_regs.h"
@@ -31,6 +32,7 @@
#include "lapic.h"
#include "xen.h"
#include "smm.h"
+#include "vmx/capabilities.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -9705,6 +9707,45 @@ heki_page_track_prewrite(struct kvm_vcpu *const vcpu, const gpa_t gpa,
return true;
}
+bool heki_exec_is_allowed(const struct kvm *const kvm, const gfn_t gfn)
+{
+ unsigned int gfn_last;
+
+ if (!READ_ONCE(kvm->heki_kernel_exec_locked))
+ return true;
+
+ /*
+ * heki_gfn_exec_last is initialized with (HEKI_GFN_MAX + 1),
+ * and 0 means that heki_gfn_exec_last is full.
+ */
+ for (gfn_last = atomic_read(&kvm->heki_gfn_exec_last);
+ gfn_last > 0 && gfn_last <= HEKI_GFN_MAX;) {
+ gfn_last--;
+
+ /* Ignores unused slots. */
+ if (kvm->heki_gfn_exec[gfn_last].end == 0)
+ break;
+
+ if (gfn >= kvm->heki_gfn_exec[gfn_last].start &&
+ gfn <= kvm->heki_gfn_exec[gfn_last].end) {
+ /* TODO: Opportunistically shrink heki_gfn_exec. */
+ return true;
+ }
+ }
+ return false;
+}
+
+bool kvm_heki_is_exec_allowed(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ const gfn_t gfn = gpa_to_gfn(gpa);
+ const struct kvm *const kvm = vcpu->kvm;
+
+ if (heki_exec_is_allowed(kvm, gfn))
+ return true;
+
+ return false;
+}
+
static int kvm_heki_init_vm(struct kvm *const kvm)
{
struct kvm_page_track_notifier_node *const node =
@@ -9733,6 +9774,7 @@ static int heki_lock_mem_page_ranges(struct kvm *const kvm, gpa_t mem_ranges,
int err;
size_t i, ranges_num;
struct heki_pa_range *ranges;
+ bool has_exec_restriction = false;
if (mem_ranges_size > HEKI_PA_RANGE_MAX_SIZE)
return -KVM_E2BIG;
@@ -9752,7 +9794,8 @@ static int heki_lock_mem_page_ranges(struct kvm *const kvm, gpa_t mem_ranges,
ranges_num = mem_ranges_size / sizeof(struct heki_pa_range);
for (i = 0; i < ranges_num; i++) {
- const u64 attributes_mask = HEKI_ATTR_MEM_NOWRITE;
+ const u64 attributes_mask = HEKI_ATTR_MEM_NOWRITE |
+ HEKI_ATTR_MEM_EXEC;
const gfn_t gfn_start = ranges[i].gfn_start;
const gfn_t gfn_end = ranges[i].gfn_end;
const u64 attributes = ranges[i].attributes;
@@ -9799,11 +9842,52 @@ static int heki_lock_mem_page_ranges(struct kvm *const kvm, gpa_t mem_ranges,
kvm, gfn, KVM_PAGE_TRACK_PREWRITE));
}
- pr_warn("heki-kvm: Locking GFN 0x%llx-0x%llx with %s\n",
+ /*
+ * Allow-list for execute permission,
+ * see kvm_heki_fix_all_ept_exec_perm().
+ */
+ if (attributes & HEKI_ATTR_MEM_EXEC) {
+ size_t gfn_i;
+
+ if (!enable_mbec) {
+ /*
+ * Guests can check for MBEC support to avoid
+ * such error by not using HEKI_ATTR_MEM_EXEC.
+ */
+ err = -KVM_EOPNOTSUPP;
+ pr_warn("heki-kvm: HEKI_ATTR_MEM_EXEC "
+ "depends on MBEC, which is disabled.");
+ /*
+ * We should continue partially applying
+ * restrictions, but it is useful for this RFC
+ * to exit early in case of missing MBEC
+ * support.
+ */
+ goto out_free_ranges;
+ }
+
+ has_exec_restriction = true;
+ gfn_i = atomic_dec_if_positive(
+ &kvm->heki_gfn_exec_last);
+ if (gfn_i == 0) {
+ err = -KVM_E2BIG;
+ goto out_free_ranges;
+ }
+
+ gfn_i--;
+ kvm->heki_gfn_exec[gfn_i].start = gfn_start;
+ kvm->heki_gfn_exec[gfn_i].end = gfn_end;
+ }
+
+ pr_warn("heki-kvm: Locking GFN 0x%llx-0x%llx with %s%s\n",
gfn_start, gfn_end,
- (attributes & HEKI_ATTR_MEM_NOWRITE) ? "[nowrite]" : "");
+ (attributes & HEKI_ATTR_MEM_NOWRITE) ? "[nowrite]" : "",
+ (attributes & HEKI_ATTR_MEM_EXEC) ? "[exec]" : "");
}
+ if (has_exec_restriction)
+ kvm_heki_fix_all_ept_exec_perm(kvm);
+
out_free_ranges:
kfree(ranges);
return err;
@@ -282,6 +282,8 @@ int heki_check_cr(const struct kvm *kvm, unsigned long cr, unsigned long val);
bool kvm_heki_is_exec_allowed(struct kvm_vcpu *vcpu, gpa_t gpa);
+bool heki_exec_is_allowed(const struct kvm *const kvm, const gfn_t gfn);
+
#else /* CONFIG_HEKI */
static inline int heki_check_cr(const struct kvm *const kvm,
@@ -290,6 +292,11 @@ static inline int heki_check_cr(const struct kvm *const kvm,
return 0;
}
+static inline bool kvm_heki_is_exec_allowed(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return true;
+}
+
#endif /* CONFIG_HEKI */
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
@@ -812,9 +812,13 @@ struct kvm {
#define HEKI_GFN_MAX 16
atomic_t heki_gfn_no_write_num;
struct heki_gfn_range heki_gfn_no_write[HEKI_GFN_MAX];
+ atomic_t heki_gfn_exec_last;
+ struct heki_gfn_range heki_gfn_exec[HEKI_GFN_MAX];
atomic_long_t heki_pinned_cr0;
atomic_long_t heki_pinned_cr4;
+
+ bool heki_kernel_exec_locked;
#endif /* CONFIG_HEKI */
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
@@ -1232,6 +1232,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
#ifdef CONFIG_HEKI
atomic_set(&kvm->heki_gfn_no_write_num, HEKI_GFN_MAX + 1);
+ atomic_set(&kvm->heki_gfn_exec_last, HEKI_GFN_MAX + 1);
#endif /* CONFIG_HEKI */
preempt_notifier_inc();