@@ -1252,6 +1252,441 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
}
}
+#include <linux/prandom.h>
+
+// readelf -W -s vmlinux | awk '/ asm_exc_double_fault$/ { print $2}'
+static unsigned long asm_exc_df_handler;
+module_param(asm_exc_df_handler, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ asm_exc_nmi$/ { print $2}'
+static unsigned long asm_exc_nmi_handler;
+module_param(asm_exc_nmi_handler, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ asm_exc_debug$/ { print $2}'
+static unsigned long asm_exc_db_handler;
+module_param(asm_exc_db_handler, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ asm_exc_machine_check$/ { print $2}'
+static unsigned long asm_exc_mce_handler;
+module_param(asm_exc_mce_handler, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ exc_nmi$/ { print $2}'
+static unsigned long exc_nmi_handler;
+module_param(exc_nmi_handler, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ exc_debug$/ { print $2}'
+static unsigned long exc_db_kernel_handler;
+module_param(exc_db_kernel_handler, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ noist_exc_debug$/ { print $2}'
+static unsigned long exc_db_user_handler;
+module_param(exc_db_user_handler, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ exc_machine_check$/ { print $2}'
+static unsigned long exc_mce_kernel_handler;
+module_param(exc_mce_kernel_handler, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ noist_exc_machine_check$/ { print $2}'
+static unsigned long exc_mce_user_handler;
+module_param(exc_mce_user_handler, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ commit_asm_exc_nmi$/ { print $2}'
+static unsigned long asm_exc_nmi_commit_ip;
+module_param(asm_exc_nmi_commit_ip, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ commit_asm_exc_debug$/ { print $2}'
+static unsigned long asm_exc_db_commit_ip;
+module_param(asm_exc_db_commit_ip, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ commit_asm_exc_machine_check$/ { print $2}'
+static unsigned long asm_exc_mce_commit_ip;
+module_param(asm_exc_mce_commit_ip, ulong, S_IRUGO);
+
+// readelf -W -s vmlinux | awk '/ native_irq_return_iret$/ { print $2}'
+static unsigned long native_irq_return_iret_ip;
+module_param(native_irq_return_iret_ip, ulong, S_IRUGO);
+
+#define MAX_NESTING 50
+#define IST_START_RATE 512 /* rate of queuing an IST per vm-exit if no IST queued */
+#define IST_NESTED_RATE 256 /* rate of nesting an IST per instruction during handling IST */
+
+static int instruction_count;
+static int ist_depth, ist_atomic_depth;
+static int ist_nmi_blocking, ist_db_blocking, ist_mce_blocking;
+static u8 pending_vector[MAX_NESTING];
+static unsigned long ist_interrupted_rip[MAX_NESTING];
+static unsigned long commit_ip;
+
+#define TEST_STOPPED 0
+#define TEST_REQUESTED 1
+#define TEST_ONGOING 2
+static int ist_emul_test = TEST_STOPPED;
+// set ist_emul_test = 1 (TEST_REQUESTED) when loading the module to enable the test
+// in a script which also sets all the above symbol addresses
+module_param(ist_emul_test, int, S_IRUGO);
+
+static bool test_too_early_nesting_bug;
+module_param(test_too_early_nesting_bug, bool, S_IRUGO);
+
+extern int ist_emul_test_singlestep;
+
+static bool ist_emul_injecting;
+static bool ist_emul_returning;
+
+static void print_pending_vectors(void)
+{
+ int i;
+
+ pr_cont("current vectors:");
+ for (i = 0; i < ist_depth; i++) {
+ switch (pending_vector[i]) {
+ case MC_VECTOR: pr_cont(" #MC"); break;
+ case DB_VECTOR: pr_cont(" #DB"); break;
+ case 2: pr_cont(" NMI"); break;
+ default: WARN_ON(1);
+ }
+ }
+ pr_cont(";\n");
+}
+
+#define DR7_EN(i, type, len) (((type | len) & 0xf) << (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) |\
+ (DR_GLOBAL_ENABLE << (i * DR_ENABLE_SIZE))
+
+#define DR7_X(i) DR7_EN(i, X86_BREAKPOINT_EXECUTE, X86_BREAKPOINT_LEN_X)
+
+static void watch_injecting(struct kvm_vcpu *vcpu)
+{
+ kvm_set_dr(vcpu, 0, asm_exc_nmi_handler);
+ kvm_set_dr(vcpu, 1, asm_exc_db_handler);
+ kvm_set_dr(vcpu, 2, asm_exc_mce_handler);
+ kvm_set_dr(vcpu, 3, asm_exc_df_handler);
+ kvm_set_dr(vcpu, 6, DR6_RESERVED);
+ kvm_set_dr(vcpu, 7, DR7_X(0) | DR7_X(1) | DR7_X(2) | DR7_X(3));
+}
+
+static void finish_watching_injecting(struct kvm_vcpu *vcpu)
+{
+ kvm_set_dr(vcpu, 0, 0);
+ kvm_set_dr(vcpu, 1, 0);
+ kvm_set_dr(vcpu, 2, 0);
+ kvm_set_dr(vcpu, 3, 0);
+ kvm_set_dr(vcpu, 6, DR6_RESERVED);
+ kvm_set_dr(vcpu, 7, 0);
+}
+
+static void watch_handler_done(struct kvm_vcpu *vcpu)
+{
+ kvm_set_dr(vcpu, 0, ist_interrupted_rip[ist_depth-1]);
+ kvm_set_dr(vcpu, 1, asm_exc_df_handler);
+ //kvm_set_dr(vcpu, 2, exc_nmi_handler);
+ //kvm_set_dr(vcpu, 3, native_irq_return_iret_ip);
+ kvm_set_dr(vcpu, 6, DR6_RESERVED);
+ kvm_set_dr(vcpu, 7, DR7_X(0) | DR7_X(1));
+ //kvm_set_dr(vcpu, 7, DR7_X(0) | DR7_X(1) | DR7_X(2) | DR7_X(3));
+}
+
+static void ist_emul_test_start_singlestep(struct kvm_vcpu *vcpu)
+{
+ ist_emul_test_singlestep = 1;
+}
+
+static void ist_emul_test_stop_singlestep(struct kvm_vcpu *vcpu)
+{
+ ist_emul_test_singlestep = 0;
+ watch_handler_done(vcpu);
+ finish_watching_injecting(vcpu);
+}
+
+static void ist_emul_test_stop(struct kvm_vcpu *vcpu)
+{
+ ist_emul_test = TEST_STOPPED;
+ ist_emul_test_stop_singlestep(vcpu);
+}
+
+static bool is_entering_ip(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip = kvm_rip_read(vcpu);
+
+ return rip == asm_exc_mce_handler || rip == asm_exc_db_handler || rip == asm_exc_nmi_handler;
+}
+
+static void emul_ist_bug_in_singlestep(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip = kvm_rip_read(vcpu);
+
+ if (WARN_ON_ONCE(!ist_depth)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+
+ if (rip == asm_exc_df_handler || instruction_count > 1000) {
+ ist_emul_test_stop(vcpu);
+ return;
+ }
+}
+
+static unsigned long read_one(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+ unsigned long ret;
+ struct x86_exception e;
+ if (kvm_read_guest_virt(vcpu, addr, &ret, sizeof(ret), &e) != X86EMUL_CONTINUE)
+ return 0xf000f123f456f789;
+ return ret;
+}
+
+static void emul_ist_kctx_handler_return(struct kvm_vcpu *vcpu)
+{
+ unsigned long rsp = kvm_rsp_read(vcpu);
+ unsigned long ret_rip;
+ struct x86_exception e;
+
+ if (kvm_read_guest_virt(vcpu, rsp, &ret_rip, sizeof(ret_rip), &e) != X86EMUL_CONTINUE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ /* it is faked #MC #DB NMI, make the guest handle it as if the handler in kernel context was an NOP */
+ kvm_rsp_write(vcpu, rsp+8);
+ kvm_rip_write(vcpu, ret_rip);
+}
+
+static void emul_ist_kctx_handler_done(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip = kvm_rip_read(vcpu);
+ int vector = -1;
+
+ if (rip == exc_nmi_handler)
+ vector = NMI_VECTOR;
+ if (rip == exc_db_kernel_handler || rip == exc_db_user_handler)
+ vector = DB_VECTOR;
+ if (rip == exc_mce_kernel_handler || rip == exc_mce_user_handler)
+ vector = MC_VECTOR;
+
+ if (vector > 0) {
+ if (WARN_ON(vector != pending_vector[ist_depth-1]))
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ pr_info("Enter the handler in kernel context; ");
+ print_pending_vectors();
+ emul_ist_kctx_handler_return(vcpu);
+ }
+}
+
+static void emul_ist_emul_enable_delivering(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip = kvm_rip_read(vcpu);
+
+ if (rip == commit_ip) {
+ ist_atomic_depth = 0;
+ ist_nmi_blocking = 0;
+ ist_db_blocking = 0;
+ ist_mce_blocking = 0;
+ }
+}
+
+static bool test_nesting_bug(void)
+{
+ return test_too_early_nesting_bug && ((get_random_u32() % 1024) == 0);
+}
+
+static void __ist_emul_test_queue_one(struct kvm_vcpu *vcpu, int vector)
+{
+ if (!ist_atomic_depth)
+ ist_interrupted_rip[ist_depth] = kvm_rip_read(vcpu);
+ else
+ ist_interrupted_rip[ist_depth] = commit_ip;
+ pending_vector[ist_depth] = vector;
+ ist_depth++;
+ ist_atomic_depth++;
+ instruction_count = 0;
+ pr_info("queue an IST exception, depth:%d, atomic_depth:%d; ", ist_depth, ist_atomic_depth);
+ print_pending_vectors();
+
+ // per intel SDM, X86_EFLAGS_TF is cleared after the exception is diliverred.
+ // use hw breakpoint to force immediate exit
+ // and add X86_EFLAGS_TF after breakpoint hits
+ ist_emul_injecting = true;
+ watch_injecting(vcpu);
+ pr_info("interrupt ip: %lx return ip: %lx\n", kvm_rip_read(vcpu), ist_interrupted_rip[ist_depth-1]);
+}
+
+static void ist_emul_test_queue_one(struct kvm_vcpu *vcpu)
+{
+ u32 which = get_random_u32() % 3;
+
+ if (ist_depth >= MAX_NESTING)
+ return;
+
+ if (which <= 0 && (!ist_mce_blocking || test_nesting_bug())) {
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ ist_mce_blocking = 1;
+ __ist_emul_test_queue_one(vcpu, MC_VECTOR);
+ commit_ip = asm_exc_mce_commit_ip;
+ return;
+ }
+
+ if (which <= 1 && (!ist_db_blocking || test_nesting_bug())) {
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
+ ist_db_blocking = 1;
+ __ist_emul_test_queue_one(vcpu, DB_VECTOR);
+ commit_ip = asm_exc_db_commit_ip;
+ return;
+ }
+
+ if (which <= 2 && (!ist_nmi_blocking || test_nesting_bug())) {
+ vmx_set_nmi_mask(vcpu, false);
+ kvm_inject_nmi(vcpu);
+ ist_nmi_blocking = 1;
+ __ist_emul_test_queue_one(vcpu, 2);
+ commit_ip = asm_exc_nmi_commit_ip;
+ return;
+ }
+}
+
+static void __handle_ist_emul_test_singlestep(struct kvm_vcpu *vcpu)
+{
+ instruction_count++;
+ //pr_info("singlestep ip: %lx\n", kvm_rip_read(vcpu));
+ emul_ist_emul_enable_delivering(vcpu);
+ emul_ist_kctx_handler_done(vcpu);
+ emul_ist_bug_in_singlestep(vcpu);
+
+ /* try to nest an IST exception if the previous is not done */
+ if (!ist_emul_test_singlestep)
+ return;
+
+ // no RF, the return rip need to be watched.
+ // #DB during a repeated string instruction will get RF (sync_regs())
+ if (vmx_get_rflags(vcpu) & X86_EFLAGS_RF)
+ return;
+
+ if (get_random_u32() % IST_NESTED_RATE)
+ return;
+
+ ist_emul_test_queue_one(vcpu);
+}
+
+static bool handle_ist_emul_test_singlestep(struct kvm_vcpu *vcpu, unsigned long dr6)
+{
+ unsigned long rip;
+
+ if (!ist_emul_test_singlestep)
+ return false;
+
+ //kvm_get_dr(vcpu, 6, &dr6);
+ //pr_info("singlestep rip %lx dr6 %lx\n", kvm_rip_read(vcpu), dr6);
+ //kvm_set_dr(vcpu, 6, DR6_RESERVED);
+ if (WARN_ON(!ist_emul_injecting && !ist_emul_returning && is_entering_ip(vcpu))) {
+ ist_emul_test_stop(vcpu);
+ return true;
+ }
+ if (ist_emul_injecting) {
+ //pr_info("xxx ip: %lx\n", kvm_rip_read(vcpu));
+ if (WARN_ON(!is_entering_ip(vcpu))) {
+ ist_emul_test_stop(vcpu);
+ return true;
+ }
+ ist_emul_injecting = false;
+ finish_watching_injecting(vcpu);
+ if (0)
+ read_one(vcpu, kvm_rsp_read(vcpu));
+ //pr_info("rip in ret %lx\n", read_one(vcpu, kvm_rsp_read(vcpu)));
+ //watch_handler_done(vcpu);
+ //if (vmx_get_rflags(vcpu) & X86_EFLAGS_TF)
+ // pr_info("xxxxxxxxxxxxxxxxxxxxxx\n");
+ vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) | X86_EFLAGS_TF);
+ }
+ if (ist_emul_returning) {
+ rip = kvm_rip_read(vcpu);
+ if (rip != ist_interrupted_rip[ist_depth-1]) {
+ pr_info("rip %lx expect rip %lx\n", rip, ist_interrupted_rip[ist_depth-1]);
+ WARN_ON(1);
+ ist_emul_test_stop(vcpu);
+ return true;
+ }
+ ist_depth--;
+ pr_info("IST exception returned, depth:%d; ", ist_depth);
+ print_pending_vectors();
+
+ // print instruction_count for adjusting IST_NESTED_RATE
+ // to contrl the average ist_depth for the the test.
+ //pr_info("instruction_count %d\n", instruction_count);
+ instruction_count = 0;
+ ist_emul_returning = false;
+
+ if (!ist_depth) {
+ ist_emul_test_stop_singlestep(vcpu);
+ return true;
+ }
+ }
+
+
+ __handle_ist_emul_test_singlestep(vcpu);
+
+ rip = kvm_rip_read(vcpu);
+ if (!ist_emul_injecting && rip == native_irq_return_iret_ip) {
+ pr_info("IST exception is going to return\n");
+ //watch_handler_done(vcpu);
+ ist_emul_returning = true;
+ }
+ return true;
+}
+
+static void handle_ist_emul_test(struct kvm_vcpu *vcpu)
+{
+ //if (ist_emul_test_singlestep)
+ // pr_info("vmexit ip: %lx\n", kvm_rip_read(vcpu));
+
+ if (ist_emul_test == TEST_STOPPED)
+ return;
+
+ if (ist_emul_test == TEST_REQUESTED) {
+ /* the test can't only be started until the guest's booting process is done. */
+ if (vmx_get_cpl(vcpu))
+ ist_emul_test = TEST_ONGOING;
+ else
+ return;
+ }
+
+ /* Don't start the test if the test is already started or someone
+ * else is using the X86_EFLAGS_TF */
+ // no RF, the return rip need to be watched.
+ if (vmx_get_rflags(vcpu) & (X86_EFLAGS_TF | X86_EFLAGS_RF))
+ return;
+
+ if (ist_emul_test_singlestep) {
+ //kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+
+ if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))
+ return;
+
+ if (vcpu->arch.nmi_injected || vcpu->arch.interrupt.injected ||
+ vcpu->arch.exception.pending || vcpu->arch.exception.injected)
+ return;
+
+ pr_info_once("%lx %lx %lx %lx %lx %lx\n", vcpu->arch.eff_db[0], vcpu->arch.eff_db[1],
+ vcpu->arch.eff_db[2], vcpu->arch.eff_db[3], vcpu->arch.dr6, vcpu->arch.dr7);
+ if (vcpu->arch.eff_db[0] || vcpu->arch.eff_db[1] ||
+ vcpu->arch.eff_db[2] || vcpu->arch.eff_db[3] ||
+ ((vcpu->arch.dr6 | DR6_RESERVED) != DR6_RESERVED) ||
+ ((vcpu->arch.dr7 | DR7_FIXED_1) != DR7_FIXED_1))
+ return;
+
+ if (WARN_ON_ONCE(ist_depth)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+
+ /* XXX: don't start the testing when the guest is handling real #MC #DB NMI.
+ * But how to detect it? Just skip the detecting since is a test. */
+
+ if (get_random_u32() % IST_START_RATE)
+ return;
+
+ ist_emul_test_start_singlestep(vcpu);
+ ist_emul_test_queue_one(vcpu);
+}
+
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5258,6 +5693,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
switch (ex_no) {
case DB_VECTOR:
dr6 = vmx_get_exit_qual(vcpu);
+ if (handle_ist_emul_test_singlestep(vcpu, dr6))
+ return 1;
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
/*
@@ -6579,6 +7016,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
+ if (ret > 0)
+ handle_ist_emul_test(vcpu);
/*
* Exit to user space when bus lock detected to inform that there is
* a bus lock in guest.
@@ -7290,7 +7729,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* vmentry fails as it then expects bit 14 (BS) in pending debug
* exceptions being set, but that's not correct for the guest debugging
* case. */
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ if (ist_emul_test_singlestep || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))
vmx_set_interrupt_shadow(vcpu, 0);
kvm_load_guest_xsave_state(vcpu);
@@ -814,6 +814,7 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu)
atomic_inc(&vcpu->arch.nmi_queued);
kvm_make_request(KVM_REQ_NMI, vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
@@ -12810,12 +12811,15 @@ bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
}
EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+int ist_emul_test_singlestep;
+EXPORT_SYMBOL_GPL(ist_emul_test_singlestep);
+
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
rflags = static_call(kvm_x86_get_rflags)(vcpu);
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ if (ist_emul_test_singlestep || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))
rflags &= ~X86_EFLAGS_TF;
return rflags;
}
@@ -12823,8 +12827,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
- kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+ if (ist_emul_test_singlestep || ((vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) &&
+ kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)))
rflags |= X86_EFLAGS_TF;
static_call(kvm_x86_set_rflags)(vcpu, rflags);
}