@@ -13,6 +13,8 @@
#include <linux/module.h>
+#include <asm/gsseg.h>
+#include <asm/io_bitmap.h>
#include <asm/pvm_para.h>
#include "cpuid.h"
@@ -26,6 +28,211 @@ static bool __read_mostly is_intel;
static unsigned long host_idt_base;
+static inline void __save_gs_base(struct vcpu_pvm *pvm)
+{
+ // switcher will do a real hw swapgs, so use hw MSR_KERNEL_GS_BASE
+ rdmsrl(MSR_KERNEL_GS_BASE, pvm->segments[VCPU_SREG_GS].base);
+}
+
+static inline void __load_gs_base(struct vcpu_pvm *pvm)
+{
+ // switcher will do a real hw swapgs, so use hw MSR_KERNEL_GS_BASE
+ wrmsrl(MSR_KERNEL_GS_BASE, pvm->segments[VCPU_SREG_GS].base);
+}
+
+static inline void __save_fs_base(struct vcpu_pvm *pvm)
+{
+ rdmsrl(MSR_FS_BASE, pvm->segments[VCPU_SREG_FS].base);
+}
+
+static inline void __load_fs_base(struct vcpu_pvm *pvm)
+{
+ wrmsrl(MSR_FS_BASE, pvm->segments[VCPU_SREG_FS].base);
+}
+
+/*
+ * Test whether DS, ES, FS and GS need to be reloaded.
+ *
+ * Reading them only returns the selectors, but writing them (if
+ * nonzero) loads the full descriptor from the GDT or LDT.
+ *
+ * We therefore need to write new values to the segment registers
+ * on every host-guest state switch unless both the new and old
+ * values are zero.
+ */
+static inline bool need_reload_sel(u16 sel1, u16 sel2)
+{
+ return unlikely(sel1 | sel2);
+}
+
+/*
+ * Save host DS/ES/FS/GS selector, FS base, and inactive GS base.
+ * And load guest DS/ES/FS/GS selector, FS base, and GS base.
+ *
+ * Note, when the guest state is loaded and it is in hypervisor, the guest
+ * GS base is loaded in the hardware MSR_KERNEL_GS_BASE which is loaded
+ * with host inactive GS base when the guest state is NOT loaded.
+ */
+static void segments_save_host_and_switch_to_guest(struct vcpu_pvm *pvm)
+{
+ u16 pvm_ds_sel, pvm_es_sel, pvm_fs_sel, pvm_gs_sel;
+
+ /* Save host segments */
+ savesegment(ds, pvm->host_ds_sel);
+ savesegment(es, pvm->host_es_sel);
+ current_save_fsgs();
+
+ /* Load guest segments */
+ pvm_ds_sel = pvm->segments[VCPU_SREG_DS].selector;
+ pvm_es_sel = pvm->segments[VCPU_SREG_ES].selector;
+ pvm_fs_sel = pvm->segments[VCPU_SREG_FS].selector;
+ pvm_gs_sel = pvm->segments[VCPU_SREG_GS].selector;
+
+ if (need_reload_sel(pvm_ds_sel, pvm->host_ds_sel))
+ loadsegment(ds, pvm_ds_sel);
+ if (need_reload_sel(pvm_es_sel, pvm->host_es_sel))
+ loadsegment(es, pvm_es_sel);
+ if (need_reload_sel(pvm_fs_sel, current->thread.fsindex))
+ loadsegment(fs, pvm_fs_sel);
+ if (need_reload_sel(pvm_gs_sel, current->thread.gsindex))
+ load_gs_index(pvm_gs_sel);
+
+ __load_gs_base(pvm);
+ __load_fs_base(pvm);
+}
+
+/*
+ * Save guest DS/ES/FS/GS selector, FS base, and GS base.
+ * And load host DS/ES/FS/GS selector, FS base, and inactive GS base.
+ */
+static void segments_save_guest_and_switch_to_host(struct vcpu_pvm *pvm)
+{
+ u16 pvm_ds_sel, pvm_es_sel, pvm_fs_sel, pvm_gs_sel;
+
+ /* Save guest segments */
+ savesegment(ds, pvm_ds_sel);
+ savesegment(es, pvm_es_sel);
+ savesegment(fs, pvm_fs_sel);
+ savesegment(gs, pvm_gs_sel);
+ pvm->segments[VCPU_SREG_DS].selector = pvm_ds_sel;
+ pvm->segments[VCPU_SREG_ES].selector = pvm_es_sel;
+ pvm->segments[VCPU_SREG_FS].selector = pvm_fs_sel;
+ pvm->segments[VCPU_SREG_GS].selector = pvm_gs_sel;
+
+ __save_fs_base(pvm);
+ __save_gs_base(pvm);
+
+ /* Load host segments */
+ if (need_reload_sel(pvm_ds_sel, pvm->host_ds_sel))
+ loadsegment(ds, pvm->host_ds_sel);
+ if (need_reload_sel(pvm_es_sel, pvm->host_es_sel))
+ loadsegment(es, pvm->host_es_sel);
+ if (need_reload_sel(pvm_fs_sel, current->thread.fsindex))
+ loadsegment(fs, current->thread.fsindex);
+ if (need_reload_sel(pvm_gs_sel, current->thread.gsindex))
+ load_gs_index(current->thread.gsindex);
+
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
+ wrmsrl(MSR_FS_BASE, current->thread.fsbase);
+}
+
+static void pvm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+ if (pvm->loaded_cpu_state)
+ return;
+
+ pvm->loaded_cpu_state = 1;
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+ /*
+ * PVM doesn't load guest I/O bitmap into hardware. Invalidate I/O
+ * bitmap if the current task is using it. This prevents any possible
+ * leakage of an active I/O bitmap to the guest and forces I/O
+ * instructions in guest to be trapped and emulated.
+ *
+ * The I/O bitmap will be restored when the current task exits to
+ * user mode in arch_exit_to_user_mode_prepare().
+ */
+ if (test_thread_flag(TIF_IO_BITMAP))
+ native_tss_invalidate_io_bitmap();
+#endif
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /* PVM doesn't support LDT. */
+ if (unlikely(current->mm->context.ldt))
+ clear_LDT();
+#endif
+
+ segments_save_host_and_switch_to_guest(pvm);
+
+ kvm_set_user_return_msr(0, (u64)entry_SYSCALL_64_switcher, -1ull);
+ kvm_set_user_return_msr(1, pvm->msr_tsc_aux, -1ull);
+ if (ia32_enabled()) {
+ if (is_intel)
+ kvm_set_user_return_msr(2, GDT_ENTRY_INVALID_SEG, -1ull);
+ else
+ kvm_set_user_return_msr(2, (u64)entry_SYSCALL32_ignore, -1ull);
+ }
+}
+
+static void pvm_prepare_switch_to_host(struct vcpu_pvm *pvm)
+{
+ if (!pvm->loaded_cpu_state)
+ return;
+
+ ++pvm->vcpu.stat.host_state_reload;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ if (unlikely(current->mm->context.ldt))
+ kvm_load_ldt(GDT_ENTRY_LDT*8);
+#endif
+
+ segments_save_guest_and_switch_to_host(pvm);
+ pvm->loaded_cpu_state = 0;
+}
+
+/*
+ * Set all hardware states back to host.
+ * Except user return MSRs.
+ */
+static void pvm_switch_to_host(struct vcpu_pvm *pvm)
+{
+ preempt_disable();
+ pvm_prepare_switch_to_host(pvm);
+ preempt_enable();
+}
+
+DEFINE_PER_CPU(struct vcpu_pvm *, active_pvm_vcpu);
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static void pvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+ if (__this_cpu_read(active_pvm_vcpu) == pvm && vcpu->cpu == cpu)
+ return;
+
+ __this_cpu_write(active_pvm_vcpu, pvm);
+
+ indirect_branch_prediction_barrier();
+}
+
+static void pvm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_pvm *pvm = to_pvm(vcpu);
+
+ pvm_prepare_switch_to_host(pvm);
+}
+
+static void pvm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
static void pvm_setup_mce(struct kvm_vcpu *vcpu)
{
}
@@ -100,6 +307,8 @@ static void pvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
struct vcpu_pvm *pvm = to_pvm(vcpu);
int i;
+ pvm_switch_to_host(pvm);
+
kvm_gpc_deactivate(&pvm->pvcs_gpc);
if (!init_event)
@@ -183,6 +392,24 @@ static int pvm_check_processor_compat(void)
return 0;
}
+/*
+ * When in PVM mode, the hardware MSR_LSTAR is set to the entry point
+ * provided by the host entry code (switcher), and the
+ * hypervisor can also change the hardware MSR_TSC_AUX to emulate
+ * the guest MSR_TSC_AUX.
+ */
+static __init void pvm_setup_user_return_msrs(void)
+{
+ kvm_add_user_return_msr(MSR_LSTAR);
+ kvm_add_user_return_msr(MSR_TSC_AUX);
+ if (ia32_enabled()) {
+ if (is_intel)
+ kvm_add_user_return_msr(MSR_IA32_SYSENTER_CS);
+ else
+ kvm_add_user_return_msr(MSR_CSTAR);
+ }
+}
+
static __init void pvm_set_cpu_caps(void)
{
if (boot_cpu_has(X86_FEATURE_NX))
@@ -253,6 +480,8 @@ static __init int hardware_setup(void)
store_idt(&dt);
host_idt_base = dt.address;
+ pvm_setup_user_return_msrs();
+
pvm_set_cpu_caps();
kvm_configure_mmu(false, 0, 0, 0);
@@ -287,8 +516,14 @@ static struct kvm_x86_ops pvm_x86_ops __initdata = {
.vcpu_free = pvm_vcpu_free,
.vcpu_reset = pvm_vcpu_reset,
+ .prepare_switch_to_guest = pvm_prepare_switch_to_guest,
+ .vcpu_load = pvm_vcpu_load,
+ .vcpu_put = pvm_vcpu_put,
+
.vcpu_after_set_cpuid = pvm_vcpu_after_set_cpuid,
+ .sched_in = pvm_sched_in,
+
.nested_ops = &pvm_nested_ops,
.setup_mce = pvm_setup_mce,
@@ -30,13 +30,18 @@ struct vcpu_pvm {
unsigned long switch_flags;
+ u16 host_ds_sel, host_es_sel;
+
u32 hw_cs, hw_ss;
+ int loaded_cpu_state;
int int_shadow;
bool nmi_mask;
struct gfn_to_pfn_cache pvcs_gpc;
+ // emulated x86 msrs
+ u64 msr_tsc_aux;
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included