summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/emulate.c191
-rw-r--r--arch/x86/kvm/hyperv.c20
-rw-r--r--arch/x86/kvm/lapic.c77
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c68
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmutrace.h4
-rw-r--r--arch/x86/kvm/pmu.c4
-rw-r--r--arch/x86/kvm/svm.c111
-rw-r--r--arch/x86/kvm/trace.h4
-rw-r--r--arch/x86/kvm/vmx/nested.c130
-rw-r--r--arch/x86/kvm/vmx/vmenter.S12
-rw-r--r--arch/x86/kvm/vmx/vmx.c61
-rw-r--r--arch/x86/kvm/vmx/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c157
-rw-r--r--arch/x86/kvm/x86.h4
16 files changed, 554 insertions, 298 deletions
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c338984c850d..d0d5dd44b4f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2331,24 +2331,18 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
{
+#ifdef CONFIG_X86_64
u32 eax, ebx, ecx, edx;
eax = 0x80000001;
ecx = 0;
ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
return edx & bit(X86_FEATURE_LM);
+#else
+ return false;
+#endif
}
-#define GET_SMSTATE(type, smbase, offset) \
- ({ \
- type __val; \
- int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \
- sizeof(__val)); \
- if (r != X86EMUL_CONTINUE) \
- return X86EMUL_UNHANDLEABLE; \
- __val; \
- })
-
static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
{
desc->g = (flags >> 23) & 1;
@@ -2361,27 +2355,30 @@ static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
desc->type = (flags >> 8) & 15;
}
-static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
+ int n)
{
struct desc_struct desc;
int offset;
u16 selector;
- selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
+ selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4);
if (n < 3)
offset = 0x7f84 + n * 12;
else
offset = 0x7f2c + (n - 3) * 12;
- set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
- set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
+ set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset));
ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
return X86EMUL_CONTINUE;
}
-static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+#ifdef CONFIG_X86_64
+static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
+ int n)
{
struct desc_struct desc;
int offset;
@@ -2390,15 +2387,16 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
offset = 0x7e00 + n * 16;
- selector = GET_SMSTATE(u16, smbase, offset);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
- set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
- base3 = GET_SMSTATE(u32, smbase, offset + 12);
+ selector = GET_SMSTATE(u16, smstate, offset);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
+ set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
+ base3 = GET_SMSTATE(u32, smstate, offset + 12);
ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
return X86EMUL_CONTINUE;
}
+#endif
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
u64 cr0, u64 cr3, u64 cr4)
@@ -2445,7 +2443,8 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
-static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
+ const char *smstate)
{
struct desc_struct desc;
struct desc_ptr dt;
@@ -2453,53 +2452,55 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
u32 val, cr0, cr3, cr4;
int i;
- cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
- cr3 = GET_SMSTATE(u32, smbase, 0x7ff8);
- ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
- ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
+ cr0 = GET_SMSTATE(u32, smstate, 0x7ffc);
+ cr3 = GET_SMSTATE(u32, smstate, 0x7ff8);
+ ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
+ ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0);
for (i = 0; i < 8; i++)
- *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
+ *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
- val = GET_SMSTATE(u32, smbase, 0x7fcc);
+ val = GET_SMSTATE(u32, smstate, 0x7fcc);
ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
- val = GET_SMSTATE(u32, smbase, 0x7fc8);
+ val = GET_SMSTATE(u32, smstate, 0x7fc8);
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
- selector = GET_SMSTATE(u32, smbase, 0x7fc4);
- set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64));
- set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c));
+ selector = GET_SMSTATE(u32, smstate, 0x7fc4);
+ set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c));
ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
- selector = GET_SMSTATE(u32, smbase, 0x7fc0);
- set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80));
- set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78));
+ selector = GET_SMSTATE(u32, smstate, 0x7fc0);
+ set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78));
ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
- dt.address = GET_SMSTATE(u32, smbase, 0x7f74);
- dt.size = GET_SMSTATE(u32, smbase, 0x7f70);
+ dt.address = GET_SMSTATE(u32, smstate, 0x7f74);
+ dt.size = GET_SMSTATE(u32, smstate, 0x7f70);
ctxt->ops->set_gdt(ctxt, &dt);
- dt.address = GET_SMSTATE(u32, smbase, 0x7f58);
- dt.size = GET_SMSTATE(u32, smbase, 0x7f54);
+ dt.address = GET_SMSTATE(u32, smstate, 0x7f58);
+ dt.size = GET_SMSTATE(u32, smstate, 0x7f54);
ctxt->ops->set_idt(ctxt, &dt);
for (i = 0; i < 6; i++) {
- int r = rsm_load_seg_32(ctxt, smbase, i);
+ int r = rsm_load_seg_32(ctxt, smstate, i);
if (r != X86EMUL_CONTINUE)
return r;
}
- cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
+ cr4 = GET_SMSTATE(u32, smstate, 0x7f14);
- ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8));
return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
}
-static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
+#ifdef CONFIG_X86_64
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
+ const char *smstate)
{
struct desc_struct desc;
struct desc_ptr dt;
@@ -2509,43 +2510,43 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
int i, r;
for (i = 0; i < 16; i++)
- *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
+ *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
- ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78);
- ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
+ ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
+ ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
- val = GET_SMSTATE(u32, smbase, 0x7f68);
+ val = GET_SMSTATE(u32, smstate, 0x7f68);
ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
- val = GET_SMSTATE(u32, smbase, 0x7f60);
+ val = GET_SMSTATE(u32, smstate, 0x7f60);
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
- cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
- cr3 = GET_SMSTATE(u64, smbase, 0x7f50);
- cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
- ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
- val = GET_SMSTATE(u64, smbase, 0x7ed0);
+ cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
+ cr3 = GET_SMSTATE(u64, smstate, 0x7f50);
+ cr4 = GET_SMSTATE(u64, smstate, 0x7f48);
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00));
+ val = GET_SMSTATE(u64, smstate, 0x7ed0);
ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
- selector = GET_SMSTATE(u32, smbase, 0x7e90);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94));
- set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98));
- base3 = GET_SMSTATE(u32, smbase, 0x7e9c);
+ selector = GET_SMSTATE(u32, smstate, 0x7e90);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94));
+ set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98));
+ base3 = GET_SMSTATE(u32, smstate, 0x7e9c);
ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
- dt.size = GET_SMSTATE(u32, smbase, 0x7e84);
- dt.address = GET_SMSTATE(u64, smbase, 0x7e88);
+ dt.size = GET_SMSTATE(u32, smstate, 0x7e84);
+ dt.address = GET_SMSTATE(u64, smstate, 0x7e88);
ctxt->ops->set_idt(ctxt, &dt);
- selector = GET_SMSTATE(u32, smbase, 0x7e70);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74));
- set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78));
- base3 = GET_SMSTATE(u32, smbase, 0x7e7c);
+ selector = GET_SMSTATE(u32, smstate, 0x7e70);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74));
+ set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78));
+ base3 = GET_SMSTATE(u32, smstate, 0x7e7c);
ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
- dt.size = GET_SMSTATE(u32, smbase, 0x7e64);
- dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
+ dt.size = GET_SMSTATE(u32, smstate, 0x7e64);
+ dt.address = GET_SMSTATE(u64, smstate, 0x7e68);
ctxt->ops->set_gdt(ctxt, &dt);
r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
@@ -2553,37 +2554,49 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
return r;
for (i = 0; i < 6; i++) {
- r = rsm_load_seg_64(ctxt, smbase, i);
+ r = rsm_load_seg_64(ctxt, smstate, i);
if (r != X86EMUL_CONTINUE)
return r;
}
return X86EMUL_CONTINUE;
}
+#endif
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
unsigned long cr0, cr4, efer;
+ char buf[512];
u64 smbase;
int ret;
if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
return emulate_ud(ctxt);
+ smbase = ctxt->ops->get_smbase(ctxt);
+
+ ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf));
+ if (ret != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
+ ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
+
/*
* Get back to real mode, to prepare a safe state in which to load
* CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
* supports long mode.
*/
- cr4 = ctxt->ops->get_cr(ctxt, 4);
if (emulator_has_longmode(ctxt)) {
struct desc_struct cs_desc;
/* Zero CR4.PCIDE before CR0.PG. */
- if (cr4 & X86_CR4_PCIDE) {
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PCIDE)
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
- cr4 &= ~X86_CR4_PCIDE;
- }
/* A 32-bit code segment is required to clear EFER.LMA. */
memset(&cs_desc, 0, sizeof(cs_desc));
@@ -2597,39 +2610,39 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if (cr0 & X86_CR0_PE)
ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
- /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */
- if (cr4 & X86_CR4_PAE)
- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
-
- /* And finally go back to 32-bit mode. */
- efer = 0;
- ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+ if (emulator_has_longmode(ctxt)) {
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PAE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
- smbase = ctxt->ops->get_smbase(ctxt);
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+ }
/*
* Give pre_leave_smm() a chance to make ISA-specific changes to the
* vCPU state (e.g. enter guest mode) before loading state from the SMM
* state-save area.
*/
- if (ctxt->ops->pre_leave_smm(ctxt, smbase))
+ if (ctxt->ops->pre_leave_smm(ctxt, buf))
return X86EMUL_UNHANDLEABLE;
+#ifdef CONFIG_X86_64
if (emulator_has_longmode(ctxt))
- ret = rsm_load_state_64(ctxt, smbase + 0x8000);
+ ret = rsm_load_state_64(ctxt, buf);
else
- ret = rsm_load_state_32(ctxt, smbase + 0x8000);
+#endif
+ ret = rsm_load_state_32(ctxt, buf);
if (ret != X86EMUL_CONTINUE) {
/* FIXME: should triple fault */
return X86EMUL_UNHANDLEABLE;
}
- if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
- ctxt->ops->set_nmi_mask(ctxt, false);
+ ctxt->ops->post_leave_smm(ctxt);
- ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
- ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
return X86EMUL_CONTINUE;
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 27c43525a05f..cc24b3a32c44 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -526,7 +526,9 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
new_config.enable = 0;
stimer->config.as_uint64 = new_config.as_uint64;
- stimer_mark_pending(stimer, false);
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
return 0;
}
@@ -542,7 +544,10 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
stimer->config.enable = 0;
else if (stimer->config.auto_enable)
stimer->config.enable = 1;
- stimer_mark_pending(stimer, false);
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
return 0;
}
@@ -1366,7 +1371,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
valid_bank_mask = BIT_ULL(0);
sparse_banks[0] = flush.processor_mask;
- all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
+
+ /*
+ * Work around possible WS2012 bug: it sends hypercalls
+ * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+ * while also expecting us to flush something and crashing if
+ * we don't. Let's treat processor_mask == 0 same as
+ * HV_FLUSH_ALL_PROCESSORS.
+ */
+ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+ flush.processor_mask == 0;
} else {
if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
sizeof(flush_ex))))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 991fdf7fc17f..bd13fdddbdc4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -70,7 +70,6 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
-static bool lapic_timer_advance_adjust_done = false;
#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -138,6 +137,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
if (offset <= max_apic_id) {
u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+ offset = array_index_nospec(offset, map->max_apic_id + 1);
*cluster = &map->phys_map[offset];
*mask = dest_id & (0xffff >> (16 - cluster_size));
} else {
@@ -901,7 +901,8 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
if (irq->dest_id > map->max_apic_id) {
*bitmap = 0;
} else {
- *dst = &map->phys_map[irq->dest_id];
+ u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1);
+ *dst = &map->phys_map[dest_id];
*bitmap = 1;
}
return true;
@@ -1480,14 +1481,32 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
return false;
}
+static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
+{
+ u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+
+ /*
+ * If the guest TSC is running at a different ratio than the host, then
+ * convert the delay to nanoseconds to achieve an accurate delay. Note
+ * that __delay() uses delay_tsc whenever the hardware has TSC, thus
+ * always for VMX enabled hardware.
+ */
+ if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) {
+ __delay(min(guest_cycles,
+ nsec_to_cycles(vcpu, timer_advance_ns)));
+ } else {
+ u64 delay_ns = guest_cycles * 1000000ULL;
+ do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
+ ndelay(min_t(u32, delay_ns, timer_advance_ns));
+ }
+}
+
void wait_lapic_expire(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
u64 guest_tsc, tsc_deadline, ns;
- if (!lapic_in_kernel(vcpu))
- return;
-
if (apic->lapic_timer.expired_tscdeadline == 0)
return;
@@ -1499,33 +1518,37 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
- /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
if (guest_tsc < tsc_deadline)
- __delay(min(tsc_deadline - guest_tsc,
- nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
+ __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
- if (!lapic_timer_advance_adjust_done) {
+ if (!apic->lapic_timer.timer_advance_adjust_done) {
/* too early */
if (guest_tsc < tsc_deadline) {
ns = (tsc_deadline - guest_tsc) * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- lapic_timer_advance_ns -= min((unsigned int)ns,
- lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns -= min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
} else {
/* too late */
ns = (guest_tsc - tsc_deadline) * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
- lapic_timer_advance_ns += min((unsigned int)ns,
- lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+ timer_advance_ns += min((u32)ns,
+ timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
}
if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
- lapic_timer_advance_adjust_done = true;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ if (unlikely(timer_advance_ns > 5000)) {
+ timer_advance_ns = 0;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ }
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}
}
static void start_sw_tscdeadline(struct kvm_lapic *apic)
{
- u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
u64 ns = 0;
ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1540,13 +1563,15 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
now = ktime_get();
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- if (likely(tscdeadline > guest_tsc)) {
- ns = (tscdeadline - guest_tsc) * 1000000ULL;
- do_div(ns, this_tsc_khz);
+
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+
+ if (likely(tscdeadline > guest_tsc) &&
+ likely(ns > apic->lapic_timer.timer_advance_ns)) {
expire = ktime_add_ns(now, ns);
- expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
- hrtimer_start(&apic->lapic_timer.timer,
- expire, HRTIMER_MODE_ABS_PINNED);
+ expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
+ hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED);
} else
apic_timer_expired(apic);
@@ -2253,7 +2278,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
return HRTIMER_NORESTART;
}
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
{
struct kvm_lapic *apic;
@@ -2277,6 +2302,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED);
apic->lapic_timer.timer.function = apic_timer_fn;
+ if (timer_advance_ns == -1) {
+ apic->lapic_timer.timer_advance_ns = 1000;
+ apic->lapic_timer.timer_advance_adjust_done = false;
+ } else {
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+ apic->lapic_timer.timer_advance_adjust_done = true;
+ }
+
/*
* APIC is created enabled. This will prevent kvm_lapic_set_base from
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff6ef9c3d760..d6d049ba3045 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,8 +31,10 @@ struct kvm_timer {
u32 timer_mode_mask;
u64 tscdeadline;
u64 expired_tscdeadline;
+ u32 timer_advance_ns;
atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use;
+ bool timer_advance_adjust_done;
};
struct kvm_lapic {
@@ -62,7 +64,7 @@ struct kvm_lapic {
struct dest_map;
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
+int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7837ab001d80..d9c7b45d231f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -182,7 +182,7 @@ struct kvm_shadow_walk_iterator {
static const union kvm_mmu_page_role mmu_base_role_mask = {
.cr0_wp = 1,
- .cr4_pae = 1,
+ .gpte_is_8_bytes = 1,
.nxe = 1,
.smep_andnot_wp = 1,
.smap_andnot_wp = 1,
@@ -2007,7 +2007,7 @@ static int is_empty_shadow_page(u64 *spt)
* aggregate version in order to make the slab shrinker
* faster
*/
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
{
kvm->arch.n_used_mmu_pages += nr;
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
@@ -2205,6 +2205,7 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
+
#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -2215,12 +2216,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
for_each_valid_sp(_kvm, _sp, _gfn) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+static inline bool is_ept_sp(struct kvm_mmu_page *sp)
+{
+ return sp->role.cr0_wp && sp->role.smap_andnot_wp;
+}
+
/* @sp->gfn should be write-protected at the call site */
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- if (sp->role.cr4_pae != !!is_pae(vcpu)
- || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
+ if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
+ vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
@@ -2232,7 +2238,7 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
struct list_head *invalid_list,
bool remote_flush)
{
- if (!remote_flush && !list_empty(invalid_list))
+ if (!remote_flush && list_empty(invalid_list))
return false;
if (!list_empty(invalid_list))
@@ -2423,7 +2429,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role.level = level;
role.direct = direct;
if (role.direct)
- role.cr4_pae = 0;
+ role.gpte_is_8_bytes = true;
role.access = access;
if (!vcpu->arch.mmu->direct_map
&& vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
@@ -2757,7 +2763,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
* Changing the number of mmu pages allocated to the vm
* Note: if goal_nr_mmu_pages is too small, you will get dead lock
*/
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
LIST_HEAD(invalid_list);
@@ -4775,6 +4781,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
union kvm_mmu_extended_role ext = {0};
ext.cr0_pg = !!is_paging(vcpu);
+ ext.cr4_pae = !!is_pae(vcpu);
ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
ext.cr4_pse = !!is_pse(vcpu);
@@ -4794,7 +4801,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
role.base.access = ACC_ALL;
role.base.nxe = !!is_nx(vcpu);
- role.base.cr4_pae = !!is_pae(vcpu);
role.base.cr0_wp = is_write_protection(vcpu);
role.base.smm = is_smm(vcpu);
role.base.guest_mode = is_guest_mode(vcpu);
@@ -4815,6 +4821,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.ad_disabled = (shadow_accessed_mask == 0);
role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
role.base.direct = true;
+ role.base.gpte_is_8_bytes = true;
return role;
}
@@ -4879,6 +4886,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.smap_andnot_wp = role.ext.cr4_smap &&
!is_write_protection(vcpu);
role.base.direct = !is_paging(vcpu);
+ role.base.gpte_is_8_bytes = !!is_pae(vcpu);
if (!is_long_mode(vcpu))
role.base.level = PT32E_ROOT_LEVEL;
@@ -4918,18 +4926,26 @@ static union kvm_mmu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
bool execonly)
{
- union kvm_mmu_role role;
+ union kvm_mmu_role role = {0};
- /* Base role is inherited from root_mmu */
- role.base.word = vcpu->arch.root_mmu.mmu_role.base.word;
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ /* SMM flag is inherited from root_mmu */
+ role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
role.base.level = PT64_ROOT_4LEVEL;
+ role.base.gpte_is_8_bytes = true;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
role.base.guest_mode = true;
role.base.access = ACC_ALL;
+ /*
+ * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
+ * SMAP variation to denote shadow EPT entries.
+ */
+ role.base.cr0_wp = true;
+ role.base.smap_andnot_wp = true;
+
+ role.ext = kvm_calc_mmu_role_ext(vcpu);
role.ext.execonly = execonly;
return role;
@@ -5179,7 +5195,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
gpa, bytes, sp->role.word);
offset = offset_in_page(gpa);
- pte_size = sp->role.cr4_pae ? 8 : 4;
+ pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
/*
* Sometimes, the OS only writes the last one bytes to update status
@@ -5203,7 +5219,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
page_offset = offset_in_page(gpa);
level = sp->role.level;
*nspte = 1;
- if (!sp->role.cr4_pae) {
+ if (!sp->role.gpte_is_8_bytes) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -5393,10 +5409,12 @@ emulate:
* This can happen if a guest gets a page-fault on data access but the HW
* table walker is not able to read the instruction page (e.g instruction
* page is not present in memory). In those cases we simply restart the
- * guest.
+ * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
*/
- if (unlikely(insn && !insn_len))
- return 1;
+ if (unlikely(insn && !insn_len)) {
+ if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
+ return 1;
+ }
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
@@ -5509,7 +5527,9 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_with_address(kvm,
+ start_gfn,
+ iterator.gfn - start_gfn + 1);
flush = false;
}
cond_resched_lock(&kvm->mmu_lock);
@@ -5517,7 +5537,8 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
+ end_gfn - start_gfn + 1);
flush = false;
}
@@ -6011,10 +6032,10 @@ out:
/*
* Calculate mmu pages needed for kvm.
*/
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
{
- unsigned int nr_mmu_pages;
- unsigned int nr_pages = 0;
+ unsigned long nr_mmu_pages;
+ unsigned long nr_pages = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i;
@@ -6027,8 +6048,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
}
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
return nr_mmu_pages;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bbdc60f2fae8..54c2a377795b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -64,7 +64,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
-static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
{
if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
return kvm->arch.n_max_mmu_pages -
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 9f6c855a0043..dd30dccd2ad5 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -29,10 +29,10 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \
+ trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \
" %snxe %sad root %u %s%c", \
__entry->gfn, role.level, \
- role.cr4_pae ? " pae" : "", \
+ role.gpte_is_8_bytes ? 8 : 4, \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 58ead7db71a3..e39741997893 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -281,9 +281,13 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
bool fast_mode = idx & (1u << 31);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
u64 ctr_val;
+ if (!pmu->version)
+ return 1;
+
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b5b128a0a051..406b558abfef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -262,6 +262,7 @@ struct amd_svm_iommu_ir {
};
#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
@@ -2692,6 +2693,7 @@ static int npf_interception(struct vcpu_svm *svm)
static int db_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
if (!(svm->vcpu.guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
@@ -2702,6 +2704,8 @@ static int db_interception(struct vcpu_svm *svm)
if (svm->nmi_singlestep) {
disable_nmi_singlestep(svm);
+ /* Make sure we check for pending NMIs upon entry */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
if (svm->vcpu.guest_debug &
@@ -4517,14 +4521,25 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm *kvm = svm->vcpu.kvm;
struct kvm_lapic *apic = svm->vcpu.arch.apic;
/*
- * Update ICR high and low, then emulate sending IPI,
- * which is handled when writing APIC_ICR.
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
*/
- kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
- kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, apic,
+ icrl & KVM_APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & KVM_APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
break;
}
case AVIC_IPI_FAILURE_INVALID_TARGET:
@@ -4596,7 +4611,7 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
if (entry)
- WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK);
+ clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
}
static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
@@ -5621,6 +5636,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
svm->vmcb->save.cr2 = vcpu->arch.cr2;
clgi();
+ kvm_load_guest_xcr0(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -5766,6 +5782,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(&svm->vcpu);
+ kvm_put_guest_xcr0(vcpu);
stgi();
/* Any pending NMI will happen here */
@@ -6215,32 +6232,24 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *nested_vmcb;
struct page *page;
- struct {
- u64 guest;
- u64 vmcb;
- } svm_state_save;
- int ret;
+ u64 guest;
+ u64 vmcb;
- ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
- sizeof(svm_state_save));
- if (ret)
- return ret;
+ guest = GET_SMSTATE(u64, smstate, 0x7ed8);
+ vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
- if (svm_state_save.guest) {
- vcpu->arch.hflags &= ~HF_SMM_MASK;
- nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
- if (nested_vmcb)
- enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
- else
- ret = 1;
- vcpu->arch.hflags |= HF_SMM_MASK;
+ if (guest) {
+ nested_vmcb = nested_svm_map(svm, vmcb, &page);
+ if (!nested_vmcb)
+ return 1;
+ enter_svm_guest_mode(svm, vmcb, nested_vmcb, page);
}
- return ret;
+ return 0;
}
static int enable_smi_window(struct kvm_vcpu *vcpu)
@@ -6422,11 +6431,11 @@ e_free:
return ret;
}
-static int get_num_contig_pages(int idx, struct page **inpages,
- unsigned long npages)
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
{
unsigned long paddr, next_paddr;
- int i = idx + 1, pages = 1;
+ unsigned long i = idx + 1, pages = 1;
/* find the number of contiguous pages starting from idx */
paddr = __sme_page_pa(inpages[idx]);
@@ -6445,12 +6454,12 @@ static int get_num_contig_pages(int idx, struct page **inpages,
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- unsigned long vaddr, vaddr_end, next_vaddr, npages, size;
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
struct sev_data_launch_update_data *data;
struct page **inpages;
- int i, ret, pages;
+ int ret;
if (!sev_guest(kvm))
return -ENOTTY;
@@ -6799,7 +6808,8 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
struct page **src_p, **dst_p;
struct kvm_sev_dbg debug;
unsigned long n;
- int ret, size;
+ unsigned int size;
+ int ret;
if (!sev_guest(kvm))
return -ENOTTY;
@@ -6807,6 +6817,11 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
return -EFAULT;
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
vaddr = debug.src_uaddr;
size = debug.len;
vaddr_end = vaddr + size;
@@ -6857,8 +6872,8 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
dst_vaddr,
len, &argp->error);
- sev_unpin_memory(kvm, src_p, 1);
- sev_unpin_memory(kvm, dst_p, 1);
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
if (ret)
goto err;
@@ -7098,6 +7113,36 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return -ENODEV;
}
+static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+{
+ bool is_user, smap;
+
+ is_user = svm_get_cpl(vcpu) == 3;
+ smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+
+ /*
+ * Detect and workaround Errata 1096 Fam_17h_00_0Fh
+ *
+ * In non SEV guest, hypervisor will be able to read the guest
+ * memory to decode the instruction pointer when insn_len is zero
+ * so we return true to indicate that decoding is possible.
+ *
+ * But in the SEV guest, the guest memory is encrypted with the
+ * guest specific key and hypervisor will not be able to decode the
+ * instruction pointer so we will not able to workaround it. Lets
+ * print the error and request to kill the guest.
+ */
+ if (is_user && smap) {
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ pr_err_ratelimited("KVM: Guest triggered AMD Erratum 1096\n");
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
+ return false;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -7231,6 +7276,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.nested_enable_evmcs = nested_enable_evmcs,
.nested_get_evmcs_version = nested_get_evmcs_version,
+
+ .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6432d08c7de7..4d47a2631d1f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -438,13 +438,13 @@ TRACE_EVENT(kvm_apic_ipi,
);
TRACE_EVENT(kvm_apic_accept_irq,
- TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec),
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
TP_ARGS(apicid, dm, tm, vec),
TP_STRUCT__entry(
__field( __u32, apicid )
__field( __u16, dm )
- __field( __u8, tm )
+ __field( __u16, tm )
__field( __u8, vec )
),
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f24a2c225070..0c601d079cd2 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -500,6 +500,17 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
}
}
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap[word] = ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+}
+
/*
* Merge L0's and L1's MSR bitmap, return false to indicate that
* we do not use the hardware.
@@ -541,39 +552,44 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
return false;
msr_bitmap_l1 = (unsigned long *)kmap(page);
- if (nested_cpu_has_apic_reg_virt(vmcs12)) {
- /*
- * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
- * just lets the processor take the value from the virtual-APIC page;
- * take those 256 bits directly from the L1 bitmap.
- */
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap_l0[word] = msr_bitmap_l1[word];
- msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
- }
- } else {
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap_l0[word] = ~0;
- msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
- }
- }
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- X2APIC_MSR(APIC_TASKPRI),
- MSR_TYPE_W);
+ /*
+ * To keep the control flow simple, pay eight 8-byte writes (sixteen
+ * 4-byte writes on 32-bit systems) up front to enable intercepts for
+ * the x2APIC MSR range and selectively disable them below.
+ */
+ enable_x2apic_msr_intercepts(msr_bitmap_l0);
+
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+ /*
+ * L0 need not intercept reads for MSRs between 0x800
+ * and 0x8ff, it just lets the processor take the value
+ * from the virtual-APIC page; take those 256 bits
+ * directly from the L1 bitmap.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap_l0[word] = msr_bitmap_l1[word];
+ }
+ }
- if (nested_cpu_has_vid(vmcs12)) {
- nested_vmx_disable_intercept_for_msr(
- msr_bitmap_l1, msr_bitmap_l0,
- X2APIC_MSR(APIC_EOI),
- MSR_TYPE_W);
nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
- X2APIC_MSR(APIC_SELF_IPI),
- MSR_TYPE_W);
+ X2APIC_MSR(APIC_TASKPRI),
+ MSR_TYPE_R | MSR_TYPE_W);
+
+ if (nested_cpu_has_vid(vmcs12)) {
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_EOI),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_SELF_IPI),
+ MSR_TYPE_W);
+ }
}
if (spec_ctrl)
@@ -2585,6 +2601,11 @@ static int nested_check_host_control_regs(struct kvm_vcpu *vcpu,
!nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
!nested_cr3_valid(vcpu, vmcs12->host_cr3))
return -EINVAL;
+
+ if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
+ is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
* IA32_EFER MSR must be 0 in the field for that register. In addition,
@@ -2852,20 +2873,27 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
/*
* If translation failed, VM entry will fail because
* prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
- * Failing the vm entry is _not_ what the processor
- * does but it's basically the only possibility we
- * have. We could still enter the guest if CR8 load
- * exits are enabled, CR8 store exits are enabled, and
- * virtualize APIC access is disabled; in this case
- * the processor would never use the TPR shadow and we
- * could simply clear the bit from the execution
- * control. But such a configuration is useless, so
- * let's keep the code simple.
*/
if (!is_error_page(page)) {
vmx->nested.virtual_apic_page = page;
hpa = page_to_phys(vmx->nested.virtual_apic_page);
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+ } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
+ nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
+ !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ /*
+ * The processor will never use the TPR shadow, simply
+ * clear the bit from the execution control. Such a
+ * configuration is useless, but it happens in tests.
+ * For any other configuration, failing the vm entry is
+ * _not_ what the processor does but it's basically the
+ * only possibility we have.
+ */
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_TPR_SHADOW);
+ } else {
+ printk("bad virtual-APIC page address\n");
+ dump_vmcs();
}
}
@@ -3768,8 +3796,18 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
nested_ept_uninit_mmu_context(vcpu);
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ /*
+ * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3
+ * points to shadow pages! Fortunately we only get here after a WARN_ON
+ * if EPT is disabled, so a VMabort is perfectly fine.
+ */
+ if (enable_ept) {
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ } else {
+ nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED);
+ }
/*
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -5385,7 +5423,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return ret;
/* Empty 'VMXON' state is permitted */
- if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
return 0;
if (kvm_state->vmx.vmcs_pa != -1ull) {
@@ -5429,7 +5467,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
vmcs12->vmcs_link_pointer != -1ull) {
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
- if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
+ if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12))
return -EINVAL;
if (copy_from_user(shadow_vmcs12,
@@ -5717,6 +5755,14 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
{
int i;
+ /*
+ * Without EPT it is not possible to restore L1's CR3 and PDPTR on
+ * VMfail, because they are not available in vmcs01. Just always
+ * use hardware checks.
+ */
+ if (!enable_ept)
+ nested_early_check = 1;
+
if (!cpu_has_vmx_shadow_vmcs())
enable_shadow_vmcs = 0;
if (enable_shadow_vmcs) {
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 7b272738c576..d4cb1945b2e3 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -3,6 +3,7 @@
#include <asm/asm.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
+#include <asm/nospec-branch.h>
#define WORD_SIZE (BITS_PER_LONG / 8)
@@ -77,6 +78,17 @@ ENDPROC(vmx_vmenter)
* referred to by VMCS.HOST_RIP.
*/
ENTRY(vmx_vmexit)
+#ifdef CONFIG_RETPOLINE
+ ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
+ /* Preserve guest's RAX, it's used to stuff the RSB. */
+ push %_ASM_AX
+
+ /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+
+ pop %_ASM_AX
+.Lvmexit_skip_rsb:
+#endif
ret
ENDPROC(vmx_vmexit)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c73375e01ab8..0c955bb286ff 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1683,12 +1683,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = to_vmx(vcpu)->spec_ctrl;
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
- return 1;
- msr_info->data = to_vmx(vcpu)->arch_capabilities;
- break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -1895,11 +1889,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated)
- return 1;
- vmx->arch_capabilities = data;
- break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -4088,8 +4077,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
- vmx->arch_capabilities = kvm_get_arch_capabilities();
-
vm_exit_controls_init(vmx, vmx_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
@@ -5616,7 +5603,7 @@ static void vmx_dump_dtsel(char *name, uint32_t limit)
vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
}
-static void dump_vmcs(void)
+void dump_vmcs(void)
{
u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
@@ -6423,6 +6410,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ kvm_load_guest_xcr0(vcpu);
+
if (static_cpu_has(X86_FEATURE_PKU) &&
kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
vcpu->arch.pkru != vmx->host_pkru)
@@ -6473,9 +6462,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
- /* Eliminate branch target predictions from guest mode */
- vmexit_fill_RSB();
-
/* All fields are clean at this point */
if (static_branch_unlikely(&enable_evmcs))
current_evmcs->hv_clean_fields |=
@@ -6519,6 +6505,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
__write_pkru(vmx->host_pkru);
}
+ kvm_put_guest_xcr0(vcpu);
+
vmx->nested.nested_run_pending = 0;
vmx->idt_vectoring_info = 0;
@@ -6865,6 +6853,30 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
}
}
+static bool guest_cpuid_has_pmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid10_eax eax;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+ if (!entry)
+ return false;
+
+ eax.full = entry->eax;
+ return (eax.split.version_id > 0);
+}
+
+static void nested_vmx_procbased_ctls_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool pmu_enabled = guest_cpuid_has_pmu(vcpu);
+
+ if (pmu_enabled)
+ vmx->nested.msrs.procbased_ctls_high |= CPU_BASED_RDPMC_EXITING;
+ else
+ vmx->nested.msrs.procbased_ctls_high &= ~CPU_BASED_RDPMC_EXITING;
+}
+
static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6953,6 +6965,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (nested_vmx_allowed(vcpu)) {
nested_vmx_cr_fixed1_bits_update(vcpu);
nested_vmx_entry_exit_ctls_update(vcpu);
+ nested_vmx_procbased_ctls_update(vcpu);
}
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
@@ -7016,6 +7029,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
if (kvm_mwait_in_guest(vcpu->kvm))
return -EOPNOTSUPP;
@@ -7024,7 +7038,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
tscl = rdtsc();
guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
- lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
+ ktimer->timer_advance_ns);
if (delta_tsc > lapic_timer_advance_cycles)
delta_tsc -= lapic_timer_advance_cycles;
@@ -7382,7 +7397,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -7393,9 +7408,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
}
if (vmx->nested.smm.guest_mode) {
- vcpu->arch.hflags &= ~HF_SMM_MASK;
ret = nested_vmx_enter_non_root_mode(vcpu, false);
- vcpu->arch.hflags |= HF_SMM_MASK;
if (ret)
return ret;
@@ -7409,6 +7422,11 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
+static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
@@ -7711,6 +7729,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.set_nested_state = NULL,
.get_vmcs12_pages = NULL,
.nested_enable_evmcs = NULL,
+ .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
};
static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 1554cb45b393..f879529906b4 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -190,7 +190,6 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
- u64 arch_capabilities;
u64 spec_ctrl;
u32 vm_entry_controls_shadow;
@@ -518,4 +517,6 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
}
+void dump_vmcs(void);
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 65e4559eef2f..b5edc8e3ce1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -136,10 +136,14 @@ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int __read_mostly lapic_timer_advance_ns = 1000;
+/*
+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
+ * adaptive tuning starting from default advancment of 1000ns. '0' disables
+ * advancement entirely. Any other value is used as-is and disables adaptive
+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
+ */
+static int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
@@ -800,7 +804,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
{
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
!vcpu->guest_xcr0_loaded) {
@@ -810,8 +814,9 @@ static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
vcpu->guest_xcr0_loaded = 1;
}
}
+EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
-static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
{
if (vcpu->guest_xcr0_loaded) {
if (vcpu->arch.xcr0 != host_xcr0)
@@ -819,6 +824,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
vcpu->guest_xcr0_loaded = 0;
}
}
+EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
@@ -1125,7 +1131,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_SPEC_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
@@ -1158,6 +1164,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
@@ -2443,6 +2450,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr_info->host_initiated)
vcpu->arch.microcode_version = data;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.arch_capabilities = data;
+ break;
case MSR_EFER:
return set_efer(vcpu, data);
case MSR_K7_HWCR:
@@ -2747,6 +2759,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_UCODE_REV:
msr_info->data = vcpu->arch.microcode_version;
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
case MSR_IA32_TSC:
msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
break;
@@ -3081,7 +3099,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_NESTED_STATE:
r = kvm_x86_ops->get_nested_state ?
- kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
+ kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;
break;
default:
break;
@@ -3516,7 +3534,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
memset(&events->reserved, 0, sizeof(events->reserved));
}
-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu);
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
@@ -3576,12 +3594,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
- u32 hflags = vcpu->arch.hflags;
- if (events->smi.smm)
- hflags |= HF_SMM_MASK;
- else
- hflags &= ~HF_SMM_MASK;
- kvm_set_hflags(vcpu, hflags);
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+ if (events->smi.smm)
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ kvm_smm_changed(vcpu);
+ }
vcpu->arch.smi_pending = events->smi.pending;
@@ -4258,7 +4277,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
- u32 kvm_nr_mmu_pages)
+ unsigned long kvm_nr_mmu_pages)
{
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
@@ -4272,7 +4291,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
{
return kvm->arch.n_max_mmu_pages;
}
@@ -5946,12 +5965,18 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
{
- kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
+ emul_to_vcpu(ctxt)->arch.hflags = emul_flags;
}
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
+ const char *smstate)
{
- return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
+ return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate);
+}
+
+static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ kvm_smm_changed(emul_to_vcpu(ctxt));
}
static const struct x86_emulate_ops emulate_ops = {
@@ -5994,6 +6019,7 @@ static const struct x86_emulate_ops emulate_ops = {
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
.pre_leave_smm = emulator_pre_leave_smm,
+ .post_leave_smm = emulator_post_leave_smm,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6235,16 +6261,6 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
}
-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
-{
- unsigned changed = vcpu->arch.hflags ^ emul_flags;
-
- vcpu->arch.hflags = emul_flags;
-
- if (changed & HF_SMM_MASK)
- kvm_smm_changed(vcpu);
-}
-
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
{
@@ -6523,15 +6539,45 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
}
EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
+static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port)
{
unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
size, port, &val, 1);
- /* do not return to emulator after return from userspace */
- vcpu->arch.pio.count = 0;
- return ret;
+ if (ret)
+ return ret;
+
+ /*
+ * Workaround userspace that relies on old KVM behavior of %rip being
+ * incremented prior to exiting to userspace to handle "OUT 0x7e".
+ */
+ if (port == 0x7e &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
+ vcpu->arch.complete_userspace_io =
+ complete_fast_pio_out_port_0x7e;
+ kvm_skip_emulated_instruction(vcpu);
+ } else {
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_out;
+ }
+ return 0;
}
static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
@@ -6541,6 +6587,11 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
/* For size less than 4 we merge, else we zero extend */
val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
: 0;
@@ -6553,7 +6604,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
vcpu->arch.pio.port, &val, 1);
kvm_register_write(vcpu, VCPU_REGS_RAX, val);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
@@ -6572,6 +6623,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
@@ -6579,16 +6631,13 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
{
- int ret = kvm_skip_emulated_instruction(vcpu);
+ int ret;
- /*
- * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
if (in)
- return kvm_fast_pio_in(vcpu, size, port) && ret;
+ ret = kvm_fast_pio_in(vcpu, size, port);
else
- return kvm_fast_pio_out(vcpu, size, port) && ret;
+ ret = kvm_fast_pio_out(vcpu, size, port);
+ return ret && kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_fast_pio);
@@ -7413,9 +7462,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
+#ifdef CONFIG_X86_64
static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
-#ifdef CONFIG_X86_64
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
@@ -7465,10 +7514,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
for (i = 0; i < 6; i++)
enter_smm_save_seg_64(vcpu, buf, i);
-#else
- WARN_ON_ONCE(1);
-#endif
}
+#endif
static void enter_smm(struct kvm_vcpu *vcpu)
{
@@ -7479,9 +7526,11 @@ static void enter_smm(struct kvm_vcpu *vcpu)
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
+#endif
enter_smm_save_state_32(vcpu, buf);
/*
@@ -7539,8 +7588,10 @@ static void enter_smm(struct kvm_vcpu *vcpu)
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
kvm_x86_ops->set_efer(vcpu, 0);
+#endif
kvm_update_cpuid(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -7837,15 +7888,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- kvm_load_guest_xcr0(vcpu);
-
if (req_immediate_exit) {
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_x86_ops->request_immediate_exit(vcpu);
}
trace_kvm_entry(vcpu->vcpu_id);
- if (lapic_timer_advance_ns)
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
wait_lapic_expire(vcpu);
guest_enter_irqoff();
@@ -7891,8 +7941,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- kvm_put_guest_xcr0(vcpu);
-
kvm_before_interrupt(vcpu);
kvm_x86_ops->handle_external_intr(vcpu);
kvm_after_interrupt(vcpu);
@@ -8733,6 +8781,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
@@ -9034,7 +9083,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- r = kvm_create_lapic(vcpu);
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
} else
@@ -9429,13 +9478,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- int nr_mmu_pages = 0;
-
if (!kvm->arch.n_requested_mmu_pages)
- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-
- if (nr_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ kvm_mmu_change_mmu_pages(kvm,
+ kvm_mmu_calculate_default_mmu_pages(kvm));
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 28406aa1136d..534d3f28bb01 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void);
extern unsigned int min_timer_period_us;
-extern unsigned int lapic_timer_advance_ns;
-
extern bool enable_vmware_backdoor;
extern struct static_key kvm_no_apic_vcpu;
@@ -347,4 +345,6 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
__this_cpu_write(current_vcpu, NULL);
}
+void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
+void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
#endif