summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r--arch/x86/kvm/vmx.c475
1 files changed, 307 insertions, 168 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f76efad248ab..c6ef2940119b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -198,7 +198,8 @@ struct loaded_vmcs {
struct vmcs *vmcs;
struct vmcs *shadow_vmcs;
int cpu;
- int launched;
+ bool launched;
+ bool nmi_known_unmasked;
struct list_head loaded_vmcss_on_cpu_link;
};
@@ -415,13 +416,10 @@ struct nested_vmx {
/* The guest-physical address of the current VMCS L1 keeps for L2 */
gpa_t current_vmptr;
- /* The host-usable pointer to the above */
- struct page *current_vmcs12_page;
- struct vmcs12 *current_vmcs12;
/*
* Cache of the guest's VMCS, existing outside of guest memory.
* Loaded from guest memory during VMPTRLD. Flushed to guest
- * memory during VMXOFF, VMCLEAR, VMPTRLD.
+ * memory during VMCLEAR and VMPTRLD.
*/
struct vmcs12 *cached_vmcs12;
/*
@@ -562,7 +560,6 @@ struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
u8 fail;
- bool nmi_known_unmasked;
u32 exit_intr_info;
u32 idt_vectoring_info;
ulong rflags;
@@ -639,8 +636,6 @@ struct vcpu_vmx {
u64 current_tsc_ratio;
- bool guest_pkru_valid;
- u32 guest_pkru;
u32 host_pkru;
/*
@@ -927,6 +922,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
static int alloc_identity_pagetable(struct kvm *kvm);
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -2326,6 +2325,11 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
__vmx_load_host_state(to_vmx(vcpu));
}
+static bool emulation_required(struct kvm_vcpu *vcpu)
+{
+ return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+}
+
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
/*
@@ -2363,6 +2367,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
+ unsigned long old_rflags = vmx_get_rflags(vcpu);
+
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
to_vmx(vcpu)->rflags = rflags;
if (to_vmx(vcpu)->rmode.vm86_active) {
@@ -2370,11 +2376,9 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
}
vmcs_writel(GUEST_RFLAGS, rflags);
-}
-static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
-{
- return to_vmx(vcpu)->guest_pkru;
+ if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
+ to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
}
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -2418,32 +2422,84 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
+ unsigned long exit_qual)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
+ u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+ if (vcpu->arch.exception.has_error_code) {
+ vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (kvm_exception_is_soft(nr))
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ vmx_get_nmi_mask(vcpu))
+ intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
/*
* KVM wants to inject page-faults which it got to the guest. This function
* checks whether in a nested guest, we need to inject them to L1 or L2.
*/
-static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
- if (!(vmcs12->exception_bitmap & (1u << nr)))
- return 0;
+ if (nr == PF_VECTOR) {
+ if (vcpu->arch.exception.nested_apf) {
+ nested_vmx_inject_exception_vmexit(vcpu,
+ vcpu->arch.apf.nested_apf_token);
+ return 1;
+ }
+ /*
+ * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+ * can be written only when inject_pending_event runs. This should be
+ * conditional on a new capability---if the capability is disabled,
+ * kvm_multiple_exception would write the ancillary information to
+ * CR2 or DR6, for backwards ABI-compatibility.
+ */
+ if (nested_vmx_is_page_fault_vmexit(vmcs12,
+ vcpu->arch.exception.error_code)) {
+ nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
+ return 1;
+ }
+ } else {
+ unsigned long exit_qual = 0;
+ if (nr == DB_VECTOR)
+ exit_qual = vcpu->arch.dr6;
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
+ if (vmcs12->exception_bitmap & (1u << nr)) {
+ nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ return 1;
+ }
+ }
+
+ return 0;
}
-static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject)
+static void vmx_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned nr = vcpu->arch.exception.nr;
+ bool has_error_code = vcpu->arch.exception.has_error_code;
+ bool reinject = vcpu->arch.exception.reinject;
+ u32 error_code = vcpu->arch.exception.error_code;
u32 intr_info = nr | INTR_INFO_VALID_MASK;
if (!reinject && is_guest_mode(vcpu) &&
- nested_vmx_check_exception(vcpu, nr))
+ nested_vmx_check_exception(vcpu))
return;
if (has_error_code) {
@@ -2644,7 +2700,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* reason is that if one of these bits is necessary, it will appear
* in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
* fields of vmcs01 and vmcs02, will turn these bits off - and
- * nested_vmx_exit_handled() will not pass related exits to L1.
+ * nested_vmx_exit_reflected() will not pass related exits to L1.
* These rules have exceptions below.
*/
@@ -3764,6 +3820,25 @@ static void free_kvm_area(void)
}
}
+enum vmcs_field_type {
+ VMCS_FIELD_TYPE_U16 = 0,
+ VMCS_FIELD_TYPE_U64 = 1,
+ VMCS_FIELD_TYPE_U32 = 2,
+ VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
+};
+
+static inline int vmcs_field_type(unsigned long field)
+{
+ if (0x1 & field) /* the *_HIGH fields are all 32 bit */
+ return VMCS_FIELD_TYPE_U32;
+ return (field >> 13) & 0x3 ;
+}
+
+static inline int vmcs_field_readonly(unsigned long field)
+{
+ return (((field >> 10) & 0x3) == 1);
+}
+
static void init_vmcs_shadow_fields(void)
{
int i, j;
@@ -3789,14 +3864,22 @@ static void init_vmcs_shadow_fields(void)
/* shadowed fields guest access without vmexit */
for (i = 0; i < max_shadow_read_write_fields; i++) {
- clear_bit(shadow_read_write_fields[i],
- vmx_vmwrite_bitmap);
- clear_bit(shadow_read_write_fields[i],
- vmx_vmread_bitmap);
+ unsigned long field = shadow_read_write_fields[i];
+
+ clear_bit(field, vmx_vmwrite_bitmap);
+ clear_bit(field, vmx_vmread_bitmap);
+ if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
+ clear_bit(field + 1, vmx_vmwrite_bitmap);
+ clear_bit(field + 1, vmx_vmread_bitmap);
+ }
+ }
+ for (i = 0; i < max_shadow_read_only_fields; i++) {
+ unsigned long field = shadow_read_only_fields[i];
+
+ clear_bit(field, vmx_vmread_bitmap);
+ if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
+ clear_bit(field + 1, vmx_vmread_bitmap);
}
- for (i = 0; i < max_shadow_read_only_fields; i++)
- clear_bit(shadow_read_only_fields[i],
- vmx_vmread_bitmap);
}
static __init int alloc_kvm_area(void)
@@ -3817,11 +3900,6 @@ static __init int alloc_kvm_area(void)
return 0;
}
-static bool emulation_required(struct kvm_vcpu *vcpu)
-{
- return emulate_invalid_guest_state && !guest_state_valid(vcpu);
-}
-
static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
struct kvm_segment *save)
{
@@ -4634,6 +4712,11 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
return true;
}
+static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
+}
+
static int init_rmode_tss(struct kvm *kvm)
{
gfn_t fn;
@@ -4905,6 +4988,28 @@ static bool vmx_get_enable_apicv(void)
return enable_apicv;
}
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gfn_t gfn;
+
+ /*
+ * Don't need to mark the APIC access page dirty; it is never
+ * written to by the CPU during APIC virtualization.
+ */
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+}
+
+
static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4912,18 +5017,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
void *vapic_page;
u16 status;
- if (vmx->nested.pi_desc &&
- vmx->nested.pi_pending) {
- vmx->nested.pi_pending = false;
- if (!pi_test_and_clear_on(vmx->nested.pi_desc))
- return;
-
- max_irr = find_last_bit(
- (unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+ return;
- if (max_irr == 256)
- return;
+ vmx->nested.pi_pending = false;
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return;
+ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (max_irr != 256) {
vapic_page = kmap(vmx->nested.virtual_apic_page);
__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
kunmap(vmx->nested.virtual_apic_page);
@@ -4935,11 +5037,16 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
+
+ nested_mark_vmcs12_pages_dirty(vcpu);
}
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ bool nested)
{
#ifdef CONFIG_SMP
+ int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
+
if (vcpu->mode == IN_GUEST_MODE) {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4957,8 +5064,7 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
*/
WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
- POSTED_INTR_VECTOR);
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
return true;
}
#endif
@@ -4973,7 +5079,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/* the PIR and ON have been set by L1. */
- kvm_vcpu_trigger_posted_interrupt(vcpu);
+ kvm_vcpu_trigger_posted_interrupt(vcpu, true);
/*
* If a posted intr is not recognized by hardware,
* we will accomplish it in the next vmentry.
@@ -5007,7 +5113,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_on(&vmx->pi_desc))
return;
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
kvm_vcpu_kick(vcpu);
}
@@ -5465,10 +5571,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!is_guest_mode(vcpu)) {
- ++vcpu->stat.nmi_injections;
- vmx->nmi_known_unmasked = false;
- }
+ ++vcpu->stat.nmi_injections;
+ vmx->loaded_vmcs->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
@@ -5482,16 +5586,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
- if (to_vmx(vcpu)->nmi_known_unmasked)
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool masked;
+
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
return false;
- return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+ masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ return masked;
}
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->nmi_known_unmasked = !masked;
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
if (masked)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
@@ -5664,14 +5773,11 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
if (is_page_fault(intr_info)) {
- /* EPT won't cause page fault directly */
- BUG_ON(enable_ept);
cr2 = vmcs_readl(EXIT_QUALIFICATION);
- trace_kvm_page_fault(cr2, error_code);
-
- if (kvm_event_needs_reinjection(vcpu))
- kvm_mmu_unprotect_page_virt(vcpu, cr2);
- return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
+ /* EPT won't cause page fault directly */
+ WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
+ true);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -7082,34 +7188,32 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
return 1;
}
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+}
+
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
if (vmx->nested.current_vmptr == -1ull)
return;
- /* current_vmptr and current_vmcs12 are always set/reset together */
- if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
- return;
-
if (enable_shadow_vmcs) {
/* copy to memory all shadowed fields in case
they were modified */
copy_shadow_to_vmcs12(vmx);
vmx->nested.sync_shadow_vmcs = false;
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_SHADOW_VMCS);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmx_disable_shadow_vmcs(vmx);
}
vmx->nested.posted_intr_nv = -1;
/* Flush VMCS12 to guest memory */
- memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
- VMCS12_SIZE);
+ kvm_vcpu_write_guest_page(&vmx->vcpu,
+ vmx->nested.current_vmptr >> PAGE_SHIFT,
+ vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
- kunmap(vmx->nested.current_vmcs12_page);
- nested_release_page(vmx->nested.current_vmcs12_page);
vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
}
/*
@@ -7123,12 +7227,14 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.vmxon = false;
free_vpid(vmx->nested.vpid02);
- nested_release_vmcs12(vmx);
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = -1ull;
if (vmx->nested.msr_bitmap) {
free_page((unsigned long)vmx->nested.msr_bitmap);
vmx->nested.msr_bitmap = NULL;
}
if (enable_shadow_vmcs) {
+ vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
free_vmcs(vmx->vmcs01.shadow_vmcs);
vmx->vmcs01.shadow_vmcs = NULL;
@@ -7214,25 +7320,6 @@ static int handle_vmresume(struct kvm_vcpu *vcpu)
return nested_vmx_run(vcpu, false);
}
-enum vmcs_field_type {
- VMCS_FIELD_TYPE_U16 = 0,
- VMCS_FIELD_TYPE_U64 = 1,
- VMCS_FIELD_TYPE_U32 = 2,
- VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
-};
-
-static inline int vmcs_field_type(unsigned long field)
-{
- if (0x1 & field) /* the *_HIGH fields are all 32 bit */
- return VMCS_FIELD_TYPE_U32;
- return (field >> 13) & 0x3 ;
-}
-
-static inline int vmcs_field_readonly(unsigned long field)
-{
- return (((field >> 10) & 0x3) == 1);
-}
-
/*
* Read a vmcs12 field. Since these can have varying lengths and we return
* one type, we chose the biggest type (u64) and zero-extend the return value
@@ -7546,14 +7633,14 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
nested_release_vmcs12(vmx);
- vmx->nested.current_vmcs12 = new_vmcs12;
- vmx->nested.current_vmcs12_page = page;
/*
* Load VMCS12 from guest memory since it is not already
* cached.
*/
- memcpy(vmx->nested.cached_vmcs12,
- vmx->nested.current_vmcs12, VMCS12_SIZE);
+ memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
+ kunmap(page);
+ nested_release_page_clean(page);
+
set_current_vmptr(vmx, vmptr);
}
@@ -7986,12 +8073,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
* should handle it ourselves in L0 (and then continue L2). Only call this
* when in is_guest_mode (L2).
*/
-static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
{
u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u32 exit_reason = vmx->exit_reason;
trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
vmcs_readl(EXIT_QUALIFICATION),
@@ -8000,6 +8086,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
KVM_ISA_VMX);
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+ * may write to these pages via their host physical address while
+ * L2 is running, bypassing any address-translation-based dirty
+ * tracking (e.g. EPT write protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
if (vmx->nested.nested_run_pending)
return false;
@@ -8014,7 +8112,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
if (is_nmi(intr_info))
return false;
else if (is_page_fault(intr_info))
- return enable_ept;
+ return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
else if (is_no_device(intr_info) &&
!(vmcs12->guest_cr0 & X86_CR0_TS))
return false;
@@ -8136,6 +8234,29 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
}
}
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+ u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ /*
+ * At this point, the exit interruption info in exit_intr_info
+ * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
+ * we need to query the in-kernel LAPIC.
+ */
+ WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+ if ((exit_intr_info &
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+
+ nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+ vmcs_readl(EXIT_QUALIFICATION));
+ return 1;
+}
+
static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
*info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -8382,12 +8503,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
- if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
- nested_vmx_vmexit(vcpu, exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
- }
+ if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+ return nested_vmx_reflect_vmexit(vcpu, exit_reason);
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
dump_vmcs();
@@ -8418,9 +8535,15 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
exit_reason != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vectoring_info;
vcpu->run->internal.data[1] = exit_reason;
+ vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
+ if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+ vcpu->run->internal.ndata++;
+ vcpu->run->internal.data[3] =
+ vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ }
return 0;
}
@@ -8611,17 +8734,24 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
- u32 exit_intr_info;
+ u32 exit_intr_info = 0;
+ u16 basic_exit_reason = (u16)vmx->exit_reason;
- if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
- || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
+ if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
+ || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
return;
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- exit_intr_info = vmx->exit_intr_info;
+ if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ vmx->exit_intr_info = exit_intr_info;
+
+ /* if exit due to PF check for async PF */
+ if (is_page_fault(exit_intr_info))
+ vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
/* Handle machine checks before interrupts are enabled */
- if (is_machine_check(exit_intr_info))
+ if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
+ is_machine_check(exit_intr_info))
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
@@ -8700,7 +8830,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
- if (vmx->nmi_known_unmasked)
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
return;
/*
* Can't use vmx->exit_intr_info since we're not sure what
@@ -8724,7 +8854,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
else
- vmx->nmi_known_unmasked =
+ vmx->loaded_vmcs->nmi_known_unmasked =
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
& GUEST_INTR_STATE_NMI);
}
@@ -8883,8 +9013,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- if (vmx->guest_pkru_valid)
- __write_pkru(vmx->guest_pkru);
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+ vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vcpu->arch.pkru);
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
@@ -9032,13 +9164,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
* back on host, so it is safe to read guest PKRU from current
* XSAVE.
*/
- if (boot_cpu_has(X86_FEATURE_OSPKE)) {
- vmx->guest_pkru = __read_pkru();
- if (vmx->guest_pkru != vmx->host_pkru) {
- vmx->guest_pkru_valid = true;
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
+ vcpu->arch.pkru = __read_pkru();
+ if (vcpu->arch.pkru != vmx->host_pkru)
__write_pkru(vmx->host_pkru);
- } else
- vmx->guest_pkru_valid = false;
}
/*
@@ -9177,7 +9307,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
@@ -9463,12 +9592,15 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
WARN_ON(!is_guest_mode(vcpu));
- if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
- nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- else
+ if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+ vmcs12->vm_exit_intr_error_code = fault->error_code;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+ INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+ fault->address);
+ } else {
kvm_inject_page_fault(vcpu, fault);
+ }
}
static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
@@ -9589,23 +9721,26 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
}
+static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
+ !page_address_valid(vcpu, vmcs12->io_bitmap_b))
+ return -EINVAL;
+
+ return 0;
+}
+
static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- int maxphyaddr;
- u64 addr;
-
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return 0;
- if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
- WARN_ON(1);
- return -EINVAL;
- }
- maxphyaddr = cpuid_maxphyaddr(vcpu);
-
- if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
- ((addr + PAGE_SIZE) >> maxphyaddr))
+ if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
return -EINVAL;
return 0;
@@ -9993,6 +10128,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_entry_instruction_len);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
} else {
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
}
@@ -10017,13 +10154,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
/* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
- /*
- * Note that we use L0's vector here and in
- * vmx_deliver_nested_posted_interrupt.
- */
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
vmx->nested.pi_pending = false;
- vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
} else {
exec_control &= ~PIN_BASED_POSTED_INTR;
}
@@ -10047,12 +10180,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
- *
- * A problem with this approach (when !enable_ept) is that L1 may be
- * injected with more page faults than it asked for. This could have
- * caused problems, but in practice existing hypervisors don't care.
- * To fix this, we will need to emulate the PFEC checking (on the L1
- * page tables), using walk_addr(), when injecting PFs to L1.
*/
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
enable_ept ? vmcs12->page_fault_error_code_mask : 0);
@@ -10293,6 +10420,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
@@ -10429,8 +10559,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
return 1;
}
- vmcs12->launch_state = 1;
-
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
@@ -10448,6 +10576,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
{
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
u32 exit_qual;
int ret;
@@ -10472,6 +10601,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* for misconfigurations which will anyway be caught by the processor
* when using the merged vmcs02.
*/
+ if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+ goto out;
+ }
+
if (vmcs12->launch_state == launch) {
nested_vmx_failValid(vcpu,
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
@@ -10792,18 +10927,15 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_exit_reason = exit_reason;
vmcs12->exit_qualification = exit_qualification;
-
vmcs12->vm_exit_intr_info = exit_intr_info;
- if ((vmcs12->vm_exit_intr_info &
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
- vmcs12->vm_exit_intr_error_code =
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
vmcs12->idt_vectoring_info_field = 0;
vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ vmcs12->launch_state = 1;
+
/* vm_entry_intr_info_field is cleared on exit. Emulate this
* instead of reading the real value. */
vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
@@ -10884,7 +11016,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
*/
vmx_flush_tlb(vcpu);
}
-
+ /* Restore posted intr vector. */
+ if (nested_cpu_has_posted_intr(vmcs12))
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
@@ -10990,8 +11124,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
- && nested_exit_intr_ack_set(vcpu)) {
+ /*
+ * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
+ * the VM-exit interrupt information (valid interrupt) is always set to
+ * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
+ * kvm_cpu_has_interrupt(). See the commit message for details.
+ */
+ if (nested_exit_intr_ack_set(vcpu) &&
+ exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ kvm_cpu_has_interrupt(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
@@ -11534,8 +11675,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
- .get_pkru = vmx_get_pkru,
-
.tlb_flush = vmx_flush_tlb,
.run = vmx_vcpu_run,