diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 475 |
1 files changed, 307 insertions, 168 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f76efad248ab..c6ef2940119b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -198,7 +198,8 @@ struct loaded_vmcs { struct vmcs *vmcs; struct vmcs *shadow_vmcs; int cpu; - int launched; + bool launched; + bool nmi_known_unmasked; struct list_head loaded_vmcss_on_cpu_link; }; @@ -415,13 +416,10 @@ struct nested_vmx { /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; - /* The host-usable pointer to the above */ - struct page *current_vmcs12_page; - struct vmcs12 *current_vmcs12; /* * Cache of the guest's VMCS, existing outside of guest memory. * Loaded from guest memory during VMPTRLD. Flushed to guest - * memory during VMXOFF, VMCLEAR, VMPTRLD. + * memory during VMCLEAR and VMPTRLD. */ struct vmcs12 *cached_vmcs12; /* @@ -562,7 +560,6 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; - bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -639,8 +636,6 @@ struct vcpu_vmx { u64 current_tsc_ratio; - bool guest_pkru_valid; - u32 guest_pkru; u32 host_pkru; /* @@ -927,6 +922,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -2326,6 +2325,11 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) __vmx_load_host_state(to_vmx(vcpu)); } +static bool emulation_required(struct kvm_vcpu *vcpu) +{ + return emulate_invalid_guest_state && !guest_state_valid(vcpu); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); /* @@ -2363,6 +2367,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + unsigned long old_rflags = vmx_get_rflags(vcpu); + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { @@ -2370,11 +2376,9 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } vmcs_writel(GUEST_RFLAGS, rflags); -} -static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->guest_pkru; + if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) @@ -2418,32 +2422,84 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); } +static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, + unsigned long exit_qual) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; + u32 intr_info = nr | INTR_INFO_VALID_MASK; + + if (vcpu->arch.exception.has_error_code) { + vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } + + if (kvm_exception_is_soft(nr)) + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && + vmx_get_nmi_mask(vcpu)) + intr_info |= INTR_INFO_UNBLOCK_NMI; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); +} + /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; - if (!(vmcs12->exception_bitmap & (1u << nr))) - return 0; + if (nr == PF_VECTOR) { + if (vcpu->arch.exception.nested_apf) { + nested_vmx_inject_exception_vmexit(vcpu, + vcpu->arch.apf.nested_apf_token); + return 1; + } + /* + * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception. + * The fix is to add the ancillary datum (CR2 or DR6) to structs + * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 + * can be written only when inject_pending_event runs. This should be + * conditional on a new capability---if the capability is disabled, + * kvm_multiple_exception would write the ancillary information to + * CR2 or DR6, for backwards ABI-compatibility. + */ + if (nested_vmx_is_page_fault_vmexit(vmcs12, + vcpu->arch.exception.error_code)) { + nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2); + return 1; + } + } else { + unsigned long exit_qual = 0; + if (nr == DB_VECTOR) + exit_qual = vcpu->arch.dr6; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - return 1; + if (vmcs12->exception_bitmap & (1u << nr)) { + nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + return 1; + } + } + + return 0; } -static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) +static void vmx_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + bool reinject = vcpu->arch.exception.reinject; + u32 error_code = vcpu->arch.exception.error_code; u32 intr_info = nr | INTR_INFO_VALID_MASK; if (!reinject && is_guest_mode(vcpu) && - nested_vmx_check_exception(vcpu, nr)) + nested_vmx_check_exception(vcpu)) return; if (has_error_code) { @@ -2644,7 +2700,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * reason is that if one of these bits is necessary, it will appear * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_handled() will not pass related exits to L1. + * nested_vmx_exit_reflected() will not pass related exits to L1. * These rules have exceptions below. */ @@ -3764,6 +3820,25 @@ static void free_kvm_area(void) } } +enum vmcs_field_type { + VMCS_FIELD_TYPE_U16 = 0, + VMCS_FIELD_TYPE_U64 = 1, + VMCS_FIELD_TYPE_U32 = 2, + VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 +}; + +static inline int vmcs_field_type(unsigned long field) +{ + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ + return VMCS_FIELD_TYPE_U32; + return (field >> 13) & 0x3 ; +} + +static inline int vmcs_field_readonly(unsigned long field) +{ + return (((field >> 10) & 0x3) == 1); +} + static void init_vmcs_shadow_fields(void) { int i, j; @@ -3789,14 +3864,22 @@ static void init_vmcs_shadow_fields(void) /* shadowed fields guest access without vmexit */ for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], - vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], - vmx_vmread_bitmap); + unsigned long field = shadow_read_write_fields[i]; + + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) { + clear_bit(field + 1, vmx_vmwrite_bitmap); + clear_bit(field + 1, vmx_vmread_bitmap); + } + } + for (i = 0; i < max_shadow_read_only_fields; i++) { + unsigned long field = shadow_read_only_fields[i]; + + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) + clear_bit(field + 1, vmx_vmread_bitmap); } - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], - vmx_vmread_bitmap); } static __init int alloc_kvm_area(void) @@ -3817,11 +3900,6 @@ static __init int alloc_kvm_area(void) return 0; } -static bool emulation_required(struct kvm_vcpu *vcpu) -{ - return emulate_invalid_guest_state && !guest_state_valid(vcpu); -} - static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { @@ -4634,6 +4712,11 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) return true; } +static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; @@ -4905,6 +4988,28 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } +static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + gfn_t gfn; + + /* + * Don't need to mark the APIC access page dirty; it is never + * written to by the CPU during APIC virtualization. + */ + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } +} + + static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4912,18 +5017,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) void *vapic_page; u16 status; - if (vmx->nested.pi_desc && - vmx->nested.pi_pending) { - vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return; - - max_irr = find_last_bit( - (unsigned long *)vmx->nested.pi_desc->pir, 256); + if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) + return; - if (max_irr == 256) - return; + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return; + max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); + if (max_irr != 256) { vapic_page = kmap(vmx->nested.virtual_apic_page); __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -4935,11 +5037,16 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } + + nested_mark_vmcs12_pages_dirty(vcpu); } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + bool nested) { #ifdef CONFIG_SMP + int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; + if (vcpu->mode == IN_GUEST_MODE) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4957,8 +5064,7 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) */ WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return true; } #endif @@ -4973,7 +5079,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* the PIR and ON have been set by L1. */ - kvm_vcpu_trigger_posted_interrupt(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, true); /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. @@ -5007,7 +5113,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return; - if (!kvm_vcpu_trigger_posted_interrupt(vcpu)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); } @@ -5465,10 +5571,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!is_guest_mode(vcpu)) { - ++vcpu->stat.nmi_injections; - vmx->nmi_known_unmasked = false; - } + ++vcpu->stat.nmi_injections; + vmx->loaded_vmcs->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) @@ -5482,16 +5586,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { - if (to_vmx(vcpu)->nmi_known_unmasked) + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool masked; + + if (vmx->loaded_vmcs->nmi_known_unmasked) return false; - return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + return masked; } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->nmi_known_unmasked = !masked; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -5664,14 +5773,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) } if (is_page_fault(intr_info)) { - /* EPT won't cause page fault directly */ - BUG_ON(enable_ept); cr2 = vmcs_readl(EXIT_QUALIFICATION); - trace_kvm_page_fault(cr2, error_code); - - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, cr2); - return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); + /* EPT won't cause page fault directly */ + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, + true); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -7082,34 +7188,32 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) return 1; } +static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) +{ + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, -1ull); +} + static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { if (vmx->nested.current_vmptr == -1ull) return; - /* current_vmptr and current_vmcs12 are always set/reset together */ - if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) - return; - if (enable_shadow_vmcs) { /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; /* Flush VMCS12 to guest memory */ - memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12, - VMCS12_SIZE); + kvm_vcpu_write_guest_page(&vmx->vcpu, + vmx->nested.current_vmptr >> PAGE_SHIFT, + vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; } /* @@ -7123,12 +7227,14 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); - nested_release_vmcs12(vmx); + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = -1ull; if (vmx->nested.msr_bitmap) { free_page((unsigned long)vmx->nested.msr_bitmap); vmx->nested.msr_bitmap = NULL; } if (enable_shadow_vmcs) { + vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); free_vmcs(vmx->vmcs01.shadow_vmcs); vmx->vmcs01.shadow_vmcs = NULL; @@ -7214,25 +7320,6 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) return nested_vmx_run(vcpu, false); } -enum vmcs_field_type { - VMCS_FIELD_TYPE_U16 = 0, - VMCS_FIELD_TYPE_U64 = 1, - VMCS_FIELD_TYPE_U32 = 2, - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 -}; - -static inline int vmcs_field_type(unsigned long field) -{ - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_TYPE_U32; - return (field >> 13) & 0x3 ; -} - -static inline int vmcs_field_readonly(unsigned long field) -{ - return (((field >> 10) & 0x3) == 1); -} - /* * Read a vmcs12 field. Since these can have varying lengths and we return * one type, we chose the biggest type (u64) and zero-extend the return value @@ -7546,14 +7633,14 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) } nested_release_vmcs12(vmx); - vmx->nested.current_vmcs12 = new_vmcs12; - vmx->nested.current_vmcs12_page = page; /* * Load VMCS12 from guest memory since it is not already * cached. */ - memcpy(vmx->nested.cached_vmcs12, - vmx->nested.current_vmcs12, VMCS12_SIZE); + memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); + kunmap(page); + nested_release_page_clean(page); + set_current_vmptr(vmx, vmptr); } @@ -7986,12 +8073,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, * should handle it ourselves in L0 (and then continue L2). Only call this * when in is_guest_mode (L2). */ -static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) +static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) { u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 exit_reason = vmx->exit_reason; trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, vmcs_readl(EXIT_QUALIFICATION), @@ -8000,6 +8086,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) vmcs_read32(VM_EXIT_INTR_ERROR_CODE), KVM_ISA_VMX); + /* + * The host physical addresses of some pages of guest memory + * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU + * may write to these pages via their host physical address while + * L2 is running, bypassing any address-translation-based dirty + * tracking (e.g. EPT write protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + if (vmx->nested.nested_run_pending) return false; @@ -8014,7 +8112,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) if (is_nmi(intr_info)) return false; else if (is_page_fault(intr_info)) - return enable_ept; + return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; else if (is_no_device(intr_info) && !(vmcs12->guest_cr0 & X86_CR0_TS)) return false; @@ -8136,6 +8234,29 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) } } +static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* + * At this point, the exit interruption info in exit_intr_info + * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT + * we need to query the in-kernel LAPIC. + */ + WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); + if ((exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + } + + nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, + vmcs_readl(EXIT_QUALIFICATION)); + return 1; +} + static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { *info1 = vmcs_readl(EXIT_QUALIFICATION); @@ -8382,12 +8503,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { - nested_vmx_vmexit(vcpu, exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - return 1; - } + if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) + return nested_vmx_reflect_vmexit(vcpu, exit_reason); if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { dump_vmcs(); @@ -8418,9 +8535,15 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 2; + vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason; + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vcpu->run->internal.ndata++; + vcpu->run->internal.data[3] = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); + } return 0; } @@ -8611,17 +8734,24 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info; + u32 exit_intr_info = 0; + u16 basic_exit_reason = (u16)vmx->exit_reason; - if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) + if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) return; - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - exit_intr_info = vmx->exit_intr_info; + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmx->exit_intr_info = exit_intr_info; + + /* if exit due to PF check for async PF */ + if (is_page_fault(exit_intr_info)) + vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); /* Handle machine checks before interrupts are enabled */ - if (is_machine_check(exit_intr_info)) + if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || + is_machine_check(exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ @@ -8700,7 +8830,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (vmx->nmi_known_unmasked) + if (vmx->loaded_vmcs->nmi_known_unmasked) return; /* * Can't use vmx->exit_intr_info since we're not sure what @@ -8724,7 +8854,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else - vmx->nmi_known_unmasked = + vmx->loaded_vmcs->nmi_known_unmasked = !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI); } @@ -8883,8 +9013,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - if (vmx->guest_pkru_valid) - __write_pkru(vmx->guest_pkru); + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && + vcpu->arch.pkru != vmx->host_pkru) + __write_pkru(vcpu->arch.pkru); atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); @@ -9032,13 +9164,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * back on host, so it is safe to read guest PKRU from current * XSAVE. */ - if (boot_cpu_has(X86_FEATURE_OSPKE)) { - vmx->guest_pkru = __read_pkru(); - if (vmx->guest_pkru != vmx->host_pkru) { - vmx->guest_pkru_valid = true; + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { + vcpu->arch.pkru = __read_pkru(); + if (vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vmx->host_pkru); - } else - vmx->guest_pkru_valid = false; } /* @@ -9177,7 +9307,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; @@ -9463,12 +9592,15 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) - nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - else + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) { + vmcs12->vm_exit_intr_error_code = fault->error_code; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, + fault->address); + } else { kvm_inject_page_fault(vcpu, fault); + } } static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, @@ -9589,23 +9721,26 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || + !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int maxphyaddr; - u64 addr; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 0; - if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { - WARN_ON(1); - return -EINVAL; - } - maxphyaddr = cpuid_maxphyaddr(vcpu); - - if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || - ((addr + PAGE_SIZE) >> maxphyaddr)) + if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) return -EINVAL; return 0; @@ -9993,6 +10128,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vm_entry_instruction_len); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); } else { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } @@ -10017,13 +10154,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { - /* - * Note that we use L0's vector here and in - * vmx_deliver_nested_posted_interrupt. - */ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); } else { exec_control &= ~PIN_BASED_POSTED_INTR; } @@ -10047,12 +10180,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when * !enable_ept, EB.PF is 1, so the "or" will always be 1. - * - * A problem with this approach (when !enable_ept) is that L1 may be - * injected with more page faults than it asked for. This could have - * caused problems, but in practice existing hypervisors don't care. - * To fix this, we will need to emulate the PFEC checking (on the L1 - * page tables), using walk_addr(), when injecting PFs to L1. */ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, enable_ept ? vmcs12->page_fault_error_code_mask : 0); @@ -10293,6 +10420,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -10429,8 +10559,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return 1; } - vmcs12->launch_state = 1; - /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet @@ -10448,6 +10576,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); u32 exit_qual; int ret; @@ -10472,6 +10601,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ + if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) { + nested_vmx_failValid(vcpu, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + goto out; + } + if (vmcs12->launch_state == launch) { nested_vmx_failValid(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS @@ -10792,18 +10927,15 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vm_exit_reason = exit_reason; vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = exit_intr_info; - if ((vmcs12->vm_exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) - vmcs12->vm_exit_intr_error_code = - vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs12->idt_vectoring_info_field = 0; vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + vmcs12->launch_state = 1; + /* vm_entry_intr_info_field is cleared on exit. Emulate this * instead of reading the real value. */ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; @@ -10884,7 +11016,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, */ vmx_flush_tlb(vcpu); } - + /* Restore posted intr vector. */ + if (nested_cpu_has_posted_intr(vmcs12)) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); @@ -10990,8 +11124,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); - if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) - && nested_exit_intr_ack_set(vcpu)) { + /* + * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of + * the VM-exit interrupt information (valid interrupt) is always set to + * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need + * kvm_cpu_has_interrupt(). See the commit message for details. + */ + if (nested_exit_intr_ack_set(vcpu) && + exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + kvm_cpu_has_interrupt(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); WARN_ON(irq < 0); vmcs12->vm_exit_intr_info = irq | @@ -11534,8 +11675,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .get_pkru = vmx_get_pkru, - .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, |