diff options
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 714 |
1 files changed, 282 insertions, 432 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 22411f4aff53..81ed596e4454 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -68,10 +68,13 @@ #include "vmcs12.h" #include "vmx.h" #include "x86.h" +#include "x86_ops.h" #include "smm.h" #include "vmx_onhyperv.h" +#include "posted_intr.h" MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); MODULE_LICENSE("GPL"); #ifdef MODULE @@ -257,7 +260,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; return 0; } @@ -402,7 +405,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) * and VM-Exit. */ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && - (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && !boot_cpu_has_bug(X86_BUG_TAA); @@ -522,16 +525,10 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; -static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) -{ - vmx->segment_cache.bitmask = 0; -} static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) -static struct kvm_x86_ops vmx_x86_ops __initdata; - static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -581,9 +578,8 @@ static __init void hv_init_evmcs(void) } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_l2_tlb_flush + vt_x86_ops.enable_l2_tlb_flush = hv_enable_l2_tlb_flush; - } else { enlightened_vmcs = false; } @@ -755,7 +751,7 @@ fault: return -EIO; } -static void vmx_emergency_disable(void) +void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -874,6 +870,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); /* + * #VE isn't used for VMX. To test against unexpected changes + * related to #VE for VMX, intercept unexpected #VE and warn on it. + */ + if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) + eb |= 1u << VE_VECTOR; + /* * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway @@ -1118,12 +1120,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) * atomically, since it's faster than switching it manually. */ if (cpu_has_load_ia32_efer() || - (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { + (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; - if (guest_efer != host_efer) + if (guest_efer != kvm_host.efer) add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer, false); + guest_efer, kvm_host.efer, false); else clear_atomic_switch_msr(vmx, MSR_EFER); return false; @@ -1136,7 +1138,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, MSR_EFER); guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; + guest_efer |= kvm_host.efer & ignore_bits; vmx->guest_uret_msrs[i].data = guest_efer; vmx->guest_uret_msrs[i].mask = ~ignore_bits; @@ -1406,6 +1408,38 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) } #endif +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old, ple_window, + ple_window_grow, + ple_window_max); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, ple_window, + ple_window_shrink, + ple_window); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy) { @@ -1477,10 +1511,13 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); + vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); @@ -1488,7 +1525,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx->host_debugctlmsr = get_debugctlmsr(); } -static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -1547,7 +1584,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmx->emulation_required = vmx_emulation_required(vcpu); } -static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +bool vmx_get_if_flag(struct kvm_vcpu *vcpu) { return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; } @@ -1653,8 +1690,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does @@ -1738,7 +1775,7 @@ rip_updated: * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. */ -static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) +void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1769,7 +1806,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) } } -static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) +int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) { vmx_update_emulated_instruction(vcpu); return skip_emulated_instruction(vcpu); @@ -1788,7 +1825,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } -static void vmx_inject_exception(struct kvm_vcpu *vcpu) +void vmx_inject_exception(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; @@ -1909,12 +1946,12 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_caps.default_tsc_scaling_ratio; } -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) +void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); } -static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) +void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); } @@ -1957,15 +1994,15 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } -static int vmx_get_msr_feature(struct kvm_msr_entry *msr) +int vmx_get_feature_msr(u32 msr, u64 *data) { - switch (msr->index) { + switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } } @@ -1974,7 +2011,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; @@ -2155,7 +2192,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; @@ -2458,7 +2495,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return ret; } -static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { unsigned long guest_owned_bits; @@ -2520,17 +2557,15 @@ static bool cpu_has_sgx(void) */ static bool cpu_has_perf_global_ctrl_bug(void) { - if (boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_NEHALEM_EP: /* AAK155 */ - case INTEL_FAM6_NEHALEM: /* AAP115 */ - case INTEL_FAM6_WESTMERE: /* AAT100 */ - case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */ - case INTEL_FAM6_NEHALEM_EX: /* BA97 */ - return true; - default: - break; - } + switch (boot_cpu_data.x86_vfm) { + case INTEL_NEHALEM_EP: /* AAK155 */ + case INTEL_NEHALEM: /* AAP115 */ + case INTEL_WESTMERE: /* AAT100 */ + case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ + case INTEL_NEHALEM_EX: /* BA97 */ + return true; + default: + break; } return false; @@ -2566,13 +2601,13 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) static int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { - u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + u64 basic_msr; u64 misc_msr; int i; @@ -2606,6 +2641,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_cpu_based_2nd_exec_control)) return -EIO; } + if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; + #ifndef CONFIG_X86_64 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) @@ -2630,6 +2668,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, return -EIO; vmx_cap->ept = 0; + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && vmx_cap->vpid) { @@ -2691,29 +2730,29 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, _vmexit_control &= ~x_ctrl; } - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) return -EIO; #ifdef CONFIG_X86_64 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ - if (vmx_msr_high & (1u<<16)) + /* + * KVM expects to be able to shove all legal physical addresses into + * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always + * 0 for processors that support Intel 64 architecture". + */ + if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EIO; #endif /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != 6) + if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; rdmsrl(MSR_IA32_VMX_MISC, misc_msr); - vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - - vmcs_conf->revision_id = vmx_msr_low; - + vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; @@ -2759,7 +2798,7 @@ static bool kvm_is_vmx_supported(void) return supported; } -static int vmx_check_processor_compat(void) +int vmx_check_processor_compat(void) { int cpu = raw_smp_processor_id(); struct vmcs_config vmcs_conf; @@ -2801,7 +2840,7 @@ fault: return -EFAULT; } -static int vmx_hardware_enable(void) +int vmx_enable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2825,9 +2864,6 @@ static int vmx_hardware_enable(void) return r; } - if (enable_ept) - ept_sync_global(); - return 0; } @@ -2841,7 +2877,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -static void vmx_hardware_disable(void) +void vmx_disable_virtualization_cpu(void) { vmclear_local_loaded_vmcss(); @@ -2863,13 +2899,13 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) if (!pages) return NULL; vmcs = page_address(pages); - memset(vmcs, 0, vmcs_config.size); + memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); /* KVM supports Enlightened VMCS v1 only */ if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); if (shadow) vmcs->hdr.shadow_vmcs = 1; @@ -2962,7 +2998,7 @@ static __init int alloc_kvm_area(void) * physical CPU. */ if (kvm_is_using_evmcs()) - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); per_cpu(vmxarea, cpu) = vmcs; } @@ -3155,7 +3191,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3185,7 +3221,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } -static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u64 root_hpa = mmu->root.hpa; @@ -3201,7 +3237,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) vpid_sync_context(vmx_get_current_vpid(vcpu)); } -static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in @@ -3210,7 +3246,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } -static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a @@ -3255,7 +3291,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ CPU_BASED_CR3_STORE_EXITING) -static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (is_guest_mode(vcpu)) return nested_guest_cr0_valid(vcpu, cr0); @@ -3376,8 +3412,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) return eptp; } -static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, - int root_level) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3406,8 +3441,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } - -static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be @@ -3523,7 +3557,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->g = (ar >> 15) & 1; } -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment s; @@ -3600,14 +3634,14 @@ void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); } -static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } -static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); @@ -3615,25 +3649,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) *l = (ar >> 13) & 1; } -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_IDTR_LIMIT); dt->address = vmcs_readl(GUEST_IDTR_BASE); } -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_IDTR_LIMIT, dt->size); vmcs_writel(GUEST_IDTR_BASE, dt->address); } -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_GDTR_LIMIT); dt->address = vmcs_readl(GUEST_GDTR_BASE); } -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_GDTR_LIMIT, dt->size); vmcs_writel(GUEST_GDTR_BASE, dt->address); @@ -4101,27 +4135,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - void *vapic_page; - u32 vppr; - int rvi; - - if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || - !nested_cpu_has_vid(get_vmcs12(vcpu)) || - WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) - return false; - - rvi = vmx_get_rvi(); - - vapic_page = vmx->nested.virtual_apic_map.hva; - vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - -static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 i; @@ -4201,6 +4215,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated + * and freed, and must not be accessed outside of vcpu->mutex. The + * vCPU's cached PI NV is valid if and only if posted interrupts + * enabled in its vmcs12, i.e. checking the vector also checks that + * L1 has enabled posted interrupts for L2. + */ if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* @@ -4265,8 +4286,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) return 0; } -static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, - int trig_mode, int vector) +void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) { struct kvm_vcpu *vcpu = apic->vcpu; @@ -4350,7 +4371,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) } if (cpu_has_load_ia32_efer()) - vmcs_write64(HOST_IA32_EFER, host_efer); + vmcs_write64(HOST_IA32_EFER, kvm_host.efer); } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -4428,7 +4449,7 @@ static u32 vmx_vmexit_ctrl(void) ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } -static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4594,6 +4615,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; enable_unrestricted_guest = 0; } if (!enable_unrestricted_guest) @@ -4692,7 +4714,7 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) return 0; } -static int vmx_vcpu_precreate(struct kvm *kvm) +int vmx_vcpu_precreate(struct kvm *kvm) { return vmx_alloc_ipiv_pid_table(kvm); } @@ -4717,8 +4739,12 @@ static void init_vmcs(struct vcpu_vmx *vmx) exec_controls_set(vmx, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) + if (cpu_has_secondary_exec_ctrls()) { secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (vmx->ve_info) + vmcs_write64(VE_INFORMATION_ADDRESS, + __pa(vmx->ve_info)); + } if (cpu_has_tertiary_exec_ctrls()) tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); @@ -4844,10 +4870,10 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) * or POSTED_INTR_WAKEUP_VECTOR. */ vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; + __pi_set_sn(&vmx->pi_desc); } -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4862,9 +4888,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - vmx_segment_cache_clear(vmx); - kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); - seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); @@ -4891,6 +4914,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_writel(GUEST_IDTR_BASE, 0); vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); @@ -4906,12 +4932,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_update_fb_clear_dis(vcpu, vmx); } -static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) +void vmx_enable_irq_window(struct kvm_vcpu *vcpu) { exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } -static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) +void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) { if (!enable_vnmi || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { @@ -4922,7 +4948,7 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; @@ -4950,7 +4976,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) vmx_clear_hlt(vcpu); } -static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5028,7 +5054,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) GUEST_INTR_STATE_NMI)); } -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; @@ -5040,17 +5066,22 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !vmx_nmi_blocked(vcpu); } +bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) +{ + return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return false; - return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); + return __vmx_interrupt_blocked(vcpu); } -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; @@ -5065,7 +5096,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !vmx_interrupt_blocked(vcpu); } -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) +int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { void __user *ret; @@ -5085,7 +5116,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) return init_rmode_tss(kvm, ret); } -static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; return 0; @@ -5206,6 +5237,16 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); + if (WARN_ON_ONCE(is_ve_fault(intr_info))) { + struct vmx_ve_information *ve_info = vmx->ve_info; + + WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, + "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); + dump_vmcs(vcpu); + kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); + return 1; + } + error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); @@ -5371,8 +5412,7 @@ static int handle_io(struct kvm_vcpu *vcpu) return kvm_fast_pio(vcpu, size, port, in); } -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { /* * Patch in the VMCALL instruction: @@ -5578,7 +5618,7 @@ out: return kvm_complete_insn_gp(vcpu, err); } -static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); @@ -5597,7 +5637,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); } @@ -5767,10 +5807,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) ? PFERR_PRESENT_MASK : 0; - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - - vcpu->arch.exit_qualification = exit_qualification; + if (error_code & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; /* * Check that the GPA doesn't exceed physical memory limits, as that is @@ -5868,7 +5907,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } -static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) { if (vmx_emulation_required_with_pending_exception(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); @@ -5878,38 +5917,6 @@ static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) return 1; } -static void grow_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned int old = vmx->ple_window; - - vmx->ple_window = __grow_ple_window(old, ple_window, - ple_window_grow, - ple_window_max); - - if (vmx->ple_window != old) { - vmx->ple_window_dirty = true; - trace_kvm_ple_window_update(vcpu->vcpu_id, - vmx->ple_window, old); - } -} - -static void shrink_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned int old = vmx->ple_window; - - vmx->ple_window = __shrink_ple_window(old, ple_window, - ple_window_shrink, - ple_window); - - if (vmx->ple_window != old) { - vmx->ple_window_dirty = true; - trace_kvm_ple_window_update(vcpu->vcpu_id, - vmx->ple_window, old); - } -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -6156,9 +6163,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, - u64 *info1, u64 *info2, - u32 *intr_info, u32 *error_code) +void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6416,6 +6422,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) pr_err("Virtual processor ID = 0x%04x\n", vmcs_read16(VIRTUAL_PROCESSOR_ID)); + if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { + struct vmx_ve_information *ve_info = vmx->ve_info; + u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); + + /* + * If KVM is dumping the VMCS, then something has gone wrong + * already. Derefencing an address from the VMCS, which could + * very well be corrupted, is a terrible idea. The virtual + * address is known so use it. + */ + pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, + ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); + pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", + ve_info->exit_reason, ve_info->delivery, + ve_info->exit_qualification, + ve_info->guest_linear_address, + ve_info->guest_physical_address, ve_info->eptp_index); + } } /* @@ -6601,7 +6625,7 @@ unexpected_vmexit: return 0; } -static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { int ret = __vmx_handle_exit(vcpu, exit_fastpath); @@ -6641,9 +6665,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) bool flush_l1d; /* - * Clear the per-vcpu flush bit, it gets set again - * either from vcpu_run() or from one of the unsafe - * VMEXIT handlers. + * Clear the per-vcpu flush bit, it gets set again if the vCPU + * is reloaded, i.e. if the vCPU is scheduled out or if KVM + * exits to userspace, or if KVM reaches one of the unsafe + * VMEXIT handlers, e.g. if KVM calls into the emulator. */ flush_l1d = vcpu->arch.l1tf_flush_l1d; vcpu->arch.l1tf_flush_l1d = false; @@ -6689,7 +6714,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) : "eax", "ebx", "ecx", "edx"); } -static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold; @@ -6759,7 +6784,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap_x2apic(vcpu); } -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) +void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -6828,7 +6853,7 @@ out: kvm_release_pfn_clean(pfn); } -static void vmx_hwapic_isr_update(int max_isr) +void vmx_hwapic_isr_update(int max_isr) { u16 status; u8 old; @@ -6862,7 +6887,7 @@ static void vmx_set_rvi(int vector) } } -static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { /* * When running L2, updating RVI is only relevant when @@ -6876,7 +6901,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) vmx_set_rvi(max_irr); } -static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -6922,7 +6947,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) return; @@ -6933,7 +6958,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) +void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6964,24 +6989,22 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } -static void handle_exception_irqoff(struct vcpu_vmx *vmx) +static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) { - u32 intr_info = vmx_get_intr_info(&vmx->vcpu); - /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) - vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* if exit due to NM, handle before interrupts are enabled */ else if (is_nm_fault(intr_info)) - handle_nm_fault_irqoff(&vmx->vcpu); + handle_nm_fault_irqoff(vcpu); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); } -static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) +static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, + u32 intr_info) { - u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, @@ -6998,7 +7021,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) vcpu->arch.at_instruction_boundary = true; } -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7006,16 +7029,16 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) return; if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) - handle_external_interrupt_irqoff(vcpu); + handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) - handle_exception_irqoff(vmx); + handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. */ -static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) +bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: @@ -7138,7 +7161,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) IDT_VECTORING_ERROR_CODE); } -static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +void vmx_cancel_injection(struct kvm_vcpu *vcpu) { __vmx_complete_interrupts(vcpu, vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), @@ -7246,6 +7269,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); + case EXIT_REASON_HLT: + return handle_fastpath_hlt(vcpu); default: return EXIT_FASTPATH_NONE; } @@ -7308,7 +7333,7 @@ out: guest_state_exit_irqoff(); } -static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7463,7 +7488,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); } -static void vmx_vcpu_free(struct kvm_vcpu *vcpu) +void vmx_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7472,9 +7497,10 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu) free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); + free_page((unsigned long)vmx->ve_info); } -static int vmx_vcpu_create(struct kvm_vcpu *vcpu) +int vmx_vcpu_create(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; @@ -7565,6 +7591,20 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + err = -ENOMEM; + if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { + struct page *page; + + BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); + + /* ve_info must be page aligned. */ + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!page) + goto free_vmcs; + + vmx->ve_info = page_to_virt(page); + } + if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); @@ -7583,7 +7623,7 @@ free_vpid: #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" -static int vmx_vm_init(struct kvm *kvm) +int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) kvm->arch.pause_in_guest = true; @@ -7614,41 +7654,25 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in - * memory aliases with conflicting memory types and sometimes MCEs. - * We have to be careful as to what are honored and when. - * - * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to - * UC. The effective memory type is UC or WC depending on guest PAT. - * This was historically the source of MCEs and we want to be - * conservative. - * - * When there is no need to deal with noncoherent DMA (e.g., no VT-d - * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The - * EPT memory type is set to WB. The effective memory type is forced - * WB. - * - * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The - * EPT memory type is used to emulate guest CD/MTRR. + /* + * Force UC for host MMIO regions, as allowing the guest to access MMIO + * with cacheable accesses will result in Machine Checks. */ - if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; + /* + * Force WB and ignore guest PAT if the VM does NOT have a non-coherent + * device attached. Letting the guest control memory types on Intel + * CPUs may result in unexpected behavior, and so KVM's ABI is to trust + * the guest to behave only as a last resort. + */ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; - if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT; - else - return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) | - VMX_EPT_IPAT_BIT; - } - - return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) @@ -7786,7 +7810,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } -static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7947,6 +7971,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); kvm_cpu_cap_clear(X86_FEATURE_SGX1); kvm_cpu_cap_clear(X86_FEATURE_SGX2); + kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); } if (vmx_umip_emulated()) @@ -8001,10 +8026,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; } -static int vmx_check_intercept(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info, - enum x86_intercept_stage stage, - struct x86_exception *exception) +int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage, + struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -8084,8 +8109,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, return 0; } -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, - bool *expired) +int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; @@ -8124,18 +8149,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, return 0; } -static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif -static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - shrink_ple_window(vcpu); -} - void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8159,7 +8178,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static void vmx_setup_mce(struct kvm_vcpu *vcpu) +void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -8170,7 +8189,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_SMM -static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) @@ -8178,7 +8197,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) +int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8199,7 +8218,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) return 0; } -static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -8220,18 +8239,18 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) return 0; } -static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) +void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } #endif -static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); } -static void vmx_migrate_timers(struct kvm_vcpu *vcpu) +void vmx_migrate_timers(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; @@ -8241,7 +8260,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) } } -static void vmx_hardware_unsetup(void) +void vmx_hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); @@ -8251,18 +8270,7 @@ static void vmx_hardware_unsetup(void) free_kvm_area(); } -#define VMX_REQUIRED_APICV_INHIBITS \ -( \ - BIT(APICV_INHIBIT_REASON_DISABLE)| \ - BIT(APICV_INHIBIT_REASON_ABSENT) | \ - BIT(APICV_INHIBIT_REASON_HYPERV) | \ - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ - BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \ -) - -static void vmx_vm_destroy(struct kvm *kvm) +void vmx_vm_destroy(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); @@ -8313,148 +8321,6 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); } -static struct kvm_x86_ops vmx_x86_ops __initdata = { - .name = KBUILD_MODNAME, - - .check_processor_compatibility = vmx_check_processor_compat, - - .hardware_unsetup = vmx_hardware_unsetup, - - .hardware_enable = vmx_hardware_enable, - .hardware_disable = vmx_hardware_disable, - .has_emulated_msr = vmx_has_emulated_msr, - - .vm_size = sizeof(struct kvm_vmx), - .vm_init = vmx_vm_init, - .vm_destroy = vmx_vm_destroy, - - .vcpu_precreate = vmx_vcpu_precreate, - .vcpu_create = vmx_vcpu_create, - .vcpu_free = vmx_vcpu_free, - .vcpu_reset = vmx_vcpu_reset, - - .prepare_switch_to_guest = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, - - .update_exception_bitmap = vmx_update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .is_valid_cr0 = vmx_is_valid_cr0, - .set_cr0 = vmx_set_cr0, - .is_valid_cr4 = vmx_is_valid_cr4, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - .get_if_flag = vmx_get_if_flag, - - .flush_tlb_all = vmx_flush_tlb_all, - .flush_tlb_current = vmx_flush_tlb_current, - .flush_tlb_gva = vmx_flush_tlb_gva, - .flush_tlb_guest = vmx_flush_tlb_guest, - - .vcpu_pre_run = vmx_vcpu_pre_run, - .vcpu_run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = vmx_skip_emulated_instruction, - .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .inject_irq = vmx_inject_irq, - .inject_nmi = vmx_inject_nmi, - .inject_exception = vmx_inject_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = vmx_enable_nmi_window, - .enable_irq_window = vmx_enable_irq_window, - .update_cr8_intercept = vmx_update_cr8_intercept, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_pre_state_restore = vmx_apicv_pre_state_restore, - .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, - .hwapic_irr_update = vmx_hwapic_irr_update, - .hwapic_isr_update = vmx_hwapic_isr_update, - .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_interrupt = vmx_deliver_interrupt, - .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, - - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, - .get_mt_mask = vmx_get_mt_mask, - - .get_exit_info = vmx_get_exit_info, - - .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, - - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - - .get_l2_tsc_offset = vmx_get_l2_tsc_offset, - .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, - .write_tsc_offset = vmx_write_tsc_offset, - .write_tsc_multiplier = vmx_write_tsc_multiplier, - - .load_mmu_pgd = vmx_load_mmu_pgd, - - .check_intercept = vmx_check_intercept, - .handle_exit_irqoff = vmx_handle_exit_irqoff, - - .sched_in = vmx_sched_in, - - .cpu_dirty_log_size = PML_ENTITY_NUM, - .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - - .nested_ops = &vmx_nested_ops, - - .pi_update_irte = vmx_pi_update_irte, - .pi_start_assignment = vmx_pi_start_assignment, - -#ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, -#endif - - .setup_mce = vmx_setup_mce, - -#ifdef CONFIG_KVM_SMM - .smi_allowed = vmx_smi_allowed, - .enter_smm = vmx_enter_smm, - .leave_smm = vmx_leave_smm, - .enable_smi_window = vmx_enable_smi_window, -#endif - - .check_emulate_instruction = vmx_check_emulate_instruction, - .apic_init_signal_blocked = vmx_apic_init_signal_blocked, - .migrate_timers = vmx_migrate_timers, - - .msr_filter_changed = vmx_msr_filter_changed, - .complete_emulated_msr = kvm_complete_insn_gp, - - .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, - - .get_untagged_addr = vmx_get_untagged_addr, -}; - static unsigned int vmx_handle_intel_pt_intr(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); @@ -8500,18 +8366,16 @@ static void __init vmx_setup_me_spte_mask(void) u64 me_mask = 0; /* - * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use - * the former to avoid exposing shadow_phys_bits. - * * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to - * shadow_phys_bits. On MKTME and/or TDX capable systems, + * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, * boot_cpu_data.x86_phys_bits holds the actual physical address - * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR - * reported by CPUID. Those bits between are KeyID bits. + * w/o the KeyID bits, and kvm_host.maxphyaddr equals to + * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. */ - if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) + if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, - kvm_get_shadow_phys_bits() - 1); + kvm_host.maxphyaddr - 1); + /* * Unlike SME, host kernel doesn't support setting up any * MKTME KeyID on Intel platforms. No memory encryption @@ -8520,9 +8384,7 @@ static void __init vmx_setup_me_spte_mask(void) kvm_mmu_set_me_spte_mask(0, me_mask); } -static struct kvm_x86_init_ops vmx_init_ops __initdata; - -static __init int hardware_setup(void) +__init int vmx_hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; @@ -8591,16 +8453,16 @@ static __init int hardware_setup(void) * using the APIC_ACCESS_ADDR VMCS field. */ if (!flexpriority_enabled) - vmx_x86_ops.set_apic_access_page_addr = NULL; + vt_x86_ops.set_apic_access_page_addr = NULL; if (!cpu_has_vmx_tpr_shadow()) - vmx_x86_ops.update_cr8_intercept = NULL; + vt_x86_ops.update_cr8_intercept = NULL; #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { - vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; - vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; + vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; } #endif @@ -8615,7 +8477,7 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_apicv()) enable_apicv = 0; if (!enable_apicv) - vmx_x86_ops.sync_pir_to_irr = NULL; + vt_x86_ops.sync_pir_to_irr = NULL; if (!enable_apicv || !cpu_has_vmx_ipiv()) enable_ipiv = false; @@ -8651,7 +8513,7 @@ static __init int hardware_setup(void) enable_pml = 0; if (!enable_pml) - vmx_x86_ops.cpu_dirty_log_size = 0; + vt_x86_ops.cpu_dirty_log_size = 0; if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; @@ -8660,7 +8522,7 @@ static __init int hardware_setup(void) u64 use_timer_freq = 5000ULL * 1000 * 1000; cpu_preemption_timer_multi = - vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + vmx_misc_preemption_timer_rate(vmcs_config.misc); if (tsc_khz) use_timer_freq = (u64)tsc_khz * 1000; @@ -8676,8 +8538,8 @@ static __init int hardware_setup(void) } if (!enable_preemption_timer) { - vmx_x86_ops.set_hv_timer = NULL; - vmx_x86_ops.cancel_hv_timer = NULL; + vt_x86_ops.set_hv_timer = NULL; + vt_x86_ops.cancel_hv_timer = NULL; } kvm_caps.supported_mce_cap |= MCG_LMCE_P; @@ -8688,9 +8550,9 @@ static __init int hardware_setup(void) if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) - vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; else - vmx_init_ops.handle_intel_pt_intr = NULL; + vt_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -8713,14 +8575,6 @@ static __init int hardware_setup(void) return r; } -static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .hardware_setup = hardware_setup, - .handle_intel_pt_intr = NULL, - - .runtime_ops = &vmx_x86_ops, - .pmu_ops = &intel_pmu_ops, -}; - static void vmx_cleanup_l1d_flush(void) { if (vmx_l1d_flush_pages) { @@ -8735,17 +8589,15 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(vmx_emergency_disable); - vmx_cleanup_l1d_flush(); } static void vmx_exit(void) { kvm_exit(); + __vmx_exit(); kvm_x86_vendor_exit(); - __vmx_exit(); } module_exit(vmx_exit); @@ -8762,7 +8614,7 @@ static int __init vmx_init(void) */ hv_init_evmcs(); - r = kvm_x86_vendor_init(&vmx_init_ops); + r = kvm_x86_vendor_init(&vt_init_ops); if (r) return r; @@ -8783,8 +8635,6 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(vmx_emergency_disable); - vmx_check_vmcs12_offsets(); /* |