diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 608 |
1 files changed, 354 insertions, 254 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3e556c68351b..feb852b04598 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -99,13 +99,15 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); +static u64 __read_mostly host_xss; + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT) + | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -214,6 +216,7 @@ struct __packed vmcs12 { u64 virtual_apic_page_addr; u64 apic_access_addr; u64 ept_pointer; + u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; u64 guest_ia32_debugctl; @@ -616,6 +619,7 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), FIELD64(EPT_POINTER, ept_pointer), + FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), @@ -720,12 +724,15 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), }; -static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); static inline short vmcs_field_to_offset(unsigned long field) { - if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) - return -1; + BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + + if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || + vmcs_field_to_offset_table[field] == 0) + return -ENOENT; + return vmcs_field_to_offset_table[field]; } @@ -758,6 +765,7 @@ static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static bool vmx_mpx_supported(void); +static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -1098,6 +1106,12 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); } +static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && + vmx_xsaves_supported(); +} + static inline bool is_exception(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1659,12 +1673,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) vmx->guest_msrs[efer_offset].mask = ~ignore_bits; clear_atomic_switch_msr(vmx, MSR_EFER); - /* On ept, can't emulate nx, and must switch nx atomically */ - if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { + + /* + * On EPT, we can't emulate NX, so we must switch EFER atomically. + * On CPUs that support "load IA32_EFER", always switch EFER + * atomically, since it's faster than switching it manually. + */ + if (cpu_has_load_ia32_efer || + (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { guest_efer = vmx->vcpu.arch.efer; if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; - add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); + if (guest_efer != host_efer) + add_atomic_switch_msr(vmx, MSR_EFER, + guest_efer, host_efer); return false; } @@ -2377,12 +2399,13 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_WBINVD_EXITING; + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_XSAVES; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ - nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT | + SECONDARY_EXEC_UNRESTRICTED_GUEST; nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; @@ -2558,6 +2581,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) if (!nested_vmx_allowed(vcpu)) return 1; return vmx_get_vmx_msr(vcpu, msr_index, pdata); + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + data = vcpu->arch.ia32_xss; + break; case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; @@ -2649,6 +2677,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return 1; /* they are read-only */ + case MSR_IA32_XSS: + if (!vmx_xsaves_supported()) + return 1; + /* + * The only supported bit as of Skylake is bit 8, but + * it is not supported on KVM. + */ + if (data != 0) + return 1; + vcpu->arch.ia32_xss = data; + if (vcpu->arch.ia32_xss != host_xss) + add_atomic_switch_msr(vmx, MSR_IA32_XSS, + vcpu->arch.ia32_xss, host_xss); + else + clear_atomic_switch_msr(vmx, MSR_IA32_XSS); + break; case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) return 1; @@ -2884,7 +2928,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_SHADOW_VMCS; + SECONDARY_EXEC_SHADOW_VMCS | + SECONDARY_EXEC_XSAVES; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3007,6 +3052,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) } } + if (cpu_has_xsaves) + rdmsrl(MSR_IA32_XSS, host_xss); + return 0; } @@ -3110,76 +3158,6 @@ static __init int alloc_kvm_area(void) return 0; } -static __init int hardware_setup(void) -{ - if (setup_vmcs_config(&vmcs_config) < 0) - return -EIO; - - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - - if (!cpu_has_vmx_vpid()) - enable_vpid = 0; - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); - - if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels()) { - enable_ept = 0; - enable_unrestricted_guest = 0; - enable_ept_ad_bits = 0; - } - - if (!cpu_has_vmx_ept_ad_bits()) - enable_ept_ad_bits = 0; - - if (!cpu_has_vmx_unrestricted_guest()) - enable_unrestricted_guest = 0; - - if (!cpu_has_vmx_flexpriority()) { - flexpriority_enabled = 0; - - /* - * set_apic_access_page_addr() is used to reload apic access - * page upon invalidation. No need to do anything if the - * processor does not have the APIC_ACCESS_ADDR VMCS field. - */ - kvm_x86_ops->set_apic_access_page_addr = NULL; - } - - if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - - if (!cpu_has_vmx_ple()) - ple_gap = 0; - - if (!cpu_has_vmx_apicv()) - enable_apicv = 0; - - if (enable_apicv) - kvm_x86_ops->update_cr8_intercept = NULL; - else { - kvm_x86_ops->hwapic_irr_update = NULL; - kvm_x86_ops->deliver_posted_interrupt = NULL; - kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; - } - - if (nested) - nested_vmx_setup_ctls_msrs(); - - return alloc_kvm_area(); -} - -static __exit void hardware_unsetup(void) -{ - free_kvm_area(); -} - static bool emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !guest_state_valid(vcpu); @@ -4396,6 +4374,7 @@ static void ept_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); } +#define VMX_XSS_EXIT_BITMAP 0 /* * Sets up the vmcs for emulated real mode. */ @@ -4505,6 +4484,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); + if (vmx_xsaves_supported()) + vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + return 0; } @@ -5163,13 +5145,20 @@ static int handle_cr(struct kvm_vcpu *vcpu) static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int dr, reg; + int dr, dr7, reg; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + dr = exit_qualification & DEBUG_REG_ACCESS_NUM; + + /* First, if DR does not exist, trigger UD */ + if (!kvm_require_dr(vcpu, dr)) + return 1; /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; - dr = vmcs_readl(GUEST_DR7); - if (dr & DR7_GD) { + dr7 = vmcs_readl(GUEST_DR7); + if (dr7 & DR7_GD) { /* * As the vm-exit takes precedence over the debug trap, we * need to emulate the latter, either for the host or the @@ -5177,17 +5166,14 @@ static int handle_dr(struct kvm_vcpu *vcpu) */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; - vcpu->run->debug.arch.dr7 = dr; - vcpu->run->debug.arch.pc = - vmcs_readl(GUEST_CS_BASE) + - vmcs_readl(GUEST_RIP); + vcpu->run->debug.arch.dr7 = dr7; + vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { - vcpu->arch.dr7 &= ~DR7_GD; + vcpu->arch.dr6 &= ~15; vcpu->arch.dr6 |= DR6_BD | DR6_RTM; - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); kvm_queue_exception(vcpu, DB_VECTOR); return 1; } @@ -5209,8 +5195,6 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; } - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; @@ -5391,6 +5375,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) return 1; } +static int handle_xsaves(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + +static int handle_xrstors(struct kvm_vcpu *vcpu) +{ + skip_emulated_instruction(vcpu); + WARN(1, "this should never happen\n"); + return 1; +} + static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -5492,7 +5490,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) } /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55); + vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155); /* * TODO: What about debug traps on tss switch? @@ -5539,11 +5537,11 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) trace_kvm_page_fault(gpa, exit_qualification); /* It is a write fault? */ - error_code = exit_qualification & (1U << 1); + error_code = exit_qualification & PFERR_WRITE_MASK; /* It is a fetch fault? */ - error_code |= (exit_qualification & (1U << 2)) << 2; + error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; /* ept page table is present? */ - error_code |= (exit_qualification >> 3) & 0x1; + error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; vcpu->arch.exit_qualification = exit_qualification; @@ -5785,6 +5783,204 @@ static void update_ple_window_actual_max(void) ple_window_grow, INT_MIN); } +static __init int hardware_setup(void) +{ + int r = -ENOMEM, i, msr; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); + + vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_a) + return r; + + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_io_bitmap_b) + goto out; + + vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy) + goto out1; + + vmx_msr_bitmap_legacy_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic) + goto out2; + + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode) + goto out3; + + vmx_msr_bitmap_longmode_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; + vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmread_bitmap) + goto out5; + + vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_vmwrite_bitmap) + goto out6; + + memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); + memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); + clear_bit(0x80, vmx_io_bitmap_a); + + memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); + + memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); + memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); + + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); + vmx_disable_intercept_for_msr(MSR_GS_BASE, false); + vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); + vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); + + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); + } + + if (enable_ept) { + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); + + update_ple_window_actual_max(); + + if (setup_vmcs_config(&vmcs_config) < 0) { + r = -EIO; + goto out7; + } + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + + if (!cpu_has_vmx_vpid()) + enable_vpid = 0; + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); + + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels()) { + enable_ept = 0; + enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; + } + + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + + if (!cpu_has_vmx_unrestricted_guest()) + enable_unrestricted_guest = 0; + + if (!cpu_has_vmx_flexpriority()) { + flexpriority_enabled = 0; + + /* + * set_apic_access_page_addr() is used to reload apic access + * page upon invalidation. No need to do anything if the + * processor does not have the APIC_ACCESS_ADDR VMCS field. + */ + kvm_x86_ops->set_apic_access_page_addr = NULL; + } + + if (!cpu_has_vmx_tpr_shadow()) + kvm_x86_ops->update_cr8_intercept = NULL; + + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + + if (!cpu_has_vmx_ple()) + ple_gap = 0; + + if (!cpu_has_vmx_apicv()) + enable_apicv = 0; + + if (enable_apicv) + kvm_x86_ops->update_cr8_intercept = NULL; + else { + kvm_x86_ops->hwapic_irr_update = NULL; + kvm_x86_ops->deliver_posted_interrupt = NULL; + kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; + } + + if (nested) + nested_vmx_setup_ctls_msrs(); + + return alloc_kvm_area(); + +out7: + free_page((unsigned long)vmx_vmwrite_bitmap); +out6: + free_page((unsigned long)vmx_vmread_bitmap); +out5: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); +out4: + free_page((unsigned long)vmx_msr_bitmap_longmode); +out3: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); +out2: + free_page((unsigned long)vmx_msr_bitmap_legacy); +out1: + free_page((unsigned long)vmx_io_bitmap_b); +out: + free_page((unsigned long)vmx_io_bitmap_a); + + return r; +} + +static __exit void hardware_unsetup(void) +{ + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_msr_bitmap_longmode); + free_page((unsigned long)vmx_io_bitmap_b); + free_page((unsigned long)vmx_io_bitmap_a); + free_page((unsigned long)vmx_vmwrite_bitmap); + free_page((unsigned long)vmx_vmread_bitmap); + + free_kvm_area(); +} + /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -6361,58 +6557,60 @@ static inline int vmcs_field_readonly(unsigned long field) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 *ret) +static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 *ret) { short offset = vmcs_field_to_offset(field); char *p; if (offset < 0) - return 0; + return offset; p = ((char *)(get_vmcs12(vcpu))) + offset; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_NATURAL_WIDTH: *ret = *((natural_width *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U16: *ret = *((u16 *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U32: *ret = *((u32 *)p); - return 1; + return 0; case VMCS_FIELD_TYPE_U64: *ret = *((u64 *)p); - return 1; + return 0; default: - return 0; /* can never happen. */ + WARN_ON(1); + return -ENOENT; } } -static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 field_value){ +static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, + unsigned long field, u64 field_value){ short offset = vmcs_field_to_offset(field); char *p = ((char *) get_vmcs12(vcpu)) + offset; if (offset < 0) - return false; + return offset; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_U16: *(u16 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_U32: *(u32 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_U64: *(u64 *)p = field_value; - return true; + return 0; case VMCS_FIELD_TYPE_NATURAL_WIDTH: *(natural_width *)p = field_value; - return true; + return 0; default: - return false; /* can never happen. */ + WARN_ON(1); + return -ENOENT; } } @@ -6445,6 +6643,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) case VMCS_FIELD_TYPE_NATURAL_WIDTH: field_value = vmcs_readl(field); break; + default: + WARN_ON(1); + continue; } vmcs12_write_any(&vmx->vcpu, field, field_value); } @@ -6490,6 +6691,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) case VMCS_FIELD_TYPE_NATURAL_WIDTH: vmcs_writel(field, (long)field_value); break; + default: + WARN_ON(1); + break; } } } @@ -6528,7 +6732,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (!vmcs12_read_any(vcpu, field, &field_value)) { + if (vmcs12_read_any(vcpu, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; @@ -6598,7 +6802,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; } - if (!vmcs12_write_any(vcpu, field, field_value)) { + if (vmcs12_write_any(vcpu, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; @@ -6802,6 +7006,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, [EXIT_REASON_INVVPID] = handle_invvpid, + [EXIT_REASON_XSAVES] = handle_xsaves, + [EXIT_REASON_XRSTORS] = handle_xrstors, }; static const int kvm_vmx_max_exit_handlers = @@ -7089,6 +7295,14 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: return 1; + case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + /* + * This should never happen, since it is not possible to + * set XSS to a non-zero value---neither in L1 nor in L2. + * If if it were, XSS would have to be checked against + * the XSS exit bitmap in vmcs12. + */ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); default: return 1; } @@ -7277,6 +7491,9 @@ static void vmx_set_rvi(int vector) u16 status; u8 old; + if (vector == -1) + vector = 0; + status = vmcs_read16(GUEST_INTR_STATUS); old = (u8)status & 0xff; if ((u8)vector != old) { @@ -7288,22 +7505,23 @@ static void vmx_set_rvi(int vector) static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { + if (!is_guest_mode(vcpu)) { + vmx_set_rvi(max_irr); + return; + } + if (max_irr == -1) return; /* - * If a vmexit is needed, vmx_check_nested_events handles it. + * In guest mode. If a vmexit is needed, vmx_check_nested_events + * handles it. */ - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + if (nested_exit_on_intr(vcpu)) return; - if (!is_guest_mode(vcpu)) { - vmx_set_rvi(max_irr); - return; - } - /* - * Fall back to pre-APICv interrupt injection since L2 + * Else, fall back to pre-APICv interrupt injection since L2 * is run without virtual interrupt delivery. */ if (!kvm_event_needs_reinjection(vcpu) && @@ -7400,6 +7618,12 @@ static bool vmx_mpx_supported(void) (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); } +static bool vmx_xsaves_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_XSAVES; +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -8135,6 +8359,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; @@ -8775,6 +9001,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (vmx_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (nested_cpu_has_xsaves(vmcs12)) + vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); /* update exit information fields: */ @@ -9176,6 +9404,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_intercept = vmx_check_intercept, .handle_external_intr = vmx_handle_external_intr, .mpx_supported = vmx_mpx_supported, + .xsaves_supported = vmx_xsaves_supported, .check_nested_events = vmx_check_nested_events, @@ -9184,150 +9413,21 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - int r, i, msr; - - rdmsrl_safe(MSR_EFER, &host_efer); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return -ENOMEM; - - r = -ENOMEM; - - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out3; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out4; - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out5; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out6; - - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - - /* - * Allow direct access to the PC debug port (it is often used for I/O - * delays, but the vmexits simply slow things down). - */ - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - clear_bit(0x80, vmx_io_bitmap_a); - - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) - goto out7; + return r; #ifdef CONFIG_KEXEC rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); #endif - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - - memcpy(vmx_msr_bitmap_legacy_x2apic, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, - vmx_msr_bitmap_longmode, PAGE_SIZE); - - if (enable_apicv) { - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); - - /* According SDM, in x2apic mode, the whole id reg is used. - * But in KVM, it only use the highest eight bits. Need to - * intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); - /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); - /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); - } - - if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); - - update_ple_window_actual_max(); - return 0; - -out7: - free_page((unsigned long)vmx_vmwrite_bitmap); -out6: - free_page((unsigned long)vmx_vmread_bitmap); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out4: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); -out: - free_page((unsigned long)vmx_io_bitmap_a); - return r; } static void __exit vmx_exit(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); - #ifdef CONFIG_KEXEC RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); |