diff options
Diffstat (limited to 'drivers/kvm/vmx.c')
-rw-r--r-- | drivers/kvm/vmx.c | 273 |
1 files changed, 187 insertions, 86 deletions
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index fbbf9d6b299f..724db0027f00 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -17,7 +17,6 @@ #include "kvm.h" #include "vmx.h" -#include "kvm_vmx.h" #include <linux/module.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -70,6 +69,10 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +/* + * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * away by decrementing the array size. + */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, @@ -78,6 +81,19 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) +#ifdef CONFIG_X86_64 +static unsigned msr_offset_kernel_gs_base; +#define NR_64BIT_MSRS 4 +/* + * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt + * mechanism (cpu bug AA24) + */ +#define NR_BAD_MSRS 2 +#else +#define NR_64BIT_MSRS 0 +#define NR_BAD_MSRS 0 +#endif + static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -85,6 +101,13 @@ static inline int is_page_fault(u32 intr_info) (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); } +static inline int is_no_device(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); +} + static inline int is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -200,6 +223,16 @@ static void vmcs_write64(unsigned long field, u64 value) #endif } +static void vmcs_clear_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) & ~mask); +} + +static void vmcs_set_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) | mask); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -297,6 +330,44 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) } /* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy + * mode, as fiddling with msrs is very expensive. + */ +static void setup_msrs(struct kvm_vcpu *vcpu) +{ + int nr_skip, nr_good_msrs; + + if (is_long_mode(vcpu)) + nr_skip = NR_BAD_MSRS; + else + nr_skip = NR_64BIT_MSRS; + nr_good_msrs = vcpu->nmsrs - nr_skip; + + /* + * MSR_K6_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + if (find_msr_entry(vcpu, MSR_K6_STAR)) { + --nr_good_msrs; +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE)) + ++nr_good_msrs; +#endif + } + + vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, + virt_to_phys(vcpu->guest_msrs + nr_skip)); + vmcs_writel(VM_EXIT_MSR_STORE_ADDR, + virt_to_phys(vcpu->guest_msrs + nr_skip)); + vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, + virt_to_phys(vcpu->host_msrs + nr_skip)); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ +} + +/* * reads and returns guest's timestamp counter "register" * guest_tsc = host_tsc + tsc_offset -- 21.3 */ @@ -712,6 +783,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); @@ -754,11 +827,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu) +static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->cr0 &= KVM_GUEST_CR0_MASK; - vcpu->cr0 |= vmcs_readl(GUEST_CR0) & ~KVM_GUEST_CR0_MASK; - vcpu->cr4 &= KVM_GUEST_CR4_MASK; vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; } @@ -780,22 +850,11 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, - (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); - vcpu->cr0 = cr0; -} - -/* - * Used when restoring the VM to avoid corrupting segment registers - */ -static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) - enter_rmode(vcpu); + if (!(cr0 & CR0_TS_MASK)) { + vcpu->fpu_active = 1; + vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK); + } - vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0); - update_exception_bitmap(vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); @@ -805,6 +864,12 @@ static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); + + if (!(vcpu->cr0 & CR0_TS_MASK)) { + vcpu->fpu_active = 0; + vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + } } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -835,6 +900,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } + setup_msrs(vcpu); } #endif @@ -878,7 +944,14 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (var->unusable) + if (vcpu->rmode.active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else if (var->unusable) ar = 1 << 16; else { ar = var->type & 15; @@ -933,9 +1006,9 @@ static int init_rmode_tss(struct kvm* kvm) gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; char *page; - p1 = _gfn_to_page(kvm, fn++); - p2 = _gfn_to_page(kvm, fn++); - p3 = _gfn_to_page(kvm, fn); + p1 = gfn_to_page(kvm, fn++); + p2 = gfn_to_page(kvm, fn++); + p3 = gfn_to_page(kvm, fn); if (!p1 || !p2 || !p3) { kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); @@ -991,7 +1064,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) struct descriptor_table dt; int i; int ret = 0; - int nr_good_msrs; extern asmlinkage void kvm_vmx_return(void); if (!init_rmode_tss(vcpu->kvm)) { @@ -1136,23 +1208,17 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->host_msrs[j].reserved = 0; vcpu->host_msrs[j].data = data; vcpu->guest_msrs[j] = vcpu->host_msrs[j]; +#ifdef CONFIG_X86_64 + if (index == MSR_KERNEL_GS_BASE) + msr_offset_kernel_gs_base = j; +#endif ++vcpu->nmsrs; } - printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs); - nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; - vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, - virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); - vmcs_writel(VM_EXIT_MSR_STORE_ADDR, - virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); - vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, - virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS)); + setup_msrs(vcpu); + vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS, (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ - /* 22.2.1, 20.8.1 */ vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS, @@ -1164,7 +1230,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vmcs_writel(TPR_THRESHOLD, 0); #endif - vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK); + vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); vcpu->cr0 = 0x60000010; @@ -1190,7 +1256,7 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) u16 sp = vmcs_readl(GUEST_RSP); u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); - if (sp > ss_limit || sp - 6 > sp) { + if (sp > ss_limit || sp < 6 ) { vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", __FUNCTION__, vmcs_readl(GUEST_RSP), @@ -1330,6 +1396,15 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm ("int $2"); return 1; } + + if (is_no_device(intr_info)) { + vcpu->fpu_active = 1; + vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + if (!(vcpu->cr0 & CR0_TS_MASK)) + vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + return 1; + } + error_code = 0; rip = vmcs_readl(GUEST_RIP); if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) @@ -1355,7 +1430,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case EMULATE_DONE: return 1; case EMULATE_DO_MMIO: - ++kvm_stat.mmio_exits; + ++vcpu->stat.mmio_exits; kvm_run->exit_reason = KVM_EXIT_MMIO; return 0; case EMULATE_FAIL: @@ -1384,7 +1459,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_external_interrupt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - ++kvm_stat.irq_exits; + ++vcpu->stat.irq_exits; return 1; } @@ -1394,7 +1469,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } -static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) +static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count) { u64 inst; gva_t rip; @@ -1439,33 +1514,35 @@ static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) done: countr_size *= 8; *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); + //printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]); return 1; } static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u64 exit_qualification; + int size, down, in, string, rep; + unsigned port; + unsigned long count; + gva_t address; - ++kvm_stat.io_exits; + ++vcpu->stat.io_exits; exit_qualification = vmcs_read64(EXIT_QUALIFICATION); - kvm_run->exit_reason = KVM_EXIT_IO; - if (exit_qualification & 8) - kvm_run->io.direction = KVM_EXIT_IO_IN; - else - kvm_run->io.direction = KVM_EXIT_IO_OUT; - kvm_run->io.size = (exit_qualification & 7) + 1; - kvm_run->io.string = (exit_qualification & 16) != 0; - kvm_run->io.string_down - = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; - kvm_run->io.rep = (exit_qualification & 32) != 0; - kvm_run->io.port = exit_qualification >> 16; - if (kvm_run->io.string) { - if (!get_io_count(vcpu, &kvm_run->io.count)) + in = (exit_qualification & 8) != 0; + size = (exit_qualification & 7) + 1; + string = (exit_qualification & 16) != 0; + down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; + count = 1; + rep = (exit_qualification & 32) != 0; + port = exit_qualification >> 16; + address = 0; + if (string) { + if (rep && !get_io_count(vcpu, &count)) return 1; - kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); - } else - kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ - return 0; + address = vmcs_readl(GUEST_LINEAR_ADDRESS); + } + return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down, + address, rep, port); } static void @@ -1514,6 +1591,15 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; }; break; + case 2: /* clts */ + vcpu_load_rsp_rip(vcpu); + vcpu->fpu_active = 1; + vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR); + vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); + vcpu->cr0 &= ~CR0_TS_MASK; + vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); + skip_emulated_instruction(vcpu); + return 1; case 1: /*mov from cr*/ switch (cr) { case 3: @@ -1523,8 +1609,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) skip_emulated_instruction(vcpu); return 1; case 8: - printk(KERN_DEBUG "handle_cr: read CR8 " - "cpu erratum AA15\n"); vcpu_load_rsp_rip(vcpu); vcpu->regs[reg] = vcpu->cr8; vcpu_put_rsp_rip(vcpu); @@ -1583,8 +1667,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - kvm_run->exit_reason = KVM_EXIT_CPUID; - return 0; + kvm_emulate_cpuid(vcpu); + return 1; } static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -1639,7 +1723,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, if (kvm_run->request_interrupt_window && !vcpu->irq_summary) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - ++kvm_stat.irq_window_exits; + ++vcpu->stat.irq_window_exits; return 0; } return 1; @@ -1652,13 +1736,13 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; kvm_run->exit_reason = KVM_EXIT_HLT; - ++kvm_stat.halt_exits; + ++vcpu->stat.halt_exits; return 0; } static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3); + skip_emulated_instruction(vcpu); return kvm_hypercall(vcpu, kvm_run); } @@ -1699,7 +1783,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_EXCEPTION_NMI ) printk(KERN_WARNING "%s: unexpected, valid vectoring info and " "exit reason is 0x%x\n", __FUNCTION__, exit_reason); - kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); @@ -1763,11 +1846,21 @@ again: if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); - fx_save(vcpu->host_fx_image); - fx_restore(vcpu->guest_fx_image); + if (vcpu->fpu_active) { + fx_save(vcpu->host_fx_image); + fx_restore(vcpu->guest_fx_image); + } + /* + * Loading guest fpu may have cleared host cr0.ts + */ + vmcs_writel(HOST_CR0, read_cr0()); - save_msrs(vcpu->host_msrs, vcpu->nmsrs); - load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); + load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + } +#endif asm ( /* Store host registers */ @@ -1909,21 +2002,28 @@ again: reload_tss(); } - ++kvm_stat.exits; + ++vcpu->stat.exits; - save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); - load_msrs(vcpu->host_msrs, NR_BAD_MSRS); +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + load_msrs(vcpu->host_msrs, NR_BAD_MSRS); + } +#endif + + if (vcpu->fpu_active) { + fx_save(vcpu->guest_fx_image); + fx_restore(vcpu->host_fx_image); + } - fx_save(vcpu->guest_fx_image); - fx_restore(vcpu->host_fx_image); vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - kvm_run->exit_type = 0; if (fail) { - kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; - kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); r = 0; } else { /* @@ -1933,19 +2033,20 @@ again: profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); vcpu->launched = 1; - kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; r = kvm_handle_exit(kvm_run, vcpu); if (r > 0) { /* Give scheduler a change to reschedule. */ if (signal_pending(current)) { - ++kvm_stat.signal_exits; + ++vcpu->stat.signal_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } if (dm_request_for_irq_injection(vcpu, kvm_run)) { - ++kvm_stat.request_irq_exits; + ++vcpu->stat.request_irq_exits; post_kvm_run_save(vcpu, kvm_run); + kvm_run->exit_reason = KVM_EXIT_INTR; return -EINTR; } @@ -1969,7 +2070,7 @@ static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, { u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - ++kvm_stat.pf_guest; + ++vcpu->stat.pf_guest; if (is_page_fault(vect_info)) { printk(KERN_DEBUG "inject_page_fault: " @@ -2026,6 +2127,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmcs_clear(vmcs); vcpu->vmcs = vmcs; vcpu->launched = 0; + vcpu->fpu_active = 1; return 0; @@ -2062,9 +2164,8 @@ static struct kvm_arch_ops vmx_arch_ops = { .get_segment = vmx_get_segment, .set_segment = vmx_set_segment, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits, + .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, - .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, #ifdef CONFIG_X86_64 |