From 7b105ca2903b84f023c49965d9a511c5e55256dc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 15 May 2011 01:00:52 +0900 Subject: KVM: x86 emulator: Stop passing ctxt->ops as arg of emul functions Dereference it in the actual users. This not only cleans up the emulator but also makes it easy to convert the old emulation functions to the new em_xxx() form later. Note: Remove some inline keywords to let the compiler decide inlining. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 77c9d8673dc4..51df0b6c891a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4513,7 +4513,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + inc_eip; - ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); + ret = emulate_int_real(&vcpu->arch.emulate_ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; -- cgit v1.2.3 From 8b0cedff040b652f3d36b1368778667581b0c140 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 15 May 2011 23:22:04 +0800 Subject: KVM: use __copy_to_user/__clear_user to write guest page Simply use __copy_to_user/__clear_user to write guest page since we have already verified the user address when the memslot is set Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51df0b6c891a..0b089860e950 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1388,7 +1388,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; kvm_x86_ops->patch_hypercall(vcpu, instructions); ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (copy_to_user((void __user *)addr, instructions, 4)) + if (__copy_to_user((void __user *)addr, instructions, 4)) return 1; kvm->arch.hv_hypercall = data; break; @@ -1415,7 +1415,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); if (kvm_is_error_hva(addr)) return 1; - if (clear_user((void __user *)addr, PAGE_SIZE)) + if (__clear_user((void __user *)addr, PAGE_SIZE)) return 1; vcpu->arch.hv_vapic = data; break; -- cgit v1.2.3 From 24c82e576b7860a4f02a21103e9df39e11e97006 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 May 2011 05:56:07 -0400 Subject: KVM: Sanitize cpuid Instead of blacklisting known-unsupported cpuid leaves, whitelist known- supported leaves. This is more conservative and prevents us from reporting features we don't support. Also whitelist a few more leaves while at it. Signed-off-by: Avi Kivity Acked-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b089860e950..da48622d170f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2283,6 +2283,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags = 0; } +static bool supported_xcr0_bit(unsigned bit) +{ + u64 mask = ((u64)1 << bit); + + return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; +} + #define F(x) bit(X86_FEATURE_##x) static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, @@ -2393,6 +2400,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 9: + break; case 0xb: { int i, level_type; @@ -2414,7 +2423,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; for (i = 1; *nent < maxnent && i < 64; ++i) { - if (entry[i].eax == 0) + if (entry[i].eax == 0 || !supported_xcr0_bit(i)) continue; do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= @@ -2451,6 +2460,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + case 0x80000008: { + unsigned g_phys_as = (entry->eax >> 16) & 0xff; + unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); + unsigned phys_as = entry->eax & 0xff; + + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->ebx = entry->edx = 0; + break; + } + case 0x80000019: + entry->ecx = entry->edx = 0; + break; + case 0x8000001a: + break; + case 0x8000001d: + break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ @@ -2460,10 +2487,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx &= kvm_supported_word5_x86_features; cpuid_mask(&entry->edx, 5); break; + case 3: /* Processor serial number */ + case 5: /* MONITOR/MWAIT */ + case 6: /* Thermal management */ + case 0xA: /* Architectural Performance Monitoring */ + case 0x80000007: /* Advanced power management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: - /*Now nothing to do, reserved for the future*/ + default: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } -- cgit v1.2.3 From d780592b99d7d8a5ff905f6bacca519d4a342c76 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 23 May 2011 10:33:05 +0200 Subject: KVM: Clean up error handling during VCPU creation So far kvm_arch_vcpu_setup is responsible for freeing the vcpu struct if it fails. Move this confusing resonsibility back into the hands of kvm_vm_ioctl_create_vcpu. Only kvm_arch_vcpu_setup of x86 is affected, all other archs cannot fail. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index da48622d170f..aaa3735004bc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6126,12 +6126,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (r == 0) r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); - if (r < 0) - goto free_vcpu; - return 0; -free_vcpu: - kvm_x86_ops->vcpu_free(vcpu); return r; } -- cgit v1.2.3 From adf52235b4082e67f31bf1fba36f1dce312633d6 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 25 May 2011 11:06:16 +0900 Subject: KVM: x86 emulator: Clean up init_emulate_ctxt() Use a local pointer to the emulate_ctxt for simplicity. Then, arrange the hard-to-read mode selection lines neatly. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aaa3735004bc..ae2353c50208 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4508,7 +4508,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; int cs_db, cs_l; /* @@ -4521,15 +4522,15 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); + ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->eip = kvm_rip_read(vcpu); + ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : + cs_l ? X86EMUL_MODE_PROT64 : + cs_db ? X86EMUL_MODE_PROT32 : + X86EMUL_MODE_PROT16; + ctxt->guest_mode = is_guest_mode(vcpu); + memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; -- cgit v1.2.3 From b5c9ff731f3cee5a2f2d7154f48f8006b48eb66d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 25 May 2011 11:09:38 +0900 Subject: KVM: x86 emulator: Avoid clearing the whole decode_cache During tracing the emulator, we noticed that init_emulate_ctxt() sometimes took a bit longer time than we expected. This patch is for mitigating the problem by some degree. By looking into the function, we soon notice that it clears the whole decode_cache whose size is about 2.5K bytes now. Furthermore, most of the bytes are taken for the two read_cache arrays, which are used only by a few instructions. Considering the fact that we are not assuming the cache arrays have been cleared when we store actual data, we do not need to clear the arrays: 2K bytes elimination. In addition, we can avoid clearing the fetch_cache and regs arrays. This patch changes the initialization not to clear the arrays. On our 64-bit host, init_emulate_ctxt() becomes 0.3 to 0.5us faster with this patch applied. Signed-off-by: Takuya Yoshikawa Cc: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ae2353c50208..d88de565d0c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4506,6 +4506,20 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } +static void init_decode_cache(struct decode_cache *c, + const unsigned long *regs) +{ + memset(c, 0, offsetof(struct decode_cache, regs)); + memcpy(c->regs, regs, sizeof(c->regs)); + + c->fetch.start = 0; + c->fetch.end = 0; + c->io_read.pos = 0; + c->io_read.end = 0; + c->mem_read.pos = 0; + c->mem_read.end = 0; +} + static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; @@ -4531,8 +4545,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + init_decode_cache(c, vcpu->arch.regs); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -- cgit v1.2.3 From 5e1746d6205d1efa3193cc0c67aa2d15e54799bd Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:03:24 +0300 Subject: KVM: nVMX: Allow setting the VMXE bit in CR4 This patch allows the guest to enable the VMXE bit in CR4, which is a prerequisite to running VMXON. Whether to allow setting the VMXE bit now depends on the architecture (svm or vmx), so its checking has moved to kvm_x86_ops->set_cr4(). This function now returns an int: If kvm_x86_ops->set_cr4() returns 1, __kvm_set_cr4() will also return 1, and this will cause kvm_set_cr4() will throw a #GP. Turning on the VMXE bit is allowed only when the nested VMX feature is enabled, and turning it off is forbidden after a vmxon. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d88de565d0c0..460932b62c5b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -615,11 +615,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_read_cr3(vcpu))) return 1; - if (cr4 & X86_CR4_VMXE) + if (kvm_x86_ops->set_cr4(vcpu, cr4)) return 1; - kvm_x86_ops->set_cr4(vcpu, cr4); - if ((cr4 ^ old_cr4) & pdptr_bits) kvm_mmu_reset_context(vcpu); -- cgit v1.2.3 From 064aea774768749c6fd308b37818ea3a9600583d Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:04:56 +0300 Subject: KVM: nVMX: Decoding memory operands of VMX instructions This patch includes a utility function for decoding pointer operands of VMX instructions issued by L1 (a guest hypervisor) Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 460932b62c5b..27d12a3b1890 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3848,7 +3848,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, exception); } -static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, +int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { @@ -3858,6 +3858,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } +EXPORT_SYMBOL_GPL(kvm_read_guest_virt); static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, -- cgit v1.2.3 From 27d6c865211662721e6cf305706e4a3da35f12b4 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:06:59 +0300 Subject: KVM: nVMX: Implement VMCLEAR This patch implements the VMCLEAR instruction. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27d12a3b1890..c1b5a1817e43 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) vcpu->arch.cr2 = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } +EXPORT_SYMBOL_GPL(kvm_inject_page_fault); void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { -- cgit v1.2.3 From 6a4d7550601b5b17df227959bdbec208384f729c Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 25 May 2011 23:08:00 +0300 Subject: KVM: nVMX: Implement VMPTRST This patch implements the VMPTRST instruction. Signed-off-by: Nadav Har'El Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1b5a1817e43..de262a086863 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3869,7 +3869,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } -static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, +int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) @@ -3901,6 +3901,7 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, out: return r; } +EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, -- cgit v1.2.3 From 9d74191ab1ea857d1cc27e439316eebf8ae46d19 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 29 May 2011 21:53:48 +0900 Subject: KVM: x86 emulator: Use the pointers ctxt and c consistently We should use the local variables ctxt and c when the emulate_ctxt and decode appears many times. At least, we need to be consistent about how we use these in a function. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index de262a086863..39d8b043580f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4552,24 +4552,24 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); - vcpu->arch.emulate_ctxt.decode.op_bytes = 2; - vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; - vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + - inc_eip; - ret = emulate_int_real(&vcpu->arch.emulate_ctxt, irq); + c->op_bytes = 2; + c->ad_bytes = 2; + c->eip = ctxt->eip + inc_eip; + ret = emulate_int_real(ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; - vcpu->arch.emulate_ctxt.eip = c->eip; + ctxt->eip = c->eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); if (irq == NMI_VECTOR) vcpu->arch.nmi_pending = false; @@ -4630,21 +4630,22 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, int insn_len) { int r; - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; bool writeback = true; kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); - vcpu->arch.emulate_ctxt.interruptibility = 0; - vcpu->arch.emulate_ctxt.have_exception = false; - vcpu->arch.emulate_ctxt.perm_ok = false; + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->perm_ok = false; - vcpu->arch.emulate_ctxt.only_vendor_specific_insn + ctxt->only_vendor_specific_insn = emulation_type & EMULTYPE_TRAP_UD; - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); + r = x86_decode_insn(ctxt, insn, insn_len); trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; @@ -4660,7 +4661,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip); + kvm_rip_write(vcpu, c->eip); return EMULATE_DONE; } @@ -4672,7 +4673,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } restart: - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) return EMULATE_DONE; @@ -4684,7 +4685,7 @@ restart: return handle_emulation_failure(vcpu); } - if (vcpu->arch.emulate_ctxt.have_exception) { + if (ctxt->have_exception) { inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { @@ -4703,13 +4704,12 @@ restart: r = EMULATE_DONE; if (writeback) { - toggle_interruptibility(vcpu, - vcpu->arch.emulate_ctxt.interruptibility); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + toggle_interruptibility(vcpu, ctxt->interruptibility); + kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + kvm_rip_write(vcpu, ctxt->eip); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -5130,8 +5130,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(&vcpu->arch.emulate_ctxt, - rip, instruction, 3, NULL); + return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); } static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) @@ -5849,21 +5848,21 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); - ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, - tss_selector, reason, has_error_code, - error_code); + ret = emulator_task_switch(ctxt, tss_selector, reason, + has_error_code, error_code); if (ret) return EMULATE_FAIL; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } -- cgit v1.2.3 From 36dd9bb5ce32bc39e25a5fcc61415f13e3ed5d17 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Jun 2011 15:34:24 +0300 Subject: KVM: x86 emulator: rename decode_cache::eip to _eip The name eip conflicts with a field of the same name in x86_emulate_ctxt, which we plan to fold decode_cache into. The name _eip is unfortunate, but what's really needed is a refactoring here, not a better name. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39d8b043580f..7e452fe31e40 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4560,13 +4560,13 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) c->op_bytes = 2; c->ad_bytes = 2; - c->eip = ctxt->eip + inc_eip; + c->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; - ctxt->eip = c->eip; + ctxt->eip = c->_eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); @@ -4661,7 +4661,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, c->eip); + kvm_rip_write(vcpu, c->_eip); return EMULATE_DONE; } -- cgit v1.2.3 From 9dac77fa4011bdb4b541a8db087eac96a602faec Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Jun 2011 15:34:25 +0300 Subject: KVM: x86 emulator: fold decode_cache into x86_emulate_ctxt This saves a lot of pointless casts x86_emulate_ctxt and decode_cache. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7e452fe31e40..694538a043e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4507,24 +4507,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } -static void init_decode_cache(struct decode_cache *c, +static void init_decode_cache(struct x86_emulate_ctxt *ctxt, const unsigned long *regs) { - memset(c, 0, offsetof(struct decode_cache, regs)); - memcpy(c->regs, regs, sizeof(c->regs)); + memset(&ctxt->twobyte, 0, + (void *)&ctxt->regs - (void *)&ctxt->twobyte); + memcpy(ctxt->regs, regs, sizeof(ctxt->regs)); - c->fetch.start = 0; - c->fetch.end = 0; - c->io_read.pos = 0; - c->io_read.end = 0; - c->mem_read.pos = 0; - c->mem_read.end = 0; + ctxt->fetch.start = 0; + ctxt->fetch.end = 0; + ctxt->io_read.pos = 0; + ctxt->io_read.end = 0; + ctxt->mem_read.pos = 0; + ctxt->mem_read.end = 0; } static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; int cs_db, cs_l; /* @@ -4546,28 +4546,27 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) X86EMUL_MODE_PROT16; ctxt->guest_mode = is_guest_mode(vcpu); - init_decode_cache(c, vcpu->arch.regs); + init_decode_cache(ctxt, vcpu->arch.regs); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); - c->op_bytes = 2; - c->ad_bytes = 2; - c->_eip = ctxt->eip + inc_eip; + ctxt->op_bytes = 2; + ctxt->ad_bytes = 2; + ctxt->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); if (ret != X86EMUL_CONTINUE) return EMULATE_FAIL; - ctxt->eip = c->_eip; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + ctxt->eip = ctxt->_eip; + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); @@ -4631,7 +4630,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, { int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; bool writeback = true; kvm_clear_exception_queue(vcpu); @@ -4661,7 +4659,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, c->_eip); + kvm_rip_write(vcpu, ctxt->_eip); return EMULATE_DONE; } @@ -4669,7 +4667,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, changes registers values during IO operation */ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { vcpu->arch.emulate_regs_need_sync_from_vcpu = false; - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); } restart: @@ -4707,7 +4705,7 @@ restart: toggle_interruptibility(vcpu, ctxt->interruptibility); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); } else @@ -5718,8 +5716,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); @@ -5849,7 +5847,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - struct decode_cache *c = &ctxt->decode; int ret; init_emulate_ctxt(vcpu); @@ -5860,7 +5857,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, if (ret) return EMULATE_FAIL; - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); -- cgit v1.2.3 From c68b734fba402b9bfdd49e23b776c42dbeaf1f5b Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Fri, 3 Jun 2011 11:13:42 +0800 Subject: KVM: Add SMEP support when setting CR4 This patch adds SMEP handling when setting CR4. Signed-off-by: Yang, Wei Signed-off-by: Shan, Haitao Signed-off-by: Li, Xin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 694538a043e7..ba5cd27b429a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -580,6 +580,14 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMEP)); +} + static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -599,14 +607,17 @@ static void update_cpuid(struct kvm_vcpu *vcpu) int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; - + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | + X86_CR4_PAE | X86_CR4_SMEP; if (cr4 & CR4_RESERVED_BITS) return 1; if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) return 1; + if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; -- cgit v1.2.3 From 611c120f7486a19e7df2225f875a52ef0b599ae8 Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Fri, 3 Jun 2011 11:14:03 +0800 Subject: KVM: Mask function7 ebx against host capability word9 This patch masks CPUID leaf 7 ebx against host capability word9. Signed-off-by: Yang, Wei Signed-off-by: Shan, Haitao Signed-off-by: Li, Xin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ba5cd27b429a..ff4623b1b102 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2359,6 +2359,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN); + /* cpuid 7.0.ebx */ + const u32 kvm_supported_word9_x86_features = + F(SMEP); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); @@ -2393,7 +2397,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } - /* function 4 and 0xb have additional index. */ + /* function 4 has additional index. */ case 4: { int i, cache_type; @@ -2410,8 +2414,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 7: { + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* Mask ebx against host capbability word 9 */ + if (index == 0) { + entry->ebx &= kvm_supported_word9_x86_features; + cpuid_mask(&entry->ebx, 9); + } else + entry->ebx = 0; + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } case 9: break; + /* function 0xb has additional index. */ case 0xb: { int i, level_type; -- cgit v1.2.3 From 02668b061db1b9f7f18872e594ac68e237db0bed Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 10 Jun 2011 11:35:30 +0200 Subject: KVM: fix XSAVE bit scanning (now properly) commit 123108f1c1aafd51d6a5c79cc04d7999dd88a930 tried to fix KVMs XSAVE valid feature scanning, but it was wrong. It was not considering the sparse nature of this bitfield, instead reading values from uninitialized members of the entries array. This patch now separates subleaf indicies from KVM's array indicies and fills the entry before querying it's value. This fixes AVX support in KVM guests. Signed-off-by: Andre Przywara Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff4623b1b102..84f46074ca74 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2447,16 +2447,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; } case 0xd: { - int i; + int idx, i; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (i = 1; *nent < maxnent && i < 64; ++i) { - if (entry[i].eax == 0 || !supported_xcr0_bit(i)) + for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { + do_cpuid_1_ent(&entry[i], function, idx); + if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) continue; - do_cpuid_1_ent(&entry[i], function, i); entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; + ++i; } break; } -- cgit v1.2.3 From 4a00efdf0c7c93dc6f6b3ce7e2d6bd4cd1ac1651 Mon Sep 17 00:00:00 2001 From: "Yang, Wei Y" Date: Mon, 13 Jun 2011 21:52:33 +0800 Subject: KVM: Enable DRNG feature support for KVM This patch exposes DRNG feature to KVM guests. The RDRAND instruction can provide software with sequences of random numbers generated from white noise. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84f46074ca74..6f1e54d0828d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2345,7 +2345,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C); + F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | -- cgit v1.2.3 From 74dc2b4ffe3af1eac367be0178f88487cb9b240a Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 20:10:18 +0800 Subject: KVM: Add RDWRGSFS support when setting CR4 This patch adds RDWRGSFS support when setting CR4. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f1e54d0828d..423e0cda8e91 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -588,6 +588,14 @@ static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_SMEP)); } +static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); +} + static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -618,6 +626,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) return 1; + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; -- cgit v1.2.3 From 176f61da82435eae09cc96f70b530d1ba0746b8b Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 20:10:19 +0800 Subject: KVM: Expose RDWRGSFS bit to KVM guests This patch exposes RDWRGSFS bit to KVM guests. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 423e0cda8e91..76c16a5c6c56 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2372,7 +2372,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(SMEP); + F(SMEP) | F(FSGSBASE); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.2.3 From a01c8f9b4e266df1d7166d23216f2060648f862d Mon Sep 17 00:00:00 2001 From: "Yang, Wei" Date: Tue, 14 Jun 2011 15:19:06 +0800 Subject: KVM: Enable ERMS feature support for KVM This patch exposes ERMS feature to KVM guests. The REP MOVSB/STOSB instruction can enhance fast strings attempts to move as much of the data with larger size load/stores as possible. Signed-off-by: Yang, Wei Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 76c16a5c6c56..0b803f04bde7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2372,7 +2372,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(SMEP) | F(FSGSBASE); + F(SMEP) | F(FSGSBASE) | F(ERMS); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.2.3 From c9aaa8957f203bd6df83b002fb40b98390bed078 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 11 Jul 2011 15:28:14 -0400 Subject: KVM: Steal time implementation To implement steal time, we need the hypervisor to pass the guest information about how much time was spent running other processes outside the VM, while the vcpu had meaningful work to do - halt time does not count. This information is acquired through the run_delay field of delayacct/schedstats infrastructure, that counts time spent in a runqueue but not running. Steal time is a per-cpu information, so the traditional MSR-based infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the memory area address containing information about steal time This patch contains the hypervisor part of the steal time infrasructure, and can be backported independently of the guest portion. [avi, yongjie: export delayacct_on, to avoid build failures in some configs] Signed-off-by: Glauber Costa Tested-by: Eric B Munson CC: Rik van Riel CC: Jeremy Fitzhardinge CC: Peter Zijlstra CC: Anthony Liguori Signed-off-by: Yongjie Ren Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b803f04bde7..c96cdc092484 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -808,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 8 +#define KVM_SAVE_MSRS_BEGIN 9 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1488,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) } } +static void accumulate_steal_time(struct kvm_vcpu *vcpu) +{ + u64 delta; + + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; + vcpu->arch.st.last_steal = current->sched_info.run_delay; + vcpu->arch.st.accum_steal = delta; +} + +static void record_steal_time(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) + return; + + vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; + vcpu->arch.st.steal.version += 2; + vcpu->arch.st.accum_steal = 0; + + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1570,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; + case MSR_KVM_STEAL_TIME: + + if (unlikely(!sched_info_on())) + return 1; + + if (data & KVM_STEAL_RESERVED_MASK) + return 1; + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, + data & KVM_STEAL_VALID_BITS)) + return 1; + + vcpu->arch.st.msr_val = data; + + if (!(data & KVM_MSR_ENABLED)) + break; + + vcpu->arch.st.last_steal = current->sched_info.run_delay; + + preempt_disable(); + accumulate_steal_time(vcpu); + preempt_enable(); + + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); + + break; + case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: @@ -1855,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_ASYNC_PF_EN: data = vcpu->arch.apf.msr_val; break; + case MSR_KVM_STEAL_TIME: + data = vcpu->arch.st.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -2166,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; } + + accumulate_steal_time(vcpu); + kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -2487,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + + if (sched_info_on()) + entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); + entry->ebx = 0; entry->ecx = 0; entry->edx = 0; @@ -5470,6 +5536,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } + if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) + record_steal_time(vcpu); + } r = kvm_mmu_reload(vcpu); @@ -6206,6 +6275,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; + vcpu->arch.st.msr_val = 0; kvmclock_reset(vcpu); -- cgit v1.2.3 From af7cc7d1ee422a612f6785e347a893d44cc892ea Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:22:46 +0800 Subject: KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code Introduce vcpu_mmio_gva_to_gpa to translate the gva to gpa, we can use it to cleanup the code between read emulation and write emulation Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c96cdc092484..a1dbd0443545 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4010,6 +4010,27 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + gpa_t *gpa, struct x86_exception *exception, + bool write) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + + if (write) + access |= PFERR_WRITE_MASK; + + *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + + if (*gpa == UNMAPPED_GVA) + return -1; + + /* For APIC access vmexit */ + if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + return 1; + + return 0; +} + static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, @@ -4017,8 +4038,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - gpa_t gpa; - int handled; + gpa_t gpa; + int handled, ret; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -4028,13 +4049,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); - if (gpa == UNMAPPED_GVA) + if (ret < 0) return X86EMUL_PROPAGATE_FAULT; - /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + if (ret) goto mmio; if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) @@ -4085,16 +4105,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct x86_exception *exception, struct kvm_vcpu *vcpu) { - gpa_t gpa; - int handled; + gpa_t gpa; + int handled, ret; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); - if (gpa == UNMAPPED_GVA) + if (ret < 0) return X86EMUL_PROPAGATE_FAULT; /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + if (ret) goto mmio; if (emulator_write_phys(vcpu, gpa, val, bytes)) -- cgit v1.2.3 From bebb106a5afa32efdf5332ed4a40bf4d6d06b56e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:23:20 +0800 Subject: KVM: MMU: cache mmio info on page fault path If the page fault is caused by mmio, we can cache the mmio info, later, we do not need to walk guest page table and quickly know it is a mmio fault while we emulate the mmio instruction Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1dbd0443545..028a0f25e8a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4016,6 +4016,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + if (vcpu_match_mmio_gva(vcpu, gva) && + check_write_user_access(vcpu, write, access, + vcpu->arch.access)) { + *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | + (gva & (PAGE_SIZE - 1)); + return 1; + } + if (write) access |= PFERR_WRITE_MASK; @@ -4028,6 +4036,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) return 1; + if (vcpu_match_mmio_gpa(vcpu, *gpa)) + return 1; + return 0; } -- cgit v1.2.3 From c37079586f317d7e7f1a70d36f0e5177691c89c2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:28:04 +0800 Subject: KVM: MMU: remove bypass_guest_pf The idea is from Avi: | Maybe it's time to kill off bypass_guest_pf=1. It's not as effective as | it used to be, since unsync pages always use shadow_trap_nonpresent_pte, | and since we convert between the two nonpresent_ptes during sync and unsync. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 028a0f25e8a0..64c42d90112b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5091,7 +5091,6 @@ int kvm_arch_init(void *opaque) kvm_init_msr_list(); kvm_x86_ops = ops; - kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); -- cgit v1.2.3 From ce88decffd17bf9f373cc233c961ad2054965667 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:33:44 +0800 Subject: KVM: MMU: mmio page fault support The idea is from Avi: | We could cache the result of a miss in an spte by using a reserved bit, and | checking the page fault error code (or seeing if we get an ept violation or | ept misconfiguration), so if we get repeated mmio on a page, we don't need to | search the slot list/tree. | (https://lkml.org/lkml/2011/2/22/221) When the page fault is caused by mmio, we cache the info in the shadow page table, and also set the reserved bits in the shadow page table, so if the mmio is caused again, we can quickly identify it and emulate it directly Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it can be reduced by this feature, and also avoid walking guest page table for soft mmu. [jan: fix operator precedence issue] Signed-off-by: Xiao Guangrong Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 64c42d90112b..2c9661f230a9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5062,6 +5062,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); +static void kvm_set_mmio_spte_mask(void) +{ + u64 mask; + int maxphyaddr = boot_cpu_data.x86_phys_bits; + + /* + * Set the reserved bits and the present bit of an paging-structure + * entry to generate page fault with PFER.RSV = 1. + */ + mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; + mask |= 1ull; + +#ifdef CONFIG_X86_64 + /* + * If reserved bit is not supported, clear the present bit to disable + * mmio page fault. + */ + if (maxphyaddr == 52) + mask &= ~1ull; +#endif + + kvm_mmu_set_mmio_spte_mask(mask); +} + int kvm_arch_init(void *opaque) { int r; @@ -5088,6 +5112,7 @@ int kvm_arch_init(void *opaque) if (r) goto out; + kvm_set_mmio_spte_mask(); kvm_init_msr_list(); kvm_x86_ops = ops; -- cgit v1.2.3 From 4f0226482d20f104e943ee9e6f1218b573953f63 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 12 Jul 2011 03:34:24 +0800 Subject: KVM: MMU: trace mmio page fault Add tracepoints to trace mmio page fault Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c9661f230a9..84a28ea45fa4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4021,6 +4021,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, vcpu->arch.access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); + trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } @@ -4036,8 +4037,10 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) return 1; - if (vcpu_match_mmio_gpa(vcpu, *gpa)) + if (vcpu_match_mmio_gpa(vcpu, *gpa)) { + trace_vcpu_match_mmio(gva, *gpa, write, true); return 1; + } return 0; } -- cgit v1.2.3