From c066fafc595eef5ae3c83ae3a8305956b8c3ef15 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 14 Aug 2018 20:37:45 +1000 Subject: KVM: PPC: Book3S HV: Use correct pagesize in kvm_unmap_radix() Since commit e641a317830b ("KVM: PPC: Book3S HV: Unify dirty page map between HPT and radix", 2017-10-26), kvm_unmap_radix() computes the number of PAGE_SIZEd pages being unmapped and passes it to kvmppc_update_dirty_map(), which expects to be passed the page size instead. Consequently it will only mark one system page dirty even when a large page (for example a THP page) is being unmapped. The consequence of this is that part of the THP page might not get copied during live migration, resulting in memory corruption for the guest. This fixes it by computing and passing the page size in kvm_unmap_radix(). Cc: stable@vger.kernel.org # v4.15+ Fixes: e641a317830b (KVM: PPC: Book3S HV: Unify dirty page map between HPT and radix) Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 176f911ee983..7efc42538ccf 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -738,10 +738,10 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift); if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { - unsigned long npages = 1; + unsigned long psize = PAGE_SIZE; if (shift) - npages = 1ul << (shift - PAGE_SHIFT); - kvmppc_update_dirty_map(memslot, gfn, npages); + psize = 1ul << shift; + kvmppc_update_dirty_map(memslot, gfn, psize); } } return 0; -- cgit v1.2.3 From 46dec40fb741f00f1864580130779aeeaf24fb3d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 20 Aug 2018 16:05:45 +1000 Subject: KVM: PPC: Book3S HV: Don't truncate HPTE index in xlate function This fixes a bug which causes guest virtual addresses to get translated to guest real addresses incorrectly when the guest is using the HPT MMU and has more than 256GB of RAM, or more specifically has a HPT larger than 2GB. This has showed up in testing as a failure of the host to emulate doorbell instructions correctly on POWER9 for HPT guests with more than 256GB of RAM. The bug is that the HPTE index in kvmppc_mmu_book3s_64_hv_xlate() is stored as an int, and in forming the HPTE address, the index gets shifted left 4 bits as an int before being signed-extended to 64 bits. The simple fix is to make the variable a long int, matching the return type of kvmppc_hv_find_lock_hpte(), which is what calculates the index. Fixes: 697d3899dcb4 ("KVM: PPC: Implement MMIO emulation support for Book3S HV guests") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 7f3a8cf5d66f..4c08f42f6406 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -359,7 +359,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned long pp, key; unsigned long v, orig_v, gr; __be64 *hptep; - int index; + long int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); if (kvm_is_radix(vcpu->kvm)) -- cgit v1.2.3 From b871da4a7778eda2e700ae8bf5f86e74a004c1ec Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 23 Aug 2018 18:24:24 +0200 Subject: KVM: nVMX: avoid redundant double assignment of nested_run_pending MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nested_run_pending is set 20 lines above and check_vmentry_prereqs()/ check_vmentry_postreqs() don't seem to be resetting it (the later, however, checks it). Signed-off-by: Vitaly Kuznetsov Reviewed-by: Paolo Bonzini Reviewed-by: Jim Mattson Reviewed-by: Eduardo Valentin Reviewed-by: Krish Sadhukhan Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1d26f3c4985b..92f027745842 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -13988,9 +13988,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) - vmx->nested.nested_run_pending = 1; - vmx->nested.dirty_vmcs12 = true; ret = enter_vmx_non_root_mode(vcpu, NULL); if (ret) -- cgit v1.2.3 From 0186ec823280f5db55ab2e6291933bebe2e92772 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 28 Aug 2018 16:22:28 +0100 Subject: KVM: SVM: remove unused variable dst_vaddr_end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable dst_vaddr_end is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: variable 'dst_vaddr_end' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6276140044d0..4339fc4715fa 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6747,7 +6747,7 @@ e_free: static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) { unsigned long vaddr, vaddr_end, next_vaddr; - unsigned long dst_vaddr, dst_vaddr_end; + unsigned long dst_vaddr; struct page **src_p, **dst_p; struct kvm_sev_dbg debug; unsigned long n; @@ -6763,7 +6763,6 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) size = debug.len; vaddr_end = vaddr + size; dst_vaddr = debug.dst_uaddr; - dst_vaddr_end = dst_vaddr + size; for (; vaddr < vaddr_end; vaddr = next_vaddr) { int len, s_off, d_off; -- cgit v1.2.3 From c4409905cd6eb42cfd06126e9226b0150e05a715 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 23 Aug 2018 13:56:46 -0700 Subject: KVM: VMX: Do not allow reexecute_instruction() when skipping MMIO instr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-execution after an emulation decode failure is only intended to handle a case where two or vCPUs race to write a shadowed page, i.e. we should never re-execute an instruction as part of MMIO emulation. As handle_ept_misconfig() is only used for MMIO emulation, it should pass EMULTYPE_NO_REEXECUTE when using the emulator to skip an instr in the fast-MMIO case where VM_EXIT_INSTRUCTION_LEN is invalid. And because the cr2 value passed to x86_emulate_instruction() is only destined for use when retrying or reexecuting, we can simply call emulate_instruction(). Fixes: d391f1207067 ("x86/kvm/vmx: do not use vm-exit instruction length for fast MMIO when running nested") Cc: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 92f027745842..b345ad91809c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7704,8 +7704,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) return kvm_skip_emulated_instruction(vcpu); else - return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP, - NULL, 0) == EMULATE_DONE; + return emulate_instruction(vcpu, EMULTYPE_SKIP) == + EMULATE_DONE; } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); -- cgit v1.2.3 From 35be0aded76b54a24dc8aa678a71bca22273e8d8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 23 Aug 2018 13:56:47 -0700 Subject: KVM: x86: SVM: Set EMULTYPE_NO_REEXECUTE for RSM emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-execution after an emulation decode failure is only intended to handle a case where two or vCPUs race to write a shadowed page, i.e. we should never re-execute an instruction as part of RSM emulation. Add a new helper, kvm_emulate_instruction_from_buffer(), to support emulating from a pre-defined buffer. This eliminates the last direct call to x86_emulate_instruction() outside of kvm_mmu_page_fault(), which means x86_emulate_instruction() can be unexported in a future patch. Fixes: 7607b7174405 ("KVM: SVM: install RSM intercept") Cc: Brijesh Singh Signed-off-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/svm.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 00ddb0c9e612..3b220b86c8e4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1251,6 +1251,13 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); } +static inline int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len) +{ + return x86_emulate_instruction(vcpu, 0, EMULTYPE_NO_REEXECUTE, + insn, insn_len); +} + void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4339fc4715fa..b3cdfde8ff5e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3874,8 +3874,8 @@ static int emulate_on_interception(struct vcpu_svm *svm) static int rsm_interception(struct vcpu_svm *svm) { - return x86_emulate_instruction(&svm->vcpu, 0, 0, - rsm_ins_bytes, 2) == EMULATE_DONE; + return kvm_emulate_instruction_from_buffer(&svm->vcpu, + rsm_ins_bytes, 2) == EMULATE_DONE; } static int rdpmc_interception(struct vcpu_svm *svm) -- cgit v1.2.3 From 8065dbd1ee0ef04321d80da7999b4f0086e0a407 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 23 Aug 2018 13:56:48 -0700 Subject: KVM: x86: Invert emulation re-execute behavior to make it opt-in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-execution of an instruction after emulation decode failure is intended to be used only when emulating shadow page accesses. Invert the flag to make allowing re-execution opt-in since that behavior is by far in the minority. Signed-off-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 8 +++----- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/x86.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3b220b86c8e4..a69ea11f3bab 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1238,7 +1238,7 @@ enum emulation_result { #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_RETRY (1 << 3) -#define EMULTYPE_NO_REEXECUTE (1 << 4) +#define EMULTYPE_ALLOW_REEXECUTE (1 << 4) #define EMULTYPE_NO_UD_ON_FAIL (1 << 5) #define EMULTYPE_VMWARE (1 << 6) int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, @@ -1247,15 +1247,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, static inline int emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { - return x86_emulate_instruction(vcpu, 0, - emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); + return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); } static inline int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len) { - return x86_emulate_instruction(vcpu, 0, EMULTYPE_NO_REEXECUTE, - insn, insn_len); + return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); } void kvm_enable_efer_bits(u64); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a282321329b5..4508c34eef20 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5217,7 +5217,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { - int r, emulation_type = EMULTYPE_RETRY; + int r, emulation_type = EMULTYPE_RETRY | EMULTYPE_ALLOW_REEXECUTE; enum emulation_result er; bool direct = vcpu->arch.mmu.direct_map; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 506bd2b4b8bb..d6f85ea23101 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5870,7 +5870,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, gpa_t gpa = cr2; kvm_pfn_t pfn; - if (emulation_type & EMULTYPE_NO_REEXECUTE) + if (!(emulation_type & EMULTYPE_ALLOW_REEXECUTE)) return false; if (!vcpu->arch.mmu.direct_map) { -- cgit v1.2.3 From 384bf2218e96f57118270945b1841e4dbbe9e352 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 23 Aug 2018 13:56:49 -0700 Subject: KVM: x86: Merge EMULTYPE_RETRY and EMULTYPE_ALLOW_REEXECUTE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit retry_instruction() and reexecute_instruction() are a package deal, i.e. there is no scenario where one is allowed and the other is not. Merge their controlling emulation type flags to enforce this in code. Name the combined flag EMULTYPE_ALLOW_RETRY to make it abundantly clear that we are allowing re{try,execute} to occur, as opposed to explicitly requesting retry of a previously failed instruction. Signed-off-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 7 +++---- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a69ea11f3bab..35e03b13edcb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1237,10 +1237,9 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) -#define EMULTYPE_RETRY (1 << 3) -#define EMULTYPE_ALLOW_REEXECUTE (1 << 4) -#define EMULTYPE_NO_UD_ON_FAIL (1 << 5) -#define EMULTYPE_VMWARE (1 << 6) +#define EMULTYPE_ALLOW_RETRY (1 << 3) +#define EMULTYPE_NO_UD_ON_FAIL (1 << 4) +#define EMULTYPE_VMWARE (1 << 5) int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, void *insn, int insn_len); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4508c34eef20..0246a1ea7f55 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5217,7 +5217,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { - int r, emulation_type = EMULTYPE_RETRY | EMULTYPE_ALLOW_REEXECUTE; + int r, emulation_type = EMULTYPE_ALLOW_RETRY; enum emulation_result er; bool direct = vcpu->arch.mmu.direct_map; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d6f85ea23101..924ce28723c4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5870,7 +5870,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, gpa_t gpa = cr2; kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_REEXECUTE)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) return false; if (!vcpu->arch.mmu.direct_map) { @@ -5958,7 +5958,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) return false; if (x86_page_table_writing_insn(ctxt)) -- cgit v1.2.3 From 472faffacd9032164f611f56329d0025ddca55b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 23 Aug 2018 13:56:50 -0700 Subject: KVM: x86: Default to not allowing emulation retry in kvm_mmu_page_fault MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Effectively force kvm_mmu_page_fault() to opt-in to allowing retry to make it more obvious when and why it allows emulation to be retried. Previously this approach was less convenient due to retry and re-execute behavior being controlled by separate flags that were also inverted in their implementations (opt-in versus opt-out). Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/kvm/mmu.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0246a1ea7f55..01fb8701ccd0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5217,7 +5217,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { - int r, emulation_type = EMULTYPE_ALLOW_RETRY; + int r, emulation_type = 0; enum emulation_result er; bool direct = vcpu->arch.mmu.direct_map; @@ -5230,10 +5230,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2, direct); - if (r == RET_PF_EMULATE) { - emulation_type = 0; + if (r == RET_PF_EMULATE) goto emulate; - } } if (r == RET_PF_INVALID) { @@ -5260,8 +5258,16 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, return 1; } - if (mmio_info_in_cache(vcpu, cr2, direct)) - emulation_type = 0; + /* + * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still + * optimistically try to just unprotect the page and let the processor + * re-execute the instruction that caused the page fault. Do not allow + * retrying MMIO emulation, as it's not only pointless but could also + * cause us to enter an infinite loop because the processor will keep + * faulting on the non-existent MMIO address. + */ + if (!mmio_info_in_cache(vcpu, cr2, direct)) + emulation_type = EMULTYPE_ALLOW_RETRY; emulate: /* * On AMD platforms, under certain conditions insn_len may be zero on #NPF. -- cgit v1.2.3 From 6c3dfeb6a48b1562bd5b8ec5f3317ef34d0134ef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 23 Aug 2018 13:56:51 -0700 Subject: KVM: x86: Do not re-{try,execute} after failed emulation in L2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a6f177efaa58 ("KVM: Reenter guest after emulation failure if due to access to non-mmio address") added reexecute_instruction() to handle the scenario where two (or more) vCPUS race to write a shadowed page, i.e. reexecute_instruction() is intended to return true if and only if the instruction being emulated was accessing a shadowed page. As L0 is only explicitly shadowing L1 tables, an emulation failure of a nested VM instruction cannot be due to a race to write a shadowed page and so should never be re-executed. This fixes an issue where an "MMIO" emulation failure[1] in L2 is all but guaranteed to result in an infinite loop when TDP is enabled. Because "cr2" is actually an L2 GPA when TDP is enabled, calling kvm_mmu_gva_to_gpa_write() to translate cr2 in the non-direct mapped case (L2 is never direct mapped) will almost always yield UNMAPPED_GVA and cause reexecute_instruction() to immediately return true. The !mmio_info_in_cache() check in kvm_mmu_page_fault() doesn't catch this case because mmio_info_in_cache() returns false for a nested MMU (the MMIO caching currently handles L1 only, e.g. to cache nested guests' GPAs we'd have to manually flush the cache when switching between VMs and when L1 updated its page tables controlling the nested guest). Way back when, commit 68be0803456b ("KVM: x86: never re-execute instruction with enabled tdp") changed reexecute_instruction() to always return false when using TDP under the assumption that KVM would only get into the emulator for MMIO. Commit 95b3cf69bdf8 ("KVM: x86: let reexecute_instruction work for tdp") effectively reverted that behavior in order to handle the scenario where emulation failed due to an access from L1 to the shadow page tables for L2, but it didn't account for the case where emulation failed in L2 with TDP enabled. All of the above logic also applies to retry_instruction(), added by commit 1cb3f3ae5a38 ("KVM: x86: retry non-page-table writing instructions"). An indefinite loop in retry_instruction() should be impossible as it protects against retrying the same instruction over and over, but it's still correct to not retry an L2 instruction in the first place. Fix the immediate issue by adding a check for a nested guest when determining whether or not to allow retry in kvm_mmu_page_fault(). In addition to fixing the immediate bug, add WARN_ON_ONCE in the retry functions since they are not designed to handle nested cases, i.e. they need to be modified even if there is some scenario in the future where we want to allow retrying a nested guest. [1] This issue was encountered after commit 3a2936dedd20 ("kvm: mmu: Don't expose private memslots to L2") changed the page fault path to return KVM_PFN_NOSLOT when translating an L2 access to a prive memslot. Returning KVM_PFN_NOSLOT is semantically correct when we want to hide a memslot from L2, i.e. there effectively is no defined memory region for L2, but it has the unfortunate side effect of making KVM think the GFN is a MMIO page, thus triggering emulation. The failure occurred with in-development code that deliberately exposed a private memslot to L2, which L2 accessed with an instruction that is not emulated by KVM. Fixes: 95b3cf69bdf8 ("KVM: x86: let reexecute_instruction work for tdp") Fixes: 1cb3f3ae5a38 ("KVM: x86: retry non-page-table writing instructions") Signed-off-by: Sean Christopherson Cc: Jim Mattson Cc: Krish Sadhukhan Cc: Xiao Guangrong Cc: stable@vger.kernel.org Signed-off-by: Radim Krčmář --- arch/x86/kvm/mmu.c | 7 +++++-- arch/x86/kvm/x86.c | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 01fb8701ccd0..f7e83b1e0eb2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5264,9 +5264,12 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, * re-execute the instruction that caused the page fault. Do not allow * retrying MMIO emulation, as it's not only pointless but could also * cause us to enter an infinite loop because the processor will keep - * faulting on the non-existent MMIO address. + * faulting on the non-existent MMIO address. Retrying an instruction + * from a nested guest is also pointless and dangerous as we are only + * explicitly shadowing L1's page tables, i.e. unprotecting something + * for L1 isn't going to magically fix whatever issue cause L2 to fail. */ - if (!mmio_info_in_cache(vcpu, cr2, direct)) + if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) emulation_type = EMULTYPE_ALLOW_RETRY; emulate: /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 924ce28723c4..cbe2921e972b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5873,6 +5873,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) return false; + if (WARN_ON_ONCE(is_guest_mode(vcpu))) + return false; + if (!vcpu->arch.mmu.direct_map) { /* * Write permission should be allowed since only @@ -5961,6 +5964,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) return false; + if (WARN_ON_ONCE(is_guest_mode(vcpu))) + return false; + if (x86_page_table_writing_insn(ctxt)) return false; -- cgit v1.2.3 From 0ce97a2b627c5e26347aee298f571ddf925e5fe4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 23 Aug 2018 13:56:52 -0700 Subject: KVM: x86: Rename emulate_instruction() to kvm_emulate_instruction() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lack of the kvm_ prefix gives the impression that it's a VMX or SVM specific function, and there's no conflict that prevents adding the kvm_ prefix. Signed-off-by: Sean Christopherson Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 12 ++++++------ arch/x86/kvm/vmx.c | 16 ++++++++-------- arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 35e03b13edcb..8c9023661351 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1243,7 +1243,7 @@ enum emulation_result { int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, void *insn, int insn_len); -static inline int emulate_instruction(struct kvm_vcpu *vcpu, +static inline int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b3cdfde8ff5e..89c4c5aa15f1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -776,7 +776,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { - if (emulate_instruction(vcpu, EMULTYPE_SKIP) != + if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != EMULATE_DONE) printk(KERN_DEBUG "%s: NOP\n", __func__); return; @@ -2715,7 +2715,7 @@ static int gp_interception(struct vcpu_svm *svm) WARN_ON_ONCE(!enable_vmware_backdoor); - er = emulate_instruction(vcpu, + er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; @@ -2819,7 +2819,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -3861,7 +3861,7 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); return kvm_skip_emulated_instruction(&svm->vcpu); @@ -3869,7 +3869,7 @@ static int invlpg_interception(struct vcpu_svm *svm) static int emulate_on_interception(struct vcpu_svm *svm) { - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } static int rsm_interception(struct vcpu_svm *svm) @@ -4700,7 +4700,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ - ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); } return ret; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b345ad91809c..f910d33858d9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6983,7 +6983,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); @@ -7054,7 +7054,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); - er = emulate_instruction(vcpu, + er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; @@ -7157,7 +7157,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -7231,7 +7231,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_cr(struct kvm_vcpu *vcpu) @@ -7480,7 +7480,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -7547,7 +7547,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } } - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) @@ -7704,7 +7704,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) return kvm_skip_emulated_instruction(vcpu); else - return emulate_instruction(vcpu, EMULTYPE_SKIP) == + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == EMULATE_DONE; } @@ -7748,7 +7748,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; - err = emulate_instruction(vcpu, 0); + err = kvm_emulate_instruction(vcpu, 0); if (err == EMULATE_USER_EXIT) { ++vcpu->stat.mmio_exits; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cbe2921e972b..915002c7f07e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4987,7 +4987,7 @@ int handle_ud(struct kvm_vcpu *vcpu) emul_type = 0; } - er = emulate_instruction(vcpu, emul_type); + er = kvm_emulate_instruction(vcpu, emul_type); if (er == EMULATE_USER_EXIT) return 0; if (er != EMULATE_DONE) @@ -7740,7 +7740,7 @@ static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { int r; vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r != EMULATE_DONE) return 0; -- cgit v1.2.3 From c60658d1d983641fcdbb16f86bc2f3806d88bab4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 23 Aug 2018 13:56:53 -0700 Subject: KVM: x86: Unexport x86_emulate_instruction() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allowing x86_emulate_instruction() to be called directly has led to subtle bugs being introduced, e.g. not setting EMULTYPE_NO_REEXECUTE in the emulation type. While most of the blame lies on re-execute being opt-out, exporting x86_emulate_instruction() also exposes its cr2 parameter, which may have contributed to commit d391f1207067 ("x86/kvm/vmx: do not use vm-exit instruction length for fast MMIO when running nested") using x86_emulate_instruction() instead of emulate_instruction() because "hey, I have a cr2!", which in turn introduced its EMULTYPE_NO_REEXECUTE bug. Signed-off-by: Sean Christopherson Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 17 +++-------------- arch/x86/kvm/x86.c | 14 +++++++++++++- arch/x86/kvm/x86.h | 2 ++ 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8c9023661351..e12916e7c2fb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1240,20 +1240,9 @@ enum emulation_result { #define EMULTYPE_ALLOW_RETRY (1 << 3) #define EMULTYPE_NO_UD_ON_FAIL (1 << 4) #define EMULTYPE_VMWARE (1 << 5) -int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, - int emulation_type, void *insn, int insn_len); - -static inline int kvm_emulate_instruction(struct kvm_vcpu *vcpu, - int emulation_type) -{ - return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); -} - -static inline int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, - void *insn, int insn_len) -{ - return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); -} +int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); +int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 915002c7f07e..542f6315444d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6282,7 +6282,19 @@ restart: return r; } -EXPORT_SYMBOL_GPL(x86_emulate_instruction); + +int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) +{ + return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); +} +EXPORT_SYMBOL_GPL(kvm_emulate_instruction); + +int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len) +{ + return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); +} +EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 257f27620bc2..67b9568613f3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -274,6 +274,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); +int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, + int emulation_type, void *insn, int insn_len); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ -- cgit v1.2.3 From 58f33cfe73076b6497bada4f7b5bda961ed68083 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Fri, 24 Aug 2018 14:03:55 +0200 Subject: tools/kvm_stat: fix python3 issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Python3 returns a float for a regular division - switch to a division operator that returns an integer. Furthermore, filters return a generator object instead of the actual list - wrap result in yet another list, which makes it still work in both, Python2 and 3. Signed-off-by: Stefan Raspl Signed-off-by: Radim Krčmář --- tools/kvm/kvm_stat/kvm_stat | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 56c4b3f8a01b..e10b90a8917a 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -759,7 +759,7 @@ class DebugfsProvider(Provider): if len(vms) == 0: self.do_read = False - self.paths = filter(lambda x: "{}-".format(pid) in x, vms) + self.paths = list(filter(lambda x: "{}-".format(pid) in x, vms)) else: self.paths = [] @@ -1219,10 +1219,10 @@ class Tui(object): (x, term_width) = self.screen.getmaxyx() row = 2 for line in text: - start = (term_width - len(line)) / 2 + start = (term_width - len(line)) // 2 self.screen.addstr(row, start, line) row += 1 - self.screen.addstr(row + 1, (term_width - len(hint)) / 2, hint, + self.screen.addstr(row + 1, (term_width - len(hint)) // 2, hint, curses.A_STANDOUT) self.screen.getkey() -- cgit v1.2.3 From 617c66b9f236d20f11cecbb3f45e6d5675b2fae1 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Fri, 24 Aug 2018 14:03:56 +0200 Subject: tools/kvm_stat: fix handling of invalid paths in debugfs provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When filtering by guest, kvm_stat displays garbage when the guest is destroyed - see sample output below. We add code to remove the invalid paths from the providers, so at least no more garbage is displayed. Here's a sample output to illustrate: kvm statistics - pid 13986 (foo) Event Total %Total CurAvg/s diagnose_258 -2 0.0 0 deliver_program_interruption -3 0.0 0 diagnose_308 -4 0.0 0 halt_poll_invalid -91 0.0 -6 deliver_service_signal -244 0.0 -16 halt_successful_poll -250 0.1 -17 exit_pei -285 0.1 -19 exit_external_request -312 0.1 -21 diagnose_9c -328 0.1 -22 userspace_handled -713 0.1 -47 halt_attempted_poll -939 0.2 -62 deliver_emergency_signal -3126 0.6 -208 halt_wakeup -7199 1.5 -481 exit_wait_state -7379 1.5 -493 diagnose_500 -56499 11.5 -3757 exit_null -85491 17.4 -5685 diagnose_44 -133300 27.1 -8874 exit_instruction -195898 39.8 -13037 Total -492063 Signed-off-by: Stefan Raspl Signed-off-by: Radim Krčmář --- tools/kvm/kvm_stat/kvm_stat | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index e10b90a8917a..b9e8d0def1ab 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -766,6 +766,13 @@ class DebugfsProvider(Provider): self.do_read = True self.reset() + def _verify_paths(self): + """Remove invalid paths""" + for path in self.paths: + if not os.path.exists(os.path.join(PATH_DEBUGFS_KVM, path)): + self.paths.remove(path) + continue + def read(self, reset=0, by_guest=0): """Returns a dict with format:'file name / field -> current value'. @@ -780,6 +787,7 @@ class DebugfsProvider(Provider): # If no debugfs filtering support is available, then don't read. if not self.do_read: return results + self._verify_paths() paths = self.paths if self._pid == 0: -- cgit v1.2.3 From 710ab11ad9329d2d4b044405e328c994b19a2aa9 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Fri, 24 Aug 2018 14:03:57 +0200 Subject: tools/kvm_stat: fix updates for dead guests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With pid filtering active, when a guest is removed e.g. via virsh shutdown, successive updates produce garbage. Therefore, we add code to detect this case and prevent further body updates. Note that when displaying the help dialog via 'h' in this case, once we exit we're stuck with the 'Collecting data...' message till we remove the filter. Signed-off-by: Stefan Raspl Signed-off-by: Radim Krčmář --- tools/kvm/kvm_stat/kvm_stat | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index b9e8d0def1ab..7c92545931e3 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1170,6 +1170,9 @@ class Tui(object): return sorted_items + if not self._is_running_guest(self.stats.pid_filter): + # leave final data on screen + return row = 3 self.screen.move(row, 0) self.screen.clrtobot() @@ -1327,6 +1330,12 @@ class Tui(object): msg = '"' + str(val) + '": Invalid value' self._refresh_header() + def _is_running_guest(self, pid): + """Check if pid is still a running process.""" + if not pid: + return True + return os.path.isdir(os.path.join('/proc/', str(pid))) + def _show_vm_selection_by_guest(self): """Draws guest selection mask. @@ -1354,7 +1363,7 @@ class Tui(object): if not guest or guest == '0': break if guest.isdigit(): - if not os.path.isdir(os.path.join('/proc/', guest)): + if not self._is_running_guest(guest): msg = '"' + guest + '": Not a running process' continue pid = int(guest) -- cgit v1.2.3 From 0db8b3102368755242b44f2b30f93302c70e8e82 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Fri, 24 Aug 2018 14:03:58 +0200 Subject: tools/kvm_stat: don't reset stats when setting PID filter for debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When setting a PID filter in debugfs, we unnecessarily reset the statistics, although there is no reason to do so. This behavior was merely introduced with commit 9f114a03c6854f "tools/kvm_stat: add interactive command 'r'", most likely to mimic the behavior of the tracepoints provider in this respect. However, there are plenty of differences between the two providers, so there is no reason not to take advantage of the possibility to filter by PID without resetting the statistics. Signed-off-by: Stefan Raspl Signed-off-by: Radim Krčmář --- tools/kvm/kvm_stat/kvm_stat | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 7c92545931e3..62fbb8802f60 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -764,7 +764,6 @@ class DebugfsProvider(Provider): else: self.paths = [] self.do_read = True - self.reset() def _verify_paths(self): """Remove invalid paths""" -- cgit v1.2.3 From 29c39f38e4e8dbf0497e81db6c985ee59259f002 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Fri, 24 Aug 2018 14:03:59 +0200 Subject: tools/kvm_stat: handle guest removals more gracefully MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running with the DebugFS provider, removal of a guest can result in a negative CurAvg/s, which looks rather confusing. If so, suppress the body refresh and print a message instead. To reproduce, have at least one guest A completely booted. Then start another guest B (which generates a huge amount of events), then destroy B. On the next refresh, kvm_stat should display a whole lot of negative values in the CurAvg/s column. Signed-off-by: Stefan Raspl Signed-off-by: Radim Krčmář --- tools/kvm/kvm_stat/kvm_stat | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 62fbb8802f60..bd620579eb6f 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1194,6 +1194,7 @@ class Tui(object): # print events tavg = 0 tcur = 0 + guest_removed = False for key, values in get_sorted_events(self, stats): if row >= self.screen.getmaxyx()[0] - 1 or values == (0, 0): break @@ -1201,7 +1202,10 @@ class Tui(object): key = self.get_gname_from_pid(key) if not key: continue - cur = int(round(values.delta / sleeptime)) if values.delta else '' + cur = int(round(values.delta / sleeptime)) if values.delta else 0 + if cur < 0: + guest_removed = True + continue if key[0] != ' ': if values.delta: tcur += values.delta @@ -1214,7 +1218,10 @@ class Tui(object): values.value * 100 / float(ltotal), cur)) row += 1 if row == 3: - self.screen.addstr(4, 1, 'No matching events reported yet') + if guest_removed: + self.screen.addstr(4, 1, 'Guest removed, updating...') + else: + self.screen.addstr(4, 1, 'No matching events reported yet') if row > 4: tavg = int(round(tcur / sleeptime)) if tcur > 0 else '' self.screen.addstr(row, 1, '%-40s %10d %8s' % -- cgit v1.2.3 From 404517e40867aef60554ef497d5cf8d089a5b9cf Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Fri, 24 Aug 2018 14:04:00 +0200 Subject: tools/kvm_stat: indicate dead guests as such MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For destroyed guests, kvm_stat essentially freezes with the last data displayed. This is acceptable for users, in case they want to inspect the final data. But it looks a bit irritating. Therefore, detect this situation and display a respective indicator in the header. Signed-off-by: Stefan Raspl Signed-off-by: Radim Krčmář --- tools/kvm/kvm_stat/kvm_stat | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index bd620579eb6f..5c2422b0f2f8 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1108,10 +1108,10 @@ class Tui(object): if len(gname) > MAX_GUEST_NAME_LEN else gname)) if pid > 0: - self.screen.addstr(0, 0, 'kvm statistics - pid {0} {1}' - .format(pid, gname), curses.A_BOLD) + self._headline = 'kvm statistics - pid {0} {1}'.format(pid, gname) else: - self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) + self._headline = 'kvm statistics - summary' + self.screen.addstr(0, 0, self._headline, curses.A_BOLD) if self.stats.fields_filter: regex = self.stats.fields_filter if len(regex) > MAX_REGEX_LEN: @@ -1170,6 +1170,7 @@ class Tui(object): return sorted_items if not self._is_running_guest(self.stats.pid_filter): + self._display_guest_dead() # leave final data on screen return row = 3 @@ -1228,6 +1229,11 @@ class Tui(object): ('Total', total, tavg), curses.A_BOLD) self.screen.refresh() + def _display_guest_dead(self): + marker = ' Guest is DEAD ' + y = min(len(self._headline), 80 - len(marker)) + self.screen.addstr(0, y, marker, curses.A_BLINK | curses.A_STANDOUT) + def _show_msg(self, text): """Display message centered text and exit on key press""" hint = 'Press any key to continue' -- cgit v1.2.3 From c012a0f2677529a0ae8f53a15bd7c61dc4ca5b5e Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Fri, 24 Aug 2018 14:04:01 +0200 Subject: tools/kvm_stat: re-animate display of dead guests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When filtering by guest (interactive commands 'p'/'g'), and the respective guest was destroyed, detect when the guest is up again through the guest name if possible. I.e. when displaying events for a specific guest, it is not necessary anymore to restart kvm_stat in case the guest is restarted. Signed-off-by: Stefan Raspl Signed-off-by: Radim Krčmář --- tools/kvm/kvm_stat/kvm_stat | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 5c2422b0f2f8..439b8a27488d 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1103,6 +1103,7 @@ class Tui(object): pid = self.stats.pid_filter self.screen.erase() gname = self.get_gname_from_pid(pid) + self._gname = gname if gname: gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...' if len(gname) > MAX_GUEST_NAME_LEN @@ -1170,6 +1171,15 @@ class Tui(object): return sorted_items if not self._is_running_guest(self.stats.pid_filter): + if self._gname: + try: # ...to identify the guest by name in case it's back + pids = self.get_pid_from_gname(self._gname) + if len(pids) == 1: + self._refresh_header(pids[0]) + self._update_pid(pids[0]) + return + except: + pass self._display_guest_dead() # leave final data on screen return -- cgit v1.2.3 From a11bdb1a6b782ee97587f92fae798efc78c31093 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Thu, 30 Aug 2018 10:13:55 +0200 Subject: KVM: s390: Fix pfmf and conditional skey emulation We should not return with a lock. We also have to increase the address when we do page clearing. Fixes: bd096f644319 ("KVM: s390: Add skey emulation fault handling") Signed-off-by: Janosch Frank Message-Id: <20180830081355.59234-1-frankja@linux.ibm.com> Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/priv.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index d68f10441a16..8679bd74d337 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -280,9 +280,11 @@ retry: goto retry; } } - if (rc) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); up_read(¤t->mm->mmap_sem); + if (rc == -EFAULT) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc < 0) + return rc; vcpu->run->s.regs.gprs[reg1] &= ~0xff; vcpu->run->s.regs.gprs[reg1] |= key; return 0; @@ -324,9 +326,11 @@ retry: goto retry; } } - if (rc < 0) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); up_read(¤t->mm->mmap_sem); + if (rc == -EFAULT) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc < 0) + return rc; kvm_s390_set_psw_cc(vcpu, rc); return 0; } @@ -390,12 +394,12 @@ static int handle_sske(struct kvm_vcpu *vcpu) FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } + up_read(¤t->mm->mmap_sem); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - up_read(¤t->mm->mmap_sem); - if (rc >= 0) - start += PAGE_SIZE; + if (rc < 0) + return rc; + start += PAGE_SIZE; } if (m3 & (SSKE_MC | SSKE_MR)) { @@ -1002,13 +1006,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } + up_read(¤t->mm->mmap_sem); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - up_read(¤t->mm->mmap_sem); - if (rc >= 0) - start += PAGE_SIZE; + if (rc == -EAGAIN) + continue; + if (rc < 0) + return rc; } + start += PAGE_SIZE; } if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT) { -- cgit v1.2.3 From 204c97245612b6c255edf4e21e24d417c4a0c008 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Thu, 23 Aug 2018 12:25:54 +0200 Subject: KVM: s390: vsie: copy wrapping keys to right place Copy the key mask to the right offset inside the shadow CRYCB Fixes: bbeaa58b3 ("KVM: s390: vsie: support aes dea wrapping keys") Signed-off-by: Pierre Morel Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Reviewed-by: Janosch Frank Cc: stable@vger.kernel.org # v4.8+ Message-Id: <1535019956-23539-2-git-send-email-pmorel@linux.ibm.com> Signed-off-by: Janosch Frank Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 63844b95c22c..a2b28cd1e3fe 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -173,7 +173,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return set_validity_icpt(scb_s, 0x0039U); /* copy only the wrapping keys */ - if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56)) + if (read_guest_real(vcpu, crycb_addr + 72, + vsie_page->crycb.dea_wrapping_key_mask, 56)) return set_validity_icpt(scb_s, 0x0035U); scb_s->ecb3 |= ecb3_flags; -- cgit v1.2.3 From df88f3181f10565c6e3a89eb6f0f9e6afaaf15f1 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Thu, 30 Aug 2018 16:14:18 +0200 Subject: KVM: s390: Properly lock mm context allow_gmap_hpage_1m setting We have to do down_write on the mm semaphore to set a bitfield in the mm context. Signed-off-by: Janosch Frank Fixes: a4499382 ("KVM: s390: Add huge page enablement control") Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/mmu.h | 8 +++++++- arch/s390/kvm/kvm-s390.c | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index f31a15044c24..a8418e1379eb 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -16,7 +16,13 @@ typedef struct { unsigned long asce; unsigned long asce_limit; unsigned long vdso_base; - /* The mmu context allocates 4K page tables. */ + /* + * The following bitfields need a down_write on the mm + * semaphore when they are written to. As they are only + * written once, they can be read without a lock. + * + * The mmu context allocates 4K page tables. + */ unsigned int alloc_pgste:1; /* The mmu context uses extended page tables. */ unsigned int has_pgste:1; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 91ad4a9425c0..f69333fd2fa3 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -695,7 +695,9 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = -EINVAL; else { r = 0; + down_write(&kvm->mm->mmap_sem); kvm->mm->context.allow_gmap_hpage_1m = 1; + up_write(&kvm->mm->mmap_sem); /* * We might have to create fake 4k page * tables. To avoid that the hardware works on -- cgit v1.2.3 From 694556d54f354d3fe43bb2e61fd6103cca2638a4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 23 Aug 2018 09:58:27 +0100 Subject: KVM: arm/arm64: Clean dcache to PoC when changing PTE due to CoW When triggering a CoW, we unmap the RO page via an MMU notifier (invalidate_range_start), and then populate the new PTE using another one (change_pte). In the meantime, we'll have copied the old page into the new one. The problem is that the data for the new page is sitting in the cache, and should the guest have an uncached mapping to that page (or its MMU off), following accesses will bypass the cache. In a way, this is similar to what happens on a translation fault: We need to clean the page to the PoC before mapping it. So let's just do that. This fixes a KVM unit test regression observed on a HiSilicon platform, and subsequently reproduced on Seattle. Fixes: a9c0e12ebee5 ("KVM: arm/arm64: Only clean the dcache on translation fault") Cc: stable@vger.kernel.org # v4.16+ Reported-by: Mike Galbraith Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- virt/kvm/arm/mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 91aaf73b00df..111a660be3be 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1860,13 +1860,20 @@ static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; + kvm_pfn_t pfn = pte_pfn(pte); pte_t stage2_pte; if (!kvm->arch.pgd) return; trace_kvm_set_spte_hva(hva); - stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2); + + /* + * We've moved a page around, probably through CoW, so let's treat it + * just like a translation fault and clean the cache to the PoC. + */ + clean_dcache_guest_page(pfn, PAGE_SIZE); + stage2_pte = pfn_pte(pfn, PAGE_S2); handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); } -- cgit v1.2.3 From 7d14919c0d475a795c0127631ac8ecb2b0f31831 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 23 Aug 2018 11:51:43 +0100 Subject: arm64: KVM: Only force FPEXC32_EL2.EN if trapping FPSIMD If trapping FPSIMD in the context of an AArch32 guest, it is critical to set FPEXC32_EL2.EN to 1 so that the trapping is taken to EL2 and not EL1. Conversely, it is just as critical *not* to set FPEXC32_EL2.EN to 1 if we're not going to trap FPSIMD, as we then corrupt the existing VFP state. Moving the call to __activate_traps_fpsimd32 to the point where we know for sure that we are going to trap ensures that we don't set that bit spuriously. Fixes: e6b673b741ea ("KVM: arm64: Optimise FPSIMD handling to reduce guest/host thrashing") Cc: stable@vger.kernel.org # v4.18 Cc: Dave Martin Reported-by: Alexander Graf Tested-by: Alexander Graf Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp/switch.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index d496ef579859..ca46153d7915 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -98,8 +98,10 @@ static void activate_traps_vhe(struct kvm_vcpu *vcpu) val = read_sysreg(cpacr_el1); val |= CPACR_EL1_TTA; val &= ~CPACR_EL1_ZEN; - if (!update_fp_enabled(vcpu)) + if (!update_fp_enabled(vcpu)) { val &= ~CPACR_EL1_FPEN; + __activate_traps_fpsimd32(vcpu); + } write_sysreg(val, cpacr_el1); @@ -114,8 +116,10 @@ static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu) val = CPTR_EL2_DEFAULT; val |= CPTR_EL2_TTA | CPTR_EL2_TZ; - if (!update_fp_enabled(vcpu)) + if (!update_fp_enabled(vcpu)) { val |= CPTR_EL2_TFP; + __activate_traps_fpsimd32(vcpu); + } write_sysreg(val, cptr_el2); } @@ -129,7 +133,6 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu) if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2); - __activate_traps_fpsimd32(vcpu); if (has_vhe()) activate_traps_vhe(vcpu); else -- cgit v1.2.3 From a35381e10dc46dd75e65e4b3832d9a0005d48d44 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 23 Aug 2018 10:18:14 +0100 Subject: KVM: Remove obsolete kvm_unmap_hva notifier backend kvm_unmap_hva is long gone, and we only have kvm_unmap_hva_range to deal with. Drop the now obsolete code. Fixes: fb1522e099f0 ("KVM: update to new mmu_notifier semantic v2") Cc: James Hogan Reviewed-by: Paolo Bonzini Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 1 - arch/arm64/include/asm/kvm_host.h | 1 - arch/mips/include/asm/kvm_host.h | 1 - arch/mips/kvm/mmu.c | 10 ---------- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu.c | 5 ----- virt/kvm/arm/mmu.c | 12 ------------ virt/kvm/arm/trace.h | 15 --------------- 8 files changed, 46 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 79906cecb091..3ad482d2f1eb 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -223,7 +223,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f26055f2306e..8e6d46df38aa 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -357,7 +357,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index a9af1d2dcd69..2c1c53d12179 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -931,7 +931,6 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, bool write); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index ee64db032793..d8dcdb350405 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -512,16 +512,6 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, return 1; } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - unsigned long end = hva + PAGE_SIZE; - - handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); - - kvm_mips_callbacks->flush_shadow_all(kvm); - return 0; -} - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 00ddb0c9e612..e6a33420b871 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1450,7 +1450,6 @@ asmlinkage void kvm_spurious_fault(void); ____kvm_handle_fault_on_reboot(insn, "") #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a282321329b5..d440154e8938 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1853,11 +1853,6 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); -} - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 111a660be3be..ed162a6c57c5 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1817,18 +1817,6 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat return 0; } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - unsigned long end = hva + PAGE_SIZE; - - if (!kvm->arch.pgd) - return 0; - - trace_kvm_unmap_hva(hva); - handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); - return 0; -} - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index e53b596f483b..57b3edebbb40 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -134,21 +134,6 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier unmap hva: %#08lx", __entry->hva) -); - TRACE_EVENT(kvm_unmap_hva_range, TP_PROTO(unsigned long start, unsigned long end), TP_ARGS(start, end), -- cgit v1.2.3 From df3190e22016abf74ef67c9691e9fa1012a66bd5 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 13 Aug 2018 17:04:53 +0100 Subject: arm64: KVM: Remove pgd_lock The lock has never been used and the page tables are protected by mmu_lock in struct kvm. Reviewed-by: Suzuki K Poulose Signed-off-by: Steven Price Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_host.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8e6d46df38aa..3d6d7336f871 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -61,8 +61,7 @@ struct kvm_arch { u64 vmid_gen; u32 vmid; - /* 1-level 2nd stage table and lock */ - spinlock_t pgd_lock; + /* 1-level 2nd stage table, protected by kvm->mmu_lock */ pgd_t *pgd; /* VTTBR value associated with above pgd and vmid */ -- cgit v1.2.3 From b5861e5cf2fcf83031ea3e26b0a69d887adf7d21 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Mon, 3 Sep 2018 15:20:22 +0300 Subject: KVM: nVMX: Fix loss of pending IRQ/NMI before entering L2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consider the case L1 had a IRQ/NMI event until it executed VMLAUNCH/VMRESUME which wasn't delivered because it was disallowed (e.g. interrupts disabled). When L1 executes VMLAUNCH/VMRESUME, L0 needs to evaluate if this pending event should cause an exit from L2 to L1 or delivered directly to L2 (e.g. In case L1 don't intercept EXTERNAL_INTERRUPT). Usually this would be handled by L0 requesting a IRQ/NMI window by setting VMCS accordingly. However, this setting was done on VMCS01 and now VMCS02 is active instead. Thus, when L1 executes VMLAUNCH/VMRESUME we force L0 to perform pending event evaluation by requesting a KVM_REQ_EVENT. Note that above scenario exists when L1 KVM is about to enter L2 but requests an "immediate-exit". As in this case, L1 will disable-interrupts and then send a self-IPI before entering L2. Reviewed-by: Nikita Leshchenko Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Liran Alon Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f910d33858d9..533a327372c8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -12537,8 +12537,11 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); bool from_vmentry = !!exit_qual; u32 dummy_exit_qual; + u32 vmcs01_cpu_exec_ctrl; int r = 0; + vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) @@ -12574,6 +12577,25 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); } + /* + * If L1 had a pending IRQ/NMI until it executed + * VMLAUNCH/VMRESUME which wasn't delivered because it was + * disallowed (e.g. interrupts disabled), L0 needs to + * evaluate if this pending event should cause an exit from L2 + * to L1 or delivered directly to L2 (e.g. In case L1 don't + * intercept EXTERNAL_INTERRUPT). + * + * Usually this would be handled by L0 requesting a + * IRQ/NMI window by setting VMCS accordingly. However, + * this setting was done on VMCS01 and now VMCS02 is active + * instead. Thus, we force L0 to perform pending event + * evaluation by requesting a KVM_REQ_EVENT. + */ + if (vmcs01_cpu_exec_ctrl & + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + } + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet -- cgit v1.2.3 From bdf7ffc89922a52a4f08a12f7421ea24bb7626a0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 30 Aug 2018 10:03:30 +0800 Subject: KVM: LAPIC: Fix pv ipis out-of-bounds access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dan Carpenter reported that the untrusted data returns from kvm_register_read() results in the following static checker warning: arch/x86/kvm/lapic.c:576 kvm_pv_send_ipi() error: buffer underflow 'map->phys_map' 's32min-s32max' KVM guest can easily trigger this by executing the following assembly sequence in Ring0: mov $10, %rax mov $0xFFFFFFFF, %rbx mov $0xFFFFFFFF, %rdx mov $0, %rsi vmcall As this will cause KVM to execute the following code-path: vmx_handle_exit() -> handle_vmcall() -> kvm_emulate_hypercall() -> kvm_pv_send_ipi() which will reach out-of-bounds access. This patch fixes it by adding a check to kvm_pv_send_ipi() against map->max_apic_id, ignoring destinations that are not present and delivering the rest. We also check whether or not map->phys_map[min + i] is NULL since the max_apic_id is set to the max apic id, some phys_map maybe NULL when apic id is sparse, especially kvm unconditionally set max_apic_id to 255 to reserve enough space for any xAPIC ID. Reported-by: Dan Carpenter Reviewed-by: Liran Alon Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Liran Alon Cc: Dan Carpenter Signed-off-by: Wanpeng Li [Add second "if (min > map->max_apic_id)" to complete the fix. -Radim] Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/lapic.c | 27 ++++++++++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3ad10f634d4c..8e90488c3d56 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1455,7 +1455,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); u64 kvm_get_arch_capabilities(void); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0cefba28c864..17c0472c5b34 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -548,7 +548,7 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, } int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { int i; @@ -571,18 +571,31 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); + if (min > map->max_apic_id) + goto out; /* Bits above cluster_size are masked in the caller. */ - for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + for_each_set_bit(i, &ipi_bitmap_low, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } min += cluster_size; - for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + + if (min > map->max_apic_id) + goto out; + + for_each_set_bit(i, &ipi_bitmap_high, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } +out: rcu_read_unlock(); return count; } -- cgit v1.2.3