From eb3992e833d3a17f9b0a3e0371d0b1d3d566f740 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Sep 2022 23:31:32 +0000 Subject: KVM: VMX: Resume guest immediately when injecting #GP on ECREATE Resume the guest immediately when injecting a #GP on ECREATE due to an invalid enclave size, i.e. don't attempt ECREATE in the host. The #GP is a terminal fault, e.g. skipping the instruction if ECREATE is successful would result in KVM injecting #GP on the instruction following ECREATE. Fixes: 70210c044b4e ("KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions") Cc: stable@vger.kernel.org Cc: Kai Huang Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20220930233132.1723330-1-seanjc@google.com --- arch/x86/kvm/vmx/sgx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 8f95c7c01433..b12da2a6dec9 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -182,8 +182,10 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, /* Enforce CPUID restriction on max enclave size. */ max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 : sgx_12_0->edx; - if (size >= BIT_ULL(max_size_log2)) + if (size >= BIT_ULL(max_size_log2)) { kvm_inject_gp(vcpu, 0); + return 1; + } /* * sgx_virt_ecreate() returns: -- cgit v1.2.3 From 4265df667bbdc71c640e43c905bd9aeeead92365 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Tue, 8 Nov 2022 11:50:54 +0800 Subject: KVM: x86: Keep the lock order consistent between SRCU and gpc spinlock Acquire SRCU before taking the gpc spinlock in wait_pending_event() so as to be consistent with all other functions that acquire both locks. It's not illegal to acquire SRCU inside a spinlock, nor is there deadlock potential, but in general it's preferable to order locks from least restrictive to most restrictive, e.g. if wait_pending_event() needed to sleep for whatever reason, it could do so while holding SRCU, but would need to drop the spinlock. Signed-off-by: Peng Hao Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/CAPm50a++Cb=QfnjMZ2EnCj-Sb9Y4UM-=uOEtHAcjnNLCAAf-dQ@mail.gmail.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 9187d024d006..2f21fa5ee7de 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1165,8 +1165,8 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, bool ret = true; int idx, i; - read_lock_irqsave(&gpc->lock, flags); idx = srcu_read_lock(&kvm->srcu); + read_lock_irqsave(&gpc->lock, flags); if (!kvm_gpc_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) goto out_rcu; @@ -1187,8 +1187,8 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, } out_rcu: - srcu_read_unlock(&kvm->srcu, idx); read_unlock_irqrestore(&gpc->lock, flags); + srcu_read_unlock(&kvm->srcu, idx); return ret; } -- cgit v1.2.3 From 17122c06b86c9f77f45b86b8e62c3ed440847a59 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Sep 2022 23:36:32 +0000 Subject: KVM: x86: Fail emulation during EMULTYPE_SKIP on any exception Treat any exception during instruction decode for EMULTYPE_SKIP as a "full" emulation failure, i.e. signal failure instead of queuing the exception. When decoding purely to skip an instruction, KVM and/or the CPU has already done some amount of emulation that cannot be unwound, e.g. on an EPT misconfig VM-Exit KVM has already processeed the emulated MMIO. KVM already does this if a #UD is encountered, but not for other exceptions, e.g. if a #PF is encountered during fetch. In SVM's soft-injection use case, queueing the exception is particularly problematic as queueing exceptions while injecting events can put KVM into an infinite loop due to bailing from VM-Enter to service the newly pending exception. E.g. multiple warnings to detect such behavior fire: ------------[ cut here ]------------ WARNING: CPU: 3 PID: 1017 at arch/x86/kvm/x86.c:9873 kvm_arch_vcpu_ioctl_run+0x1de5/0x20a0 [kvm] Modules linked in: kvm_amd ccp kvm irqbypass CPU: 3 PID: 1017 Comm: svm_nested_soft Not tainted 6.0.0-rc1+ #220 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kvm_arch_vcpu_ioctl_run+0x1de5/0x20a0 [kvm] Call Trace: kvm_vcpu_ioctl+0x223/0x6d0 [kvm] __x64_sys_ioctl+0x85/0xc0 do_syscall_64+0x2b/0x50 entry_SYSCALL_64_after_hwframe+0x46/0xb0 ---[ end trace 0000000000000000 ]--- ------------[ cut here ]------------ WARNING: CPU: 3 PID: 1017 at arch/x86/kvm/x86.c:9987 kvm_arch_vcpu_ioctl_run+0x12a3/0x20a0 [kvm] Modules linked in: kvm_amd ccp kvm irqbypass CPU: 3 PID: 1017 Comm: svm_nested_soft Tainted: G W 6.0.0-rc1+ #220 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kvm_arch_vcpu_ioctl_run+0x12a3/0x20a0 [kvm] Call Trace: kvm_vcpu_ioctl+0x223/0x6d0 [kvm] __x64_sys_ioctl+0x85/0xc0 do_syscall_64+0x2b/0x50 entry_SYSCALL_64_after_hwframe+0x46/0xb0 ---[ end trace 0000000000000000 ]--- Fixes: 6ea6e84309ca ("KVM: x86: inject exceptions produced by x86_decode_insn") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220930233632.1725475-1-seanjc@google.com --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7f850dfb4086..ef12747ecb63 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8772,7 +8772,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, write_fault_to_spt, emulation_type)) return 1; - if (ctxt->have_exception) { + + if (ctxt->have_exception && + !(emulation_type & EMULTYPE_SKIP)) { /* * #UD should result in just EMULATION_FAILED, and trap-like * exception should not be encountered during decode. -- cgit v1.2.3 From 5c30e8101e8d5d020b1d7119117889756a6ed713 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 30 Sep 2022 23:40:31 +0000 Subject: KVM: SVM: Skip WRMSR fastpath on VM-Exit if next RIP isn't valid Skip the WRMSR fastpath in SVM's VM-Exit handler if the next RIP isn't valid, e.g. because KVM is running with nrips=false. SVM must decode and emulate to skip the WRMSR if the CPU doesn't provide the next RIP. Getting the instruction bytes to decode the WRMSR requires reading guest memory, which in turn means dereferencing memslots, and that isn't safe because KVM doesn't hold SRCU when the fastpath runs. Don't bother trying to enable the fastpath for this case, e.g. by doing only the WRMSR and leaving the "skip" until later. NRIPS is supported on all modern CPUs (KVM has considered making it mandatory), and the next RIP will be valid the vast, vast majority of the time. ============================= WARNING: suspicious RCU usage 6.0.0-smp--4e557fcd3d80-skip #13 Tainted: G O ----------------------------- include/linux/kvm_host.h:954 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by stable/206475: #0: ffff9d9dfebcc0f0 (&vcpu->mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x8b/0x620 [kvm] stack backtrace: CPU: 152 PID: 206475 Comm: stable Tainted: G O 6.0.0-smp--4e557fcd3d80-skip #13 Hardware name: Google, Inc. Arcadia_IT_80/Arcadia_IT_80, BIOS 10.48.0 01/27/2022 Call Trace: dump_stack_lvl+0x69/0xaa dump_stack+0x10/0x12 lockdep_rcu_suspicious+0x11e/0x130 kvm_vcpu_gfn_to_memslot+0x155/0x190 [kvm] kvm_vcpu_gfn_to_hva_prot+0x18/0x80 [kvm] paging64_walk_addr_generic+0x183/0x450 [kvm] paging64_gva_to_gpa+0x63/0xd0 [kvm] kvm_fetch_guest_virt+0x53/0xc0 [kvm] __do_insn_fetch_bytes+0x18b/0x1c0 [kvm] x86_decode_insn+0xf0/0xef0 [kvm] x86_emulate_instruction+0xba/0x790 [kvm] kvm_emulate_instruction+0x17/0x20 [kvm] __svm_skip_emulated_instruction+0x85/0x100 [kvm_amd] svm_skip_emulated_instruction+0x13/0x20 [kvm_amd] handle_fastpath_set_msr_irqoff+0xae/0x180 [kvm] svm_vcpu_run+0x4b8/0x5a0 [kvm_amd] vcpu_enter_guest+0x16ca/0x22f0 [kvm] kvm_arch_vcpu_ioctl_run+0x39d/0x900 [kvm] kvm_vcpu_ioctl+0x538/0x620 [kvm] __se_sys_ioctl+0x77/0xc0 __x64_sys_ioctl+0x1d/0x20 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 404d5d7bff0d ("KVM: X86: Introduce more exit_fastpath_completion enum values") Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220930234031.1732249-1-seanjc@google.com --- arch/x86/kvm/svm/svm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 91352d692845..6ffadbd57744 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3895,8 +3895,14 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + /* + * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM + * can't read guest memory (dereference memslots) to decode the WRMSR. + */ + if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 && + nrips && control->next_rip) return handle_fastpath_set_msr_irqoff(vcpu); return EXIT_FASTPATH_NONE; -- cgit v1.2.3 From a8a12c0069b9e3c909b3c22bf8711d2f18a0af97 Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Wed, 28 Sep 2022 17:27:48 +0800 Subject: KVM: SVM: Replace kmap_atomic() with kmap_local_page() The use of kmap_atomic() is being deprecated in favor of kmap_local_page()[1]. The main difference between atomic and local mappings is that local mappings don't disable page faults or preemption. There're 2 reasons we can use kmap_local_page() here: 1. SEV is 64-bit only and kmap_local_page() doesn't disable migration in this case, but here the function clflush_cache_range() uses CLFLUSHOPT instruction to flush, and on x86 CLFLUSHOPT is not CPU-local and flushes the page out of the entire cache hierarchy on all CPUs (APM volume 3, chapter 3, CLFLUSHOPT). So there's no need to disable preemption to ensure CPU-local. 2. clflush_cache_range() doesn't need to disable pagefault and the mapping is still valid even if sleeps. This is also true for sched out/in when preempted. In addition, though kmap_local_page() is a thin wrapper around page_address() on 64-bit, kmap_local_page() should still be used here in preference to page_address() since page_address() isn't suitable to be used in a generic function (like sev_clflush_pages()) where the page passed in is not easy to determine the source of allocation. Keeping the kmap* API in place means it can be used for things other than highmem mappings[2]. Therefore, sev_clflush_pages() is a function that should use kmap_local_page() in place of kmap_atomic(). Convert the calls of kmap_atomic() / kunmap_atomic() to kmap_local_page() / kunmap_local(). [1]: https://lore.kernel.org/all/20220813220034.806698-1-ira.weiny@intel.com [2]: https://lore.kernel.org/lkml/5d667258-b58b-3d28-3609-e7914c99b31b@intel.com/ Suggested-by: Dave Hansen Suggested-by: Ira Weiny Suggested-by: Fabio M. De Francesco Signed-off-by: Zhao Liu Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20220928092748.463631-1-zhao1.liu@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 69dbf17f0d6a..86d6897f4806 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -465,9 +465,9 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) return; for (i = 0; i < npages; i++) { - page_virtual = kmap_atomic(pages[i]); + page_virtual = kmap_local_page(pages[i]); clflush_cache_range(page_virtual, PAGE_SIZE); - kunmap_atomic(page_virtual); + kunmap_local(page_virtual); cond_resched(); } } -- cgit v1.2.3 From 9cc409325ddd776f6fd6293d5ce93ce1248af6e4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Oct 2022 00:19:56 +0000 Subject: KVM: nVMX: Inject #GP, not #UD, if "generic" VMXON CR0/CR4 check fails Inject #GP for if VMXON is attempting with a CR0/CR4 that fails the generic "is CRx valid" check, but passes the CR4.VMXE check, and do the generic checks _after_ handling the post-VMXON VM-Fail. The CR4.VMXE check, and all other #UD cases, are special pre-conditions that are enforced prior to pivoting on the current VMX mode, i.e. occur before interception if VMXON is attempted in VMX non-root mode. All other CR0/CR4 checks generate #GP and effectively have lower priority than the post-VMXON check. Per the SDM: IF (register operand) or (CR0.PE = 0) or (CR4.VMXE = 0) or ... THEN #UD; ELSIF not in VMX operation THEN IF (CPL > 0) or (in A20M mode) or (the values of CR0 and CR4 are not supported in VMX operation) THEN #GP(0); ELSIF in VMX non-root operation THEN VMexit; ELSIF CPL > 0 THEN #GP(0); ELSE VMfail("VMXON executed in VMX root operation"); FI; which, if re-written without ELSIF, yields: IF (register operand) or (CR0.PE = 0) or (CR4.VMXE = 0) or ... THEN #UD IF in VMX non-root operation THEN VMexit; IF CPL > 0 THEN #GP(0) IF in VMX operation THEN VMfail("VMXON executed in VMX root operation"); IF (in A20M mode) or (the values of CR0 and CR4 are not supported in VMX operation) THEN #GP(0); Note, KVM unconditionally forwards VMXON VM-Exits that occur in L2 to L1, i.e. there is no need to check the vCPU is not in VMX non-root mode. Add a comment to explain why unconditionally forwarding such exits is functionally correct. Reported-by: Eric Li Fixes: c7d855c2aff2 ("KVM: nVMX: Inject #UD if VMXON is attempted with incompatible CR0/CR4") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20221006001956.329314-1-seanjc@google.com --- arch/x86/kvm/vmx/nested.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b28be793de29..892791019968 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5131,24 +5131,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks - * that have higher priority than VM-Exit (see Intel SDM's pseudocode - * for VMXON), as KVM must load valid CR0/CR4 values into hardware while - * running the guest, i.e. KVM needs to check the _guest_ values. + * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter + * the guest and so cannot rely on hardware to perform the check, + * which has higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON). * - * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and - * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real - * Mode, but KVM will never take the guest out of those modes. + * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 + * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't + * force any of the relevant guest state. For a restricted guest, KVM + * does force CR0.PE=1, but only to also force VM86 in order to emulate + * Real Mode, and so there's no need to check CR0.PE manually. */ - if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || - !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* - * CPL=0 and all other checks that are lower priority than VM-Exit must - * be checked manually. + * The CPL is checked for "not in VMX operation" and for "in VMX root", + * and has higher priority than the VM-Fail due to being post-VMXON, + * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, + * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits + * from L2 to L1, i.e. there's no need to check for the vCPU being in + * VMX non-root. + * + * Forwarding the VM-Exit unconditionally, i.e. without performing the + * #UD checks (see above), is functionally ok because KVM doesn't allow + * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's + * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are + * missed by hardware due to shadowing CR0 and/or CR4. */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); @@ -5158,6 +5169,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + /* + * Invalid CR0/CR4 generates #GP. These checks are performed if and + * only if the vCPU isn't already in VMX operation, i.e. effectively + * have lower priority than the VM-Fail above. + */ + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); -- cgit v1.2.3 From 4f209989586c79e9bf59ba9381101f5fb449dfbb Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 19 Oct 2022 14:36:19 -0700 Subject: KVM: VMX: Guest usage of IA32_SPEC_CTRL is likely At this point in time, most guests (in the default, out-of-the-box configuration) are likely to use IA32_SPEC_CTRL. Therefore, drop the compiler hint that it is unlikely for KVM to be intercepting WRMSR of IA32_SPEC_CTRL. Signed-off-by: Jim Mattson Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221019213620.1953281-2-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cea8c07f5229..cb40f724d8cc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -858,7 +858,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) * to change it directly without causing a vmexit. In that case read * it after vmexit and store it in vmx->spec_ctrl. */ - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) + if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; return flags; -- cgit v1.2.3 From 2e7eab81425ad6c875f2ed47c0ce01e78afc38a5 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 19 Oct 2022 14:36:20 -0700 Subject: KVM: VMX: Execute IBPB on emulated VM-exit when guest has IBRS According to Intel's document on Indirect Branch Restricted Speculation, "Enabling IBRS does not prevent software from controlling the predicted targets of indirect branches of unrelated software executed later at the same predictor mode (for example, between two different user applications, or two different virtual machines). Such isolation can be ensured through use of the Indirect Branch Predictor Barrier (IBPB) command." This applies to both basic and enhanced IBRS. Since L1 and L2 VMs share hardware predictor modes (guest-user and guest-kernel), hardware IBRS is not sufficient to virtualize IBRS. (The way that basic IBRS is implemented on pre-eIBRS parts, hardware IBRS is actually sufficient in practice, even though it isn't sufficient architecturally.) For virtual CPUs that support IBRS, add an indirect branch prediction barrier on emulated VM-exit, to ensure that the predicted targets of indirect branches executed in L1 cannot be controlled by software that was executed in L2. Since we typically don't intercept guest writes to IA32_SPEC_CTRL, perform the IBPB at emulated VM-exit regardless of the current IA32_SPEC_CTRL.IBRS value, even though the IBPB could technically be deferred until L1 sets IA32_SPEC_CTRL.IBRS, if IA32_SPEC_CTRL.IBRS is clear at emulated VM-exit. This is CVE-2022-2196. Fixes: 5c911beff20a ("KVM: nVMX: Skip IBPB when switching between vmcs01 and vmcs02") Cc: Sean Christopherson Signed-off-by: Jim Mattson Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20221019213620.1953281-3-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 11 +++++++++++ arch/x86/kvm/vmx/vmx.c | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 892791019968..61c83424285c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4798,6 +4798,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); + /* + * If IBRS is advertised to the vCPU, KVM must flush the indirect + * branch predictors when transitioning from L2 to L1, as L1 expects + * hardware (KVM in this case) to provide separate predictor modes. + * Bare metal isolates VMX root (host) from VMX non-root (guest), but + * doesn't isolate different VMCSs, i.e. in this case, doesn't provide + * separate modes for L2 vs L1. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + indirect_branch_prediction_barrier(); + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cb40f724d8cc..3f31c46c306e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1348,8 +1348,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, /* * No indirect branch prediction barrier needed when switching - * the active VMCS within a guest, e.g. on nested VM-Enter. - * The L1 VMM can protect itself with retpolines, IBPB or IBRS. + * the active VMCS within a vCPU, unless IBRS is advertised to + * the vCPU. To minimize the number of IBPBs executed, KVM + * performs IBPB on nested VM-Exit (a single nested transition + * may switch the active VMCS multiple times). */ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) indirect_branch_prediction_barrier(); -- cgit v1.2.3 From 4a8fd4a720f8a8dbc370076d26388176c311218a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 31 Aug 2022 00:07:21 +0000 Subject: KVM: nVMX: Reword comments about generating nested CR0/4 read shadows Reword the comments that (attempt to) document nVMX's overrides of the CR0/4 read shadows for L2 after calling vmx_set_cr0/4(). The important behavior that needs to be documented is that KVM needs to override the shadows to account for L1's masks even though the shadows are set by the common helpers (and that setting the shadows first would result in the correct shadows being clobbered). Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220831000721.4066617-1-seanjc@google.com --- arch/x86/kvm/vmx/nested.c | 9 +++------ arch/x86/kvm/vmx/nested.h | 7 ++++--- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 61c83424285c..b6f4411b613e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2588,12 +2588,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, nested_ept_init_mmu_context(vcpu); /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those - * bits which we consider mandatory enabled. - * The CR0_READ_SHADOW is what L2 should have expected to read given - * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we - * have more bits than L1 expected. + * Override the CR0/CR4 read shadows after setting the effective guest + * CR0/CR4. The common helpers also set the shadows, but they don't + * account for vmcs12's cr0/4_guest_host_mask. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 6312c9541c3c..96952263b029 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -79,9 +79,10 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) } /* - * Return the cr0 value that a nested guest would read. This is a combination - * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by - * its hypervisor (cr0_read_shadow). + * Return the cr0/4 value that a nested guest would read. This is a combination + * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed + * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as + * the value+mask loaded into vmcs02 may not match the vmcs12 fields. */ static inline unsigned long nested_read_cr0(struct vmcs12 *fields) { -- cgit v1.2.3 From 0b5e7a16a0a79a3742f0df9e45bca46f01b40e6a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 28 Sep 2022 23:20:15 +0000 Subject: KVM: VMX: Make vmread_error_trampoline() uncallable from C code Declare vmread_error_trampoline() as an opaque symbol so that it cannot be called from C code, at least not without some serious fudging. The trampoline always passes parameters on the stack so that the inline VMREAD sequence doesn't need to clobber registers. regparm(0) was originally added to document the stack behavior, but it ended up being confusing because regparm(0) is a nop for 64-bit targets. Opportunustically wrap the trampoline and its declaration in #ifdeffery to make it even harder to invoke incorrectly, to document why it exists, and so that it's not left behind if/when CONFIG_CC_HAS_ASM_GOTO_OUTPUT is true for all supported toolchains. No functional change intended. Cc: Uros Bizjak Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220928232015.745948-1-seanjc@google.com --- arch/x86/kvm/vmx/vmenter.S | 2 ++ arch/x86/kvm/vmx/vmx_ops.h | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0b5db4de4d09..766c6b3ef5ed 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -269,6 +269,7 @@ SYM_FUNC_END(__vmx_vcpu_run) .section .text, "ax" +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed @@ -317,6 +318,7 @@ SYM_FUNC_START(vmread_error_trampoline) RET SYM_FUNC_END(vmread_error_trampoline) +#endif SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) /* diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index f6f23c7397dc..842dc898c972 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -11,14 +11,28 @@ #include "../x86.h" void vmread_error(unsigned long field, bool fault); -__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, - bool fault); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); +#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +/* + * The VMREAD error trampoline _always_ uses the stack to pass parameters, even + * for 64-bit targets. Preserving all registers allows the VMREAD inline asm + * blob to avoid clobbering GPRs, which in turn allows the compiler to better + * optimize sequences of VMREADs. + * + * Declare the trampoline as an opaque label as it's not safe to call from C + * code; there is no way to tell the compiler to pass params on the stack for + * 64-bit targets. + * + * void vmread_error_trampoline(unsigned long field, bool fault); + */ +extern unsigned long vmread_error_trampoline; +#endif + static __always_inline void vmcs_check16(unsigned long field) { BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, -- cgit v1.2.3 From d2a00af2061db863890e32a4a99a6f82c330df1f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 23:23:51 +0000 Subject: KVM: VMX: Allow userspace to set all supported FEATURE_CONTROL bits Allow userspace to set all supported bits in MSR IA32_FEATURE_CONTROL irrespective of the guest CPUID model, e.g. via KVM_SET_MSRS. KVM's ABI is that userspace is allowed to set MSRs before CPUID, i.e. can set MSRs to values that would fault according to the guest CPUID model. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220607232353.3375324-2-seanjc@google.com --- arch/x86/kvm/vmx/vmx.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3f31c46c306e..7be1fb50a753 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1836,12 +1836,38 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu) return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); } -static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, - uint64_t val) +/* + * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of + * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain + * backwards compatibility even though KVM doesn't support emulating SMX. And + * because userspace set "VMX in SMX", the guest must also be allowed to set it, + * e.g. if the MSR is left unlocked and the guest does a RMW operation. + */ +#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ + FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ + FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ + FEAT_CTL_SGX_LC_ENABLED | \ + FEAT_CTL_SGX_ENABLED | \ + FEAT_CTL_LMCE_ENABLED) + +static inline bool vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, + struct msr_data *msr) { - uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + uint64_t valid_bits; + + /* + * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are + * exposed to the guest. + */ + WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & + ~KVM_SUPPORTED_FEATURE_CONTROL); + + if (msr->host_initiated) + valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; + else + valid_bits = vmx->msr_ia32_feature_control_valid_bits; - return !(val & ~valid_bits); + return !(msr->data & ~valid_bits); } static int vmx_get_msr_feature(struct kvm_msr_entry *msr) @@ -2240,7 +2266,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: - if (!vmx_feature_control_msr_valid(vcpu, data) || + if (!vmx_feature_control_msr_valid(vmx, msr_info) || (to_vmx(vcpu)->msr_ia32_feature_control & FEAT_CTL_LOCKED && !msr_info->host_initiated)) return 1; -- cgit v1.2.3 From 2d6cd68636d60822219074b7c1d0bfe41321f106 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jun 2022 23:23:52 +0000 Subject: KVM: VMX: Move MSR_IA32_FEAT_CTL.LOCKED check into "is valid" helper Move the check on IA32_FEATURE_CONTROL being locked, i.e. read-only from the guest, into the helper to check the overall validity of the incoming value. Opportunistically rename the helper to make it clear that it returns a bool. No functional change intended. Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20220607232353.3375324-3-seanjc@google.com --- arch/x86/kvm/vmx/vmx.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7be1fb50a753..fe5615fd8295 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1850,8 +1850,8 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu) FEAT_CTL_SGX_ENABLED | \ FEAT_CTL_LMCE_ENABLED) -static inline bool vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, - struct msr_data *msr) +static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, + struct msr_data *msr) { uint64_t valid_bits; @@ -1862,6 +1862,10 @@ static inline bool vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & ~KVM_SUPPORTED_FEATURE_CONTROL); + if (!msr->host_initiated && + (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) + return false; + if (msr->host_initiated) valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; else @@ -2266,10 +2270,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: - if (!vmx_feature_control_msr_valid(vmx, msr_info) || - (to_vmx(vcpu)->msr_ia32_feature_control & - FEAT_CTL_LOCKED && !msr_info->host_initiated)) + if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) return 1; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); -- cgit v1.2.3 From 3ebcbd2244f5a69e06e5f655bfbd8127c08201c7 Mon Sep 17 00:00:00 2001 From: Anton Romanov Date: Wed, 8 Jun 2022 18:35:26 +0000 Subject: KVM: x86: Use current rather than snapshotted TSC frequency if it is constant Don't snapshot tsc_khz into per-cpu cpu_tsc_khz if the host TSC is constant, in which case the actual TSC frequency will never change and thus capturing TSC during initialization is unnecessary, KVM can simply use tsc_khz. This value is snapshotted from kvm_timer_init->kvmclock_cpu_online->tsc_khz_changed(NULL) On CPUs with constant TSC, but not a hardware-specified TSC frequency, snapshotting cpu_tsc_khz and using that to set a VM's target TSC frequency can lead to VM to think its TSC frequency is not what it actually is if refining the TSC completes after KVM snapshots tsc_khz. The actual frequency never changes, only the kernel's calculation of what that frequency is changes. Ideally, KVM would not be able to race with TSC refinement, or would have a hook into tsc_refine_calibration_work() to get an alert when refinement is complete. Avoiding the race altogether isn't practical as refinement takes a relative eternity; it's deliberately put on a work queue outside of the normal boot sequence to avoid unnecessarily delaying boot. Adding a hook is doable, but somewhat gross due to KVM's ability to be built as a module. And if the TSC is constant, which is likely the case for every VMX/SVM-capable CPU produced in the last decade, the race can be hit if and only if userspace is able to create a VM before TSC refinement completes; refinement is slow, but not that slow. For now, punt on a proper fix, as not taking a snapshot can help some uses cases and not taking a snapshot is arguably correct irrespective of the race with refinement. Signed-off-by: Anton Romanov Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20220608183525.1143682-1-romanton@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ef12747ecb63..152ea4993b76 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2974,6 +2974,22 @@ static void kvm_update_masterclock(struct kvm *kvm) kvm_end_pvclock_update(kvm); } +/* + * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's + * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz + * can change during boot even if the TSC is constant, as it's possible for KVM + * to be loaded before TSC calibration completes. Ideally, KVM would get a + * notification when calibration completes, but practically speaking calibration + * will complete before userspace is alive enough to create VMs. + */ +static unsigned long get_cpu_tsc_khz(void) +{ + if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} + /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { @@ -2984,7 +3000,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) get_cpu(); data->flags = 0; - if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { + if (ka->use_master_clock && + (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) { #ifdef CONFIG_X86_64 struct timespec64 ts; @@ -2998,7 +3015,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) data->flags |= KVM_CLOCK_TSC_STABLE; hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); @@ -3108,7 +3125,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); + tgt_tsc_khz = get_cpu_tsc_khz(); if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -9038,9 +9055,11 @@ static void tsc_khz_changed(void *data) struct cpufreq_freqs *freq = data; unsigned long khz = 0; + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); + if (data) khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + else khz = cpufreq_quick_get(raw_smp_processor_id()); if (!khz) khz = tsc_khz; @@ -9061,8 +9080,10 @@ static void kvm_hyperv_tsc_notifier(void) hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ - for_each_present_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + for_each_present_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + } kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { @@ -9199,10 +9220,10 @@ static void kvm_timer_init(void) } cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - } - cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", - kvmclock_cpu_online, kvmclock_cpu_down_prep); + cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", + kvmclock_cpu_online, kvmclock_cpu_down_prep); + } } #ifdef CONFIG_X86_64 @@ -9362,10 +9383,11 @@ void kvm_arch_exit(void) #endif kvm_lapic_exit(); - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + } #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); irq_work_sync(&pvclock_irq_work); -- cgit v1.2.3