summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx/nested.c
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2022-12-06 12:29:06 -0500
committerPaolo Bonzini <pbonzini@redhat.com>2022-12-12 15:54:07 -0500
commit9352e7470a1b4edd2fa9d235420ecc7bc3971bdc (patch)
tree141ccdb777f2ee36764f2067ab43f23da114e08a /arch/x86/kvm/vmx/nested.c
parent2afc1fbbdab2aee831561f09f859989dcd5ed648 (diff)
parent5656374b168c98377b6feee8d7500993eebda230 (diff)
downloadlinux-9352e7470a1b4edd2fa9d235420ecc7bc3971bdc.tar.gz
linux-9352e7470a1b4edd2fa9d235420ecc7bc3971bdc.tar.bz2
linux-9352e7470a1b4edd2fa9d235420ecc7bc3971bdc.zip
Merge remote-tracking branch 'kvm/queue' into HEAD
x86 Xen-for-KVM: * Allow the Xen runstate information to cross a page boundary * Allow XEN_RUNSTATE_UPDATE flag behaviour to be configured * add support for 32-bit guests in SCHEDOP_poll x86 fixes: * One-off fixes for various emulation flows (SGX, VMXON, NRIPS=0). * Reinstate IBPB on emulated VM-Exit that was incorrectly dropped a few years back when eliminating unnecessary barriers when switching between vmcs01 and vmcs02. * Clean up the MSR filter docs. * Clean up vmread_error_trampoline() to make it more obvious that params must be passed on the stack, even for x86-64. * Let userspace set all supported bits in MSR_IA32_FEAT_CTL irrespective of the current guest CPUID. * Fudge around a race with TSC refinement that results in KVM incorrectly thinking a guest needs TSC scaling when running on a CPU with a constant TSC, but no hardware-enumerated TSC frequency. * Advertise (on AMD) that the SMM_CTL MSR is not supported * Remove unnecessary exports Selftests: * Fix an inverted check in the access tracking perf test, and restore support for asserting that there aren't too many idle pages when running on bare metal. * Fix an ordering issue in the AMX test introduced by recent conversions to use kvm_cpu_has(), and harden the code to guard against similar bugs in the future. Anything that tiggers caching of KVM's supported CPUID, kvm_cpu_has() in this case, effectively hides opt-in XSAVE features if the caching occurs before the test opts in via prctl(). * Fix build errors that occur in certain setups (unsure exactly what is unique about the problematic setup) due to glibc overriding static_assert() to a variant that requires a custom message. * Introduce actual atomics for clear/set_bit() in selftests Documentation: * Remove deleted ioctls from documentation * Various fixes
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
-rw-r--r--arch/x86/kvm/vmx/nested.c64
1 files changed, 47 insertions, 17 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b28be793de29..b6f4411b613e 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2588,12 +2588,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
nested_ept_init_mmu_context(vcpu);
/*
- * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
- * bits which we consider mandatory enabled.
- * The CR0_READ_SHADOW is what L2 should have expected to read given
- * the specifications by L1; It's not enough to take
- * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
- * have more bits than L1 expected.
+ * Override the CR0/CR4 read shadows after setting the effective guest
+ * CR0/CR4. The common helpers also set the shadows, but they don't
+ * account for vmcs12's cr0/4_guest_host_mask.
*/
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
@@ -4798,6 +4795,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ /*
+ * If IBRS is advertised to the vCPU, KVM must flush the indirect
+ * branch predictors when transitioning from L2 to L1, as L1 expects
+ * hardware (KVM in this case) to provide separate predictor modes.
+ * Bare metal isolates VMX root (host) from VMX non-root (guest), but
+ * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
+ * separate modes for L2 vs L1.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ indirect_branch_prediction_barrier();
+
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
@@ -5131,24 +5139,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
/*
- * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
- * that have higher priority than VM-Exit (see Intel SDM's pseudocode
- * for VMXON), as KVM must load valid CR0/CR4 values into hardware while
- * running the guest, i.e. KVM needs to check the _guest_ values.
+ * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
+ * the guest and so cannot rely on hardware to perform the check,
+ * which has higher priority than VM-Exit (see Intel SDM's pseudocode
+ * for VMXON).
*
- * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
- * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real
- * Mode, but KVM will never take the guest out of those modes.
+ * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
+ * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
+ * force any of the relevant guest state. For a restricted guest, KVM
+ * does force CR0.PE=1, but only to also force VM86 in order to emulate
+ * Real Mode, and so there's no need to check CR0.PE manually.
*/
- if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
- !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
/*
- * CPL=0 and all other checks that are lower priority than VM-Exit must
- * be checked manually.
+ * The CPL is checked for "not in VMX operation" and for "in VMX root",
+ * and has higher priority than the VM-Fail due to being post-VMXON,
+ * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
+ * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
+ * from L2 to L1, i.e. there's no need to check for the vCPU being in
+ * VMX non-root.
+ *
+ * Forwarding the VM-Exit unconditionally, i.e. without performing the
+ * #UD checks (see above), is functionally ok because KVM doesn't allow
+ * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
+ * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
+ * missed by hardware due to shadowing CR0 and/or CR4.
*/
if (vmx_get_cpl(vcpu)) {
kvm_inject_gp(vcpu, 0);
@@ -5158,6 +5177,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
if (vmx->nested.vmxon)
return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+ /*
+ * Invalid CR0/CR4 generates #GP. These checks are performed if and
+ * only if the vCPU isn't already in VMX operation, i.e. effectively
+ * have lower priority than the VM-Fail above.
+ */
+ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
+ !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
kvm_inject_gp(vcpu, 0);