diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 192 |
1 files changed, 118 insertions, 74 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4244c2baf57d..742d0f7d3556 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -123,6 +123,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); unsigned int __read_mostly lapic_timer_advance_ns = 0; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); +static bool __read_mostly vector_hashing = true; +module_param(vector_hashing, bool, S_IRUGO); + static bool __read_mostly backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -720,7 +723,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP; + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; if (cr4 & CR4_RESERVED_BITS) return 1; @@ -737,6 +740,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; + if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; @@ -762,7 +768,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); return 0; @@ -1196,17 +1202,11 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { - uint32_t quotient, remainder; - - /* Don't try to replace with do_div(), this one calculates - * "(dividend << 32) / divisor" */ - __asm__ ( "divl %4" - : "=a" (quotient), "=d" (remainder) - : "0" (0), "1" (dividend), "r" (divisor) ); - return quotient; + do_shl32_div32(dividend, divisor); + return dividend; } -static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, +static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, s8 *pshift, u32 *pmultiplier) { uint64_t scaled64; @@ -1214,8 +1214,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, uint64_t tps64; uint32_t tps32; - tps64 = base_khz * 1000LL; - scaled64 = scaled_khz * 1000LL; + tps64 = base_hz; + scaled64 = scaled_hz; while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; @@ -1233,8 +1233,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, *pshift = shift; *pmultiplier = div_frac(scaled64, tps32); - pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", - __func__, base_khz, scaled_khz, shift, *pmultiplier); + pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n", + __func__, base_hz, scaled_hz, shift, *pmultiplier); } #ifdef CONFIG_X86_64 @@ -1293,23 +1293,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) return 0; } -static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) { u32 thresh_lo, thresh_hi; int use_scaling = 0; /* tsc_khz can be zero if TSC calibration fails */ - if (this_tsc_khz == 0) { + if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; return -1; } /* Compute a scale to convert nanoseconds in TSC cycles */ - kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, + kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, &vcpu->arch.virtual_tsc_shift, &vcpu->arch.virtual_tsc_mult); - vcpu->arch.virtual_tsc_khz = this_tsc_khz; + vcpu->arch.virtual_tsc_khz = user_tsc_khz; /* * Compute the variation in TSC rate which is acceptable @@ -1319,11 +1319,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) */ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); - if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); + if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { + pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } - return set_tsc_khz(vcpu, this_tsc_khz, use_scaling); + return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) @@ -1562,7 +1562,7 @@ static cycle_t read_tsc(void) /* * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is + * predictable (it's just a function of time and the likely is * very likely) and there's a data dependence, so force GCC * to generate a branch instead. I don't barrier() because * we don't actually need a barrier, and if this function @@ -1716,7 +1716,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags, this_tsc_khz, tgt_tsc_khz; + unsigned long flags, tgt_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -1742,8 +1742,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - this_tsc_khz = __this_cpu_read(cpu_tsc_khz); - if (unlikely(this_tsc_khz == 0)) { + tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); + if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; @@ -1778,13 +1778,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (!vcpu->pv_time_enabled) return 0; - if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - tgt_tsc_khz = kvm_has_tsc_control ? - vcpu->virtual_tsc_khz : this_tsc_khz; - kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz, + if (kvm_has_tsc_control) + tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); + + if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); - vcpu->hw_tsc_khz = this_tsc_khz; + vcpu->hw_tsc_khz = tgt_tsc_khz; } /* With all the info we got, fill in the values */ @@ -2987,7 +2988,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && - kvm_vcpu_has_lapic(vcpu)) + lapic_in_kernel(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { @@ -3000,7 +3001,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; - if (kvm_vcpu_has_lapic(vcpu)) { + if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); else @@ -3240,7 +3241,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; - if (!vcpu->arch.apic) + if (!lapic_in_kernel(vcpu)) goto out; u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); @@ -3258,7 +3259,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_LAPIC: { r = -EINVAL; - if (!vcpu->arch.apic) + if (!lapic_in_kernel(vcpu)) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); if (IS_ERR(u.lapic)) @@ -3605,20 +3606,26 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; + + BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); + + mutex_lock(&kps->lock); + memcpy(ps, &kps->channels, sizeof(*ps)); + mutex_unlock(&kps->lock); return 0; } static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int i; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); for (i = 0; i < 3; i++) - kvm_pit_load_count(kvm, i, ps->channels[i].count, 0); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + kvm_pit_load_count(pit, i, ps->channels[i].count, 0); + mutex_unlock(&pit->pit_state.lock); return 0; } @@ -3638,29 +3645,39 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) int start = 0; int i; u32 prev_legacy, cur_legacy; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; if (!prev_legacy && cur_legacy) start = 1; - memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, - sizeof(kvm->arch.vpit->pit_state.channels)); - kvm->arch.vpit->pit_state.flags = ps->flags; + memcpy(&pit->pit_state.channels, &ps->channels, + sizeof(pit->pit_state.channels)); + pit->pit_state.flags = ps->flags; for (i = 0; i < 3; i++) - kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count, + kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, start && i == 0); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + mutex_unlock(&pit->pit_state.lock); return 0; } static int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control) { - if (!kvm->arch.vpit) + struct kvm_pit *pit = kvm->arch.vpit; + + if (!pit) return -ENXIO; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - kvm->arch.vpit->pit_state.reinject = control->pit_reinject; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + + /* pit->pit_state.lock was overloaded to prevent userspace from getting + * an inconsistent state after running multiple KVM_REINJECT_CONTROL + * ioctls in parallel. Use a separate lock if that ioctl isn't rare. + */ + mutex_lock(&pit->pit_state.lock); + kvm_pit_set_reinject(pit, control->pit_reinject); + mutex_unlock(&pit->pit_state.lock); + return 0; } @@ -4093,7 +4110,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, do { n = min(len, 8); - if (!(vcpu->arch.apic && + if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) break; @@ -4113,7 +4130,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) do { n = min(len, 8); - if (!(vcpu->arch.apic && + if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, addr, n, v)) && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) @@ -4312,9 +4329,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); + /* + * currently PKRU is only applied to ept enabled guest so + * there is no pkey in EPT page table for L1 guest or EPT + * shadow page table for L2 guest. + */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, access)) { + vcpu->arch.access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -4346,7 +4368,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes); if (ret < 0) return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); + kvm_page_track_write(vcpu, gpa, val, bytes); return 1; } @@ -4604,7 +4626,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CMPXCHG_FAILED; kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); - kvm_mmu_pte_write(vcpu, gpa, new, bytes); + kvm_page_track_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -6010,7 +6032,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!kvm_x86_ops->update_cr8_intercept) return; - if (!vcpu->arch.apic) + if (!lapic_in_kernel(vcpu)) return; if (vcpu->arch.apicv_active) @@ -6574,8 +6596,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - /* We should set ->mode before check ->requests, - * see the comment in make_all_cpus_request. + /* + * We should set ->mode before check ->requests, + * Please see the comment in kvm_make_all_cpus_request. + * This also orders the write to mode from any reads + * to the page tables done while the VCPU is running. + * Please see the comment in kvm_flush_remote_tlbs. */ smp_mb__after_srcu_read_unlock(); @@ -6618,12 +6644,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * KVM_DEBUGREG_WONT_EXIT again. */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { - int i; - WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); kvm_x86_ops->sync_dirty_debug_regs(vcpu); - for (i = 0; i < KVM_NR_DB_REGS; i++) - vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + kvm_update_dr0123(vcpu); + kvm_update_dr6(vcpu); + kvm_update_dr7(vcpu); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } /* @@ -7038,7 +7064,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (!kvm_vcpu_has_lapic(vcpu) && + if (!lapic_in_kernel(vcpu) && mp_state->mp_state != KVM_MP_STATE_RUNNABLE) return -EINVAL; @@ -7109,7 +7135,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & X86_CR4_OSXSAVE) + if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -7314,7 +7340,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) * Every 255 times fpu_counter rolls over to 0; a guest that uses * the FPU in bursts will revert to loading it on demand. */ - if (!vcpu->arch.eager_fpu) { + if (!use_eager_fpu()) { if (++vcpu->fpu_counter < 5) kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); } @@ -7593,6 +7619,7 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) } struct static_key kvm_no_apic_vcpu __read_mostly; +EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { @@ -7724,6 +7751,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + kvm_page_track_init(kvm); + kvm_mmu_init_vm(kvm); + return 0; } @@ -7850,6 +7880,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvm_mmu_uninit_vm(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, @@ -7871,6 +7902,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, free->arch.lpage_info[i - 1] = NULL; } } + + kvm_page_track_free_memslot(free, dont); } int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -7879,6 +7912,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + struct kvm_lpage_info *linfo; unsigned long ugfn; int lpages; int level = i + 1; @@ -7893,15 +7927,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, if (i == 0) continue; - slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages * - sizeof(*slot->arch.lpage_info[i - 1])); - if (!slot->arch.lpage_info[i - 1]) + linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); + if (!linfo) goto out_free; + slot->arch.lpage_info[i - 1] = linfo; + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i - 1][0].write_count = 1; + linfo[0].disallow_lpage = 1; if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1; + linfo[lpages - 1].disallow_lpage = 1; ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each @@ -7913,10 +7948,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long j; for (j = 0; j < lpages; ++j) - slot->arch.lpage_info[i - 1][j].write_count = 1; + linfo[j].disallow_lpage = 1; } } + if (kvm_page_track_create_memslot(slot, npages)) + goto out_free; + return 0; out_free: @@ -8370,6 +8408,12 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); } +bool kvm_vector_hashing_enabled(void) +{ + return vector_hashing; +} +EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); |