diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 110 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 16 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 35 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 210 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 42 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 79 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 72 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/ops.h | 93 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 321 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 237 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 2 |
16 files changed, 797 insertions, 445 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index dd5985eb61b4..9c5029cf6f3f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -304,7 +304,13 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function, case 7: case 0xb: case 0xd: + case 0xf: + case 0x10: + case 0x12: case 0x14: + case 0x17: + case 0x18: + case 0x1f: case 0x8000001d: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; break; @@ -360,7 +366,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = @@ -479,6 +485,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = + F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); @@ -612,16 +619,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, */ case 0x1f: case 0xb: { - int i, level_type; + int i; - /* read more entries until level_type is zero */ - for (i = 1; ; ++i) { + /* + * We filled in entry[0] for CPUID(EAX=<function>, + * ECX=00H) above. If its level type (ECX[15:8]) is + * zero, then the leaf is unimplemented, and we're + * done. Otherwise, continue to populate entries + * until the level type (ECX[15:8]) of the previously + * added entry is zero. + */ + for (i = 1; entry[i - 1].ecx & 0xff00; ++i) { if (*nent >= maxnent) goto out; - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; do_host_cpuid(&entry[i], function, i); ++*nent; } @@ -963,53 +974,66 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. + * If the basic or extended CPUID leaf requested is higher than the + * maximum supported basic or extended leaf, respectively, then it is + * out of range. */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) +static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function) { - struct kvm_cpuid_entry2 *maxlevel; - - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); + struct kvm_cpuid_entry2 *max; + + max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + return max && function <= max->eax; } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) { u32 function = *eax, index = *ecx; - struct kvm_cpuid_entry2 *best; - bool entry_found = true; - - best = kvm_find_cpuid_entry(vcpu, function, index); - - if (!best) { - entry_found = false; - if (!check_limit) - goto out; + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *max; + bool found; - best = check_cpuid_limit(vcpu, function, index); + entry = kvm_find_cpuid_entry(vcpu, function, index); + found = entry; + /* + * Intel CPUID semantics treats any query for an out-of-range + * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were + * requested. AMD CPUID semantics returns all zeroes for any + * undefined leaf, whether or not the leaf is in range. + */ + if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && + !cpuid_function_in_range(vcpu, function)) { + max = kvm_find_cpuid_entry(vcpu, 0, 0); + if (max) { + function = max->eax; + entry = kvm_find_cpuid_entry(vcpu, function, index); + } } - -out: - if (best) { - *eax = best->eax; - *ebx = best->ebx; - *ecx = best->ecx; - *edx = best->edx; - } else + if (entry) { + *eax = entry->eax; + *ebx = entry->ebx; + *ecx = entry->ecx; + *edx = entry->edx; + } else { *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); - return entry_found; + /* + * When leaf 0BH or 1FH is defined, CL is pass-through + * and EDX is always the x2APIC ID, even for undefined + * subleaves. Index 1 will exist iff the leaf is + * implemented, so we pass through CL iff leaf 1 + * exists. EDX can be copied from any existing index. + */ + if (function == 0xb || function == 0x1f) { + entry = kvm_find_cpuid_entry(vcpu, function, 1); + if (entry) { + *ecx = index & 0xff; + *edx = entry->edx; + } + } + } + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found); + return found; } EXPORT_SYMBOL_GPL(kvm_cpuid); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index fff790a3f4ee..23ff65504d7e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -23,6 +23,7 @@ #include "ioapic.h" #include "hyperv.h" +#include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> @@ -645,7 +646,9 @@ static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) .vector = stimer->config.apic_vector }; - return !kvm_apic_set_irq(vcpu, &irq, NULL); + if (lapic_in_kernel(vcpu)) + return !kvm_apic_set_irq(vcpu, &irq, NULL); + return 0; } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) @@ -1852,7 +1855,13 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; - ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + + /* + * Direct Synthetic timers only make sense with in-kernel + * LAPIC + */ + if (lapic_in_kernel(vcpu)) + ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; @@ -1864,7 +1873,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; - + if (!cpu_smt_possible()) + ent->eax |= HV_X64_NO_NONARCH_CORESHARING; /* * Default number of spinlock retry attempts, matches * HyperV 2016. diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8675458c2205..87b0fcc23ef8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -65,8 +65,11 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 -#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 +static bool lapic_timer_advance_dynamic __read_mostly; +#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_NS_INIT 1000 +#define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -1485,26 +1488,25 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 ns; + /* Do not adjust for tiny fluctuations or large random spikes. */ + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX || + abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN) + return; + /* too early */ if (advance_expire_delta < 0) { ns = -advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns -= min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } else { /* too late */ ns = advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns += min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } - if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - apic->lapic_timer.timer_advance_adjust_done = true; - if (unlikely(timer_advance_ns > 5000)) { - timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; - apic->lapic_timer.timer_advance_adjust_done = false; - } + if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) + timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -1524,7 +1526,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) + if (lapic_timer_advance_dynamic) adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } @@ -2301,14 +2303,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { - apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; - apic->lapic_timer.timer_advance_adjust_done = false; + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; + lapic_timer_advance_dynamic = true; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; - apic->lapic_timer.timer_advance_adjust_done = true; + lapic_timer_advance_dynamic = false; } - /* * APIC is created enabled. This will prevent kvm_lapic_set_base from * thinking that APIC state has changed. diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 50053d2b8b7b..2aad7e226fc0 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -35,7 +35,6 @@ struct kvm_timer { s64 advance_expire_delta; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; - bool timer_advance_adjust_done; }; struct kvm_lapic { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a10af9c87f8a..24c23c66b226 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -83,7 +83,17 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 #define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) #define PT64_LEVEL_BITS 9 @@ -219,12 +229,11 @@ static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. - * Non-present SPTEs with shadow_acc_track_value set are in place for access - * tracking. + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. */ static u64 __read_mostly shadow_acc_track_mask; -static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -304,7 +313,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((mmio_mask & mmio_value) != mmio_value); - shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; shadow_mmio_access_mask = access_mask; } @@ -320,10 +329,27 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) return sp->role.ad_disabled; } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return !(spte & shadow_acc_track_value); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; } static inline u64 spte_shadow_accessed_mask(u64 spte) @@ -403,8 +429,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mask |= (gpa & shadow_nonpresent_or_rsvd_mask) << shadow_nonpresent_or_rsvd_mask_len; - page_header(__pa(sptep))->mmio_cached = true; - trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -463,7 +487,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & shadow_acc_track_value); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -1591,16 +1615,16 @@ static bool spte_clear_dirty(u64 *sptep) rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; - return mmu_spte_update(sptep, spte); } -static bool wrprot_ad_disabled_spte(u64 *sptep) +static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); - if (was_writable) + if (was_writable && !spte_ad_enabled(*sptep)) kvm_set_pfn_dirty(spte_to_pfn(*sptep)); return was_writable; @@ -1619,10 +1643,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - if (spte_ad_enabled(*sptep)) - flush |= spte_clear_dirty(sptep); + if (spte_ad_need_write_protect(*sptep)) + flush |= spte_wrprot_for_clear_dirty(sptep); else - flush |= wrprot_ad_disabled_spte(sptep); + flush |= spte_clear_dirty(sptep); return flush; } @@ -1633,6 +1657,11 @@ static bool spte_set_dirty(u64 *sptep) rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + /* + * Similar to the !kvm_x86_ops->slot_disable_log_dirty case, + * do not bother adding back write access to pages marked + * SPTE_AD_WRPROT_ONLY_MASK. + */ spte |= shadow_dirty_mask; return mmu_spte_update(sptep, spte); @@ -2103,6 +2132,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct * depends on valid pages being added to the head of the list. See * comments in kvm_zap_obsolete_pages(). */ + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; @@ -2252,7 +2282,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ @@ -2311,7 +2341,8 @@ static void mmu_audit_disable(void) { } static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); + return sp->role.invalid || + unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -2538,7 +2569,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); @@ -2623,7 +2653,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, shadow_user_mask | shadow_x_mask | shadow_me_mask; if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -2753,7 +2783,12 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - if (!sp->role.invalid) + /* + * Obsolete pages cannot be used on any vCPUs, see the comment + * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also + * treats invalid shadow pages as being obsolete. + */ + if (!is_obsolete_sp(kvm, sp)) kvm_reload_remote_mmus(kvm); } @@ -2964,7 +2999,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, sp = page_header(__pa(sptep)); if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -5383,7 +5420,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { int r, emulation_type = 0; - enum emulation_result er; bool direct = vcpu->arch.mmu->direct_map; /* With shadow page tables, fault_address contains a GVA or nGPA. */ @@ -5450,19 +5486,8 @@ emulate: return 1; } - er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); - - switch (er) { - case EMULATE_DONE: - return 1; - case EMULATE_USER_EXIT: - ++vcpu->stat.mmio_exits; - /* fall through */ - case EMULATE_FAIL: - return 0; - default: - BUG(); - } + return x86_emulate_instruction(vcpu, cr2, emulation_type, insn, + insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); @@ -5684,12 +5709,11 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) return ret; } - +#define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - int ign; + int nr_zapped, batch = 0; restart: list_for_each_entry_safe_reverse(sp, node, @@ -5702,46 +5726,39 @@ restart: break; /* - * Do not repeatedly zap a root page to avoid unnecessary - * KVM_REQ_MMU_RELOAD, otherwise we may not be able to - * progress: - * vcpu 0 vcpu 1 - * call vcpu_enter_guest(): - * 1): handle KVM_REQ_MMU_RELOAD - * and require mmu-lock to - * load mmu - * repeat: - * 1): zap root page and - * send KVM_REQ_MMU_RELOAD - * - * 2): if (cond_resched_lock(mmu-lock)) - * - * 2): hold mmu-lock and load mmu - * - * 3): see KVM_REQ_MMU_RELOAD bit - * on vcpu->requests is set - * then return 1 to call - * vcpu_enter_guest() again. - * goto repeat; - * - * Since we are reversely walking the list and the invalid - * list will be moved to the head, skip the invalid page - * can help us to avoid the infinity list walking. + * Skip invalid pages with a non-zero root count, zapping pages + * with a non-zero root count will never succeed, i.e. the page + * will get thrown back on active_mmu_pages and we'll get stuck + * in an infinite loop. */ - if (sp->role.invalid) + if (sp->role.invalid && sp->root_count) continue; - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); - cond_resched_lock(&kvm->mmu_lock); + /* + * No need to flush the TLB since we're only zapping shadow + * pages with an obsolete generation number and all vCPUS have + * loaded a new root, i.e. the shadow pages being zapped cannot + * be in active use by the guest. + */ + if (batch >= BATCH_ZAP_PAGES && + cond_resched_lock(&kvm->mmu_lock)) { + batch = 0; goto restart; } - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) + if (__kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { + batch += nr_zapped; goto restart; + } } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + /* + * Trigger a remote TLB flush before freeing the page tables to ensure + * KVM is not in the middle of a lockless shadow page table walk, which + * may reference the pages. + */ + kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); } /* @@ -5755,13 +5772,39 @@ restart: */ static void kvm_mmu_zap_all_fast(struct kvm *kvm) { + lockdep_assert_held(&kvm->slots_lock); + spin_lock(&kvm->mmu_lock); - kvm->arch.mmu_valid_gen++; + trace_kvm_mmu_zap_all_fast(kvm); + + /* + * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is + * held for the entire duration of zapping obsolete pages, it's + * impossible for there to be multiple invalid generations associated + * with *valid* shadow pages at any given time, i.e. there is exactly + * one valid generation and (at most) one invalid generation. + */ + kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; + + /* + * Notify all vcpus to reload its shadow page table and flush TLB. + * Then all vcpus will switch to new shadow page table with the new + * mmu_valid_gen. + * + * Note: we need to do this under the protection of mmu_lock, + * otherwise, vcpu would purge shadow page but miss tlb flush. + */ + kvm_reload_remote_mmus(kvm); kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } +static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) +{ + return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); +} + static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node) @@ -5959,7 +6002,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) +void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -5968,14 +6011,10 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (mmio_only && !sp->mmio_cached) - continue; if (sp->role.invalid && sp->root_count) continue; - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { - WARN_ON_ONCE(mmio_only); + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; - } if (cond_resched_lock(&kvm->mmu_lock)) goto restart; } @@ -5984,11 +6023,6 @@ restart: spin_unlock(&kvm->mmu_lock); } -void kvm_mmu_zap_all(struct kvm *kvm) -{ - return __kvm_mmu_zap_all(kvm, false); -} - void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); @@ -6010,7 +6044,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - __kvm_mmu_zap_all(kvm, true); + kvm_mmu_zap_all_fast(kvm); } } @@ -6041,16 +6075,24 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages) + if (!kvm->arch.n_used_mmu_pages && + !kvm_has_zapped_obsolete_pages(kvm)) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); + if (kvm_has_zapped_obsolete_pages(kvm)) { + kvm_mmu_commit_zap_page(kvm, + &kvm->arch.zapped_obsolete_pages); + goto unlock; + } + if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); +unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index d8001b4bca05..7ca8831c7d1a 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -8,16 +8,18 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(__u8, mmu_valid_gen) \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->mmu_valid_gen = sp->mmu_valid_gen; \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -29,8 +31,9 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \ + trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \ " %snxe %sad root %u %s%c", \ + __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.gpte_is_8_bytes ? 8 : 4, \ role.quadrant, \ @@ -280,6 +283,27 @@ TRACE_EVENT( ); TRACE_EVENT( + kvm_mmu_zap_all_fast, + TP_PROTO(struct kvm *kvm), + TP_ARGS(kvm), + + TP_STRUCT__entry( + __field(__u8, mmu_valid_gen) + __field(unsigned int, mmu_used_pages) + ), + + TP_fast_assign( + __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; + __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; + ), + + TP_printk("kvm-mmu-valid-gen %u used_pages %x", + __entry->mmu_valid_gen, __entry->mmu_used_pages + ) +); + + +TRACE_EVENT( check_mmio_spte, TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), TP_ARGS(spte, kvm_gen, spte_gen), diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 04fe21849b6e..f8ecb6df5106 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -777,17 +777,18 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) svm->next_rip = svm->vmcb->control.next_rip; } - if (!svm->next_rip) - return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP); - - if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) - printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", - __func__, kvm_rip_read(vcpu), svm->next_rip); - - kvm_rip_write(vcpu, svm->next_rip); + if (!svm->next_rip) { + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + return 0; + } else { + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) + pr_err("%s: ip 0x%lx next 0x%llx\n", + __func__, kvm_rip_read(vcpu), svm->next_rip); + kvm_rip_write(vcpu, svm->next_rip); + } svm_set_interrupt_shadow(vcpu, 0); - return EMULATE_DONE; + return 1; } static void svm_queue_exception(struct kvm_vcpu *vcpu) @@ -1539,6 +1540,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); set_intercept(svm, INTERCEPT_XSETBV); + set_intercept(svm, INTERCEPT_RDPRU); set_intercept(svm, INTERCEPT_RSM); if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { @@ -2768,17 +2770,18 @@ static int gp_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 error_code = svm->vmcb->control.exit_info_1; - int er; WARN_ON_ONCE(!enable_vmware_backdoor); - er = kvm_emulate_instruction(vcpu, - EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); - if (er == EMULATE_USER_EXIT) - return 0; - else if (er != EMULATE_DONE) + /* + * VMware backdoor emulation on #GP interception only handles IN{S}, + * OUT{S}, and RDPMC, none of which generate a non-zero error code. + */ + if (error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); - return 1; + return 1; + } + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); } static bool is_erratum_383(void) @@ -2876,7 +2879,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string) - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -3830,6 +3833,12 @@ static int xsetbv_interception(struct vcpu_svm *svm) return 1; } +static int rdpru_interception(struct vcpu_svm *svm) +{ + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; @@ -3883,24 +3892,15 @@ static int task_switch_interception(struct vcpu_svm *svm) int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (skip_emulated_instruction(&svm->vcpu) != EMULATE_DONE) - goto fail; + if (!skip_emulated_instruction(&svm->vcpu)) + return 0; } if (int_type != SVM_EXITINTINFO_TYPE_SOFT) int_vec = -1; - if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, - has_error_code, error_code) == EMULATE_FAIL) - goto fail; - - return 1; - -fail: - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - svm->vcpu.run->internal.ndata = 0; - return 0; + return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, + has_error_code, error_code); } static int cpuid_interception(struct vcpu_svm *svm) @@ -3921,7 +3921,7 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0); kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); return kvm_skip_emulated_instruction(&svm->vcpu); @@ -3929,13 +3929,12 @@ static int invlpg_interception(struct vcpu_svm *svm) static int emulate_on_interception(struct vcpu_svm *svm) { - return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0); } static int rsm_interception(struct vcpu_svm *svm) { - return kvm_emulate_instruction_from_buffer(&svm->vcpu, - rsm_ins_bytes, 2) == EMULATE_DONE; + return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2); } static int rdpmc_interception(struct vcpu_svm *svm) @@ -4724,7 +4723,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ - ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + ret = kvm_emulate_instruction(&svm->vcpu, 0); } return ret; @@ -4791,6 +4790,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MONITOR] = monitor_interception, [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, + [SVM_EXIT_RDPRU] = rdpru_interception, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, @@ -7099,13 +7099,6 @@ failed: return ret; } -static int nested_enable_evmcs(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version) -{ - /* Intel-only feature */ - return -ENODEV; -} - static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) { unsigned long cr4 = kvm_read_cr4(vcpu); @@ -7311,7 +7304,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .nested_enable_evmcs = nested_enable_evmcs, + .nested_enable_evmcs = NULL, .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index d6664ee3d127..7aa69716d516 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -247,6 +247,12 @@ static inline bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static inline bool vmx_waitpkg_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; +} + static inline bool cpu_has_vmx_tsc_scaling(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 39a24eec8884..07ebf6882a45 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -178,6 +178,8 @@ static inline void evmcs_load(u64 phys_addr) struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(smp_processor_id()); + if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) + vp_ap->nested_control.features.directhypercall = 1; vp_ap->current_nested_vmcs = phys_addr; vp_ap->enlighten_vmentry = 1; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a10cd351940..e76eb4f07f6c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -198,6 +198,16 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); } +static inline bool vmx_control_verify(u32 control, u32 low, u32 high) +{ + return fixed_bits_valid(control, low, high); +} + +static inline u64 vmx_control_msr(u32 low, u32 high) +{ + return low | ((u64)high << 32); +} + static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); @@ -866,16 +876,34 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, return 0; } +static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; +} + /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. + * + * One of the failure modes for MSR load/store is when a list exceeds the + * virtual hardware's capacity. To maintain compatibility with hardware inasmuch + * as possible, process all valid entries before failing rather than precheck + * for a capacity violation. */ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u32 i; struct vmx_msr_entry e; + u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { + if (unlikely(i >= max_msr_list_size)) + goto fail; + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { pr_debug_ratelimited( @@ -906,8 +934,12 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u64 data; u32 i; struct vmx_msr_entry e; + u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { + if (unlikely(i >= max_msr_list_size)) + return -EINVAL; + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, 2 * sizeof(u32))) { @@ -1013,17 +1045,6 @@ static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; } - -static inline bool vmx_control_verify(u32 control, u32 low, u32 high) -{ - return fixed_bits_valid(control, low, high); -} - -static inline u64 vmx_control_msr(u32 low, u32 high) -{ - return low | ((u64)high << 32); -} - static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) { superset &= mask; @@ -2089,6 +2110,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC); @@ -2588,7 +2610,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, /* VM-entry exception error code */ if (CC(has_error_code && - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) + vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) return -EINVAL; /* VM-entry interruption-info field: reserved bits */ @@ -2642,8 +2664,23 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; - ia32e = (vmcs12->vm_exit_controls & - VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; +#ifdef CONFIG_X86_64 + ia32e = !!(vcpu->arch.efer & EFER_LMA); +#else + ia32e = false; +#endif + + if (ia32e) { + if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || + CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) + return -EINVAL; + } else { + if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || + CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || + CC((vmcs12->host_rip) >> 32)) + return -EINVAL; + } if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || @@ -2662,7 +2699,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu))) + CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) return -EINVAL; #endif @@ -5441,6 +5479,10 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_ENCLS: /* SGX is never exposed to L1 */ return false; + case EXIT_REASON_UMWAIT: + case EXIT_REASON_TPAUSE: + return nested_cpu_has2(vmcs12, + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); default: return true; } diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 2200fb698dd0..45eaedee2ac0 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -11,8 +11,13 @@ #include "vmcs.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) + +asmlinkage void vmread_error(unsigned long field, bool fault); +void vmwrite_error(unsigned long field, unsigned long value); +void vmclear_error(struct vmcs *vmcs, u64 phys_addr); +void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); +void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); +void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); static __always_inline void vmcs_check16(unsigned long field) { @@ -62,8 +67,22 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; - asm volatile (__ex_clear("vmread %1, %0", "%k0") - : "=r"(value) : "r"(field)); + asm volatile("1: vmread %2, %1\n\t" + ".byte 0x3e\n\t" /* branch taken hint */ + "ja 3f\n\t" + "mov %2, %%" _ASM_ARG1 "\n\t" + "xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t" + "2: call vmread_error\n\t" + "xor %k1, %k1\n\t" + "3:\n\t" + + ".pushsection .fixup, \"ax\"\n\t" + "4: mov %2, %%" _ASM_ARG1 "\n\t" + "mov $1, %%" _ASM_ARG2 "\n\t" + "jmp 2b\n\t" + ".popsection\n\t" + _ASM_EXTABLE(1b, 4b) + : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc"); return value; } @@ -103,21 +122,39 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) return __vmcs_readl(field); } -static noinline void vmwrite_error(unsigned long field, unsigned long value) -{ - printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); - dump_stack(); -} +#define vmx_asm1(insn, op1, error_args...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + ".byte 0x2e\n\t" /* branch not taken hint */ \ + "jna %l[error]\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + : : op1 : "cc" : error, fault); \ + return; \ +error: \ + insn##_error(error_args); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +#define vmx_asm2(insn, op1, op2, error_args...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + ".byte 0x2e\n\t" /* branch not taken hint */ \ + "jna %l[error]\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + : : op1, op2 : "cc" : error, fault); \ + return; \ +error: \ + insn##_error(error_args); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { - bool error; - - asm volatile (__ex("vmwrite %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(field), "rm"(value)); - if (unlikely(error)) - vmwrite_error(field, value); + vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value); } static __always_inline void vmcs_write16(unsigned long field, u16 value) @@ -182,28 +219,18 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) static inline void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - bool error; - asm volatile (__ex("vmclear %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", - vmcs, phys_addr); + vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr); } static inline void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - bool error; if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex("vmptrld %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); + vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr); } static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) @@ -213,11 +240,8 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) u64 rsvd : 48; u64 gva; } operand = { vpid, 0, gva }; - bool error; - asm volatile (__ex("invvpid %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); + vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva); } static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) @@ -225,11 +249,8 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) struct { u64 eptp, gpa; } operand = {eptp, gpa}; - bool error; - asm volatile (__ex("invept %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); + vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa); } static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 4dea0e0e7e39..3e9c059099e9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -262,6 +262,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; @@ -283,8 +284,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; + perf_get_x86_pmu_capability(&x86_pmu); + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -294,7 +297,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } else { pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, - INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4a99be1fae4e..e7970a2e8eae 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -209,6 +209,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) struct page *page; unsigned int i; + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + if (!enable_ept) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; return 0; @@ -343,6 +348,48 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit void vmx_vmexit(void); +#define vmx_insn_failed(fmt...) \ +do { \ + WARN_ONCE(1, fmt); \ + pr_warn_ratelimited(fmt); \ +} while (0) + +asmlinkage void vmread_error(unsigned long field, bool fault) +{ + if (fault) + kvm_spurious_fault(); + else + vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); +} + +noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); +} + +noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) +{ + vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); +} + +noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) +{ + vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); +} + +noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) +{ + vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", + ext, vpid, gva); +} + +noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) +{ + vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", + ext, eptp, gpa); +} + static DEFINE_PER_CPU(struct vmcs *, vmxarea); DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* @@ -486,6 +533,31 @@ static int hv_remote_flush_tlb(struct kvm *kvm) return hv_remote_flush_tlb_with_range(kvm, NULL); } +static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +{ + struct hv_enlightened_vmcs *evmcs; + struct hv_partition_assist_pg **p_hv_pa_pg = + &vcpu->kvm->arch.hyperv.hv_pa_pg; + /* + * Synthetic VM-Exit is not enabled in current code and so All + * evmcs in singe VM shares same assist page. + */ + if (!*p_hv_pa_pg) + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); + + if (!*p_hv_pa_pg) + return -ENOMEM; + + evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; + + evmcs->partition_assist_page = + __pa(*p_hv_pa_pg); + evmcs->hv_vm_id = (unsigned long)vcpu->kvm; + evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; + + return 0; +} + #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* @@ -1472,27 +1544,32 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -/* - * Returns an int to be compatible with SVM implementation (which can fail). - * Do not use directly, use skip_emulated_instruction() instead. - */ -static int __skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; - rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_rip_write(vcpu, rip); + /* + * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on + * undefined behavior: Intel's SDM doesn't mandate the VMCS field be + * set when EPT misconfig occurs. In practice, real hardware updates + * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors + * (namely Hyper-V) don't set it due to it being undefined behavior, + * i.e. we end up advancing IP with some random value. + */ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || + to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) { + rip = kvm_rip_read(vcpu); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + kvm_rip_write(vcpu, rip); + } else { + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + return 0; + } /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); - return EMULATE_DONE; -} - -static inline void skip_emulated_instruction(struct kvm_vcpu *vcpu) -{ - (void)__skip_emulated_instruction(vcpu); + return 1; } static void vmx_clear_hlt(struct kvm_vcpu *vcpu) @@ -1527,8 +1604,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) int inc_eip = 0; if (kvm_exception_is_soft(nr)) inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_inject_realmode_interrupt(vcpu, nr, inc_eip); return; } @@ -1700,6 +1776,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); + case MSR_IA32_UMWAIT_CONTROL: + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) + return 1; + + msr_info->data = vmx->msr_ia32_umwait_control; + break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -1873,6 +1955,16 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; + case MSR_IA32_UMWAIT_CONTROL: + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) + return 1; + + /* The reserved bit 1 and non-32 bit [63:32] should be zero */ + if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) + return 1; + + vmx->msr_ia32_umwait_control = data; + break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -2290,6 +2382,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | @@ -4026,6 +4119,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } + if (vmx_waitpkg_supported()) { + bool waitpkg_enabled = + guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG); + + if (!waitpkg_enabled) + exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + + if (nested) { + if (waitpkg_enabled) + vmx->nested.msrs.secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + else + vmx->nested.msrs.secondary_ctls_high &= + ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + } + } + vmx->secondary_exec_control = exec_control; } @@ -4160,6 +4270,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; + vmx->msr_ia32_umwait_control = 0; + vcpu->arch.microcode_version = 0x100000000ULL; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); vmx->hv_deadline_tsc = -1; @@ -4277,8 +4389,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) int inc_eip = 0; if (vcpu->arch.interrupt.soft) inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); return; } intr = irq | INTR_INFO_VALID_MASK; @@ -4314,8 +4425,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); return; } @@ -4442,7 +4552,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); @@ -4493,7 +4603,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) u32 intr_info, ex_no, error_code; unsigned long cr2, rip, dr6; u32 vect_info; - enum emulation_result er; vect_info = vmx->idt_vectoring_info; intr_info = vmx->exit_intr_info; @@ -4510,13 +4619,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); - er = kvm_emulate_instruction(vcpu, - EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); - if (er == EMULATE_USER_EXIT) - return 0; - else if (er != EMULATE_DONE) + + /* + * VMware backdoor emulation on #GP interception only handles + * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero + * error code on #GP. + */ + if (error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); - return 1; + return 1; + } + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); } /* @@ -4558,7 +4671,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; if (is_icebp(intr_info)) - skip_emulated_instruction(vcpu); + WARN_ON(!skip_emulated_instruction(vcpu)); kvm_queue_exception(vcpu, DB_VECTOR); return 1; @@ -4613,7 +4726,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string) - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -4687,7 +4800,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); } static int handle_cr(struct kvm_vcpu *vcpu) @@ -4903,7 +5016,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -4937,20 +5050,6 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) return 1; } -static int handle_xsaves(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - -static int handle_xrstors(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -4970,7 +5069,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } } - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) @@ -5039,23 +5138,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && type != INTR_TYPE_EXT_INTR && type != INTR_TYPE_NMI_INTR)) - skip_emulated_instruction(vcpu); - - if (kvm_task_switch(vcpu, tss_selector, - type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, - has_error_code, error_code) == EMULATE_FAIL) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + WARN_ON(!skip_emulated_instruction(vcpu)); /* * TODO: What about debug traps on tss switch? * Are we supposed to inject them and update dr6? */ - - return 1; + return kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, + reason, has_error_code, error_code); } static int handle_ept_violation(struct kvm_vcpu *vcpu) @@ -5114,21 +5205,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { trace_kvm_fast_mmio(gpa); - /* - * Doing kvm_skip_emulated_instruction() depends on undefined - * behavior: Intel's manual doesn't mandate - * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG - * occurs and while on real hardware it was observed to be set, - * other hypervisors (namely Hyper-V) don't set it, we end up - * advancing IP with some random value. Disable fast mmio when - * running nested and keep it for real hardware in hope that - * VM_EXIT_INSTRUCTION_LEN will always be set correctly. - */ - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) - return kvm_skip_emulated_instruction(vcpu); - else - return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == - EMULATE_DONE; + return kvm_skip_emulated_instruction(vcpu); } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); @@ -5147,8 +5224,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - enum emulation_result err = EMULATE_DONE; - int ret = 1; bool intr_window_requested; unsigned count = 130; @@ -5169,41 +5244,35 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; - err = kvm_emulate_instruction(vcpu, 0); - - if (err == EMULATE_USER_EXIT) { - ++vcpu->stat.mmio_exits; - ret = 0; - goto out; - } - - if (err != EMULATE_DONE) - goto emulation_error; + if (!kvm_emulate_instruction(vcpu, 0)) + return 0; if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) - goto emulation_error; + vcpu->arch.exception.pending) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - ret = kvm_vcpu_halt(vcpu); - goto out; + return kvm_vcpu_halt(vcpu); } + /* + * Note, return 1 and not 0, vcpu_run() is responsible for + * morphing the pending signal into the proper return code. + */ if (signal_pending(current)) - goto out; + return 1; + if (need_resched()) schedule(); } -out: - return ret; - -emulation_error: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; + return 1; } static void grow_ple_window(struct kvm_vcpu *vcpu) @@ -5474,6 +5543,14 @@ static int handle_encls(struct kvm_vcpu *vcpu) return 1; } +static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu) +{ + kvm_skip_emulated_instruction(vcpu); + WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x", + vmcs_read32(VM_EXIT_REASON)); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5525,13 +5602,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = handle_invalid_op, [EXIT_REASON_RDSEED] = handle_invalid_op, - [EXIT_REASON_XSAVES] = handle_xsaves, - [EXIT_REASON_XRSTORS] = handle_xrstors, + [EXIT_REASON_XSAVES] = handle_unexpected_vmexit, + [EXIT_REASON_XRSTORS] = handle_unexpected_vmexit, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, + [EXIT_REASON_UMWAIT] = handle_unexpected_vmexit, + [EXIT_REASON_TPAUSE] = handle_unexpected_vmexit, }; static const int kvm_vmx_max_exit_handlers = @@ -6362,6 +6441,23 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } +static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx) +{ + u32 host_umwait_control; + + if (!vmx_has_waitpkg(vmx)) + return; + + host_umwait_control = get_umwait_control_msr(); + + if (vmx->msr_ia32_umwait_control != host_umwait_control) + add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL, + vmx->msr_ia32_umwait_control, + host_umwait_control, false); + else + clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL); +} + static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6456,6 +6552,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); + atomic_switch_umwait_control_msr(vmx); if (enable_preemption_timer) vmx_update_hv_timer(vcpu); @@ -6511,6 +6608,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + if (static_branch_unlikely(&enable_evmcs)) + current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index; + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vmx->host_debugctlmsr) update_debugctlmsr(vmx->host_debugctlmsr); @@ -6578,6 +6678,7 @@ static struct kvm *vmx_vm_alloc(void) static void vmx_vm_free(struct kvm *kvm) { + kfree(kvm->arch.hyperv.hv_pa_pg); vfree(to_kvm_vmx(kvm)); } @@ -7706,7 +7807,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = __skip_emulated_instruction, + .skip_emulated_instruction = skip_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, @@ -7837,6 +7938,7 @@ static void vmx_exit(void) if (!vp_ap) continue; + vp_ap->nested_control.features.directhypercall = 0; vp_ap->current_nested_vmcs = 0; vp_ap->enlighten_vmentry = 0; } @@ -7876,6 +7978,11 @@ static int __init vmx_init(void) pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); static_branch_enable(&enable_evmcs); } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) + vmx_x86_ops.enable_direct_tlbflush + = hv_enable_direct_tlbflush; + } else { enlightened_vmcs = false; } @@ -7893,12 +8000,10 @@ static int __init vmx_init(void) * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ - if (boot_cpu_has(X86_BUG_L1TF)) { - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; } #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 64d5a4890aa9..bee16687dc0b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -14,6 +14,8 @@ extern const u32 vmx_msr_index[]; extern u64 host_efer; +extern u32 get_umwait_control_msr(void); + #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 #define MSR_TYPE_RW 3 @@ -211,6 +213,7 @@ struct vcpu_vmx { #endif u64 spec_ctrl; + u32 msr_ia32_umwait_control; u32 secondary_exec_control; @@ -497,6 +500,12 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } +static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) +{ + return vmx->secondary_exec_control & + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dfd641243568..661e2bf38526 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -212,7 +212,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } @@ -360,7 +360,8 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base); asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - BUG(); + if (!kvm_rebooting) + BUG(); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); @@ -884,34 +885,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - if (cr4 & CR4_RESERVED_BITS) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + return -EINVAL; + + return 0; +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; + + if (kvm_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -1145,6 +1154,30 @@ static u32 msrs_to_save[] = { MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, }; static unsigned num_msrs_to_save; @@ -3169,7 +3202,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_EVENTFD: case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: - case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: case KVM_CAP_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: @@ -3246,6 +3278,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_x86_ops->get_nested_state ? kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; break; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + r = kvm_x86_ops->enable_direct_tlbflush != NULL; + break; + case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: + r = kvm_x86_ops->nested_enable_evmcs != NULL; + break; default: break; } @@ -4019,6 +4057,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = -EFAULT; } return r; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + if (!kvm_x86_ops->enable_direct_tlbflush) + return -ENOTTY; + + return kvm_x86_ops->enable_direct_tlbflush(vcpu); default: return -EINVAL; @@ -5048,9 +5091,15 @@ out: static void kvm_init_msr_list(void) { + struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i, j; + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, + "Please update the fixed PMCs in msrs_to_save[]"); + + perf_get_x86_pmu_capability(&x86_pmu); + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; @@ -5091,6 +5140,15 @@ static void kvm_init_msr_list(void) intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: + if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: + if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; } default: break; @@ -5389,7 +5447,6 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); int handle_ud(struct kvm_vcpu *vcpu) { int emul_type = EMULTYPE_TRAP_UD; - enum emulation_result er; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; @@ -5398,15 +5455,10 @@ int handle_ud(struct kvm_vcpu *vcpu) sig, sizeof(sig), &e) == 0 && memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) { kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); - emul_type = 0; + emul_type = EMULTYPE_TRAP_UD_FORCED; } - er = kvm_emulate_instruction(vcpu, emul_type); - if (er == EMULATE_USER_EXIT) - return 0; - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + return kvm_emulate_instruction(vcpu, emul_type); } EXPORT_SYMBOL_GPL(handle_ud); @@ -6228,7 +6280,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) +void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; @@ -6240,37 +6292,43 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) ctxt->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); - if (ret != X86EMUL_CONTINUE) - return EMULATE_FAIL; - - ctxt->eip = ctxt->_eip; - kvm_rip_write(vcpu, ctxt->eip); - kvm_set_rflags(vcpu, ctxt->eflags); - - return EMULATE_DONE; + if (ret != X86EMUL_CONTINUE) { + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } else { + ctxt->eip = ctxt->_eip; + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); + } } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { - int r = EMULATE_DONE; - ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - if (emulation_type & EMULTYPE_NO_UD_ON_FAIL) - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_VMWARE_GP) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } - if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + if (emulation_type & EMULTYPE_SKIP) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - r = EMULATE_USER_EXIT; + return 0; } kvm_queue_exception(vcpu, UD_VECTOR); - return r; + if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + return 1; } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, @@ -6425,7 +6483,7 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; @@ -6434,10 +6492,10 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); + return 0; } + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); + return 1; } int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -6446,7 +6504,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) int r; r = kvm_x86_ops->skip_emulated_instruction(vcpu); - if (unlikely(r != EMULATE_DONE)) + if (unlikely(!r)) return 0; /* @@ -6458,8 +6516,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) * that sets the TF flag". */ if (unlikely(rflags & X86_EFLAGS_TF)) - kvm_vcpu_do_singlestep(vcpu, &r); - return r == EMULATE_DONE; + r = kvm_vcpu_do_singlestep(vcpu); + return r; } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); @@ -6478,7 +6536,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; + *r = 0; return true; } } @@ -6494,7 +6552,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); - *r = EMULATE_DONE; + *r = 1; return true; } } @@ -6578,11 +6636,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; if (r != EMULATION_OK) { - if (emulation_type & EMULTYPE_TRAP_UD) - return EMULATE_FAIL; + if ((emulation_type & EMULTYPE_TRAP_UD) || + (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) - return EMULATE_DONE; + return 1; if (ctxt->have_exception) { /* * #UD should result in just EMULATION_FAILED, and trap-like @@ -6591,28 +6652,32 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || exception_type(ctxt->exception.vector) == EXCPT_TRAP); inject_emulated_exception(vcpu); - return EMULATE_DONE; + return 1; } - if (emulation_type & EMULTYPE_SKIP) - return EMULATE_FAIL; return handle_emulation_failure(vcpu, emulation_type); } } - if ((emulation_type & EMULTYPE_VMWARE) && - !is_vmware_backdoor_opcode(ctxt)) - return EMULATE_FAIL; + if ((emulation_type & EMULTYPE_VMWARE_GP) && + !is_vmware_backdoor_opcode(ctxt)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + /* + * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks + * for kvm_skip_emulated_instruction(). The caller is responsible for + * updating interruptibility state and injecting single-step #DBs. + */ if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); - kvm_x86_ops->set_interrupt_shadow(vcpu, 0); - return EMULATE_DONE; + return 1; } if (retry_instruction(ctxt, cr2, emulation_type)) - return EMULATE_DONE; + return 1; /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ @@ -6628,18 +6693,18 @@ restart: r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) - return EMULATE_DONE; + return 1; if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) - return EMULATE_DONE; + return 1; return handle_emulation_failure(vcpu, emulation_type); } if (ctxt->have_exception) { - r = EMULATE_DONE; + r = 1; if (inject_emulated_exception(vcpu)) return r; } else if (vcpu->arch.pio.count) { @@ -6650,16 +6715,18 @@ restart: writeback = false; vcpu->arch.complete_userspace_io = complete_emulated_pio; } - r = EMULATE_USER_EXIT; + r = 0; } else if (vcpu->mmio_needed) { + ++vcpu->stat.mmio_exits; + if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_USER_EXIT; + r = 0; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; else - r = EMULATE_DONE; + r = 1; if (writeback) { unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); @@ -6668,8 +6735,8 @@ restart: if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE && ctxt->tf) - kvm_vcpu_do_singlestep(vcpu, &r); + if (r && ctxt->tf) + r = kvm_vcpu_do_singlestep(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -8263,12 +8330,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu) static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { int r; + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) - return 0; - return 1; + return r; } static int complete_emulated_pio(struct kvm_vcpu *vcpu) @@ -8636,23 +8702,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - - if (ret) - return EMULATE_FAIL; + if (ret) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); - return EMULATE_DONE; + return 1; } EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8671,7 +8736,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return -EINVAL; } - return 0; + return kvm_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -9361,6 +9426,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9690,8 +9756,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * Scan sptes if dirty logging has been stopped, dropping those * which can be collapsed into a single large-page spte. Later * page faults will create the large-page sptes. + * + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() */ - if ((change != KVM_MR_DELETE) && + if (change == KVM_MR_FLAGS_ONLY && (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b5274e2a53cf..dbf7442a822b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -261,7 +261,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) } void kvm_set_pending_timer(struct kvm_vcpu *vcpu); -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); +void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 get_kvmclock_ns(struct kvm *kvm); |