diff options
Diffstat (limited to 'arch/x86/kvm/mmu/tdp_mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 336 |
1 files changed, 128 insertions, 208 deletions
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7c25dbf32ecc..08340219c35a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -40,7 +40,17 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { - /* Also waits for any queued work items. */ + /* + * Invalidate all roots, which besides the obvious, schedules all roots + * for zapping and thus puts the TDP MMU's reference to each root, i.e. + * ultimately frees all roots. + */ + kvm_tdp_mmu_invalidate_all_roots(kvm); + + /* + * Destroying a workqueue also first flushes the workqueue, i.e. no + * need to invoke kvm_tdp_mmu_zap_invalidated_roots(). + */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); @@ -116,16 +126,6 @@ static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); } -static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) -{ - union kvm_mmu_page_role role = page->role; - role.invalid = true; - - /* No need to use cmpxchg, only the invalid bit can change. */ - role.word = xchg(&page->role.word, role.word); - return role.invalid; -} - void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { @@ -134,45 +134,12 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; - WARN_ON(!is_tdp_mmu_page(root)); - /* - * The root now has refcount=0. It is valid, but readers already - * cannot acquire a reference to it because kvm_tdp_mmu_get_root() - * rejects it. This remains true for the rest of the execution - * of this function, because readers visit valid roots only - * (except for tdp_mmu_zap_root_work(), which however - * does not acquire any reference itself). - * - * Even though there are flows that need to visit all roots for - * correctness, they all take mmu_lock for write, so they cannot yet - * run concurrently. The same is true after kvm_tdp_root_mark_invalid, - * since the root still has refcount=0. - * - * However, tdp_mmu_zap_root can yield, and writers do not expect to - * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). - * So the root temporarily gets an extra reference, going to refcount=1 - * while staying invalid. Readers still cannot acquire any reference; - * but writers are now allowed to run if tdp_mmu_zap_root yields and - * they might take an extra reference if they themselves yield. - * Therefore, when the reference is given back by the worker, - * there is no guarantee that the refcount is still 1. If not, whoever - * puts the last reference will free the page, but they will not have to - * zap the root because a root cannot go from invalid to valid. + * The TDP MMU itself holds a reference to each root until the root is + * explicitly invalidated, i.e. the final reference should be never be + * put for a valid root. */ - if (!kvm_tdp_root_mark_invalid(root)) { - refcount_set(&root->tdp_mmu_root_count, 1); - - /* - * Zapping the root in a worker is not just "nice to have"; - * it is required because kvm_tdp_mmu_invalidate_all_roots() - * skips already-invalid roots. If kvm_tdp_mmu_put_root() did - * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() - * might return with some roots not zapped yet. - */ - tdp_mmu_schedule_zap_root(kvm, root); - return; - } + KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_del_rcu(&root->link); @@ -320,7 +287,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) root = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_sp(root, NULL, 0, role); - refcount_set(&root->tdp_mmu_root_count, 1); + /* + * TDP MMU roots are kept until they are explicitly invalidated, either + * by a memslot update or by the destruction of the VM. Initialize the + * refcount to two; one reference for the vCPU, and one reference for + * the TDP MMU itself, which is held until the root is invalidated and + * is ultimately put by tdp_mmu_zap_root_work(). + */ + refcount_set(&root->tdp_mmu_root_count, 2); spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); @@ -334,35 +308,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) -{ - if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) - return; - - if (is_accessed_spte(old_spte) && - (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || - spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); -} - -static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) -{ - bool pfn_changed; - struct kvm_memory_slot *slot; - - if (level > PG_LEVEL_4K) - return; - - pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - - if ((!is_writable_pte(old_spte) || pfn_changed) && - is_writable_pte(new_spte)) { - slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); - mark_page_dirty_in_slot(kvm, slot, gfn); - } -} - static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); @@ -505,7 +450,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) } /** - * __handle_changed_spte - handle bookkeeping associated with an SPTE change + * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance * @as_id: the address space of the paging structure the SPTE was a part of * @gfn: the base GFN that was mapped by the SPTE @@ -516,12 +461,13 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. * - * Handle bookkeeping that might result from the modification of a SPTE. - * This function must be called for all TDP SPTE modifications. + * Handle bookkeeping that might result from the modification of a SPTE. Note, + * dirty logging updates are handled in common code, not here (see make_spte() + * and fast_pf_fix_direct_spte()). */ -static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared) +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level, + bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); @@ -605,17 +551,10 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_present && !was_leaf && (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); -} -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared) -{ - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, - shared); - handle_changed_spte_acc_track(old_spte, new_spte, level); - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + if (was_leaf && is_accessed_spte(old_spte) && + (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } /* @@ -658,9 +597,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; - __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, true); - handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return 0; } @@ -696,7 +634,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, /* - * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping + * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance * @as_id: Address space ID, i.e. regular vs. SMM * @sptep: Pointer to the SPTE @@ -704,23 +642,12 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * @new_spte: The new value that will be set for the SPTE * @gfn: The base GFN that was (or will be) mapped by the SPTE * @level: The level _containing_ the SPTE (its parent PT's level) - * @record_acc_track: Notify the MM subsystem of changes to the accessed state - * of the page. Should be set unless handling an MMU - * notifier for access tracking. Leaving record_acc_track - * unset in that case prevents page accesses from being - * double counted. - * @record_dirty_log: Record the page as dirty in the dirty bitmap if - * appropriate for the change being made. Should be set - * unless performing certain dirty logging operations. - * Leaving record_dirty_log unset in that case prevents page - * writes from being double counted. * * Returns the old SPTE value, which _may_ be different than @old_spte if the * SPTE had voldatile bits. */ -static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, - u64 old_spte, u64 new_spte, gfn_t gfn, int level, - bool record_acc_track, bool record_dirty_log) +static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, + u64 old_spte, u64 new_spte, gfn_t gfn, int level) { lockdep_assert_held_write(&kvm->mmu_lock); @@ -735,46 +662,17 @@ static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); - - if (record_acc_track) - handle_changed_spte_acc_track(old_spte, new_spte, level); - if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); return old_spte; } -static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte, bool record_acc_track, - bool record_dirty_log) +static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) { WARN_ON_ONCE(iter->yielded); - - iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, - iter->old_spte, new_spte, - iter->gfn, iter->level, - record_acc_track, record_dirty_log); -} - -static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); -} - -static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); -} - -static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); + iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, + iter->old_spte, new_spte, + iter->gfn, iter->level); } #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ @@ -866,7 +764,7 @@ retry: continue; if (!shared) - tdp_mmu_set_spte(kvm, &iter, 0); + tdp_mmu_iter_set_spte(kvm, &iter, 0); else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) goto retry; } @@ -923,8 +821,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) return false; - __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, - sp->gfn, sp->role.level + 1, true, true); + tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, + sp->gfn, sp->role.level + 1); return true; } @@ -958,7 +856,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); + tdp_mmu_iter_set_spte(kvm, &iter, 0); flush = true; } @@ -1022,32 +920,49 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) /* * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that * is about to be zapped, e.g. in response to a memslots update. The actual - * zapping is performed asynchronously, so a reference is taken on all roots. - * Using a separate workqueue makes it easy to ensure that the destruction is - * performed before the "fast zap" completes, without keeping a separate list - * of invalidated roots; the list is effectively the list of work items in - * the workqueue. - * - * Get a reference even if the root is already invalid, the asynchronous worker - * assumes it was gifted a reference to the root it processes. Because mmu_lock - * is held for write, it should be impossible to observe a root with zero refcount, - * i.e. the list of roots cannot be stale. + * zapping is performed asynchronously. Using a separate workqueue makes it + * easy to ensure that the destruction is performed before the "fast zap" + * completes, without keeping a separate list of invalidated roots; the list is + * effectively the list of work items in the workqueue. * - * This has essentially the same effect for the TDP MMU - * as updating mmu_valid_gen does for the shadow MMU. + * Note, the asynchronous worker is gifted the TDP MMU's reference. + * See kvm_tdp_mmu_get_vcpu_root_hpa(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) { struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); - list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { - if (!root->role.invalid && - !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { + /* + * mmu_lock must be held for write to ensure that a root doesn't become + * invalid while there are active readers (invalidating a root while + * there are active readers may or may not be problematic in practice, + * but it's uncharted territory and not supported). + * + * Waive the assertion if there are no users of @kvm, i.e. the VM is + * being destroyed after all references have been put, or if no vCPUs + * have been created (which means there are no roots), i.e. the VM is + * being destroyed in an error path of KVM_CREATE_VM. + */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && + refcount_read(&kvm->users_count) && kvm->created_vcpus) + lockdep_assert_held_write(&kvm->mmu_lock); + + /* + * As above, mmu_lock isn't held when destroying the VM! There can't + * be other references to @kvm, i.e. nothing else can invalidate roots + * or be consuming roots, but walking the list of roots does need to be + * guarded against roots being deleted by the asynchronous zap worker. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { + if (!root->role.invalid) { root->role.invalid = true; tdp_mmu_schedule_zap_root(kvm, root); } } + + rcu_read_unlock(); } /* @@ -1128,7 +1043,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, if (ret) return ret; } else { - tdp_mmu_set_spte(kvm, iter, spte); + tdp_mmu_iter_set_spte(kvm, iter, spte); } tdp_account_mmu_page(kvm, sp); @@ -1262,33 +1177,42 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. + * + * No need to mark the corresponding PFN as accessed as this call is coming + * from the clear_young() or clear_flush_young() notifier, which uses the + * return value to determine if the page has been accessed. */ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { - u64 new_spte = 0; + u64 new_spte; /* If we have a non-accessed entry we don't need to change the pte. */ if (!is_accessed_spte(iter->old_spte)) return false; - new_spte = iter->old_spte; - - if (spte_ad_enabled(new_spte)) { - new_spte &= ~shadow_accessed_mask; + if (spte_ad_enabled(iter->old_spte)) { + iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, + iter->old_spte, + shadow_accessed_mask, + iter->level); + new_spte = iter->old_spte & ~shadow_accessed_mask; } else { /* * Capture the dirty status of the page, so that it doesn't get * lost when the SPTE is marked for access tracking. */ - if (is_writable_pte(new_spte)) - kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + if (is_writable_pte(iter->old_spte)) + kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); - new_spte = mark_spte_for_access_track(new_spte); + new_spte = mark_spte_for_access_track(iter->old_spte); + iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, + iter->old_spte, new_spte, + iter->level); } - tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); - + trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, + iter->old_spte, new_spte); return true; } @@ -1324,15 +1248,15 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, * Note, when changing a read-only SPTE, it's not strictly necessary to * zero the SPTE before setting the new PFN, but doing so preserves the * invariant that the PFN of a present * leaf SPTE can never change. - * See __handle_changed_spte(). + * See handle_changed_spte(). */ - tdp_mmu_set_spte(kvm, iter, 0); + tdp_mmu_iter_set_spte(kvm, iter, 0); if (!pte_write(range->pte)) { new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, pte_pfn(range->pte)); - tdp_mmu_set_spte(kvm, iter, new_spte); + tdp_mmu_iter_set_spte(kvm, iter, new_spte); } return true; @@ -1349,7 +1273,7 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) /* * No need to handle the remote TLB flush under RCU protection, the * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a - * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). + * shadow page. See the WARN on pfn_changed in handle_changed_spte(). */ return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); } @@ -1607,8 +1531,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { + u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; struct tdp_iter iter; - u64 new_spte; bool spte_set = false; rcu_read_lock(); @@ -1621,19 +1545,13 @@ retry: if (!is_shadow_present_pte(iter.old_spte)) continue; - if (spte_ad_need_write_protect(iter.old_spte)) { - if (is_writable_pte(iter.old_spte)) - new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - else - continue; - } else { - if (iter.old_spte & shadow_dirty_mask) - new_spte = iter.old_spte & ~shadow_dirty_mask; - else - continue; - } + MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); - if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) + if (!(iter.old_spte & dbit)) + continue; + + if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) goto retry; spte_set = true; @@ -1675,8 +1593,9 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { + u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : + shadow_dirty_mask; struct tdp_iter iter; - u64 new_spte; rcu_read_lock(); @@ -1685,25 +1604,26 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, if (!mask) break; + MMU_WARN_ON(kvm_ad_enabled() && + spte_ad_need_write_protect(iter.old_spte)); + if (iter.level > PG_LEVEL_4K || !(mask & (1UL << (iter.gfn - gfn)))) continue; mask &= ~(1UL << (iter.gfn - gfn)); - if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { - if (is_writable_pte(iter.old_spte)) - new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - else - continue; - } else { - if (iter.old_spte & shadow_dirty_mask) - new_spte = iter.old_spte & ~shadow_dirty_mask; - else - continue; - } + if (!(iter.old_spte & dbit)) + continue; + + iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, + iter.old_spte, dbit, + iter.level); - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, + iter.old_spte, + iter.old_spte & ~dbit); + kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); } rcu_read_unlock(); @@ -1821,7 +1741,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, if (new_spte == iter.old_spte) break; - tdp_mmu_set_spte(kvm, &iter, new_spte); + tdp_mmu_iter_set_spte(kvm, &iter, new_spte); spte_set = true; } |