From 345b0fd6fe5f66dfe841bad0b39dd11a5672df68 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 21:00:20 +0000
Subject: KVM: Drop unused @gpa param from gfn=>pfn cache's __release_gpc()
 helper

Drop the @pga param from __release_gpc() and rename the helper to make it
more obvious that the cache itself is not being released.  The helper
will be reused by a future commit to release a pfn+khva combination that
is _never_ associated with the cache, at which point the current name
would go from slightly misleading to blatantly wrong.

No functional change intended.

Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429210025.3293691-4-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/pfncache.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index dd84676615f1..e05a6a1b8eff 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -95,7 +95,7 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_check);
 
-static void __release_gpc(struct kvm *kvm, kvm_pfn_t pfn, void *khva, gpa_t gpa)
+static void gpc_release_pfn_and_khva(struct kvm *kvm, kvm_pfn_t pfn, void *khva)
 {
 	/* Unmap the old page if it was mapped before, and release it */
 	if (!is_error_noslot_pfn(pfn)) {
@@ -146,7 +146,6 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	unsigned long page_offset = gpa & ~PAGE_MASK;
 	kvm_pfn_t old_pfn, new_pfn;
 	unsigned long old_uhva;
-	gpa_t old_gpa;
 	void *old_khva;
 	bool old_valid;
 	int ret = 0;
@@ -160,7 +159,6 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 
 	write_lock_irq(&gpc->lock);
 
-	old_gpa = gpc->gpa;
 	old_pfn = gpc->pfn;
 	old_khva = gpc->khva - offset_in_page(gpc->khva);
 	old_uhva = gpc->uhva;
@@ -244,7 +242,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
  out:
 	write_unlock_irq(&gpc->lock);
 
-	__release_gpc(kvm, old_pfn, old_khva, old_gpa);
+	gpc_release_pfn_and_khva(kvm, old_pfn, old_khva);
 
 	return ret;
 }
@@ -254,14 +252,12 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 {
 	void *old_khva;
 	kvm_pfn_t old_pfn;
-	gpa_t old_gpa;
 
 	write_lock_irq(&gpc->lock);
 
 	gpc->valid = false;
 
 	old_khva = gpc->khva - offset_in_page(gpc->khva);
-	old_gpa = gpc->gpa;
 	old_pfn = gpc->pfn;
 
 	/*
@@ -273,7 +269,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 
 	write_unlock_irq(&gpc->lock);
 
-	__release_gpc(kvm, old_pfn, old_khva, old_gpa);
+	gpc_release_pfn_and_khva(kvm, old_pfn, old_khva);
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap);
 
-- 
cgit v1.2.3


From 3dddf65b4f4c451c345d34ae85bdf1791a746e49 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 21:00:21 +0000
Subject: KVM: Put the extra pfn reference when reusing a pfn in the gpc cache

Put the struct page reference to pfn acquired by hva_to_pfn() when the
old and new pfns for a gfn=>pfn cache match.  The cache already has a
reference via the old/current pfn, and will only put one reference when
the cache is done with the pfn.

Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support")
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429210025.3293691-5-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/pfncache.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'virt')

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index e05a6a1b8eff..40cbe90d52e0 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -206,6 +206,14 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 
 		if (gpc->usage & KVM_HOST_USES_PFN) {
 			if (new_pfn == old_pfn) {
+				/*
+				 * Reuse the existing pfn and khva, but put the
+				 * reference acquired hva_to_pfn_retry(); the
+				 * cache still holds a reference to the pfn
+				 * from the previous refresh.
+				 */
+				gpc_release_pfn_and_khva(kvm, new_pfn, NULL);
+
 				new_khva = old_khva;
 				old_pfn = KVM_PFN_ERR_FAULT;
 				old_khva = NULL;
-- 
cgit v1.2.3


From 3ba2c95ea180740b16281fa43a3ee5f47279c0ed Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 21:00:22 +0000
Subject: KVM: Do not incorporate page offset into gfn=>pfn cache user address

Don't adjust the userspace address in the gfn=>pfn cache by the page
offset from the gpa.  KVM should never use the user address directly, and
all KVM operations that translate a user address to something else
require the user address to be page aligned.  Ignoring the offset will
allow the cache to reuse a gfn=>hva translation in the unlikely event
that the page offset of the gpa changes, but the gfn does not.  And more
importantly, not having to (un)adjust the user address will simplify a
future bug fix.

Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429210025.3293691-6-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/pfncache.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 40cbe90d52e0..05cb0bcbf662 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -179,8 +179,6 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 			ret = -EFAULT;
 			goto out;
 		}
-
-		gpc->uhva += page_offset;
 	}
 
 	/*
-- 
cgit v1.2.3


From 93984f19e7bce4c18084a6ef3dacafb155b806ed Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 21:00:23 +0000
Subject: KVM: Fully serialize gfn=>pfn cache refresh via mutex

Protect gfn=>pfn cache refresh with a mutex to fully serialize refreshes.
The refresh logic doesn't protect against

- concurrent unmaps, or refreshes with different GPAs (which may or may not
  happen in practice, for example if a cache is only used under vcpu->mutex;
  but it's allowed in the code)

- a false negative on the memslot generation.  If the first refresh sees
  a stale memslot generation, it will refresh the hva and generation before
  moving on to the hva=>pfn translation.  If it then drops gpc->lock, a
  different user of the cache can come along, acquire gpc->lock, see that
  the memslot generation is fresh, and skip the hva=>pfn update due to the
  userspace address also matching (because it too was updated).

The refresh path can already sleep during hva=>pfn resolution, so wrap
the refresh with a mutex to ensure that any given refresh runs to
completion before other callers can start their refresh.

Cc: stable@vger.kernel.org
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429210025.3293691-7-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/pfncache.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'virt')

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 05cb0bcbf662..f610d3945b69 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -157,6 +157,13 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	if (page_offset + len > PAGE_SIZE)
 		return -EINVAL;
 
+	/*
+	 * If another task is refreshing the cache, wait for it to complete.
+	 * There is no guarantee that concurrent refreshes will see the same
+	 * gpa, memslots generation, etc..., so they must be fully serialized.
+	 */
+	mutex_lock(&gpc->refresh_lock);
+
 	write_lock_irq(&gpc->lock);
 
 	old_pfn = gpc->pfn;
@@ -248,6 +255,8 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
  out:
 	write_unlock_irq(&gpc->lock);
 
+	mutex_unlock(&gpc->refresh_lock);
+
 	gpc_release_pfn_and_khva(kvm, old_pfn, old_khva);
 
 	return ret;
@@ -259,6 +268,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 	void *old_khva;
 	kvm_pfn_t old_pfn;
 
+	mutex_lock(&gpc->refresh_lock);
 	write_lock_irq(&gpc->lock);
 
 	gpc->valid = false;
@@ -274,6 +284,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 	gpc->pfn = KVM_PFN_ERR_FAULT;
 
 	write_unlock_irq(&gpc->lock);
+	mutex_unlock(&gpc->refresh_lock);
 
 	gpc_release_pfn_and_khva(kvm, old_pfn, old_khva);
 }
@@ -288,6 +299,7 @@ int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 
 	if (!gpc->active) {
 		rwlock_init(&gpc->lock);
+		mutex_init(&gpc->refresh_lock);
 
 		gpc->khva = NULL;
 		gpc->pfn = KVM_PFN_ERR_FAULT;
-- 
cgit v1.2.3


From 58cd407ca4c6278cf9f9d09a2e663bf645b0c982 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 21:00:24 +0000
Subject: KVM: Fix multiple races in gfn=>pfn cache refresh

Rework the gfn=>pfn cache (gpc) refresh logic to address multiple races
between the cache itself, and between the cache and mmu_notifier events.

The existing refresh code attempts to guard against races with the
mmu_notifier by speculatively marking the cache valid, and then marking
it invalid if a mmu_notifier invalidation occurs.  That handles the case
where an invalidation occurs between dropping and re-acquiring gpc->lock,
but it doesn't handle the scenario where the cache is refreshed after the
cache was invalidated by the notifier, but before the notifier elevates
mmu_notifier_count.  The gpc refresh can't use the "retry" helper as its
invalidation occurs _before_ mmu_notifier_count is elevated and before
mmu_notifier_range_start is set/updated.

  CPU0                                    CPU1
  ----                                    ----

  gfn_to_pfn_cache_invalidate_start()
  |
  -> gpc->valid = false;
                                          kvm_gfn_to_pfn_cache_refresh()
                                          |
                                          |-> gpc->valid = true;

                                          hva_to_pfn_retry()
                                          |
                                          -> acquire kvm->mmu_lock
                                             kvm->mmu_notifier_count == 0
                                             mmu_seq == kvm->mmu_notifier_seq
                                             drop kvm->mmu_lock
                                             return pfn 'X'
  acquire kvm->mmu_lock
  kvm_inc_notifier_count()
  drop kvm->mmu_lock()
  kernel frees pfn 'X'
                                          kvm_gfn_to_pfn_cache_check()
                                          |
                                          |-> gpc->valid == true

                                          caller accesses freed pfn 'X'

Key off of mn_active_invalidate_count to detect that a pfncache refresh
needs to wait for an in-progress mmu_notifier invalidation.  While
mn_active_invalidate_count is not guaranteed to be stable, it is
guaranteed to be elevated prior to an invalidation acquiring gpc->lock,
so either the refresh will see an active invalidation and wait, or the
invalidation will run after the refresh completes.

Speculatively marking the cache valid is itself flawed, as a concurrent
kvm_gfn_to_pfn_cache_check() would see a valid cache with stale pfn/khva
values.  The KVM Xen use case explicitly allows/wants multiple users;
even though the caches are allocated per vCPU, __kvm_xen_has_interrupt()
can read a different vCPU (or vCPUs).  Address this race by invalidating
the cache prior to dropping gpc->lock (this is made possible by fixing
the above mmu_notifier race).

Complicating all of this is the fact that both the hva=>pfn resolution
and mapping of the kernel address can sleep, i.e. must be done outside
of gpc->lock.

Fix the above races in one fell swoop, trying to fix each individual race
is largely pointless and essentially impossible to test, e.g. closing one
hole just shifts the focus to the other hole.

Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support")
Cc: stable@vger.kernel.org
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Mingwei Zhang <mizhang@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429210025.3293691-8-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c |   9 +++
 virt/kvm/pfncache.c | 193 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 131 insertions(+), 71 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 342043b30125..a65a2369f788 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -724,6 +724,15 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	kvm->mn_active_invalidate_count++;
 	spin_unlock(&kvm->mn_invalidate_lock);
 
+	/*
+	 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
+	 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
+	 * each cache's lock.  There are relatively few caches in existence at
+	 * any given time, and the caches themselves can check for hva overlap,
+	 * i.e. don't need to rely on memslot overlap checks for performance.
+	 * Because this runs without holding mmu_lock, the pfn caches must use
+	 * mn_active_invalidate_count (see above) instead of mmu_notifier_count.
+	 */
 	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
 					  hva_range.may_block);
 
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f610d3945b69..b0b678367376 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -112,31 +112,122 @@ static void gpc_release_pfn_and_khva(struct kvm *kvm, kvm_pfn_t pfn, void *khva)
 	}
 }
 
-static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, unsigned long uhva)
+static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
 {
+	/*
+	 * mn_active_invalidate_count acts for all intents and purposes
+	 * like mmu_notifier_count here; but the latter cannot be used
+	 * here because the invalidation of caches in the mmu_notifier
+	 * event occurs _before_ mmu_notifier_count is elevated.
+	 *
+	 * Note, it does not matter that mn_active_invalidate_count
+	 * is not protected by gpc->lock.  It is guaranteed to
+	 * be elevated before the mmu_notifier acquires gpc->lock, and
+	 * isn't dropped until after mmu_notifier_seq is updated.
+	 */
+	if (kvm->mn_active_invalidate_count)
+		return true;
+
+	/*
+	 * Ensure mn_active_invalidate_count is read before
+	 * mmu_notifier_seq.  This pairs with the smp_wmb() in
+	 * mmu_notifier_invalidate_range_end() to guarantee either the
+	 * old (non-zero) value of mn_active_invalidate_count or the
+	 * new (incremented) value of mmu_notifier_seq is observed.
+	 */
+	smp_rmb();
+	return kvm->mmu_notifier_seq != mmu_seq;
+}
+
+static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+{
+	/* Note, the new page offset may be different than the old! */
+	void *old_khva = gpc->khva - offset_in_page(gpc->khva);
+	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
+	void *new_khva = NULL;
 	unsigned long mmu_seq;
-	kvm_pfn_t new_pfn;
-	int retry;
+
+	lockdep_assert_held(&gpc->refresh_lock);
+
+	lockdep_assert_held_write(&gpc->lock);
+
+	/*
+	 * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva
+	 * assets have already been updated and so a concurrent check() from a
+	 * different task may not fail the gpa/uhva/generation checks.
+	 */
+	gpc->valid = false;
 
 	do {
 		mmu_seq = kvm->mmu_notifier_seq;
 		smp_rmb();
 
+		write_unlock_irq(&gpc->lock);
+
+		/*
+		 * If the previous iteration "failed" due to an mmu_notifier
+		 * event, release the pfn and unmap the kernel virtual address
+		 * from the previous attempt.  Unmapping might sleep, so this
+		 * needs to be done after dropping the lock.  Opportunistically
+		 * check for resched while the lock isn't held.
+		 */
+		if (new_pfn != KVM_PFN_ERR_FAULT) {
+			/*
+			 * Keep the mapping if the previous iteration reused
+			 * the existing mapping and didn't create a new one.
+			 */
+			if (new_khva == old_khva)
+				new_khva = NULL;
+
+			gpc_release_pfn_and_khva(kvm, new_pfn, new_khva);
+
+			cond_resched();
+		}
+
 		/* We always request a writeable mapping */
-		new_pfn = hva_to_pfn(uhva, false, NULL, true, NULL);
+		new_pfn = hva_to_pfn(gpc->uhva, false, NULL, true, NULL);
 		if (is_error_noslot_pfn(new_pfn))
-			break;
+			goto out_error;
+
+		/*
+		 * Obtain a new kernel mapping if KVM itself will access the
+		 * pfn.  Note, kmap() and memremap() can both sleep, so this
+		 * too must be done outside of gpc->lock!
+		 */
+		if (gpc->usage & KVM_HOST_USES_PFN) {
+			if (new_pfn == gpc->pfn) {
+				new_khva = old_khva;
+			} else if (pfn_valid(new_pfn)) {
+				new_khva = kmap(pfn_to_page(new_pfn));
+#ifdef CONFIG_HAS_IOMEM
+			} else {
+				new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB);
+#endif
+			}
+			if (!new_khva) {
+				kvm_release_pfn_clean(new_pfn);
+				goto out_error;
+			}
+		}
+
+		write_lock_irq(&gpc->lock);
 
-		KVM_MMU_READ_LOCK(kvm);
-		retry = mmu_notifier_retry_hva(kvm, mmu_seq, uhva);
-		KVM_MMU_READ_UNLOCK(kvm);
-		if (!retry)
-			break;
+		/*
+		 * Other tasks must wait for _this_ refresh to complete before
+		 * attempting to refresh.
+		 */
+		WARN_ON_ONCE(gpc->valid);
+	} while (mmu_notifier_retry_cache(kvm, mmu_seq));
 
-		cond_resched();
-	} while (1);
+	gpc->valid = true;
+	gpc->pfn = new_pfn;
+	gpc->khva = new_khva + (gpc->gpa & ~PAGE_MASK);
+	return 0;
+
+out_error:
+	write_lock_irq(&gpc->lock);
 
-	return new_pfn;
+	return -EFAULT;
 }
 
 int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
@@ -147,7 +238,6 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	kvm_pfn_t old_pfn, new_pfn;
 	unsigned long old_uhva;
 	void *old_khva;
-	bool old_valid;
 	int ret = 0;
 
 	/*
@@ -169,7 +259,6 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	old_pfn = gpc->pfn;
 	old_khva = gpc->khva - offset_in_page(gpc->khva);
 	old_uhva = gpc->uhva;
-	old_valid = gpc->valid;
 
 	/* If the userspace HVA is invalid, refresh that first */
 	if (gpc->gpa != gpa || gpc->generation != slots->generation ||
@@ -182,7 +271,6 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 		gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn);
 
 		if (kvm_is_error_hva(gpc->uhva)) {
-			gpc->pfn = KVM_PFN_ERR_FAULT;
 			ret = -EFAULT;
 			goto out;
 		}
@@ -192,60 +280,8 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	 * If the userspace HVA changed or the PFN was already invalid,
 	 * drop the lock and do the HVA to PFN lookup again.
 	 */
-	if (!old_valid || old_uhva != gpc->uhva) {
-		unsigned long uhva = gpc->uhva;
-		void *new_khva = NULL;
-
-		/* Placeholders for "hva is valid but not yet mapped" */
-		gpc->pfn = KVM_PFN_ERR_FAULT;
-		gpc->khva = NULL;
-		gpc->valid = true;
-
-		write_unlock_irq(&gpc->lock);
-
-		new_pfn = hva_to_pfn_retry(kvm, uhva);
-		if (is_error_noslot_pfn(new_pfn)) {
-			ret = -EFAULT;
-			goto map_done;
-		}
-
-		if (gpc->usage & KVM_HOST_USES_PFN) {
-			if (new_pfn == old_pfn) {
-				/*
-				 * Reuse the existing pfn and khva, but put the
-				 * reference acquired hva_to_pfn_retry(); the
-				 * cache still holds a reference to the pfn
-				 * from the previous refresh.
-				 */
-				gpc_release_pfn_and_khva(kvm, new_pfn, NULL);
-
-				new_khva = old_khva;
-				old_pfn = KVM_PFN_ERR_FAULT;
-				old_khva = NULL;
-			} else if (pfn_valid(new_pfn)) {
-				new_khva = kmap(pfn_to_page(new_pfn));
-#ifdef CONFIG_HAS_IOMEM
-			} else {
-				new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB);
-#endif
-			}
-			if (new_khva)
-				new_khva += page_offset;
-			else
-				ret = -EFAULT;
-		}
-
-	map_done:
-		write_lock_irq(&gpc->lock);
-		if (ret) {
-			gpc->valid = false;
-			gpc->pfn = KVM_PFN_ERR_FAULT;
-			gpc->khva = NULL;
-		} else {
-			/* At this point, gpc->valid may already have been cleared */
-			gpc->pfn = new_pfn;
-			gpc->khva = new_khva;
-		}
+	if (!gpc->valid || old_uhva != gpc->uhva) {
+		ret = hva_to_pfn_retry(kvm, gpc);
 	} else {
 		/* If the HVA→PFN mapping was already valid, don't unmap it. */
 		old_pfn = KVM_PFN_ERR_FAULT;
@@ -253,11 +289,26 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	}
 
  out:
+	/*
+	 * Invalidate the cache and purge the pfn/khva if the refresh failed.
+	 * Some/all of the uhva, gpa, and memslot generation info may still be
+	 * valid, leave it as is.
+	 */
+	if (ret) {
+		gpc->valid = false;
+		gpc->pfn = KVM_PFN_ERR_FAULT;
+		gpc->khva = NULL;
+	}
+
+	/* Snapshot the new pfn before dropping the lock! */
+	new_pfn = gpc->pfn;
+
 	write_unlock_irq(&gpc->lock);
 
 	mutex_unlock(&gpc->refresh_lock);
 
-	gpc_release_pfn_and_khva(kvm, old_pfn, old_khva);
+	if (old_pfn != new_pfn)
+		gpc_release_pfn_and_khva(kvm, old_pfn, old_khva);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 85165781c5d900d97052be1d2723f6929d56768d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 21:00:25 +0000
Subject: KVM: Do not pin pages tracked by gfn=>pfn caches

Put the reference to any struct page mapped/tracked by a gfn=>pfn cache
upon inserting the pfn into its associated cache, as opposed to putting
the reference only when the cache is done using the pfn.  In other words,
don't pin pages while they're in the cache.  One of the major roles of
the gfn=>pfn cache is to play nicely with invalidation events, i.e. it
exists in large part so that KVM doesn't rely on pinning pages.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429210025.3293691-9-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/pfncache.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index b0b678367376..ab519f72f2cd 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -95,20 +95,16 @@ bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_check);
 
-static void gpc_release_pfn_and_khva(struct kvm *kvm, kvm_pfn_t pfn, void *khva)
+static void gpc_unmap_khva(struct kvm *kvm, kvm_pfn_t pfn, void *khva)
 {
-	/* Unmap the old page if it was mapped before, and release it */
-	if (!is_error_noslot_pfn(pfn)) {
-		if (khva) {
-			if (pfn_valid(pfn))
-				kunmap(pfn_to_page(pfn));
+	/* Unmap the old pfn/page if it was mapped before. */
+	if (!is_error_noslot_pfn(pfn) && khva) {
+		if (pfn_valid(pfn))
+			kunmap(pfn_to_page(pfn));
 #ifdef CONFIG_HAS_IOMEM
-			else
-				memunmap(khva);
+		else
+			memunmap(khva);
 #endif
-		}
-
-		kvm_release_pfn(pfn, false);
 	}
 }
 
@@ -176,10 +172,10 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 			 * Keep the mapping if the previous iteration reused
 			 * the existing mapping and didn't create a new one.
 			 */
-			if (new_khva == old_khva)
-				new_khva = NULL;
+			if (new_khva != old_khva)
+				gpc_unmap_khva(kvm, new_pfn, new_khva);
 
-			gpc_release_pfn_and_khva(kvm, new_pfn, new_khva);
+			kvm_release_pfn_clean(new_pfn);
 
 			cond_resched();
 		}
@@ -222,6 +218,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 	gpc->valid = true;
 	gpc->pfn = new_pfn;
 	gpc->khva = new_khva + (gpc->gpa & ~PAGE_MASK);
+
+	/*
+	 * Put the reference to the _new_ pfn.  The pfn is now tracked by the
+	 * cache and can be safely migrated, swapped, etc... as the cache will
+	 * invalidate any mappings in response to relevant mmu_notifier events.
+	 */
+	kvm_release_pfn_clean(new_pfn);
+
 	return 0;
 
 out_error:
@@ -308,7 +312,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	mutex_unlock(&gpc->refresh_lock);
 
 	if (old_pfn != new_pfn)
-		gpc_release_pfn_and_khva(kvm, old_pfn, old_khva);
+		gpc_unmap_khva(kvm, old_pfn, old_khva);
 
 	return ret;
 }
@@ -337,7 +341,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
 	write_unlock_irq(&gpc->lock);
 	mutex_unlock(&gpc->refresh_lock);
 
-	gpc_release_pfn_and_khva(kvm, old_pfn, old_khva);
+	gpc_unmap_khva(kvm, old_pfn, old_khva);
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap);
 
-- 
cgit v1.2.3


From 1d5e740d518e02cea46325b3d37135bf9c08982a Mon Sep 17 00:00:00 2001
From: Zeng Guang <guang.zeng@intel.com>
Date: Tue, 19 Apr 2022 23:44:09 +0800
Subject: KVM: Move kvm_arch_vcpu_precreate() under kvm->lock

kvm_arch_vcpu_precreate() targets to handle arch specific VM resource
to be prepared prior to the actual creation of vCPU. For example, x86
platform may need do per-VM allocation based on max_vcpu_ids at the
first vCPU creation. It probably leads to concurrency control on this
allocation as multiple vCPU creation could happen simultaneously. From
the architectual point of view, it's necessary to execute
kvm_arch_vcpu_precreate() under protect of kvm->lock.

Currently only arm64, x86 and s390 have non-nop implementations at the
stage of vCPU pre-creation. Remove the lock acquiring in s390's design
and make sure all architecture can run kvm_arch_vcpu_precreate() safely
under kvm->lock without recrusive lock issue.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Zeng Guang <guang.zeng@intel.com>
Message-Id: <20220419154409.11842-1-guang.zeng@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7f79abdbd68d..6dbdcbda0291 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3768,13 +3768,15 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		return -EINVAL;
 	}
 
+	r = kvm_arch_vcpu_precreate(kvm, id);
+	if (r) {
+		mutex_unlock(&kvm->lock);
+		return r;
+	}
+
 	kvm->created_vcpus++;
 	mutex_unlock(&kvm->lock);
 
-	r = kvm_arch_vcpu_precreate(kvm, id);
-	if (r)
-		goto vcpu_decrement;
-
 	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
 	if (!vcpu) {
 		r = -ENOMEM;
-- 
cgit v1.2.3


From f24b44e48d267092dd79dba1288dc73ac414447e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Sun, 5 Jun 2022 14:34:15 +0800
Subject: KVM: Rename ack_flush() to ack_kick()

Make it use the same verb as in kvm_kick_many_cpus().

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Message-Id: <20220605063417.308311-5-jiangshanlai@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a67e996cbf7f..b13acbaa6d2a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -239,7 +239,7 @@ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
 	return mode == IN_GUEST_MODE;
 }
 
-static void ack_flush(void *_completed)
+static void ack_kick(void *_completed)
 {
 }
 
@@ -248,7 +248,7 @@ static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
 	if (cpumask_empty(cpus))
 		return false;
 
-	smp_call_function_many(cpus, ack_flush, NULL, wait);
+	smp_call_function_many(cpus, ack_kick, NULL, wait);
 	return true;
 }
 
-- 
cgit v1.2.3


From 28b85ae06f64bf1c1adea68a2fbb31dc40cc060e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:08 +0000
Subject: KVM: Drop bogus "pfn != 0" guard from kvm_release_pfn()

Remove a check from kvm_release_pfn() to bail if the provided @pfn is
zero.  Zero is a perfectly valid pfn on most architectures, and should
not be used to indicate an error or an invalid pfn.  The bogus check was
added by commit 917248144db5 ("x86/kvm: Cache gfn to pfn translation"),
which also did the bad thing of zeroing the pfn and gfn to mark a cache
invalid.  Thankfully, that bad behavior was axed by commit 357a18ad230f
("KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache").

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b13acbaa6d2a..8f475d174e3d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2745,9 +2745,6 @@ EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
 {
-	if (pfn == 0)
-		return;
-
 	if (dirty)
 		kvm_release_pfn_dirty(pfn);
 	else
-- 
cgit v1.2.3


From a1040b0d42acf69bb4f6dbdc54c2dcd78eea1de5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:09 +0000
Subject: KVM: Don't set Accessed/Dirty bits for ZERO_PAGE

Don't set Accessed/Dirty bits for a struct page with PG_reserved set,
i.e. don't set A/D bits for the ZERO_PAGE.  The ZERO_PAGE (or pages
depending on the architecture) should obviously never be written, and
similarly there's no point in marking it accessed as the page will never
be swapped out or reclaimed.  The comment in page-flags.h is quite clear
that PG_reserved pages should be managed only by their owner, and
strictly following that mandate also simplifies KVM's logic.

Fixes: 7df003c85218 ("KVM: fix overflow of zero page refcount with ksm running")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-4-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8f475d174e3d..351cbd121cf5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2850,16 +2850,28 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
+static bool kvm_is_ad_tracked_pfn(kvm_pfn_t pfn)
+{
+	if (!pfn_valid(pfn))
+		return false;
+
+	/*
+	 * Per page-flags.h, pages tagged PG_reserved "should in general not be
+	 * touched (e.g. set dirty) except by its owner".
+	 */
+	return !PageReserved(pfn_to_page(pfn));
+}
+
 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
-	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
+	if (kvm_is_ad_tracked_pfn(pfn))
 		SetPageDirty(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
 {
-	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
+	if (kvm_is_ad_tracked_pfn(pfn))
 		mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
-- 
cgit v1.2.3


From 8e1c69149f27189cff93a0cfe9402e576d89ce29 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:10 +0000
Subject: KVM: Avoid pfn_to_page() and vice versa when releasing pages

Invert the order of KVM's page/pfn release helpers so that the "inner"
helper operates on a page instead of a pfn.  As pointed out by Linus[*],
converting between struct page and a pfn isn't necessarily cheap, and
that's not even counting the overhead of is_error_noslot_pfn() and
kvm_is_reserved_pfn().  Even if the checks were dirt cheap, there's no
reason to convert from a page to a pfn and back to a page, just to mark
the page dirty/accessed or to put a reference to the page.

Opportunistically drop a stale declaration of kvm_set_page_accessed()
from kvm_host.h (there was no implementation).

No functional change intended.

[*] https://lore.kernel.org/all/CAHk-=wifQimj2d6npq-wCi5onYPjzQg4vyO4tFcPJJZr268cRw@mail.gmail.com

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-5-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 64 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 21 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 351cbd121cf5..4732a99935f9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2820,18 +2820,40 @@ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
 
+static bool kvm_is_ad_tracked_page(struct page *page)
+{
+	/*
+	 * Per page-flags.h, pages tagged PG_reserved "should in general not be
+	 * touched (e.g. set dirty) except by its owner".
+	 */
+	return !PageReserved(page);
+}
+
+static void kvm_set_page_dirty(struct page *page)
+{
+	if (kvm_is_ad_tracked_page(page))
+		SetPageDirty(page);
+}
+
+static void kvm_set_page_accessed(struct page *page)
+{
+	if (kvm_is_ad_tracked_page(page))
+		mark_page_accessed(page);
+}
+
 void kvm_release_page_clean(struct page *page)
 {
 	WARN_ON(is_error_page(page));
 
-	kvm_release_pfn_clean(page_to_pfn(page));
+	kvm_set_page_accessed(page);
+	put_page(page);
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn)
 {
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
-		put_page(pfn_to_page(pfn));
+		kvm_release_page_clean(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 
@@ -2839,40 +2861,40 @@ void kvm_release_page_dirty(struct page *page)
 {
 	WARN_ON(is_error_page(page));
 
-	kvm_release_pfn_dirty(page_to_pfn(page));
+	kvm_set_page_dirty(page);
+	kvm_release_page_clean(page);
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
 
 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
 {
-	kvm_set_pfn_dirty(pfn);
-	kvm_release_pfn_clean(pfn);
+	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
+		kvm_release_page_dirty(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
-static bool kvm_is_ad_tracked_pfn(kvm_pfn_t pfn)
-{
-	if (!pfn_valid(pfn))
-		return false;
-
-	/*
-	 * Per page-flags.h, pages tagged PG_reserved "should in general not be
-	 * touched (e.g. set dirty) except by its owner".
-	 */
-	return !PageReserved(pfn_to_page(pfn));
-}
-
+/*
+ * Note, checking for an error/noslot pfn is the caller's responsibility when
+ * directly marking a page dirty/accessed.  Unlike the "release" helpers, the
+ * "set" helpers are not to be used when the pfn might point at garbage.
+ */
 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
-	if (kvm_is_ad_tracked_pfn(pfn))
-		SetPageDirty(pfn_to_page(pfn));
+	if (WARN_ON(is_error_noslot_pfn(pfn)))
+		return;
+
+	if (pfn_valid(pfn))
+		kvm_set_page_dirty(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
 {
-	if (kvm_is_ad_tracked_pfn(pfn))
-		mark_page_accessed(pfn_to_page(pfn));
+	if (WARN_ON(is_error_noslot_pfn(pfn)))
+		return;
+
+	if (pfn_valid(pfn))
+		kvm_set_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 
-- 
cgit v1.2.3


From 6573a6910ce46ece35c1aa4bd38b70884553cd21 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:12 +0000
Subject: KVM: Don't WARN if kvm_pfn_to_page() encounters a "reserved" pfn

Drop a WARN_ON() if kvm_pfn_to_page() encounters a "reserved" pfn, which
in this context means a struct page that has PG_reserved but is not a/the
ZERO_PAGE and is not a ZONE_DEVICE page.  The usage, via gfn_to_page(),
in x86 is safe as gfn_to_page() is used only to retrieve a page from
KVM-controlled memslot, but the usage in PPC and s390 operates on
arbitrary gfns and thus memslots that can be backed by incompatible
memory.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-7-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4732a99935f9..78d280e119b3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2725,10 +2725,8 @@ static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
 	if (is_error_noslot_pfn(pfn))
 		return KVM_ERR_PTR_BAD_PAGE;
 
-	if (kvm_is_reserved_pfn(pfn)) {
-		WARN_ON(1);
+	if (kvm_is_reserved_pfn(pfn))
 		return KVM_ERR_PTR_BAD_PAGE;
-	}
 
 	return pfn_to_page(pfn);
 }
-- 
cgit v1.2.3


From b1624f99aa8fedafcabf1b92fa51ed88dde14acb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:13 +0000
Subject: KVM: Remove kvm_vcpu_gfn_to_page() and kvm_vcpu_gpa_to_page()

Drop helpers to convert a gfn/gpa to a 'struct page' in the context of a
vCPU.  KVM doesn't require that guests be backed by 'struct page' memory,
thus any use of helpers that assume 'struct page' is bound to be flawed,
as was the case for the recently removed last user in x86's nested VMX.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-8-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 78d280e119b3..7cd0e3d67f9f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2720,8 +2720,18 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 }
 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 
-static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
+/*
+ * Do not use this helper unless you are absolutely certain the gfn _must_ be
+ * backed by 'struct page'.  A valid example is if the backing memslot is
+ * controlled by KVM.  Note, if the returned page is valid, it's refcount has
+ * been elevated by gfn_to_pfn().
+ */
+struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
+	kvm_pfn_t pfn;
+
+	pfn = gfn_to_pfn(kvm, gfn);
+
 	if (is_error_noslot_pfn(pfn))
 		return KVM_ERR_PTR_BAD_PAGE;
 
@@ -2730,15 +2740,6 @@ static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
 
 	return pfn_to_page(pfn);
 }
-
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
-	kvm_pfn_t pfn;
-
-	pfn = gfn_to_pfn(kvm, gfn);
-
-	return kvm_pfn_to_page(pfn);
-}
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
@@ -2808,16 +2809,6 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
-struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	kvm_pfn_t pfn;
-
-	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
-
-	return kvm_pfn_to_page(pfn);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
-
 static bool kvm_is_ad_tracked_page(struct page *page)
 {
 	/*
-- 
cgit v1.2.3


From 284dc49307738d2a897fd375431f741213cd0f27 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:14 +0000
Subject: KVM: Take a 'struct page', not a pfn in kvm_is_zone_device_page()

Operate on a 'struct page' instead of a pfn when checking if a page is a
ZONE_DEVICE page, and rename the helper accordingly.  Generally speaking,
KVM doesn't actually care about ZONE_DEVICE memory, i.e. shouldn't do
anything special for ZONE_DEVICE memory.  Rather, KVM wants to treat
ZONE_DEVICE memory like regular memory, and the need to identify
ZONE_DEVICE memory only arises as an exception to PG_reserved pages. In
other words, KVM should only ever check for ZONE_DEVICE memory after KVM
has already verified that there is a struct page associated with the pfn.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-9-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7cd0e3d67f9f..aa1bcd5f140d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -168,7 +168,7 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
 {
 }
 
-bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+bool kvm_is_zone_device_page(struct page *page)
 {
 	/*
 	 * The metadata used by is_zone_device_page() to determine whether or
@@ -176,10 +176,10 @@ bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
 	 * the device has been pinned, e.g. by get_user_pages().  WARN if the
 	 * page_count() is zero to help detect bad usage of this helper.
 	 */
-	if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+	if (WARN_ON_ONCE(!page_count(page)))
 		return false;
 
-	return is_zone_device_page(pfn_to_page(pfn));
+	return is_zone_device_page(page);
 }
 
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
@@ -192,7 +192,7 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 	if (pfn_valid(pfn))
 		return PageReserved(pfn_to_page(pfn)) &&
 		       !is_zero_pfn(pfn) &&
-		       !kvm_is_zone_device_pfn(pfn);
+		       !kvm_is_zone_device_page(pfn_to_page(pfn));
 
 	return true;
 }
-- 
cgit v1.2.3


From b14b2690c50e02145bb867dfcde8845eb17aa8a4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:15 +0000
Subject: KVM: Rename/refactor kvm_is_reserved_pfn() to
 kvm_pfn_to_refcounted_page()

Rename and refactor kvm_is_reserved_pfn() to kvm_pfn_to_refcounted_page()
to better reflect what KVM is actually checking, and to eliminate extra
pfn_to_page() lookups.  The kvm_release_pfn_*() an kvm_try_get_pfn()
helpers in particular benefit from "refouncted" nomenclature, as it's not
all that obvious why KVM needs to get/put refcounts for some PG_reserved
pages (ZERO_PAGE and ZONE_DEVICE).

Add a comment to call out that the list of exceptions to PG_reserved is
all but guaranteed to be incomplete.  The list has mostly been compiled
by people throwing noodles at KVM and finding out they stick a little too
well, e.g. the ZERO_PAGE's refcount overflowed and ZONE_DEVICE pages
didn't get freed.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-10-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 66 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 52 insertions(+), 14 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aa1bcd5f140d..04196096f9e4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -182,19 +182,36 @@ bool kvm_is_zone_device_page(struct page *page)
 	return is_zone_device_page(page);
 }
 
-bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
+/*
+ * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
+ * page, NULL otherwise.  Note, the list of refcounted PG_reserved page types
+ * is likely incomplete, it has been compiled purely through people wanting to
+ * back guest with a certain type of memory and encountering issues.
+ */
+struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
 {
+	struct page *page;
+
+	if (!pfn_valid(pfn))
+		return NULL;
+
+	page = pfn_to_page(pfn);
+	if (!PageReserved(page))
+		return page;
+
+	/* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
+	if (is_zero_pfn(pfn))
+		return page;
+
 	/*
 	 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
 	 * perspective they are "normal" pages, albeit with slightly different
 	 * usage rules.
 	 */
-	if (pfn_valid(pfn))
-		return PageReserved(pfn_to_page(pfn)) &&
-		       !is_zero_pfn(pfn) &&
-		       !kvm_is_zone_device_page(pfn_to_page(pfn));
+	if (kvm_is_zone_device_page(page))
+		return page;
 
-	return true;
+	return NULL;
 }
 
 /*
@@ -2501,9 +2518,12 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
 
 static int kvm_try_get_pfn(kvm_pfn_t pfn)
 {
-	if (kvm_is_reserved_pfn(pfn))
+	struct page *page = kvm_pfn_to_refcounted_page(pfn);
+
+	if (!page)
 		return 1;
-	return get_page_unless_zero(pfn_to_page(pfn));
+
+	return get_page_unless_zero(page);
 }
 
 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
@@ -2728,6 +2748,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
  */
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
+	struct page *page;
 	kvm_pfn_t pfn;
 
 	pfn = gfn_to_pfn(kvm, gfn);
@@ -2735,10 +2756,11 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 	if (is_error_noslot_pfn(pfn))
 		return KVM_ERR_PTR_BAD_PAGE;
 
-	if (kvm_is_reserved_pfn(pfn))
+	page = kvm_pfn_to_refcounted_page(pfn);
+	if (!page)
 		return KVM_ERR_PTR_BAD_PAGE;
 
-	return pfn_to_page(pfn);
+	return page;
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
@@ -2841,8 +2863,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(kvm_pfn_t pfn)
 {
-	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
-		kvm_release_page_clean(pfn_to_page(pfn));
+	struct page *page;
+
+	if (is_error_noslot_pfn(pfn))
+		return;
+
+	page = kvm_pfn_to_refcounted_page(pfn);
+	if (!page)
+		return;
+
+	kvm_release_page_clean(page);
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 
@@ -2857,8 +2887,16 @@ EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
 
 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
 {
-	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
-		kvm_release_page_dirty(pfn_to_page(pfn));
+	struct page *page;
+
+	if (is_error_noslot_pfn(pfn))
+		return;
+
+	page = kvm_pfn_to_refcounted_page(pfn);
+	if (!page)
+		return;
+
+	kvm_release_page_dirty(page);
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
-- 
cgit v1.2.3


From 943dfea8f166d62657057170dbe8667ec96247ca Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 29 Apr 2022 01:04:07 +0000
Subject: KVM: Do not zero initialize 'pfn' in hva_to_pfn()

Drop the unnecessary initialization of the local 'pfn' variable in
hva_to_pfn().  First and foremost, '0' is not an invalid pfn, it's a
perfectly valid pfn on most architectures.  I.e. if hva_to_pfn() were to
return an "uninitializd" pfn, it would actually be interpeted as a legal
pfn by most callers.

Second, hva_to_pfn() can't return an uninitialized pfn as hva_to_pfn()
explicitly sets pfn to an error value (or returns an error value directly)
if a helper returns failure, and all helpers set the pfn on success.

The zeroing of 'pfn' was introduced by commit 2fc843117d64 ("KVM:
reorganize hva_to_pfn"), probably to avoid "uninitialized variable"
warnings on statements that return pfn.  However, no compiler seems
to produce them, making the initialization unnecessary.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220429010416.2788472-2-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 04196096f9e4..5b8ae83e09d7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2609,7 +2609,7 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 		     bool write_fault, bool *writable)
 {
 	struct vm_area_struct *vma;
-	kvm_pfn_t pfn = 0;
+	kvm_pfn_t pfn;
 	int npages, r;
 
 	/* we can do it either atomically or asynchronously, not both */
-- 
cgit v1.2.3


From 837f66c71207542283831d0762c5dca3db5b397a Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Wed, 22 Jun 2022 15:27:08 -0400
Subject: KVM: Allow for different capacities in kvm_mmu_memory_cache structs

Allow the capacity of the kvm_mmu_memory_cache struct to be chosen at
declaration time rather than being fixed for all declarations. This will
be used in a follow-up commit to declare an cache in x86 with a capacity
of 512+ objects without having to increase the capacity of all caches in
KVM.

This change requires each cache now specify its capacity at runtime,
since the cache struct itself no longer has a fixed capacity known at
compile time. To protect against someone accidentally defining a
kvm_mmu_memory_cache struct directly (without the extra storage), this
commit includes a WARN_ON() in kvm_mmu_topup_memory_cache().

In order to support different capacities, this commit changes the
objects pointer array to be dynamically allocated the first time the
cache is topped-up.

While here, opportunistically clean up the stack-allocated
kvm_mmu_memory_cache structs in riscv and arm64 to use designated
initializers.

No functional change intended.

Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220516232138.1783324-22-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5b8ae83e09d7..45188d11812c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -396,14 +396,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 		return (void *)__get_free_page(gfp_flags);
 }
 
-int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
+int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
 {
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
 	void *obj;
 
 	if (mc->nobjs >= min)
 		return 0;
-	while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
-		obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
+
+	if (unlikely(!mc->objects)) {
+		if (WARN_ON_ONCE(!capacity))
+			return -EIO;
+
+		mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
+		if (!mc->objects)
+			return -ENOMEM;
+
+		mc->capacity = capacity;
+	}
+
+	/* It is illegal to request a different capacity across topups. */
+	if (WARN_ON_ONCE(mc->capacity != capacity))
+		return -EIO;
+
+	while (mc->nobjs < mc->capacity) {
+		obj = mmu_memory_cache_alloc_obj(mc, gfp);
 		if (!obj)
 			return mc->nobjs >= min ? 0 : -ENOMEM;
 		mc->objects[mc->nobjs++] = obj;
@@ -411,6 +428,11 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 	return 0;
 }
 
+int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
+{
+	return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
+}
+
 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 {
 	return mc->nobjs;
@@ -424,6 +446,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 		else
 			free_page((unsigned long)mc->objects[--mc->nobjs]);
 	}
+
+	kvfree(mc->objects);
+
+	mc->objects = NULL;
+	mc->capacity = 0;
 }
 
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-- 
cgit v1.2.3


From e36de87d34a7f2f26b2e2129c6dc18a0024663eb Mon Sep 17 00:00:00 2001
From: Vineeth Pillai <vineeth@bitbyteword.org>
Date: Mon, 23 May 2022 15:03:27 -0400
Subject: KVM: debugfs: expose pid of vcpu threads

Add a new debugfs file to expose the pid of each vcpu threads. This
is very helpful for userland tools to get the vcpu pids without
worrying about thread naming conventions of the VMM.

Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
Message-Id: <20220523190327.2658-1-vineeth@bitbyteword.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 45188d11812c..da263c370d00 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3822,9 +3822,18 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
 }
 
+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
+static int vcpu_get_pid(void *data, u64 *val)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+	*val = pid_nr(rcu_access_pointer(vcpu->pid));
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
+
 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 {
-#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
 	struct dentry *debugfs_dentry;
 	char dir_name[ITOA_MAX_LEN * 2];
 
@@ -3834,10 +3843,12 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
 	debugfs_dentry = debugfs_create_dir(dir_name,
 					    vcpu->kvm->debugfs_dentry);
+	debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
+			    &vcpu_get_pid_fops);
 
 	kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
-#endif
 }
+#endif
 
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
-- 
cgit v1.2.3