From d56f5136b01020155b6b0a29f69d924687529bee Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 4 Jun 2020 15:16:52 +0200
Subject: KVM: let kvm_destroy_vm_debugfs clean up vCPU debugfs directories

After commit 63d0434 ("KVM: x86: move kvm_create_vcpu_debugfs after
last failure point") we are creating the pre-vCPU debugfs files
after the creation of the vCPU file descriptor.  This makes it
possible for userspace to reach kvm_vcpu_release before
kvm_create_vcpu_debugfs has finished.  The vcpu->debugfs_dentry
then does not have any associated inode anymore, and this causes
a NULL-pointer dereference in debugfs_create_file.

The solution is simply to avoid removing the files; they are
cleaned up when the VM file descriptor is closed (and that must be
after KVM_CREATE_VCPU returns).  We can stop storing the dentry
in struct kvm_vcpu too, because it is not needed anywhere after
kvm_create_vcpu_debugfs returns.

Reported-by: syzbot+705f4401d5a93a59b87d@syzkaller.appspotmail.com
Fixes: 63d04348371b ("KVM: x86: move kvm_create_vcpu_debugfs after last failure point")
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7fa1e38e1659..3577eb84eac0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2973,7 +2973,6 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 
-	debugfs_remove_recursive(vcpu->debugfs_dentry);
 	kvm_put_kvm(vcpu->kvm);
 	return 0;
 }
@@ -3000,16 +2999,17 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 {
 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
+	struct dentry *debugfs_dentry;
 	char dir_name[ITOA_MAX_LEN * 2];
 
 	if (!debugfs_initialized())
 		return;
 
 	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
-	vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
-						  vcpu->kvm->debugfs_dentry);
+	debugfs_dentry = debugfs_create_dir(dir_name,
+					    vcpu->kvm->debugfs_dentry);
 
-	kvm_arch_create_vcpu_debugfs(vcpu);
+	kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
 #endif
 }
 
-- 
cgit v1.2.3


From 7ec28e264f2e52089c14c6f8eba1ce7b6501e59b Mon Sep 17 00:00:00 2001
From: Denis Efremov <efremov@linux.com>
Date: Wed, 3 Jun 2020 13:11:31 +0300
Subject: KVM: Use vmemdup_user()

Replace opencoded alloc and copy with vmemdup_user().

Signed-off-by: Denis Efremov <efremov@linux.com>
Message-Id: <20200603101131.2107303-1-efremov@linux.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3577eb84eac0..4db151f6101e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3746,21 +3746,18 @@ static long kvm_vm_ioctl(struct file *filp,
 		if (routing.flags)
 			goto out;
 		if (routing.nr) {
-			r = -ENOMEM;
-			entries = vmalloc(array_size(sizeof(*entries),
-						     routing.nr));
-			if (!entries)
-				goto out;
-			r = -EFAULT;
 			urouting = argp;
-			if (copy_from_user(entries, urouting->entries,
-					   routing.nr * sizeof(*entries)))
-				goto out_free_irq_routing;
+			entries = vmemdup_user(urouting->entries,
+					       array_size(sizeof(*entries),
+							  routing.nr));
+			if (IS_ERR(entries)) {
+				r = PTR_ERR(entries);
+				goto out;
+			}
 		}
 		r = kvm_set_irq_routing(kvm, entries, routing.nr,
 					routing.flags);
-out_free_irq_routing:
-		vfree(entries);
+		kvfree(entries);
 		break;
 	}
 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
-- 
cgit v1.2.3


From e649b3f0188f8fd34dd0dde8d43fd3312b902fb2 Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
Date: Sat, 6 Jun 2020 13:26:27 +0900
Subject: KVM: x86: Fix APIC page invalidation race

Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried
to fix inappropriate APIC page invalidation by re-introducing arch
specific kvm_arch_mmu_notifier_invalidate_range() and calling it from
kvm_mmu_notifier_invalidate_range_start. However, the patch left a
possible race where the VMCS APIC address cache is updated *before*
it is unmapped:

  (Invalidator) kvm_mmu_notifier_invalidate_range_start()
  (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD)
  (KVM VCPU) vcpu_enter_guest()
  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
  (Invalidator) actually unmap page

Because of the above race, there can be a mismatch between the
host physical address stored in the APIC_ACCESS_PAGE VMCS field and
the host physical address stored in the EPT entry for the APIC GPA
(0xfee0000).  When this happens, the processor will not trap APIC
accesses, and will instead show the raw contents of the APIC-access page.
Because Windows OS periodically checks for unexpected modifications to
the LAPIC register, this will show up as a BSOD crash with BugCheck
CRITICAL_STRUCTURE_CORRUPTION (109) we are currently seeing in
https://bugzilla.redhat.com/show_bug.cgi?id=1751017.

The root cause of the issue is that kvm_arch_mmu_notifier_invalidate_range()
cannot guarantee that no additional references are taken to the pages in
the range before kvm_mmu_notifier_invalidate_range_end().  Fortunately,
this case is supported by the MMU notifier API, as documented in
include/linux/mmu_notifier.h:

	 * If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().

The fix therefore is to reload the APIC-access page field in the VMCS
from kvm_mmu_notifier_invalidate_range() instead of ..._range_start().

Cc: stable@vger.kernel.org
Fixes: b1394e745b94 ("KVM: x86: fix APIC page invalidation")
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=197951
Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
Message-Id: <20200606042627.61070-1-eiichi.tsukata@nutanix.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4db151f6101e..7b6013f2ba19 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -155,10 +155,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 static unsigned long long kvm_createvm_count;
 static unsigned long long kvm_active_vms;
 
-__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-		unsigned long start, unsigned long end, bool blockable)
+__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+						   unsigned long start, unsigned long end)
 {
-	return 0;
 }
 
 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
@@ -384,6 +383,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 	return container_of(mn, struct kvm, mmu_notifier);
 }
 
+static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
+					      struct mm_struct *mm,
+					      unsigned long start, unsigned long end)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
+	srcu_read_unlock(&kvm->srcu, idx);
+}
+
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 					struct mm_struct *mm,
 					unsigned long address,
@@ -408,7 +419,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int need_tlb_flush = 0, idx;
-	int ret;
 
 	idx = srcu_read_lock(&kvm->srcu);
 	spin_lock(&kvm->mmu_lock);
@@ -425,14 +435,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		kvm_flush_remote_tlbs(kvm);
 
 	spin_unlock(&kvm->mmu_lock);
-
-	ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
-					range->end,
-					mmu_notifier_range_blockable(range));
-
 	srcu_read_unlock(&kvm->srcu, idx);
 
-	return ret;
+	return 0;
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -538,6 +543,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 }
 
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
-- 
cgit v1.2.3


From 7863e346e1089b40cac1c7d9098314c405e2e1e3 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 10 Jun 2020 19:55:31 +0200
Subject: KVM: async_pf: Cleanup kvm_setup_async_pf()

schedule_work() returns 'false' only when the work is already on the queue
and this can't happen as kvm_setup_async_pf() always allocates a new one.
Also, to avoid potential race, it makes sense to to schedule_work() at the
very end after we've added it to the queue.

While on it, do some minor cleanup. gfn_to_pfn_async() mentioned in a
comment does not currently exist and, moreover, we can check
kvm_is_error_hva() at the very beginning, before we try to allocate work so
'retry_sync' label can go away completely.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20200610175532.779793-1-vkuznets@redhat.com>
Reviewed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/async_pf.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index f1e07fae84e9..ba080088da76 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -164,7 +164,9 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
 		return 0;
 
-	/* setup delayed work */
+	/* Arch specific code should not do async PF in this case */
+	if (unlikely(kvm_is_error_hva(hva)))
+		return 0;
 
 	/*
 	 * do alloc nowait since if we are going to sleep anyway we
@@ -183,24 +185,15 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	mmget(work->mm);
 	kvm_get_kvm(work->vcpu->kvm);
 
-	/* this can't really happen otherwise gfn_to_pfn_async
-	   would succeed */
-	if (unlikely(kvm_is_error_hva(work->addr)))
-		goto retry_sync;
-
 	INIT_WORK(&work->work, async_pf_execute);
-	if (!schedule_work(&work->work))
-		goto retry_sync;
 
 	list_add_tail(&work->queue, &vcpu->async_pf.queue);
 	vcpu->async_pf.queued++;
 	kvm_arch_async_page_not_present(vcpu, work);
+
+	schedule_work(&work->work);
+
 	return 1;
-retry_sync:
-	kvm_put_kvm(work->vcpu->kvm);
-	mmput(work->mm);
-	kmem_cache_free(async_pf_cache, work);
-	return 0;
 }
 
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 2a18b7e7cd8882f626316c340c6f2fca49b5fa12 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 10 Jun 2020 19:55:32 +0200
Subject: KVM: async_pf: Inject 'page ready' event only if 'page not present'
 was previously injected

'Page not present' event may or may not get injected depending on
guest's state. If the event wasn't injected, there is no need to
inject the corresponding 'page ready' event as the guest may get
confused. E.g. Linux thinks that the corresponding 'page not present'
event wasn't delivered *yet* and allocates a 'dummy entry' for it.
This entry is never freed.

Note, 'wakeup all' events have no corresponding 'page not present'
event and always get injected.

s390 seems to always be able to inject 'page not present', the
change is effectively a nop.

Suggested-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20200610175532.779793-2-vkuznets@redhat.com>
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=208081
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/async_pf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'virt')

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index ba080088da76..a36828fbf40a 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -189,7 +189,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 
 	list_add_tail(&work->queue, &vcpu->async_pf.queue);
 	vcpu->async_pf.queued++;
-	kvm_arch_async_page_not_present(vcpu, work);
+	work->notpresent_injected = kvm_arch_async_page_not_present(vcpu, work);
 
 	schedule_work(&work->work);
 
-- 
cgit v1.2.3