From d56f5136b01020155b6b0a29f69d924687529bee Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jun 2020 15:16:52 +0200 Subject: KVM: let kvm_destroy_vm_debugfs clean up vCPU debugfs directories After commit 63d0434 ("KVM: x86: move kvm_create_vcpu_debugfs after last failure point") we are creating the pre-vCPU debugfs files after the creation of the vCPU file descriptor. This makes it possible for userspace to reach kvm_vcpu_release before kvm_create_vcpu_debugfs has finished. The vcpu->debugfs_dentry then does not have any associated inode anymore, and this causes a NULL-pointer dereference in debugfs_create_file. The solution is simply to avoid removing the files; they are cleaned up when the VM file descriptor is closed (and that must be after KVM_CREATE_VCPU returns). We can stop storing the dentry in struct kvm_vcpu too, because it is not needed anywhere after kvm_create_vcpu_debugfs returns. Reported-by: syzbot+705f4401d5a93a59b87d@syzkaller.appspotmail.com Fixes: 63d04348371b ("KVM: x86: move kvm_create_vcpu_debugfs after last failure point") Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fa1e38e1659..3577eb84eac0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2973,7 +2973,6 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp) { struct kvm_vcpu *vcpu = filp->private_data; - debugfs_remove_recursive(vcpu->debugfs_dentry); kvm_put_kvm(vcpu->kvm); return 0; } @@ -3000,16 +2999,17 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS + struct dentry *debugfs_dentry; char dir_name[ITOA_MAX_LEN * 2]; if (!debugfs_initialized()) return; snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); - vcpu->debugfs_dentry = debugfs_create_dir(dir_name, - vcpu->kvm->debugfs_dentry); + debugfs_dentry = debugfs_create_dir(dir_name, + vcpu->kvm->debugfs_dentry); - kvm_arch_create_vcpu_debugfs(vcpu); + kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); #endif } -- cgit v1.2.3 From 7ec28e264f2e52089c14c6f8eba1ce7b6501e59b Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 3 Jun 2020 13:11:31 +0300 Subject: KVM: Use vmemdup_user() Replace opencoded alloc and copy with vmemdup_user(). Signed-off-by: Denis Efremov Message-Id: <20200603101131.2107303-1-efremov@linux.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3577eb84eac0..4db151f6101e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3746,21 +3746,18 @@ static long kvm_vm_ioctl(struct file *filp, if (routing.flags) goto out; if (routing.nr) { - r = -ENOMEM; - entries = vmalloc(array_size(sizeof(*entries), - routing.nr)); - if (!entries) - goto out; - r = -EFAULT; urouting = argp; - if (copy_from_user(entries, urouting->entries, - routing.nr * sizeof(*entries))) - goto out_free_irq_routing; + entries = vmemdup_user(urouting->entries, + array_size(sizeof(*entries), + routing.nr)); + if (IS_ERR(entries)) { + r = PTR_ERR(entries); + goto out; + } } r = kvm_set_irq_routing(kvm, entries, routing.nr, routing.flags); -out_free_irq_routing: - vfree(entries); + kvfree(entries); break; } #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ -- cgit v1.2.3 From e649b3f0188f8fd34dd0dde8d43fd3312b902fb2 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Sat, 6 Jun 2020 13:26:27 +0900 Subject: KVM: x86: Fix APIC page invalidation race Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried to fix inappropriate APIC page invalidation by re-introducing arch specific kvm_arch_mmu_notifier_invalidate_range() and calling it from kvm_mmu_notifier_invalidate_range_start. However, the patch left a possible race where the VMCS APIC address cache is updated *before* it is unmapped: (Invalidator) kvm_mmu_notifier_invalidate_range_start() (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD) (KVM VCPU) vcpu_enter_guest() (KVM VCPU) kvm_vcpu_reload_apic_access_page() (Invalidator) actually unmap page Because of the above race, there can be a mismatch between the host physical address stored in the APIC_ACCESS_PAGE VMCS field and the host physical address stored in the EPT entry for the APIC GPA (0xfee0000). When this happens, the processor will not trap APIC accesses, and will instead show the raw contents of the APIC-access page. Because Windows OS periodically checks for unexpected modifications to the LAPIC register, this will show up as a BSOD crash with BugCheck CRITICAL_STRUCTURE_CORRUPTION (109) we are currently seeing in https://bugzilla.redhat.com/show_bug.cgi?id=1751017. The root cause of the issue is that kvm_arch_mmu_notifier_invalidate_range() cannot guarantee that no additional references are taken to the pages in the range before kvm_mmu_notifier_invalidate_range_end(). Fortunately, this case is supported by the MMU notifier API, as documented in include/linux/mmu_notifier.h: * If the subsystem * can't guarantee that no additional references are taken to * the pages in the range, it has to implement the * invalidate_range() notifier to remove any references taken * after invalidate_range_start(). The fix therefore is to reload the APIC-access page field in the VMCS from kvm_mmu_notifier_invalidate_range() instead of ..._range_start(). Cc: stable@vger.kernel.org Fixes: b1394e745b94 ("KVM: x86: fix APIC page invalidation") Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=197951 Signed-off-by: Eiichi Tsukata Message-Id: <20200606042627.61070-1-eiichi.tsukata@nutanix.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4db151f6101e..7b6013f2ba19 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -155,10 +155,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; -__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end, bool blockable) +__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end) { - return 0; } bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) @@ -384,6 +383,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) return container_of(mn, struct kvm, mmu_notifier); } +static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int idx; + + idx = srcu_read_lock(&kvm->srcu); + kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); + srcu_read_unlock(&kvm->srcu, idx); +} + static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, @@ -408,7 +419,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, { struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; - int ret; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -425,14 +435,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); - - ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, - range->end, - mmu_notifier_range_blockable(range)); - srcu_read_unlock(&kvm->srcu, idx); - return ret; + return 0; } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, @@ -538,6 +543,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .invalidate_range = kvm_mmu_notifier_invalidate_range, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, -- cgit v1.2.3 From 7863e346e1089b40cac1c7d9098314c405e2e1e3 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 10 Jun 2020 19:55:31 +0200 Subject: KVM: async_pf: Cleanup kvm_setup_async_pf() schedule_work() returns 'false' only when the work is already on the queue and this can't happen as kvm_setup_async_pf() always allocates a new one. Also, to avoid potential race, it makes sense to to schedule_work() at the very end after we've added it to the queue. While on it, do some minor cleanup. gfn_to_pfn_async() mentioned in a comment does not currently exist and, moreover, we can check kvm_is_error_hva() at the very beginning, before we try to allocate work so 'retry_sync' label can go away completely. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200610175532.779793-1-vkuznets@redhat.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- virt/kvm/async_pf.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'virt') diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index f1e07fae84e9..ba080088da76 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -164,7 +164,9 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) return 0; - /* setup delayed work */ + /* Arch specific code should not do async PF in this case */ + if (unlikely(kvm_is_error_hva(hva))) + return 0; /* * do alloc nowait since if we are going to sleep anyway we @@ -183,24 +185,15 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, mmget(work->mm); kvm_get_kvm(work->vcpu->kvm); - /* this can't really happen otherwise gfn_to_pfn_async - would succeed */ - if (unlikely(kvm_is_error_hva(work->addr))) - goto retry_sync; - INIT_WORK(&work->work, async_pf_execute); - if (!schedule_work(&work->work)) - goto retry_sync; list_add_tail(&work->queue, &vcpu->async_pf.queue); vcpu->async_pf.queued++; kvm_arch_async_page_not_present(vcpu, work); + + schedule_work(&work->work); + return 1; -retry_sync: - kvm_put_kvm(work->vcpu->kvm); - mmput(work->mm); - kmem_cache_free(async_pf_cache, work); - return 0; } int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 2a18b7e7cd8882f626316c340c6f2fca49b5fa12 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 10 Jun 2020 19:55:32 +0200 Subject: KVM: async_pf: Inject 'page ready' event only if 'page not present' was previously injected 'Page not present' event may or may not get injected depending on guest's state. If the event wasn't injected, there is no need to inject the corresponding 'page ready' event as the guest may get confused. E.g. Linux thinks that the corresponding 'page not present' event wasn't delivered *yet* and allocates a 'dummy entry' for it. This entry is never freed. Note, 'wakeup all' events have no corresponding 'page not present' event and always get injected. s390 seems to always be able to inject 'page not present', the change is effectively a nop. Suggested-by: Vivek Goyal Signed-off-by: Vitaly Kuznetsov Message-Id: <20200610175532.779793-2-vkuznets@redhat.com> Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=208081 Signed-off-by: Paolo Bonzini --- virt/kvm/async_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index ba080088da76..a36828fbf40a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -189,7 +189,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, list_add_tail(&work->queue, &vcpu->async_pf.queue); vcpu->async_pf.queued++; - kvm_arch_async_page_not_present(vcpu, work); + work->notpresent_injected = kvm_arch_async_page_not_present(vcpu, work); schedule_work(&work->work); -- cgit v1.2.3