diff options
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/Kconfig | 7 | ||||
-rw-r--r-- | virt/kvm/async_pf.c | 73 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 45 |
3 files changed, 87 insertions, 38 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 184dab4ee871..29b73eedfe74 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -1,9 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # KVM common configuration items and defaults -config HAVE_KVM - bool - config KVM_COMMON bool select EVENTFD @@ -55,6 +52,9 @@ config KVM_ASYNC_PF_SYNC config HAVE_KVM_MSI bool +config HAVE_KVM_READONLY_MEM + bool + config HAVE_KVM_CPU_RELAX_INTERCEPT bool @@ -73,6 +73,7 @@ config KVM_COMPAT config HAVE_KVM_IRQ_BYPASS bool + select IRQ_BYPASS_MANAGER config HAVE_KVM_VCPU_ASYNC_IOCTL bool diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index e033c79d528e..99a63bad0306 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -46,8 +46,8 @@ static void async_pf_execute(struct work_struct *work) { struct kvm_async_pf *apf = container_of(work, struct kvm_async_pf, work); - struct mm_struct *mm = apf->mm; struct kvm_vcpu *vcpu = apf->vcpu; + struct mm_struct *mm = vcpu->kvm->mm; unsigned long addr = apf->addr; gpa_t cr2_or_gpa = apf->cr2_or_gpa; int locked = 1; @@ -56,15 +56,24 @@ static void async_pf_execute(struct work_struct *work) might_sleep(); /* - * This work is run asynchronously to the task which owns - * mm and might be done in another context, so we must - * access remotely. + * Attempt to pin the VM's host address space, and simply skip gup() if + * acquiring a pin fail, i.e. if the process is exiting. Note, KVM + * holds a reference to its associated mm_struct until the very end of + * kvm_destroy_vm(), i.e. the struct itself won't be freed before this + * work item is fully processed. */ - mmap_read_lock(mm); - get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked); - if (locked) - mmap_read_unlock(mm); + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked); + if (locked) + mmap_read_unlock(mm); + mmput(mm); + } + /* + * Notify and kick the vCPU even if faulting in the page failed, e.g. + * so that the vCPU can retry the fault synchronously. + */ if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) kvm_arch_async_page_present(vcpu, apf); @@ -74,20 +83,39 @@ static void async_pf_execute(struct work_struct *work) apf->vcpu = NULL; spin_unlock(&vcpu->async_pf.lock); - if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) - kvm_arch_async_page_present_queued(vcpu); - /* - * apf may be freed by kvm_check_async_pf_completion() after - * this point + * The apf struct may be freed by kvm_check_async_pf_completion() as + * soon as the lock is dropped. Nullify it to prevent improper usage. */ + apf = NULL; + + if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) + kvm_arch_async_page_present_queued(vcpu); trace_kvm_async_pf_completed(addr, cr2_or_gpa); __kvm_vcpu_wake_up(vcpu); +} - mmput(mm); - kvm_put_kvm(vcpu->kvm); +static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work) +{ + /* + * The async #PF is "done", but KVM must wait for the work item itself, + * i.e. async_pf_execute(), to run to completion. If KVM is a module, + * KVM must ensure *no* code owned by the KVM (the module) can be run + * after the last call to module_put(). Note, flushing the work item + * is always required when the item is taken off the completion queue. + * E.g. even if the vCPU handles the item in the "normal" path, the VM + * could be terminated before async_pf_execute() completes. + * + * Wake all events skip the queue and go straight done, i.e. don't + * need to be flushed (but sanity check that the work wasn't queued). + */ + if (work->wakeup_all) + WARN_ON_ONCE(work->work.func); + else + flush_work(&work->work); + kmem_cache_free(async_pf_cache, work); } void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) @@ -112,11 +140,8 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_ASYNC_PF_SYNC flush_work(&work->work); #else - if (cancel_work_sync(&work->work)) { - mmput(work->mm); - kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ + if (cancel_work_sync(&work->work)) kmem_cache_free(async_pf_cache, work); - } #endif spin_lock(&vcpu->async_pf.lock); } @@ -126,7 +151,10 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) list_first_entry(&vcpu->async_pf.done, typeof(*work), link); list_del(&work->link); - kmem_cache_free(async_pf_cache, work); + + spin_unlock(&vcpu->async_pf.lock); + kvm_flush_and_free_async_pf_work(work); + spin_lock(&vcpu->async_pf.lock); } spin_unlock(&vcpu->async_pf.lock); @@ -151,7 +179,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) list_del(&work->queue); vcpu->async_pf.queued--; - kmem_cache_free(async_pf_cache, work); + kvm_flush_and_free_async_pf_work(work); } } @@ -184,9 +212,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, work->cr2_or_gpa = cr2_or_gpa; work->addr = hva; work->arch = *arch; - work->mm = current->mm; - mmget(work->mm); - kvm_get_kvm(work->vcpu->kvm); INIT_WORK(&work->work, async_pf_execute); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 10bfc88a69f7..fb49c2a60200 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -421,7 +421,7 @@ int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, if (WARN_ON_ONCE(!capacity)) return -EIO; - mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp); + mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp); if (!mc->objects) return -ENOMEM; @@ -890,7 +890,9 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, /* Pairs with the increment in range_start(). */ spin_lock(&kvm->mn_invalidate_lock); - wake = (--kvm->mn_active_invalidate_count == 0); + if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count)) + --kvm->mn_active_invalidate_count; + wake = !kvm->mn_active_invalidate_count; spin_unlock(&kvm->mn_invalidate_lock); /* @@ -1150,10 +1152,7 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname) &stat_fops_per_vm); } - ret = kvm_arch_create_vm_debugfs(kvm); - if (ret) - goto out_err; - + kvm_arch_create_vm_debugfs(kvm); return 0; out_err: kvm_destroy_vm_debugfs(kvm); @@ -1183,9 +1182,8 @@ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so * a per-arch destroy interface is not needed. */ -int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) +void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) { - return 0; } static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) @@ -1614,8 +1612,14 @@ static int check_memory_region_flags(struct kvm *kvm, if (mem->flags & KVM_MEM_GUEST_MEMFD) valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; -#ifdef __KVM_HAVE_READONLY_MEM - valid_flags |= KVM_MEM_READONLY; +#ifdef CONFIG_HAVE_KVM_READONLY_MEM + /* + * GUEST_MEMFD is incompatible with read-only memslots, as writes to + * read-only memslots have emulated MMIO, not page fault, semantics, + * and KVM doesn't allow emulated MMIO for private memory. + */ + if (!(mem->flags & KVM_MEM_GUEST_MEMFD)) + valid_flags |= KVM_MEM_READONLY; #endif if (mem->flags & ~valid_flags) @@ -4042,6 +4046,18 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) return false; } +/* + * By default, simply query the target vCPU's current mode when checking if a + * vCPU was preempted in kernel mode. All architectures except x86 (or more + * specifical, except VMX) allow querying whether or not a vCPU is in kernel + * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel() + * directly for cross-vCPU checks is functionally correct and accurate. + */ +bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_in_kernel(vcpu); +} + bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { return false; @@ -4078,9 +4094,16 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) continue; + + /* + * Treat the target vCPU as being in-kernel if it has a + * pending interrupt, as the vCPU trying to yield may + * be spinning waiting on IPI delivery, i.e. the target + * vCPU is in-kernel for the purposes of directed yield. + */ if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_dy_has_pending_interrupt(vcpu) && - !kvm_arch_vcpu_in_kernel(vcpu)) + !kvm_arch_vcpu_preempted_in_kernel(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) continue; |