diff options
Diffstat (limited to 'virt/kvm')
-rw-r--r-- | virt/kvm/Kconfig | 11 | ||||
-rw-r--r-- | virt/kvm/async_pf.c | 13 | ||||
-rw-r--r-- | virt/kvm/eventfd.c | 13 | ||||
-rw-r--r-- | virt/kvm/guest_memfd.c | 275 | ||||
-rw-r--r-- | virt/kvm/irqchip.c | 24 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 160 | ||||
-rw-r--r-- | virt/kvm/pfncache.c | 3 |
7 files changed, 387 insertions, 112 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 29b73eedfe74..fd6a3010afa8 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS config KVM_GENERIC_DIRTYLOG_READ_PROTECT bool +config KVM_GENERIC_PRE_FAULT_MEMORY + bool + config KVM_COMPAT def_bool y depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) @@ -109,3 +112,11 @@ config KVM_GENERIC_PRIVATE_MEM select KVM_GENERIC_MEMORY_ATTRIBUTES select KVM_PRIVATE_MEM bool + +config HAVE_KVM_ARCH_GMEM_PREPARE + bool + depends on KVM_PRIVATE_MEM + +config HAVE_KVM_ARCH_GMEM_INVALIDATE + bool + depends on KVM_PRIVATE_MEM diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 99a63bad0306..0ee4816b079a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -80,7 +80,6 @@ static void async_pf_execute(struct work_struct *work) spin_lock(&vcpu->async_pf.lock); first = list_empty(&vcpu->async_pf.done); list_add_tail(&apf->link, &vcpu->async_pf.done); - apf->vcpu = NULL; spin_unlock(&vcpu->async_pf.lock); /* @@ -120,8 +119,6 @@ static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work) void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) { - spin_lock(&vcpu->async_pf.lock); - /* cancel outstanding work queue item */ while (!list_empty(&vcpu->async_pf.queue)) { struct kvm_async_pf *work = @@ -129,23 +126,15 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) typeof(*work), queue); list_del(&work->queue); - /* - * We know it's present in vcpu->async_pf.done, do - * nothing here. - */ - if (!work->vcpu) - continue; - - spin_unlock(&vcpu->async_pf.lock); #ifdef CONFIG_KVM_ASYNC_PF_SYNC flush_work(&work->work); #else if (cancel_work_sync(&work->work)) kmem_cache_free(async_pf_cache, work); #endif - spin_lock(&vcpu->async_pf.lock); } + spin_lock(&vcpu->async_pf.lock); while (!list_empty(&vcpu->async_pf.done)) { struct kvm_async_pf *work = list_first_entry(&vcpu->async_pf.done, diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 229570059a1b..992f9beb3e7d 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -97,18 +97,19 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) mutex_lock(&kvm->irqfds.resampler_lock); list_del_rcu(&irqfd->resampler_link); - synchronize_srcu(&kvm->irq_srcu); if (list_empty(&resampler->list)) { list_del_rcu(&resampler->link); kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); /* - * synchronize_srcu(&kvm->irq_srcu) already called + * synchronize_srcu_expedited(&kvm->irq_srcu) already called * in kvm_unregister_irq_ack_notifier(). */ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, resampler->notifier.gsi, 0, false); kfree(resampler); + } else { + synchronize_srcu_expedited(&kvm->irq_srcu); } mutex_unlock(&kvm->irqfds.resampler_lock); @@ -126,7 +127,7 @@ irqfd_shutdown(struct work_struct *work) u64 cnt; /* Make sure irqfd has been initialized in assign path. */ - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); /* * Synchronize with the wait-queue and unhook ourselves to prevent @@ -384,7 +385,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) } list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); mutex_unlock(&kvm->irqfds.resampler_lock); } @@ -523,7 +524,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, mutex_lock(&kvm->irq_lock); hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); - synchronize_srcu(&kvm->irq_srcu); + synchronize_srcu_expedited(&kvm->irq_srcu); kvm_arch_post_irq_ack_notifier_list_update(kvm); } @@ -608,7 +609,7 @@ kvm_irqfd_release(struct kvm *kvm) /* * Take note of a change in irq routing. - * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. + * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards. */ void kvm_irq_routing_update(struct kvm *kvm) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 747fe251e445..8f079a61a56d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -13,39 +13,93 @@ struct kvm_gmem { struct list_head entry; }; -static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +/** + * folio_file_pfn - like folio_file_page, but return a pfn. + * @folio: The folio which contains this index. + * @index: The index we want to look up. + * + * Return: The pfn for this index. + */ +static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) { - struct folio *folio; + return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); +} - /* TODO: Support huge pages. */ - folio = filemap_grab_folio(inode->i_mapping, index); - if (IS_ERR_OR_NULL(folio)) - return NULL; +static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, + pgoff_t index, struct folio *folio) +{ +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE + kvm_pfn_t pfn = folio_file_pfn(folio, index); + gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; + int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); + if (rc) { + pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", + index, gfn, pfn, rc); + return rc; + } +#endif - /* - * Use the up-to-date flag to track whether or not the memory has been - * zeroed before being handed off to the guest. There is no backing - * storage for the memory, so the folio will remain up-to-date until - * it's removed. - * - * TODO: Skip clearing pages when trusted firmware will do it when - * assigning memory to the guest. - */ - if (!folio_test_uptodate(folio)) { - unsigned long nr_pages = folio_nr_pages(folio); - unsigned long i; + return 0; +} + +static inline void kvm_gmem_mark_prepared(struct folio *folio) +{ + folio_mark_uptodate(folio); +} - for (i = 0; i < nr_pages; i++) - clear_highpage(folio_page(folio, i)); +/* + * Process @folio, which contains @gfn, so that the guest can use it. + * The folio must be locked and the gfn must be contained in @slot. + * On successful return the guest sees a zero page so as to avoid + * leaking host data and the up-to-date flag is set. + */ +static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, struct folio *folio) +{ + unsigned long nr_pages, i; + pgoff_t index; + int r; - folio_mark_uptodate(folio); - } + nr_pages = folio_nr_pages(folio); + for (i = 0; i < nr_pages; i++) + clear_highpage(folio_page(folio, i)); /* - * Ignore accessed, referenced, and dirty flags. The memory is - * unevictable and there is no storage to write back to. + * Preparing huge folios should always be safe, since it should + * be possible to split them later if needed. + * + * Right now the folio order is always going to be zero, but the + * code is ready for huge folios. The only assumption is that + * the base pgoff of memslots is naturally aligned with the + * requested page order, ensuring that huge folios can also use + * huge page table entries for GPA->HPA mapping. + * + * The order will be passed when creating the guest_memfd, and + * checked when creating memslots. */ - return folio; + WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); + index = gfn - slot->base_gfn + slot->gmem.pgoff; + index = ALIGN_DOWN(index, 1 << folio_order(folio)); + r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); + if (!r) + kvm_gmem_mark_prepared(folio); + + return r; +} + +/* + * Returns a locked folio on success. The caller is responsible for + * setting the up-to-date flag before the memory is mapped into the guest. + * There is no backing storage for the memory, so the folio will remain + * up-to-date until it's removed. + * + * Ignore accessed, referenced, and dirty flags. The memory is + * unevictable and there is no storage to write back to. + */ +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +{ + /* TODO: Support huge pages. */ + return filemap_grab_folio(inode->i_mapping, index); } static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, @@ -146,8 +200,8 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) } folio = kvm_gmem_get_folio(inode, index); - if (!folio) { - r = -ENOMEM; + if (IS_ERR(folio)) { + r = PTR_ERR(folio); break; } @@ -298,10 +352,24 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol return MF_DELAYED; } +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE +static void kvm_gmem_free_folio(struct folio *folio) +{ + struct page *page = folio_page(folio, 0); + kvm_pfn_t pfn = page_to_pfn(page); + int order = folio_order(folio); + + kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); +} +#endif + static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE + .free_folio = kvm_gmem_free_folio, +#endif }; static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, @@ -360,7 +428,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) inode->i_mode |= S_IFREG; inode->i_size = size; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); - mapping_set_unmovable(inode->i_mapping); + mapping_set_inaccessible(inode->i_mapping); /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); @@ -482,52 +550,153 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) fput(file); } -int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +/* Returns a locked folio on success. */ +static struct folio * +__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared, + int *max_order) { pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; - struct kvm_gmem *gmem; + struct kvm_gmem *gmem = file->private_data; struct folio *folio; - struct page *page; - struct file *file; - int r; - file = kvm_gmem_get_file(slot); - if (!file) - return -EFAULT; + if (file != slot->gmem.file) { + WARN_ON_ONCE(slot->gmem.file); + return ERR_PTR(-EFAULT); + } gmem = file->private_data; - - if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) { - r = -EIO; - goto out_fput; + if (xa_load(&gmem->bindings, index) != slot) { + WARN_ON_ONCE(xa_load(&gmem->bindings, index)); + return ERR_PTR(-EIO); } folio = kvm_gmem_get_folio(file_inode(file), index); - if (!folio) { - r = -ENOMEM; - goto out_fput; - } + if (IS_ERR(folio)) + return folio; if (folio_test_hwpoison(folio)) { folio_unlock(folio); folio_put(folio); - r = -EHWPOISON; - goto out_fput; + return ERR_PTR(-EHWPOISON); } - page = folio_file_page(folio, index); - - *pfn = page_to_pfn(page); + *pfn = folio_file_pfn(folio, index); if (max_order) *max_order = 0; - r = 0; + *is_prepared = folio_test_uptodate(folio); + return folio; +} + +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +{ + struct file *file = kvm_gmem_get_file(slot); + struct folio *folio; + bool is_prepared = false; + int r = 0; + + if (!file) + return -EFAULT; + + folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, &is_prepared, max_order); + if (IS_ERR(folio)) { + r = PTR_ERR(folio); + goto out; + } + + if (!is_prepared) + r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); folio_unlock(folio); -out_fput: - fput(file); + if (r < 0) + folio_put(folio); +out: + fput(file); return r; } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); + +#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM +long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, + kvm_gmem_populate_cb post_populate, void *opaque) +{ + struct file *file; + struct kvm_memory_slot *slot; + void __user *p; + + int ret = 0, max_order; + long i; + + lockdep_assert_held(&kvm->slots_lock); + if (npages < 0) + return -EINVAL; + + slot = gfn_to_memslot(kvm, start_gfn); + if (!kvm_slot_can_be_private(slot)) + return -EINVAL; + + file = kvm_gmem_get_file(slot); + if (!file) + return -EFAULT; + + filemap_invalidate_lock(file->f_mapping); + + npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); + for (i = 0; i < npages; i += (1 << max_order)) { + struct folio *folio; + gfn_t gfn = start_gfn + i; + bool is_prepared = false; + kvm_pfn_t pfn; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &is_prepared, &max_order); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + break; + } + + if (is_prepared) { + folio_unlock(folio); + folio_put(folio); + ret = -EEXIST; + break; + } + + folio_unlock(folio); + WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || + (npages - i) < (1 << max_order)); + + ret = -EINVAL; + while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), + KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_ATTRIBUTE_PRIVATE)) { + if (!max_order) + goto put_folio_and_exit; + max_order--; + } + + p = src ? src + i * PAGE_SIZE : NULL; + ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); + if (!ret) + kvm_gmem_mark_prepared(folio); + +put_folio_and_exit: + folio_put(folio); + if (ret) + break; + } + + filemap_invalidate_unlock(file->f_mapping); + + fput(file); + return ret && !i ? ret : i; +} +EXPORT_SYMBOL_GPL(kvm_gmem_populate); +#endif diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 1e567d1f6d3d..162d8ed889f2 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -237,3 +237,27 @@ out: return r; } + +/* + * Allocate empty IRQ routing by default so that additional setup isn't needed + * when userspace-driven IRQ routing is activated, and so that kvm->irq_routing + * is guaranteed to be non-NULL. + */ +int kvm_init_irq_routing(struct kvm *kvm) +{ + struct kvm_irq_routing_table *new; + int chip_size; + + new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; + + new->nr_rt_entries = 1; + + chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS; + memset(new->chip, -1, chip_size); + + RCU_INIT_POINTER(kvm->irq_routing, new); + + return 0; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1192942aef91..cb2b78e92910 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. + * Kernel-based Virtual Machine (KVM) Hypervisor * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. @@ -74,6 +71,7 @@ #define ITOA_MAX_LEN 12 MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor"); MODULE_LICENSE("GPL"); /* Architectures should define their poll value according to the halt latency */ @@ -91,8 +89,8 @@ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ module_param(halt_poll_ns_grow_start, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); -/* Default resets per-vcpu halt_poll_ns . */ -unsigned int halt_poll_ns_shrink; +/* Default halves per-vcpu halt_poll_ns. */ +unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); @@ -110,8 +108,7 @@ static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); -struct dentry *kvm_debugfs_dir; -EXPORT_SYMBOL_GPL(kvm_debugfs_dir); +static struct dentry *kvm_debugfs_dir; static const struct file_operations stat_fops_per_vm; @@ -1145,8 +1142,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); struct kvm_memslots *slots; - int r = -ENOMEM; - int i, j; + int r, i, j; if (!kvm) return ERR_PTR(-ENOMEM); @@ -1183,12 +1179,18 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d", task_pid_nr(current)); + r = -ENOMEM; if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) goto out_err_no_irq_srcu; + r = kvm_init_irq_routing(kvm); + if (r) + goto out_err_no_irq_routing; + refcount_set(&kvm->users_count, 1); + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { for (j = 0; j < 2; j++) { slots = &kvm->__memslots[i][j]; @@ -1206,6 +1208,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]); } + r = -ENOMEM; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); @@ -1267,6 +1270,8 @@ out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); + kvm_free_irq_routing(kvm); +out_err_no_irq_routing: cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); @@ -1573,15 +1578,14 @@ static int check_memory_region_flags(struct kvm *kvm, if (mem->flags & KVM_MEM_GUEST_MEMFD) valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; -#ifdef CONFIG_HAVE_KVM_READONLY_MEM /* * GUEST_MEMFD is incompatible with read-only memslots, as writes to * read-only memslots have emulated MMIO, not page fault, semantics, * and KVM doesn't allow emulated MMIO for private memory. */ - if (!(mem->flags & KVM_MEM_GUEST_MEMFD)) + if (kvm_arch_has_readonly_mem(kvm) && + !(mem->flags & KVM_MEM_GUEST_MEMFD)) valid_flags |= KVM_MEM_READONLY; -#endif if (mem->flags & ~valid_flags) return -EINVAL; @@ -2393,48 +2397,47 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static u64 kvm_supported_mem_attributes(struct kvm *kvm) +{ + if (!kvm || kvm_arch_has_private_mem(kvm)) + return KVM_MEMORY_ATTRIBUTE_PRIVATE; + + return 0; +} + /* * Returns true if _all_ gfns in the range [@start, @end) have attributes - * matching @attrs. + * such that the bits in @mask match @attrs. */ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, - unsigned long attrs) + unsigned long mask, unsigned long attrs) { XA_STATE(xas, &kvm->mem_attr_array, start); unsigned long index; - bool has_attrs; void *entry; - rcu_read_lock(); + mask &= kvm_supported_mem_attributes(kvm); + if (attrs & ~mask) + return false; - if (!attrs) { - has_attrs = !xas_find(&xas, end - 1); - goto out; - } + if (end == start + 1) + return (kvm_get_memory_attributes(kvm, start) & mask) == attrs; + + guard(rcu)(); + if (!attrs) + return !xas_find(&xas, end - 1); - has_attrs = true; for (index = start; index < end; index++) { do { entry = xas_next(&xas); } while (xas_retry(&xas, entry)); - if (xas.xa_index != index || xa_to_value(entry) != attrs) { - has_attrs = false; - break; - } + if (xas.xa_index != index || + (xa_to_value(entry) & mask) != attrs) + return false; } -out: - rcu_read_unlock(); - return has_attrs; -} - -static u64 kvm_supported_mem_attributes(struct kvm *kvm) -{ - if (!kvm || kvm_arch_has_private_mem(kvm)) - return KVM_MEMORY_ATTRIBUTE_PRIVATE; - - return 0; + return true; } static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, @@ -2529,7 +2532,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, mutex_lock(&kvm->slots_lock); /* Nothing to do if the entire range as the desired attributes. */ - if (kvm_range_has_memory_attributes(kvm, start, end, attributes)) + if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes)) goto out_unlock; /* @@ -4202,12 +4205,21 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) /* * Creates some virtual cpus. Good luck creating more than one. */ -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) { int r; struct kvm_vcpu *vcpu; struct page *page; + /* + * KVM tracks vCPU IDs as 'int', be kind to userspace and reject + * too-large values instead of silently truncating. + * + * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first + * changing the storage type (at the very least, IDs should be tracked + * as unsigned ints). + */ + BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX); if (id >= KVM_MAX_VCPU_IDS) return -EINVAL; @@ -4375,6 +4387,52 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) return fd; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + int idx; + long r; + u64 full_size; + + if (range->flags) + return -EINVAL; + + if (!PAGE_ALIGNED(range->gpa) || + !PAGE_ALIGNED(range->size) || + range->gpa + range->size <= range->gpa) + return -EINVAL; + + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + full_size = range->size; + do { + if (signal_pending(current)) { + r = -EINTR; + break; + } + + r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); + if (WARN_ON_ONCE(r == 0 || r == -EIO)) + break; + + if (r < 0) + break; + + range->size -= r; + range->gpa += r; + cond_resched(); + } while (range->size); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu_put(vcpu); + + /* Return success if at least one page was mapped successfully. */ + return full_size == range->size ? r : 0; +} +#endif + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4421,7 +4479,10 @@ static long kvm_vcpu_ioctl(struct file *filp, synchronize_rcu(); put_pid(oldpid); } + vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe); r = kvm_arch_vcpu_ioctl_run(vcpu); + vcpu->wants_to_run = false; + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } @@ -4575,6 +4636,20 @@ out_free1: r = kvm_vcpu_ioctl_get_stats_fd(vcpu); break; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY + case KVM_PRE_FAULT_MEMORY: { + struct kvm_pre_fault_memory range; + + r = -EFAULT; + if (copy_from_user(&range, argp, sizeof(range))) + break; + r = kvm_vcpu_pre_fault_memory(vcpu, &range); + /* Pass back leftover range. */ + if (copy_to_user(argp, &range, sizeof(range))) + r = -EFAULT; + break; + } +#endif default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } @@ -6287,8 +6362,9 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) WRITE_ONCE(vcpu->ready, false); __this_cpu_write(kvm_running_vcpu, vcpu); - kvm_arch_sched_in(vcpu, cpu); kvm_arch_vcpu_load(vcpu, cpu); + + WRITE_ONCE(vcpu->scheduled_out, false); } static void kvm_sched_out(struct preempt_notifier *pn, @@ -6296,7 +6372,9 @@ static void kvm_sched_out(struct preempt_notifier *pn, { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - if (current->on_rq) { + WRITE_ONCE(vcpu->scheduled_out, true); + + if (current->on_rq && vcpu->wants_to_run) { WRITE_ONCE(vcpu->preempted, true); WRITE_ONCE(vcpu->ready, true); } diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index e3453e869e92..f0039efb9e1e 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -430,6 +430,9 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long uhva, unsigned long len) { + if (!access_ok((void __user *)uhva, len)) + return -EINVAL; + return __kvm_gpc_activate(gpc, INVALID_GPA, uhva, len); } |