summaryrefslogtreecommitdiffstats
path: root/virt/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm')
-rw-r--r--virt/kvm/Kconfig11
-rw-r--r--virt/kvm/async_pf.c13
-rw-r--r--virt/kvm/eventfd.c13
-rw-r--r--virt/kvm/guest_memfd.c275
-rw-r--r--virt/kvm/irqchip.c24
-rw-r--r--virt/kvm/kvm_main.c160
-rw-r--r--virt/kvm/pfncache.c3
7 files changed, 387 insertions, 112 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 29b73eedfe74..fd6a3010afa8 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS
config KVM_GENERIC_DIRTYLOG_READ_PROTECT
bool
+config KVM_GENERIC_PRE_FAULT_MEMORY
+ bool
+
config KVM_COMPAT
def_bool y
depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
@@ -109,3 +112,11 @@ config KVM_GENERIC_PRIVATE_MEM
select KVM_GENERIC_MEMORY_ATTRIBUTES
select KVM_PRIVATE_MEM
bool
+
+config HAVE_KVM_ARCH_GMEM_PREPARE
+ bool
+ depends on KVM_PRIVATE_MEM
+
+config HAVE_KVM_ARCH_GMEM_INVALIDATE
+ bool
+ depends on KVM_PRIVATE_MEM
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 99a63bad0306..0ee4816b079a 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,7 +80,6 @@ static void async_pf_execute(struct work_struct *work)
spin_lock(&vcpu->async_pf.lock);
first = list_empty(&vcpu->async_pf.done);
list_add_tail(&apf->link, &vcpu->async_pf.done);
- apf->vcpu = NULL;
spin_unlock(&vcpu->async_pf.lock);
/*
@@ -120,8 +119,6 @@ static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work)
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
{
- spin_lock(&vcpu->async_pf.lock);
-
/* cancel outstanding work queue item */
while (!list_empty(&vcpu->async_pf.queue)) {
struct kvm_async_pf *work =
@@ -129,23 +126,15 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
typeof(*work), queue);
list_del(&work->queue);
- /*
- * We know it's present in vcpu->async_pf.done, do
- * nothing here.
- */
- if (!work->vcpu)
- continue;
-
- spin_unlock(&vcpu->async_pf.lock);
#ifdef CONFIG_KVM_ASYNC_PF_SYNC
flush_work(&work->work);
#else
if (cancel_work_sync(&work->work))
kmem_cache_free(async_pf_cache, work);
#endif
- spin_lock(&vcpu->async_pf.lock);
}
+ spin_lock(&vcpu->async_pf.lock);
while (!list_empty(&vcpu->async_pf.done)) {
struct kvm_async_pf *work =
list_first_entry(&vcpu->async_pf.done,
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 229570059a1b..992f9beb3e7d 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -97,18 +97,19 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
mutex_lock(&kvm->irqfds.resampler_lock);
list_del_rcu(&irqfd->resampler_link);
- synchronize_srcu(&kvm->irq_srcu);
if (list_empty(&resampler->list)) {
list_del_rcu(&resampler->link);
kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
/*
- * synchronize_srcu(&kvm->irq_srcu) already called
+ * synchronize_srcu_expedited(&kvm->irq_srcu) already called
* in kvm_unregister_irq_ack_notifier().
*/
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0, false);
kfree(resampler);
+ } else {
+ synchronize_srcu_expedited(&kvm->irq_srcu);
}
mutex_unlock(&kvm->irqfds.resampler_lock);
@@ -126,7 +127,7 @@ irqfd_shutdown(struct work_struct *work)
u64 cnt;
/* Make sure irqfd has been initialized in assign path. */
- synchronize_srcu(&kvm->irq_srcu);
+ synchronize_srcu_expedited(&kvm->irq_srcu);
/*
* Synchronize with the wait-queue and unhook ourselves to prevent
@@ -384,7 +385,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
}
list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
- synchronize_srcu(&kvm->irq_srcu);
+ synchronize_srcu_expedited(&kvm->irq_srcu);
mutex_unlock(&kvm->irqfds.resampler_lock);
}
@@ -523,7 +524,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
mutex_lock(&kvm->irq_lock);
hlist_del_init_rcu(&kian->link);
mutex_unlock(&kvm->irq_lock);
- synchronize_srcu(&kvm->irq_srcu);
+ synchronize_srcu_expedited(&kvm->irq_srcu);
kvm_arch_post_irq_ack_notifier_list_update(kvm);
}
@@ -608,7 +609,7 @@ kvm_irqfd_release(struct kvm *kvm)
/*
* Take note of a change in irq routing.
- * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
+ * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards.
*/
void kvm_irq_routing_update(struct kvm *kvm)
{
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 747fe251e445..8f079a61a56d 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -13,39 +13,93 @@ struct kvm_gmem {
struct list_head entry;
};
-static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+/**
+ * folio_file_pfn - like folio_file_page, but return a pfn.
+ * @folio: The folio which contains this index.
+ * @index: The index we want to look up.
+ *
+ * Return: The pfn for this index.
+ */
+static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
{
- struct folio *folio;
+ return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
+}
- /* TODO: Support huge pages. */
- folio = filemap_grab_folio(inode->i_mapping, index);
- if (IS_ERR_OR_NULL(folio))
- return NULL;
+static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
+ pgoff_t index, struct folio *folio)
+{
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
+ kvm_pfn_t pfn = folio_file_pfn(folio, index);
+ gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
+ int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
+ if (rc) {
+ pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
+ index, gfn, pfn, rc);
+ return rc;
+ }
+#endif
- /*
- * Use the up-to-date flag to track whether or not the memory has been
- * zeroed before being handed off to the guest. There is no backing
- * storage for the memory, so the folio will remain up-to-date until
- * it's removed.
- *
- * TODO: Skip clearing pages when trusted firmware will do it when
- * assigning memory to the guest.
- */
- if (!folio_test_uptodate(folio)) {
- unsigned long nr_pages = folio_nr_pages(folio);
- unsigned long i;
+ return 0;
+}
+
+static inline void kvm_gmem_mark_prepared(struct folio *folio)
+{
+ folio_mark_uptodate(folio);
+}
- for (i = 0; i < nr_pages; i++)
- clear_highpage(folio_page(folio, i));
+/*
+ * Process @folio, which contains @gfn, so that the guest can use it.
+ * The folio must be locked and the gfn must be contained in @slot.
+ * On successful return the guest sees a zero page so as to avoid
+ * leaking host data and the up-to-date flag is set.
+ */
+static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, struct folio *folio)
+{
+ unsigned long nr_pages, i;
+ pgoff_t index;
+ int r;
- folio_mark_uptodate(folio);
- }
+ nr_pages = folio_nr_pages(folio);
+ for (i = 0; i < nr_pages; i++)
+ clear_highpage(folio_page(folio, i));
/*
- * Ignore accessed, referenced, and dirty flags. The memory is
- * unevictable and there is no storage to write back to.
+ * Preparing huge folios should always be safe, since it should
+ * be possible to split them later if needed.
+ *
+ * Right now the folio order is always going to be zero, but the
+ * code is ready for huge folios. The only assumption is that
+ * the base pgoff of memslots is naturally aligned with the
+ * requested page order, ensuring that huge folios can also use
+ * huge page table entries for GPA->HPA mapping.
+ *
+ * The order will be passed when creating the guest_memfd, and
+ * checked when creating memslots.
*/
- return folio;
+ WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
+ index = gfn - slot->base_gfn + slot->gmem.pgoff;
+ index = ALIGN_DOWN(index, 1 << folio_order(folio));
+ r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
+ if (!r)
+ kvm_gmem_mark_prepared(folio);
+
+ return r;
+}
+
+/*
+ * Returns a locked folio on success. The caller is responsible for
+ * setting the up-to-date flag before the memory is mapped into the guest.
+ * There is no backing storage for the memory, so the folio will remain
+ * up-to-date until it's removed.
+ *
+ * Ignore accessed, referenced, and dirty flags. The memory is
+ * unevictable and there is no storage to write back to.
+ */
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+{
+ /* TODO: Support huge pages. */
+ return filemap_grab_folio(inode->i_mapping, index);
}
static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -146,8 +200,8 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
}
folio = kvm_gmem_get_folio(inode, index);
- if (!folio) {
- r = -ENOMEM;
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
break;
}
@@ -298,10 +352,24 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
return MF_DELAYED;
}
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
+static void kvm_gmem_free_folio(struct folio *folio)
+{
+ struct page *page = folio_page(folio, 0);
+ kvm_pfn_t pfn = page_to_pfn(page);
+ int order = folio_order(folio);
+
+ kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
+}
+#endif
+
static const struct address_space_operations kvm_gmem_aops = {
.dirty_folio = noop_dirty_folio,
.migrate_folio = kvm_gmem_migrate_folio,
.error_remove_folio = kvm_gmem_error_folio,
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
+ .free_folio = kvm_gmem_free_folio,
+#endif
};
static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
@@ -360,7 +428,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
inode->i_mode |= S_IFREG;
inode->i_size = size;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
- mapping_set_unmovable(inode->i_mapping);
+ mapping_set_inaccessible(inode->i_mapping);
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
@@ -482,52 +550,153 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
fput(file);
}
-int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+/* Returns a locked folio on success. */
+static struct folio *
+__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared,
+ int *max_order)
{
pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
- struct kvm_gmem *gmem;
+ struct kvm_gmem *gmem = file->private_data;
struct folio *folio;
- struct page *page;
- struct file *file;
- int r;
- file = kvm_gmem_get_file(slot);
- if (!file)
- return -EFAULT;
+ if (file != slot->gmem.file) {
+ WARN_ON_ONCE(slot->gmem.file);
+ return ERR_PTR(-EFAULT);
+ }
gmem = file->private_data;
-
- if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
- r = -EIO;
- goto out_fput;
+ if (xa_load(&gmem->bindings, index) != slot) {
+ WARN_ON_ONCE(xa_load(&gmem->bindings, index));
+ return ERR_PTR(-EIO);
}
folio = kvm_gmem_get_folio(file_inode(file), index);
- if (!folio) {
- r = -ENOMEM;
- goto out_fput;
- }
+ if (IS_ERR(folio))
+ return folio;
if (folio_test_hwpoison(folio)) {
folio_unlock(folio);
folio_put(folio);
- r = -EHWPOISON;
- goto out_fput;
+ return ERR_PTR(-EHWPOISON);
}
- page = folio_file_page(folio, index);
-
- *pfn = page_to_pfn(page);
+ *pfn = folio_file_pfn(folio, index);
if (max_order)
*max_order = 0;
- r = 0;
+ *is_prepared = folio_test_uptodate(folio);
+ return folio;
+}
+
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+{
+ struct file *file = kvm_gmem_get_file(slot);
+ struct folio *folio;
+ bool is_prepared = false;
+ int r = 0;
+
+ if (!file)
+ return -EFAULT;
+
+ folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, &is_prepared, max_order);
+ if (IS_ERR(folio)) {
+ r = PTR_ERR(folio);
+ goto out;
+ }
+
+ if (!is_prepared)
+ r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
folio_unlock(folio);
-out_fput:
- fput(file);
+ if (r < 0)
+ folio_put(folio);
+out:
+ fput(file);
return r;
}
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
+
+#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
+long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
+ kvm_gmem_populate_cb post_populate, void *opaque)
+{
+ struct file *file;
+ struct kvm_memory_slot *slot;
+ void __user *p;
+
+ int ret = 0, max_order;
+ long i;
+
+ lockdep_assert_held(&kvm->slots_lock);
+ if (npages < 0)
+ return -EINVAL;
+
+ slot = gfn_to_memslot(kvm, start_gfn);
+ if (!kvm_slot_can_be_private(slot))
+ return -EINVAL;
+
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return -EFAULT;
+
+ filemap_invalidate_lock(file->f_mapping);
+
+ npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
+ for (i = 0; i < npages; i += (1 << max_order)) {
+ struct folio *folio;
+ gfn_t gfn = start_gfn + i;
+ bool is_prepared = false;
+ kvm_pfn_t pfn;
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &is_prepared, &max_order);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ break;
+ }
+
+ if (is_prepared) {
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = -EEXIST;
+ break;
+ }
+
+ folio_unlock(folio);
+ WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
+ (npages - i) < (1 << max_order));
+
+ ret = -EINVAL;
+ while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
+ KVM_MEMORY_ATTRIBUTE_PRIVATE,
+ KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
+ if (!max_order)
+ goto put_folio_and_exit;
+ max_order--;
+ }
+
+ p = src ? src + i * PAGE_SIZE : NULL;
+ ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
+ if (!ret)
+ kvm_gmem_mark_prepared(folio);
+
+put_folio_and_exit:
+ folio_put(folio);
+ if (ret)
+ break;
+ }
+
+ filemap_invalidate_unlock(file->f_mapping);
+
+ fput(file);
+ return ret && !i ? ret : i;
+}
+EXPORT_SYMBOL_GPL(kvm_gmem_populate);
+#endif
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 1e567d1f6d3d..162d8ed889f2 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -237,3 +237,27 @@ out:
return r;
}
+
+/*
+ * Allocate empty IRQ routing by default so that additional setup isn't needed
+ * when userspace-driven IRQ routing is activated, and so that kvm->irq_routing
+ * is guaranteed to be non-NULL.
+ */
+int kvm_init_irq_routing(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *new;
+ int chip_size;
+
+ new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT);
+ if (!new)
+ return -ENOMEM;
+
+ new->nr_rt_entries = 1;
+
+ chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS;
+ memset(new->chip, -1, chip_size);
+
+ RCU_INIT_POINTER(kvm->irq_routing, new);
+
+ return 0;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1192942aef91..cb2b78e92910 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1,9 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
+ * Kernel-based Virtual Machine (KVM) Hypervisor
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
@@ -74,6 +71,7 @@
#define ITOA_MAX_LEN 12
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
MODULE_LICENSE("GPL");
/* Architectures should define their poll value according to the halt latency */
@@ -91,8 +89,8 @@ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
module_param(halt_poll_ns_grow_start, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
-/* Default resets per-vcpu halt_poll_ns . */
-unsigned int halt_poll_ns_shrink;
+/* Default halves per-vcpu halt_poll_ns. */
+unsigned int halt_poll_ns_shrink = 2;
module_param(halt_poll_ns_shrink, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
@@ -110,8 +108,7 @@ static struct kmem_cache *kvm_vcpu_cache;
static __read_mostly struct preempt_ops kvm_preempt_ops;
static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
-struct dentry *kvm_debugfs_dir;
-EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
+static struct dentry *kvm_debugfs_dir;
static const struct file_operations stat_fops_per_vm;
@@ -1145,8 +1142,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
{
struct kvm *kvm = kvm_arch_alloc_vm();
struct kvm_memslots *slots;
- int r = -ENOMEM;
- int i, j;
+ int r, i, j;
if (!kvm)
return ERR_PTR(-ENOMEM);
@@ -1183,12 +1179,18 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
task_pid_nr(current));
+ r = -ENOMEM;
if (init_srcu_struct(&kvm->srcu))
goto out_err_no_srcu;
if (init_srcu_struct(&kvm->irq_srcu))
goto out_err_no_irq_srcu;
+ r = kvm_init_irq_routing(kvm);
+ if (r)
+ goto out_err_no_irq_routing;
+
refcount_set(&kvm->users_count, 1);
+
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
for (j = 0; j < 2; j++) {
slots = &kvm->__memslots[i][j];
@@ -1206,6 +1208,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
}
+ r = -ENOMEM;
for (i = 0; i < KVM_NR_BUSES; i++) {
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
@@ -1267,6 +1270,8 @@ out_err_no_arch_destroy_vm:
WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
for (i = 0; i < KVM_NR_BUSES; i++)
kfree(kvm_get_bus(kvm, i));
+ kvm_free_irq_routing(kvm);
+out_err_no_irq_routing:
cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
cleanup_srcu_struct(&kvm->srcu);
@@ -1573,15 +1578,14 @@ static int check_memory_region_flags(struct kvm *kvm,
if (mem->flags & KVM_MEM_GUEST_MEMFD)
valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-#ifdef CONFIG_HAVE_KVM_READONLY_MEM
/*
* GUEST_MEMFD is incompatible with read-only memslots, as writes to
* read-only memslots have emulated MMIO, not page fault, semantics,
* and KVM doesn't allow emulated MMIO for private memory.
*/
- if (!(mem->flags & KVM_MEM_GUEST_MEMFD))
+ if (kvm_arch_has_readonly_mem(kvm) &&
+ !(mem->flags & KVM_MEM_GUEST_MEMFD))
valid_flags |= KVM_MEM_READONLY;
-#endif
if (mem->flags & ~valid_flags)
return -EINVAL;
@@ -2393,48 +2397,47 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static u64 kvm_supported_mem_attributes(struct kvm *kvm)
+{
+ if (!kvm || kvm_arch_has_private_mem(kvm))
+ return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ return 0;
+}
+
/*
* Returns true if _all_ gfns in the range [@start, @end) have attributes
- * matching @attrs.
+ * such that the bits in @mask match @attrs.
*/
bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
- unsigned long attrs)
+ unsigned long mask, unsigned long attrs)
{
XA_STATE(xas, &kvm->mem_attr_array, start);
unsigned long index;
- bool has_attrs;
void *entry;
- rcu_read_lock();
+ mask &= kvm_supported_mem_attributes(kvm);
+ if (attrs & ~mask)
+ return false;
- if (!attrs) {
- has_attrs = !xas_find(&xas, end - 1);
- goto out;
- }
+ if (end == start + 1)
+ return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;
+
+ guard(rcu)();
+ if (!attrs)
+ return !xas_find(&xas, end - 1);
- has_attrs = true;
for (index = start; index < end; index++) {
do {
entry = xas_next(&xas);
} while (xas_retry(&xas, entry));
- if (xas.xa_index != index || xa_to_value(entry) != attrs) {
- has_attrs = false;
- break;
- }
+ if (xas.xa_index != index ||
+ (xa_to_value(entry) & mask) != attrs)
+ return false;
}
-out:
- rcu_read_unlock();
- return has_attrs;
-}
-
-static u64 kvm_supported_mem_attributes(struct kvm *kvm)
-{
- if (!kvm || kvm_arch_has_private_mem(kvm))
- return KVM_MEMORY_ATTRIBUTE_PRIVATE;
-
- return 0;
+ return true;
}
static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
@@ -2529,7 +2532,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
mutex_lock(&kvm->slots_lock);
/* Nothing to do if the entire range as the desired attributes. */
- if (kvm_range_has_memory_attributes(kvm, start, end, attributes))
+ if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
goto out_unlock;
/*
@@ -4202,12 +4205,21 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
{
int r;
struct kvm_vcpu *vcpu;
struct page *page;
+ /*
+ * KVM tracks vCPU IDs as 'int', be kind to userspace and reject
+ * too-large values instead of silently truncating.
+ *
+ * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
+ * changing the storage type (at the very least, IDs should be tracked
+ * as unsigned ints).
+ */
+ BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
if (id >= KVM_MAX_VCPU_IDS)
return -EINVAL;
@@ -4375,6 +4387,52 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
return fd;
}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ int idx;
+ long r;
+ u64 full_size;
+
+ if (range->flags)
+ return -EINVAL;
+
+ if (!PAGE_ALIGNED(range->gpa) ||
+ !PAGE_ALIGNED(range->size) ||
+ range->gpa + range->size <= range->gpa)
+ return -EINVAL;
+
+ vcpu_load(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ full_size = range->size;
+ do {
+ if (signal_pending(current)) {
+ r = -EINTR;
+ break;
+ }
+
+ r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
+ if (WARN_ON_ONCE(r == 0 || r == -EIO))
+ break;
+
+ if (r < 0)
+ break;
+
+ range->size -= r;
+ range->gpa += r;
+ cond_resched();
+ } while (range->size);
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ vcpu_put(vcpu);
+
+ /* Return success if at least one page was mapped successfully. */
+ return full_size == range->size ? r : 0;
+}
+#endif
+
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4421,7 +4479,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
synchronize_rcu();
put_pid(oldpid);
}
+ vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
r = kvm_arch_vcpu_ioctl_run(vcpu);
+ vcpu->wants_to_run = false;
+
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
}
@@ -4575,6 +4636,20 @@ out_free1:
r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
break;
}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+ case KVM_PRE_FAULT_MEMORY: {
+ struct kvm_pre_fault_memory range;
+
+ r = -EFAULT;
+ if (copy_from_user(&range, argp, sizeof(range)))
+ break;
+ r = kvm_vcpu_pre_fault_memory(vcpu, &range);
+ /* Pass back leftover range. */
+ if (copy_to_user(argp, &range, sizeof(range)))
+ r = -EFAULT;
+ break;
+ }
+#endif
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
@@ -6287,8 +6362,9 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
WRITE_ONCE(vcpu->ready, false);
__this_cpu_write(kvm_running_vcpu, vcpu);
- kvm_arch_sched_in(vcpu, cpu);
kvm_arch_vcpu_load(vcpu, cpu);
+
+ WRITE_ONCE(vcpu->scheduled_out, false);
}
static void kvm_sched_out(struct preempt_notifier *pn,
@@ -6296,7 +6372,9 @@ static void kvm_sched_out(struct preempt_notifier *pn,
{
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
- if (current->on_rq) {
+ WRITE_ONCE(vcpu->scheduled_out, true);
+
+ if (current->on_rq && vcpu->wants_to_run) {
WRITE_ONCE(vcpu->preempted, true);
WRITE_ONCE(vcpu->ready, true);
}
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index e3453e869e92..f0039efb9e1e 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -430,6 +430,9 @@ int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long uhva, unsigned long len)
{
+ if (!access_ok((void __user *)uhva, len))
+ return -EINVAL;
+
return __kvm_gpc_activate(gpc, INVALID_GPA, uhva, len);
}