diff options
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/Kconfig | 3 | ||||
-rw-r--r-- | virt/kvm/Makefile.kvm | 1 | ||||
-rw-r--r-- | virt/kvm/dirty_ring.c | 2 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 12 | ||||
-rw-r--r-- | virt/kvm/kvm_mm.h | 44 | ||||
-rw-r--r-- | virt/kvm/mmu_lock.h | 23 | ||||
-rw-r--r-- | virt/kvm/pfncache.c | 337 |
7 files changed, 395 insertions, 27 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 97cf5413ac25..f4834c20e4a6 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -4,6 +4,9 @@ config HAVE_KVM bool +config HAVE_KVM_PFNCACHE + bool + config HAVE_KVM_IRQCHIP bool diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm index ffdcad3cc97a..2c27d5d0c367 100644 --- a/virt/kvm/Makefile.kvm +++ b/virt/kvm/Makefile.kvm @@ -11,3 +11,4 @@ kvm-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o +kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 8e9874760fb3..222ecc81d7df 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -9,7 +9,7 @@ #include <linux/vmalloc.h> #include <linux/kvm_dirty_ring.h> #include <trace/events/kvm.h> -#include "mmu_lock.h" +#include "kvm_mm.h" int __weak kvm_cpu_dirty_log_size(void) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index af5b4427b139..6e8e9d36f382 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -59,7 +59,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" -#include "mmu_lock.h" +#include "kvm_mm.h" #include "vfio.h" #define CREATE_TRACE_POINTS @@ -711,6 +711,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mn_active_invalidate_count++; spin_unlock(&kvm->mn_invalidate_lock); + gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, + hva_range.may_block); + __kvm_handle_hva_range(kvm, &hva_range); return 0; @@ -1071,6 +1074,9 @@ static struct kvm *kvm_create_vm(unsigned long type) rcuwait_init(&kvm->mn_memslots_update_rcuwait); xa_init(&kvm->vcpu_array); + INIT_LIST_HEAD(&kvm->gpc_list); + spin_lock_init(&kvm->gpc_lock); + INIT_LIST_HEAD(&kvm->devices); BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); @@ -2539,8 +2545,8 @@ out: * 2): @write_fault = false && @writable, @writable will tell the caller * whether the mapping is writable. */ -static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, - bool write_fault, bool *writable) +kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, + bool write_fault, bool *writable) { struct vm_area_struct *vma; kvm_pfn_t pfn = 0; diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h new file mode 100644 index 000000000000..34ca40823260 --- /dev/null +++ b/virt/kvm/kvm_mm.h @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef __KVM_MM_H__ +#define __KVM_MM_H__ 1 + +/* + * Architectures can choose whether to use an rwlock or spinlock + * for the mmu_lock. These macros, for use in common code + * only, avoids using #ifdefs in places that must deal with + * multiple architectures. + */ + +#ifdef KVM_HAVE_MMU_RWLOCK +#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) write_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) write_unlock(&(kvm)->mmu_lock) +#define KVM_MMU_READ_LOCK(kvm) read_lock(&(kvm)->mmu_lock) +#define KVM_MMU_READ_UNLOCK(kvm) read_unlock(&(kvm)->mmu_lock) +#else +#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) +#define KVM_MMU_READ_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) +#define KVM_MMU_READ_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) +#endif /* KVM_HAVE_MMU_RWLOCK */ + +kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, + bool write_fault, bool *writable); + +#ifdef CONFIG_HAVE_KVM_PFNCACHE +void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, + unsigned long start, + unsigned long end, + bool may_block); +#else +static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, + unsigned long start, + unsigned long end, + bool may_block) +{ +} +#endif /* HAVE_KVM_PFNCACHE */ + +#endif /* __KVM_MM_H__ */ diff --git a/virt/kvm/mmu_lock.h b/virt/kvm/mmu_lock.h deleted file mode 100644 index 9e1308f9734c..000000000000 --- a/virt/kvm/mmu_lock.h +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#ifndef KVM_MMU_LOCK_H -#define KVM_MMU_LOCK_H 1 - -/* - * Architectures can choose whether to use an rwlock or spinlock - * for the mmu_lock. These macros, for use in common code - * only, avoids using #ifdefs in places that must deal with - * multiple architectures. - */ - -#ifdef KVM_HAVE_MMU_RWLOCK -#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock) -#define KVM_MMU_LOCK(kvm) write_lock(&(kvm)->mmu_lock) -#define KVM_MMU_UNLOCK(kvm) write_unlock(&(kvm)->mmu_lock) -#else -#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock) -#define KVM_MMU_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) -#define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) -#endif /* KVM_HAVE_MMU_RWLOCK */ - -#endif diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c new file mode 100644 index 000000000000..ce878f4be4da --- /dev/null +++ b/virt/kvm/pfncache.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables kernel and guest-mode vCPU access to guest physical + * memory with suitable invalidation mechanisms. + * + * Copyright © 2021 Amazon.com, Inc. or its affiliates. + * + * Authors: + * David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/errno.h> + +#include "kvm_mm.h" + +/* + * MMU notifier 'invalidate_range_start' hook. + */ +void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, + unsigned long end, bool may_block) +{ + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); + struct gfn_to_pfn_cache *gpc; + bool wake_vcpus = false; + + spin_lock(&kvm->gpc_lock); + list_for_each_entry(gpc, &kvm->gpc_list, list) { + write_lock_irq(&gpc->lock); + + /* Only a single page so no need to care about length */ + if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && + gpc->uhva >= start && gpc->uhva < end) { + gpc->valid = false; + + /* + * If a guest vCPU could be using the physical address, + * it needs to be woken. + */ + if (gpc->guest_uses_pa) { + if (!wake_vcpus) { + wake_vcpus = true; + bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); + } + __set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap); + } + + /* + * We cannot call mark_page_dirty() from here because + * this physical CPU might not have an active vCPU + * with which to do the KVM dirty tracking. + * + * Neither is there any point in telling the kernel MM + * that the underlying page is dirty. A vCPU in guest + * mode might still be writing to it up to the point + * where we wake them a few lines further down anyway. + * + * So all the dirty marking happens on the unmap. + */ + } + write_unlock_irq(&gpc->lock); + } + spin_unlock(&kvm->gpc_lock); + + if (wake_vcpus) { + unsigned int req = KVM_REQ_GPC_INVALIDATE; + bool called; + + /* + * If the OOM reaper is active, then all vCPUs should have + * been stopped already, so perform the request without + * KVM_REQUEST_WAIT and be sad if any needed to be woken. + */ + if (!may_block) + req &= ~KVM_REQUEST_WAIT; + + called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap); + + WARN_ON_ONCE(called && !may_block); + } +} + +bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + gpa_t gpa, unsigned long len) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + + if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE) + return false; + + if (gpc->gpa != gpa || gpc->generation != slots->generation || + kvm_is_error_hva(gpc->uhva)) + return false; + + if (!gpc->valid) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_check); + +static void __release_gpc(struct kvm *kvm, kvm_pfn_t pfn, void *khva, + gpa_t gpa, bool dirty) +{ + /* Unmap the old page if it was mapped before, and release it */ + if (!is_error_noslot_pfn(pfn)) { + if (khva) { + if (pfn_valid(pfn)) + kunmap(pfn_to_page(pfn)); +#ifdef CONFIG_HAS_IOMEM + else + memunmap(khva); +#endif + } + + kvm_release_pfn(pfn, dirty); + if (dirty) + mark_page_dirty(kvm, gpa); + } +} + +static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, unsigned long uhva) +{ + unsigned long mmu_seq; + kvm_pfn_t new_pfn; + int retry; + + do { + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* We always request a writeable mapping */ + new_pfn = hva_to_pfn(uhva, false, NULL, true, NULL); + if (is_error_noslot_pfn(new_pfn)) + break; + + KVM_MMU_READ_LOCK(kvm); + retry = mmu_notifier_retry_hva(kvm, mmu_seq, uhva); + KVM_MMU_READ_UNLOCK(kvm); + if (!retry) + break; + + cond_resched(); + } while (1); + + return new_pfn; +} + +int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + gpa_t gpa, unsigned long len, bool dirty) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + unsigned long page_offset = gpa & ~PAGE_MASK; + kvm_pfn_t old_pfn, new_pfn; + unsigned long old_uhva; + gpa_t old_gpa; + void *old_khva; + bool old_valid, old_dirty; + int ret = 0; + + /* + * If must fit within a single page. The 'len' argument is + * only to enforce that. + */ + if (page_offset + len > PAGE_SIZE) + return -EINVAL; + + write_lock_irq(&gpc->lock); + + old_gpa = gpc->gpa; + old_pfn = gpc->pfn; + old_khva = gpc->khva - offset_in_page(gpc->khva); + old_uhva = gpc->uhva; + old_valid = gpc->valid; + old_dirty = gpc->dirty; + + /* If the userspace HVA is invalid, refresh that first */ + if (gpc->gpa != gpa || gpc->generation != slots->generation || + kvm_is_error_hva(gpc->uhva)) { + gfn_t gfn = gpa_to_gfn(gpa); + + gpc->dirty = false; + gpc->gpa = gpa; + gpc->generation = slots->generation; + gpc->memslot = __gfn_to_memslot(slots, gfn); + gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn); + + if (kvm_is_error_hva(gpc->uhva)) { + ret = -EFAULT; + goto out; + } + + gpc->uhva += page_offset; + } + + /* + * If the userspace HVA changed or the PFN was already invalid, + * drop the lock and do the HVA to PFN lookup again. + */ + if (!old_valid || old_uhva != gpc->uhva) { + unsigned long uhva = gpc->uhva; + void *new_khva = NULL; + + /* Placeholders for "hva is valid but not yet mapped" */ + gpc->pfn = KVM_PFN_ERR_FAULT; + gpc->khva = NULL; + gpc->valid = true; + + write_unlock_irq(&gpc->lock); + + new_pfn = hva_to_pfn_retry(kvm, uhva); + if (is_error_noslot_pfn(new_pfn)) { + ret = -EFAULT; + goto map_done; + } + + if (gpc->kernel_map) { + if (new_pfn == old_pfn) { + new_khva = old_khva; + old_pfn = KVM_PFN_ERR_FAULT; + old_khva = NULL; + } else if (pfn_valid(new_pfn)) { + new_khva = kmap(pfn_to_page(new_pfn)); +#ifdef CONFIG_HAS_IOMEM + } else { + new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB); +#endif + } + if (new_khva) + new_khva += page_offset; + else + ret = -EFAULT; + } + + map_done: + write_lock_irq(&gpc->lock); + if (ret) { + gpc->valid = false; + gpc->pfn = KVM_PFN_ERR_FAULT; + gpc->khva = NULL; + } else { + /* At this point, gpc->valid may already have been cleared */ + gpc->pfn = new_pfn; + gpc->khva = new_khva; + } + } else { + /* If the HVA→PFN mapping was already valid, don't unmap it. */ + old_pfn = KVM_PFN_ERR_FAULT; + old_khva = NULL; + } + + out: + if (ret) + gpc->dirty = false; + else + gpc->dirty = dirty; + + write_unlock_irq(&gpc->lock); + + __release_gpc(kvm, old_pfn, old_khva, old_gpa, old_dirty); + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_refresh); + +void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) +{ + void *old_khva; + kvm_pfn_t old_pfn; + bool old_dirty; + gpa_t old_gpa; + + write_lock_irq(&gpc->lock); + + gpc->valid = false; + + old_khva = gpc->khva - offset_in_page(gpc->khva); + old_dirty = gpc->dirty; + old_gpa = gpc->gpa; + old_pfn = gpc->pfn; + + /* + * We can leave the GPA → uHVA map cache intact but the PFN + * lookup will need to be redone even for the same page. + */ + gpc->khva = NULL; + gpc->pfn = KVM_PFN_ERR_FAULT; + + write_unlock_irq(&gpc->lock); + + __release_gpc(kvm, old_pfn, old_khva, old_gpa, old_dirty); +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap); + + +int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + struct kvm_vcpu *vcpu, bool guest_uses_pa, + bool kernel_map, gpa_t gpa, unsigned long len, + bool dirty) +{ + if (!gpc->active) { + rwlock_init(&gpc->lock); + + gpc->khva = NULL; + gpc->pfn = KVM_PFN_ERR_FAULT; + gpc->uhva = KVM_HVA_ERR_BAD; + gpc->vcpu = vcpu; + gpc->kernel_map = kernel_map; + gpc->guest_uses_pa = guest_uses_pa; + gpc->valid = false; + gpc->active = true; + + spin_lock(&kvm->gpc_lock); + list_add(&gpc->list, &kvm->gpc_list); + spin_unlock(&kvm->gpc_lock); + } + return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len, dirty); +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init); + +void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) +{ + if (gpc->active) { + spin_lock(&kvm->gpc_lock); + list_del(&gpc->list); + spin_unlock(&kvm->gpc_lock); + + kvm_gfn_to_pfn_cache_unmap(kvm, gpc); + gpc->active = false; + } +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_destroy); |