From e97b39c5c4362dc1cbc37a563ddac313b96c84f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:43 -0700 Subject: KVM: Tweak kvm_hva_range and hva_handler_t to allow reusing for gfn ranges Rework and rename "struct kvm_hva_range" into "kvm_mmu_notifier_range" so that the structure can be used to handle notifications that operate on gfn context, i.e. that aren't tied to a host virtual address. Rename the handler typedef too (arguably it should always have been gfn_handler_t). Practically speaking, this is a nop for 64-bit kernels as the only meaningful change is to store start+end as u64s instead of unsigned longs. Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-2-seanjc@google.com> Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 486800a7024b..0524933856d4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -541,18 +541,22 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) return container_of(mn, struct kvm, mmu_notifier); } -typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); +typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, unsigned long end); typedef void (*on_unlock_fn_t)(struct kvm *kvm); -struct kvm_hva_range { - unsigned long start; - unsigned long end; +struct kvm_mmu_notifier_range { + /* + * 64-bit addresses, as KVM notifiers can operate on host virtual + * addresses (unsigned long) and guest physical addresses (64-bit). + */ + u64 start; + u64 end; union kvm_mmu_notifier_arg arg; - hva_handler_t handler; + gfn_handler_t handler; on_lock_fn_t on_lock; on_unlock_fn_t on_unlock; bool flush_on_ret; @@ -581,7 +585,7 @@ static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG; node = interval_tree_iter_next(node, start, last)) \ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, - const struct kvm_hva_range *range) + const struct kvm_mmu_notifier_range *range) { bool ret = false, locked = false; struct kvm_gfn_range gfn_range; @@ -608,9 +612,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, unsigned long hva_start, hva_end; slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]); - hva_start = max(range->start, slot->userspace_addr); - hva_end = min(range->end, slot->userspace_addr + - (slot->npages << PAGE_SHIFT)); + hva_start = max_t(unsigned long, range->start, slot->userspace_addr); + hva_end = min_t(unsigned long, range->end, + slot->userspace_addr + (slot->npages << PAGE_SHIFT)); /* * To optimize for the likely case where the address @@ -660,10 +664,10 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, union kvm_mmu_notifier_arg arg, - hva_handler_t handler) + gfn_handler_t handler) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_hva_range range = { + const struct kvm_mmu_notifier_range range = { .start = start, .end = end, .arg = arg, @@ -680,10 +684,10 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, unsigned long start, unsigned long end, - hva_handler_t handler) + gfn_handler_t handler) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_hva_range range = { + const struct kvm_mmu_notifier_range range = { .start = start, .end = end, .handler = handler, @@ -771,7 +775,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_hva_range hva_range = { + const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = kvm_unmap_gfn_range, @@ -835,7 +839,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_hva_range hva_range = { + const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = (void *)kvm_null_fn, -- cgit v1.2.3 From c0db19232c1ed6bd7fcb825c28b014c52732c19e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:44 -0700 Subject: KVM: Assert that mmu_invalidate_in_progress *never* goes negative Move the assertion on the in-progress invalidation count from the primary MMU's notifier path to KVM's common notification path, i.e. assert that the count doesn't go negative even when the invalidation is coming from KVM itself. Opportunistically convert the assertion to a KVM_BUG_ON(), i.e. kill only the affected VM, not the entire kernel. A corrupted count is fatal to the VM, e.g. the non-zero (negative) count will cause mmu_invalidate_retry() to block any and all attempts to install new mappings. But it's far from guaranteed that an end() without a start() is fatal or even problematic to anything other than the target VM, e.g. the underlying bug could simply be a duplicate call to end(). And it's much more likely that a missed invalidation, i.e. a potential use-after-free, would manifest as no notification whatsoever, not an end() without a start(). Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-3-seanjc@google.com> Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0524933856d4..5a97e6c7d9c2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -833,6 +833,7 @@ void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, * in conjunction with the smp_rmb in mmu_invalidate_retry(). */ kvm->mmu_invalidate_in_progress--; + KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm); } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, @@ -863,8 +864,6 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, */ if (wake) rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); - - BUG_ON(kvm->mmu_invalidate_in_progress < 0); } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, -- cgit v1.2.3 From 8569992d64b8f750e34b7858eac5d7daaf0f80fd Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:45 -0700 Subject: KVM: Use gfn instead of hva for mmu_notifier_retry Currently in mmu_notifier invalidate path, hva range is recorded and then checked against by mmu_invalidate_retry_hva() in the page fault handling path. However, for the soon-to-be-introduced private memory, a page fault may not have a hva associated, checking gfn(gpa) makes more sense. For existing hva based shared memory, gfn is expected to also work. The only downside is when aliasing multiple gfns to a single hva, the current algorithm of checking multiple ranges could result in a much larger range being rejected. Such aliasing should be uncommon, so the impact is expected small. Suggested-by: Sean Christopherson Cc: Xu Yilun Signed-off-by: Chao Peng Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba [sean: convert vmx_set_apic_access_page_addr() to gfn-based API] Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Xu Yilun Message-Id: <20231027182217.3615211-4-seanjc@google.com> Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a97e6c7d9c2..9cc57b23ec81 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -543,9 +543,7 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); -typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, - unsigned long end); - +typedef void (*on_lock_fn_t)(struct kvm *kvm); typedef void (*on_unlock_fn_t)(struct kvm *kvm); struct kvm_mmu_notifier_range { @@ -637,7 +635,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, locked = true; KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) - range->on_lock(kvm, range->start, range->end); + range->on_lock(kvm); + if (IS_KVM_NULL_FN(range->handler)) break; } @@ -742,16 +741,29 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn); } -void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, - unsigned long end) +void kvm_mmu_invalidate_begin(struct kvm *kvm) { + lockdep_assert_held_write(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and * count is also read inside the mmu_lock critical section. */ kvm->mmu_invalidate_in_progress++; + if (likely(kvm->mmu_invalidate_in_progress == 1)) { + kvm->mmu_invalidate_range_start = INVALID_GPA; + kvm->mmu_invalidate_range_end = INVALID_GPA; + } +} + +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress); + + if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) { kvm->mmu_invalidate_range_start = start; kvm->mmu_invalidate_range_end = end; } else { @@ -771,6 +783,12 @@ void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, } } +static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + kvm_mmu_invalidate_range_add(kvm, range->start, range->end); + return kvm_unmap_gfn_range(kvm, range); +} + static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { @@ -778,7 +796,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, - .handler = kvm_unmap_gfn_range, + .handler = kvm_mmu_unmap_gfn_range, .on_lock = kvm_mmu_invalidate_begin, .on_unlock = kvm_arch_guest_memory_reclaimed, .flush_on_ret = true, @@ -817,9 +835,10 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, return 0; } -void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, - unsigned long end) +void kvm_mmu_invalidate_end(struct kvm *kvm) { + lockdep_assert_held_write(&kvm->mmu_lock); + /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have @@ -834,6 +853,12 @@ void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, */ kvm->mmu_invalidate_in_progress--; KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm); + + /* + * Assert that at least one range was added between start() and end(). + * Not adding a range isn't fatal, but it is a KVM bug. + */ + WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA); } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, -- cgit v1.2.3 From d497a0fab8b8457214fcc9b1a39530920ea7e95e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:46 -0700 Subject: KVM: WARN if there are dangling MMU invalidations at VM destruction Add an assertion that there are no in-progress MMU invalidations when a VM is being destroyed, with the exception of the scenario where KVM unregisters its MMU notifier between an .invalidate_range_start() call and the corresponding .invalidate_range_end(). KVM can't detect unpaired calls from the mmu_notifier due to the above exception waiver, but the assertion can detect KVM bugs, e.g. such as the bug that *almost* escaped initial guest_memfd development. Link: https://lore.kernel.org/all/e397d30c-c6af-e68f-d18e-b4e3739c5389@linux.intel.com Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9cc57b23ec81..5422ce20dcba 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1358,9 +1358,16 @@ static void kvm_destroy_vm(struct kvm *kvm) * No threads can be waiting in kvm_swap_active_memslots() as the * last reference on KVM has been dropped, but freeing * memslots would deadlock without this manual intervention. + * + * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU + * notifier between a start() and end(), then there shouldn't be any + * in-progress invalidations. */ WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); - kvm->mn_active_invalidate_count = 0; + if (kvm->mn_active_invalidate_count) + kvm->mn_active_invalidate_count = 0; + else + WARN_ON(kvm->mmu_invalidate_in_progress); #else kvm_flush_shadow_all(kvm); #endif -- cgit v1.2.3 From f128cf8cfbecccf95e891ae90d9c917df5117c7a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:49 -0700 Subject: KVM: Convert KVM_ARCH_WANT_MMU_NOTIFIER to CONFIG_KVM_GENERIC_MMU_NOTIFIER Convert KVM_ARCH_WANT_MMU_NOTIFIER into a Kconfig and select it where appropriate to effectively maintain existing behavior. Using a proper Kconfig will simplify building more functionality on top of KVM's mmu_notifier infrastructure. Add a forward declaration of kvm_gfn_range to kvm_types.h so that including arch/powerpc/include/asm/kvm_ppc.h's with CONFIG_KVM=n doesn't generate warnings due to kvm_gfn_range being undeclared. PPC defines hooks for PR vs. HV without guarding them via #ifdeffery, e.g. bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); Alternatively, PPC could forward declare kvm_gfn_range, but there's no good reason not to define it in common KVM. Acked-by: Anup Patel Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/Kconfig | 4 ++++ virt/kvm/kvm_main.c | 10 +++++----- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'virt') diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 484d0873061c..ecae2914c97e 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -92,3 +92,7 @@ config HAVE_KVM_PM_NOTIFIER config KVM_GENERIC_HARDWARE_ENABLING bool + +config KVM_GENERIC_MMU_NOTIFIER + select MMU_NOTIFIER + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5422ce20dcba..dc81279ea385 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -535,7 +535,7 @@ void kvm_destroy_vcpus(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); @@ -962,14 +962,14 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } -#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ +#else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */ static int kvm_init_mmu_notifier(struct kvm *kvm) { return 0; } -#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ +#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */ #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_pm_notifier_call(struct notifier_block *bl, @@ -1289,7 +1289,7 @@ out_err: out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER if (kvm->mmu_notifier.ops) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif @@ -1349,7 +1349,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); /* * At this point, pending calls to invalidate_range_start() -- cgit v1.2.3 From bb58b90b1a8f753b582055adaf448214a8e22c31 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:50 -0700 Subject: KVM: Introduce KVM_SET_USER_MEMORY_REGION2 Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional information can be supplied without setting userspace up to fail. The padding in the new kvm_userspace_memory_region2 structure will be used to pass a file descriptor in addition to the userspace_addr, i.e. allow userspace to point at a file descriptor and map memory into a guest that is NOT mapped into host userspace. Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" without a new ioctl(), but as Paolo pointed out, adding a new ioctl() makes detection of bad flags a bit more robust, e.g. if the new fd field is guarded only by a flag and not a new ioctl(), then a userspace bug (setting a "bad" flag) would generate out-of-bounds access instead of an -EINVAL error. Cc: Jarkko Sakkinen Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-9-seanjc@google.com> Acked-by: Kai Huang Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 57 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 8 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dc81279ea385..756b94ecd511 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1580,7 +1580,15 @@ static void kvm_replace_memslot(struct kvm *kvm, } } -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) +/* + * Flags that do not access any of the extra space of struct + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS + * only allows these. + */ +#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) + +static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; @@ -1982,7 +1990,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, * Must be called holding kvm->slots_lock for write. */ int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; @@ -2086,7 +2094,7 @@ int __kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(__kvm_set_memory_region); int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region2 *mem) { int r; @@ -2098,7 +2106,7 @@ int kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(kvm_set_memory_region); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region2 *mem) { if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; @@ -4568,6 +4576,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) { switch (arg) { case KVM_CAP_USER_MEMORY: + case KVM_CAP_USER_MEMORY2: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: case KVM_CAP_INTERNAL_ERROR_DATA: @@ -4823,6 +4832,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) return fd; } +#define SANITY_CHECK_MEM_REGION_FIELD(field) \ +do { \ + BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \ + offsetof(struct kvm_userspace_memory_region2, field)); \ + BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \ + sizeof_field(struct kvm_userspace_memory_region2, field)); \ +} while (0) + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4845,15 +4862,39 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); break; } + case KVM_SET_USER_MEMORY_REGION2: case KVM_SET_USER_MEMORY_REGION: { - struct kvm_userspace_memory_region kvm_userspace_mem; + struct kvm_userspace_memory_region2 mem; + unsigned long size; + + if (ioctl == KVM_SET_USER_MEMORY_REGION) { + /* + * Fields beyond struct kvm_userspace_memory_region shouldn't be + * accessed, but avoid leaking kernel memory in case of a bug. + */ + memset(&mem, 0, sizeof(mem)); + size = sizeof(struct kvm_userspace_memory_region); + } else { + size = sizeof(struct kvm_userspace_memory_region2); + } + + /* Ensure the common parts of the two structs are identical. */ + SANITY_CHECK_MEM_REGION_FIELD(slot); + SANITY_CHECK_MEM_REGION_FIELD(flags); + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); + SANITY_CHECK_MEM_REGION_FIELD(memory_size); + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); r = -EFAULT; - if (copy_from_user(&kvm_userspace_mem, argp, - sizeof(kvm_userspace_mem))) + if (copy_from_user(&mem, argp, size)) + goto out; + + r = -EINVAL; + if (ioctl == KVM_SET_USER_MEMORY_REGION && + (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS)) goto out; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); + r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; } case KVM_GET_DIRTY_LOG: { -- cgit v1.2.3 From cec29eef0a815386d520d61c2cbe16d537931639 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:52 -0700 Subject: KVM: Add a dedicated mmu_notifier flag for reclaiming freed memory Handle AMD SEV's kvm_arch_guest_memory_reclaimed() hook by having __kvm_handle_hva_range() return whether or not an overlapping memslot was found, i.e. mmu_lock was acquired. Using the .on_unlock() hook works, but kvm_arch_guest_memory_reclaimed() needs to run after dropping mmu_lock, which makes .on_lock() and .on_unlock() asymmetrical. Use a small struct to return the tuple of the notifier-specific return, plus whether or not overlap was found. Because the iteration helpers are __always_inlined, practically speaking, the struct will never actually be returned from a function call (not to mention the size of the struct will be two bytes in practice). Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 53 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 16 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 756b94ecd511..e18a7f152c0b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -561,6 +561,19 @@ struct kvm_mmu_notifier_range { bool may_block; }; +/* + * The inner-most helper returns a tuple containing the return value from the + * arch- and action-specific handler, plus a flag indicating whether or not at + * least one memslot was found, i.e. if the handler found guest memory. + * + * Note, most notifiers are averse to booleans, so even though KVM tracks the + * return from arch code as a bool, outer helpers will cast it to an int. :-( + */ +typedef struct kvm_mmu_notifier_return { + bool ret; + bool found_memslot; +} kvm_mn_ret_t; + /* * Use a dedicated stub instead of NULL to indicate that there is no callback * function/handler. The compiler technically can't guarantee that a real @@ -582,22 +595,25 @@ static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG; node; \ node = interval_tree_iter_next(node, start, last)) \ -static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, - const struct kvm_mmu_notifier_range *range) +static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_mmu_notifier_range *range) { - bool ret = false, locked = false; + struct kvm_mmu_notifier_return r = { + .ret = false, + .found_memslot = false, + }; struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; int i, idx; if (WARN_ON_ONCE(range->end <= range->start)) - return 0; + return r; /* A null handler is allowed if and only if on_lock() is provided. */ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && IS_KVM_NULL_FN(range->handler))) - return 0; + return r; idx = srcu_read_lock(&kvm->srcu); @@ -631,8 +647,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); gfn_range.slot = slot; - if (!locked) { - locked = true; + if (!r.found_memslot) { + r.found_memslot = true; KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) range->on_lock(kvm); @@ -640,14 +656,14 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (IS_KVM_NULL_FN(range->handler)) break; } - ret |= range->handler(kvm, &gfn_range); + r.ret |= range->handler(kvm, &gfn_range); } } - if (range->flush_on_ret && ret) + if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); - if (locked) { + if (r.found_memslot) { KVM_MMU_UNLOCK(kvm); if (!IS_KVM_NULL_FN(range->on_unlock)) range->on_unlock(kvm); @@ -655,8 +671,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, srcu_read_unlock(&kvm->srcu, idx); - /* The notifiers are averse to booleans. :-( */ - return (int)ret; + return r; } static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, @@ -677,7 +692,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .may_block = false, }; - return __kvm_handle_hva_range(kvm, &range); + return __kvm_handle_hva_range(kvm, &range).ret; } static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, @@ -696,7 +711,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn .may_block = false, }; - return __kvm_handle_hva_range(kvm, &range); + return __kvm_handle_hva_range(kvm, &range).ret; } static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -798,7 +813,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .end = range->end, .handler = kvm_mmu_unmap_gfn_range, .on_lock = kvm_mmu_invalidate_begin, - .on_unlock = kvm_arch_guest_memory_reclaimed, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; @@ -830,7 +845,13 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, hva_range.may_block); - __kvm_handle_hva_range(kvm, &hva_range); + /* + * If one or more memslots were found and thus zapped, notify arch code + * that guest memory has been reclaimed. This needs to be done *after* + * dropping mmu_lock, as x86's reclaim path is slooooow. + */ + if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot) + kvm_arch_guest_memory_reclaimed(kvm); return 0; } -- cgit v1.2.3 From 193bbfaacc84f9ee9c281ec0a8dd2ec8e4821e57 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:53 -0700 Subject: KVM: Drop .on_unlock() mmu_notifier hook Drop the .on_unlock() mmu_notifer hook now that it's no longer used for notifying arch code that memory has been reclaimed. Adding .on_unlock() and invoking it *after* dropping mmu_lock was a terrible idea, as doing so resulted in .on_lock() and .on_unlock() having divergent and asymmetric behavior, and set future developers up for failure, i.e. all but asked for bugs where KVM relied on using .on_unlock() to try to run a callback while holding mmu_lock. Opportunistically add a lockdep assertion in kvm_mmu_invalidate_end() to guard against future bugs of this nature. Reported-by: Isaku Yamahata Link: https://lore.kernel.org/all/20230802203119.GB2021422@ls.amr.corp.intel.com Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e18a7f152c0b..7f3291dec7a6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -544,7 +544,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm); -typedef void (*on_unlock_fn_t)(struct kvm *kvm); struct kvm_mmu_notifier_range { /* @@ -556,7 +555,6 @@ struct kvm_mmu_notifier_range { union kvm_mmu_notifier_arg arg; gfn_handler_t handler; on_lock_fn_t on_lock; - on_unlock_fn_t on_unlock; bool flush_on_ret; bool may_block; }; @@ -663,11 +661,8 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); - if (r.found_memslot) { + if (r.found_memslot) KVM_MMU_UNLOCK(kvm); - if (!IS_KVM_NULL_FN(range->on_unlock)) - range->on_unlock(kvm); - } srcu_read_unlock(&kvm->srcu, idx); @@ -687,7 +682,6 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .arg = arg, .handler = handler, .on_lock = (void *)kvm_null_fn, - .on_unlock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = false, }; @@ -706,7 +700,6 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn .end = end, .handler = handler, .on_lock = (void *)kvm_null_fn, - .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = false, }; @@ -813,7 +806,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .end = range->end, .handler = kvm_mmu_unmap_gfn_range, .on_lock = kvm_mmu_invalidate_begin, - .on_unlock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; @@ -891,7 +883,6 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .end = range->end, .handler = (void *)kvm_null_fn, .on_lock = kvm_mmu_invalidate_end, - .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; -- cgit v1.2.3 From 5a475554db1e476a14216e742ea2bdb77362d5d5 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:55 -0700 Subject: KVM: Introduce per-page memory attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In confidential computing usages, whether a page is private or shared is necessary information for KVM to perform operations like page fault handling, page zapping etc. There are other potential use cases for per-page memory attributes, e.g. to make memory read-only (or no-exec, or exec-only, etc.) without having to modify memslots. Introduce the KVM_SET_MEMORY_ATTRIBUTES ioctl, advertised by KVM_CAP_MEMORY_ATTRIBUTES, to allow userspace to set the per-page memory attributes to a guest memory range. Use an xarray to store the per-page attributes internally, with a naive, not fully optimized implementation, i.e. prioritize correctness over performance for the initial implementation. Use bit 3 for the PRIVATE attribute so that KVM can use bits 0-2 for RWX attributes/protections in the future, e.g. to give userspace fine-grained control over read, write, and execute protections for guest memory. Provide arch hooks for handling attribute changes before and after common code sets the new attributes, e.g. x86 will use the "pre" hook to zap all relevant mappings, and the "post" hook to track whether or not hugepages can be used to map the range. To simplify the implementation wrap the entire sequence with kvm_mmu_invalidate_{begin,end}() even though the operation isn't strictly guaranteed to be an invalidation. For the initial use case, x86 *will* always invalidate memory, and preventing arch code from creating new mappings while the attributes are in flux makes it much easier to reason about the correctness of consuming attributes. It's possible that future usages may not require an invalidation, e.g. if KVM ends up supporting RWX protections and userspace grants _more_ protections, but again opt for simplicity and punt optimizations to if/when they are needed. Suggested-by: Sean Christopherson Link: https://lore.kernel.org/all/Y2WB48kD0J4VGynX@google.com Cc: Fuad Tabba Cc: Xu Yilun Cc: Mickaël Salaün Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/Kconfig | 4 + virt/kvm/kvm_main.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+) (limited to 'virt') diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index ecae2914c97e..5bd7fcaf9089 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -96,3 +96,7 @@ config KVM_GENERIC_HARDWARE_ENABLING config KVM_GENERIC_MMU_NOTIFIER select MMU_NOTIFIER bool + +config KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_GENERIC_MMU_NOTIFIER + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7f3291dec7a6..f1a575d39b3b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1211,6 +1211,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) spin_lock_init(&kvm->mn_invalidate_lock); rcuwait_init(&kvm->mn_memslots_update_rcuwait); xa_init(&kvm->vcpu_array); +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + xa_init(&kvm->mem_attr_array); +#endif INIT_LIST_HEAD(&kvm->gpc_list); spin_lock_init(&kvm->gpc_lock); @@ -1391,6 +1394,9 @@ static void kvm_destroy_vm(struct kvm *kvm) } cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + xa_destroy(&kvm->mem_attr_array); +#endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); hardware_disable_all(); @@ -2397,6 +2403,200 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, } #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +/* + * Returns true if _all_ gfns in the range [@start, @end) have attributes + * matching @attrs. + */ +bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, + unsigned long attrs) +{ + XA_STATE(xas, &kvm->mem_attr_array, start); + unsigned long index; + bool has_attrs; + void *entry; + + rcu_read_lock(); + + if (!attrs) { + has_attrs = !xas_find(&xas, end - 1); + goto out; + } + + has_attrs = true; + for (index = start; index < end; index++) { + do { + entry = xas_next(&xas); + } while (xas_retry(&xas, entry)); + + if (xas.xa_index != index || xa_to_value(entry) != attrs) { + has_attrs = false; + break; + } + } + +out: + rcu_read_unlock(); + return has_attrs; +} + +static u64 kvm_supported_mem_attributes(struct kvm *kvm) +{ + if (!kvm) + return KVM_MEMORY_ATTRIBUTE_PRIVATE; + + return 0; +} + +static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, + struct kvm_mmu_notifier_range *range) +{ + struct kvm_gfn_range gfn_range; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + struct kvm_memslot_iter iter; + bool found_memslot = false; + bool ret = false; + int i; + + gfn_range.arg = range->arg; + gfn_range.may_block = range->may_block; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + + kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) { + slot = iter.slot; + gfn_range.slot = slot; + + gfn_range.start = max(range->start, slot->base_gfn); + gfn_range.end = min(range->end, slot->base_gfn + slot->npages); + if (gfn_range.start >= gfn_range.end) + continue; + + if (!found_memslot) { + found_memslot = true; + KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm); + } + + ret |= range->handler(kvm, &gfn_range); + } + } + + if (range->flush_on_ret && ret) + kvm_flush_remote_tlbs(kvm); + + if (found_memslot) + KVM_MMU_UNLOCK(kvm); +} + +static bool kvm_pre_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + /* + * Unconditionally add the range to the invalidation set, regardless of + * whether or not the arch callback actually needs to zap SPTEs. E.g. + * if KVM supports RWX attributes in the future and the attributes are + * going from R=>RW, zapping isn't strictly necessary. Unconditionally + * adding the range allows KVM to require that MMU invalidations add at + * least one range between begin() and end(), e.g. allows KVM to detect + * bugs where the add() is missed. Relaxing the rule *might* be safe, + * but it's not obvious that allowing new mappings while the attributes + * are in flux is desirable or worth the complexity. + */ + kvm_mmu_invalidate_range_add(kvm, range->start, range->end); + + return kvm_arch_pre_set_memory_attributes(kvm, range); +} + +/* Set @attributes for the gfn range [@start, @end). */ +static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, + unsigned long attributes) +{ + struct kvm_mmu_notifier_range pre_set_range = { + .start = start, + .end = end, + .handler = kvm_pre_set_memory_attributes, + .on_lock = kvm_mmu_invalidate_begin, + .flush_on_ret = true, + .may_block = true, + }; + struct kvm_mmu_notifier_range post_set_range = { + .start = start, + .end = end, + .arg.attributes = attributes, + .handler = kvm_arch_post_set_memory_attributes, + .on_lock = kvm_mmu_invalidate_end, + .may_block = true, + }; + unsigned long i; + void *entry; + int r = 0; + + entry = attributes ? xa_mk_value(attributes) : NULL; + + mutex_lock(&kvm->slots_lock); + + /* Nothing to do if the entire range as the desired attributes. */ + if (kvm_range_has_memory_attributes(kvm, start, end, attributes)) + goto out_unlock; + + /* + * Reserve memory ahead of time to avoid having to deal with failures + * partway through setting the new attributes. + */ + for (i = start; i < end; i++) { + r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT); + if (r) + goto out_unlock; + } + + kvm_handle_gfn_range(kvm, &pre_set_range); + + for (i = start; i < end; i++) { + r = xa_err(xa_store(&kvm->mem_attr_array, i, entry, + GFP_KERNEL_ACCOUNT)); + KVM_BUG_ON(r, kvm); + } + + kvm_handle_gfn_range(kvm, &post_set_range); + +out_unlock: + mutex_unlock(&kvm->slots_lock); + + return r; +} +static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, + struct kvm_memory_attributes *attrs) +{ + gfn_t start, end; + + /* flags is currently not used. */ + if (attrs->flags) + return -EINVAL; + if (attrs->attributes & ~kvm_supported_mem_attributes(kvm)) + return -EINVAL; + if (attrs->size == 0 || attrs->address + attrs->size < attrs->address) + return -EINVAL; + if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size)) + return -EINVAL; + + start = attrs->address >> PAGE_SHIFT; + end = (attrs->address + attrs->size) >> PAGE_SHIFT; + + /* + * xarray tracks data using "unsigned long", and as a result so does + * KVM. For simplicity, supports generic attributes only on 64-bit + * architectures. + */ + BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long)); + + return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes); +} +#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ + struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); @@ -4641,6 +4841,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_BINARY_STATS_FD: case KVM_CAP_SYSTEM_EVENT_DATA: return 1; +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + case KVM_CAP_MEMORY_ATTRIBUTES: + return kvm_supported_mem_attributes(kvm); +#endif default: break; } @@ -5034,6 +5238,18 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + case KVM_SET_MEMORY_ATTRIBUTES: { + struct kvm_memory_attributes attrs; + + r = -EFAULT; + if (copy_from_user(&attrs, argp, sizeof(attrs))) + goto out; + + r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs); + break; + } +#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ case KVM_CREATE_DEVICE: { struct kvm_create_device cd; -- cgit v1.2.3 From a7800aa80ea4d5356b8474c2302812e9d4926fa6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Nov 2023 05:42:34 -0500 Subject: KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce an ioctl(), KVM_CREATE_GUEST_MEMFD, to allow creating file-based memory that is tied to a specific KVM virtual machine and whose primary purpose is to serve guest memory. A guest-first memory subsystem allows for optimizations and enhancements that are kludgy or outright infeasible to implement/support in a generic memory subsystem. With guest_memfd, guest protections and mapping sizes are fully decoupled from host userspace mappings. E.g. KVM currently doesn't support mapping memory as writable in the guest without it also being writable in host userspace, as KVM's ABI uses VMA protections to define the allow guest protection. Userspace can fudge this by establishing two mappings, a writable mapping for the guest and readable one for itself, but that’s suboptimal on multiple fronts. Similarly, KVM currently requires the guest mapping size to be a strict subset of the host userspace mapping size, e.g. KVM doesn’t support creating a 1GiB guest mapping unless userspace also has a 1GiB guest mapping. Decoupling the mappings sizes would allow userspace to precisely map only what is needed without impacting guest performance, e.g. to harden against unintentional accesses to guest memory. Decoupling guest and userspace mappings may also allow for a cleaner alternative to high-granularity mappings for HugeTLB, which has reached a bit of an impasse and is unlikely to ever be merged. A guest-first memory subsystem also provides clearer line of sight to things like a dedicated memory pool (for slice-of-hardware VMs) and elimination of "struct page" (for offload setups where userspace _never_ needs to mmap() guest memory). More immediately, being able to map memory into KVM guests without mapping said memory into the host is critical for Confidential VMs (CoCo VMs), the initial use case for guest_memfd. While AMD's SEV and Intel's TDX prevent untrusted software from reading guest private data by encrypting guest memory with a key that isn't usable by the untrusted host, projects such as Protected KVM (pKVM) provide confidentiality and integrity *without* relying on memory encryption. And with SEV-SNP and TDX, accessing guest private memory can be fatal to the host, i.e. KVM must be prevent host userspace from accessing guest memory irrespective of hardware behavior. Attempt #1 to support CoCo VMs was to add a VMA flag to mark memory as being mappable only by KVM (or a similarly enlightened kernel subsystem). That approach was abandoned largely due to it needing to play games with PROT_NONE to prevent userspace from accessing guest memory. Attempt #2 to was to usurp PG_hwpoison to prevent the host from mapping guest private memory into userspace, but that approach failed to meet several requirements for software-based CoCo VMs, e.g. pKVM, as the kernel wouldn't easily be able to enforce a 1:1 page:guest association, let alone a 1:1 pfn:gfn mapping. And using PG_hwpoison does not work for memory that isn't backed by 'struct page', e.g. if devices gain support for exposing encrypted memory regions to guests. Attempt #3 was to extend the memfd() syscall and wrap shmem to provide dedicated file-based guest memory. That approach made it as far as v10 before feedback from Hugh Dickins and Christian Brauner (and others) led to it demise. Hugh's objection was that piggybacking shmem made no sense for KVM's use case as KVM didn't actually *want* the features provided by shmem. I.e. KVM was using memfd() and shmem to avoid having to manage memory directly, not because memfd() and shmem were the optimal solution, e.g. things like read/write/mmap in shmem were dead weight. Christian pointed out flaws with implementing a partial overlay (wrapping only _some_ of shmem), e.g. poking at inode_operations or super_operations would show shmem stuff, but address_space_operations and file_operations would show KVM's overlay. Paraphrashing heavily, Christian suggested KVM stop being lazy and create a proper API. Link: https://lore.kernel.org/all/20201020061859.18385-1-kirill.shutemov@linux.intel.com Link: https://lore.kernel.org/all/20210416154106.23721-1-kirill.shutemov@linux.intel.com Link: https://lore.kernel.org/all/20210824005248.200037-1-seanjc@google.com Link: https://lore.kernel.org/all/20211111141352.26311-1-chao.p.peng@linux.intel.com Link: https://lore.kernel.org/all/20221202061347.1070246-1-chao.p.peng@linux.intel.com Link: https://lore.kernel.org/all/ff5c5b97-acdf-9745-ebe5-c6609dd6322e@google.com Link: https://lore.kernel.org/all/20230418-anfallen-irdisch-6993a61be10b@brauner Link: https://lore.kernel.org/all/ZEM5Zq8oo+xnApW9@google.com Link: https://lore.kernel.org/linux-mm/20230306191944.GA15773@monkey Link: https://lore.kernel.org/linux-mm/ZII1p8ZHlHaQ3dDl@casper.infradead.org Cc: Fuad Tabba Cc: Vishal Annapurve Cc: Ackerley Tng Cc: Jarkko Sakkinen Cc: Maciej Szmigiero Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Quentin Perret Cc: Michael Roth Cc: Wang Cc: Liam Merwick Cc: Isaku Yamahata Co-developed-by: Kirill A. Shutemov Signed-off-by: Kirill A. Shutemov Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Co-developed-by: Chao Peng Signed-off-by: Chao Peng Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Co-developed-by: Isaku Yamahata Signed-off-by: Isaku Yamahata Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-17-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- virt/kvm/Kconfig | 4 + virt/kvm/Makefile.kvm | 1 + virt/kvm/guest_memfd.c | 538 +++++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 59 +++++- virt/kvm/kvm_mm.h | 26 +++ 5 files changed, 623 insertions(+), 5 deletions(-) create mode 100644 virt/kvm/guest_memfd.c (limited to 'virt') diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 5bd7fcaf9089..08afef022db9 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -100,3 +100,7 @@ config KVM_GENERIC_MMU_NOTIFIER config KVM_GENERIC_MEMORY_ATTRIBUTES select KVM_GENERIC_MMU_NOTIFIER bool + +config KVM_PRIVATE_MEM + select XARRAY_MULTI + bool diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm index 2c27d5d0c367..724c89af78af 100644 --- a/virt/kvm/Makefile.kvm +++ b/virt/kvm/Makefile.kvm @@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o +kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c new file mode 100644 index 000000000000..e65f4170425c --- /dev/null +++ b/virt/kvm/guest_memfd.c @@ -0,0 +1,538 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include "kvm_mm.h" + +struct kvm_gmem { + struct kvm *kvm; + struct xarray bindings; + struct list_head entry; +}; + +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +{ + struct folio *folio; + + /* TODO: Support huge pages. */ + folio = filemap_grab_folio(inode->i_mapping, index); + if (IS_ERR_OR_NULL(folio)) + return NULL; + + /* + * Use the up-to-date flag to track whether or not the memory has been + * zeroed before being handed off to the guest. There is no backing + * storage for the memory, so the folio will remain up-to-date until + * it's removed. + * + * TODO: Skip clearing pages when trusted firmware will do it when + * assigning memory to the guest. + */ + if (!folio_test_uptodate(folio)) { + unsigned long nr_pages = folio_nr_pages(folio); + unsigned long i; + + for (i = 0; i < nr_pages; i++) + clear_highpage(folio_page(folio, i)); + + folio_mark_uptodate(folio); + } + + /* + * Ignore accessed, referenced, and dirty flags. The memory is + * unevictable and there is no storage to write back to. + */ + return folio; +} + +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, + pgoff_t end) +{ + bool flush = false, found_memslot = false; + struct kvm_memory_slot *slot; + struct kvm *kvm = gmem->kvm; + unsigned long index; + + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { + pgoff_t pgoff = slot->gmem.pgoff; + + struct kvm_gfn_range gfn_range = { + .start = slot->base_gfn + max(pgoff, start) - pgoff, + .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, + .slot = slot, + .may_block = true, + }; + + if (!found_memslot) { + found_memslot = true; + + KVM_MMU_LOCK(kvm); + kvm_mmu_invalidate_begin(kvm); + } + + flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); + } + + if (flush) + kvm_flush_remote_tlbs(kvm); + + if (found_memslot) + KVM_MMU_UNLOCK(kvm); +} + +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, + pgoff_t end) +{ + struct kvm *kvm = gmem->kvm; + + if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + KVM_MMU_LOCK(kvm); + kvm_mmu_invalidate_end(kvm); + KVM_MMU_UNLOCK(kvm); + } +} + +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct list_head *gmem_list = &inode->i_mapping->private_list; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; + struct kvm_gmem *gmem; + + /* + * Bindings must be stable across invalidation to ensure the start+end + * are balanced. + */ + filemap_invalidate_lock(inode->i_mapping); + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_begin(gmem, start, end); + + truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_end(gmem, start, end); + + filemap_invalidate_unlock(inode->i_mapping); + + return 0; +} + +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t start, index, end; + int r; + + /* Dedicated guest is immutable by default. */ + if (offset + len > i_size_read(inode)) + return -EINVAL; + + filemap_invalidate_lock_shared(mapping); + + start = offset >> PAGE_SHIFT; + end = (offset + len) >> PAGE_SHIFT; + + r = 0; + for (index = start; index < end; ) { + struct folio *folio; + + if (signal_pending(current)) { + r = -EINTR; + break; + } + + folio = kvm_gmem_get_folio(inode, index); + if (!folio) { + r = -ENOMEM; + break; + } + + index = folio_next_index(folio); + + folio_unlock(folio); + folio_put(folio); + + /* 64-bit only, wrapping the index should be impossible. */ + if (WARN_ON_ONCE(!index)) + break; + + cond_resched(); + } + + filemap_invalidate_unlock_shared(mapping); + + return r; +} + +static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + int ret; + + if (!(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = kvm_gmem_punch_hole(file_inode(file), offset, len); + else + ret = kvm_gmem_allocate(file_inode(file), offset, len); + + if (!ret) + file_modified(file); + return ret; +} + +static int kvm_gmem_release(struct inode *inode, struct file *file) +{ + struct kvm_gmem *gmem = file->private_data; + struct kvm_memory_slot *slot; + struct kvm *kvm = gmem->kvm; + unsigned long index; + + /* + * Prevent concurrent attempts to *unbind* a memslot. This is the last + * reference to the file and thus no new bindings can be created, but + * dereferencing the slot for existing bindings needs to be protected + * against memslot updates, specifically so that unbind doesn't race + * and free the memslot (kvm_gmem_get_file() will return NULL). + */ + mutex_lock(&kvm->slots_lock); + + filemap_invalidate_lock(inode->i_mapping); + + xa_for_each(&gmem->bindings, index, slot) + rcu_assign_pointer(slot->gmem.file, NULL); + + synchronize_rcu(); + + /* + * All in-flight operations are gone and new bindings can be created. + * Zap all SPTEs pointed at by this file. Do not free the backing + * memory, as its lifetime is associated with the inode, not the file. + */ + kvm_gmem_invalidate_begin(gmem, 0, -1ul); + kvm_gmem_invalidate_end(gmem, 0, -1ul); + + list_del(&gmem->entry); + + filemap_invalidate_unlock(inode->i_mapping); + + mutex_unlock(&kvm->slots_lock); + + xa_destroy(&gmem->bindings); + kfree(gmem); + + kvm_put_kvm(kvm); + + return 0; +} + +static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) +{ + struct file *file; + + rcu_read_lock(); + + file = rcu_dereference(slot->gmem.file); + if (file && !get_file_rcu(file)) + file = NULL; + + rcu_read_unlock(); + + return file; +} + +static struct file_operations kvm_gmem_fops = { + .open = generic_file_open, + .release = kvm_gmem_release, + .fallocate = kvm_gmem_fallocate, +}; + +void kvm_gmem_init(struct module *module) +{ + kvm_gmem_fops.owner = module; +} + +static int kvm_gmem_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} + +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page) +{ + struct list_head *gmem_list = &mapping->private_list; + struct kvm_gmem *gmem; + pgoff_t start, end; + + filemap_invalidate_lock_shared(mapping); + + start = page->index; + end = start + thp_nr_pages(page); + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_begin(gmem, start, end); + + /* + * Do not truncate the range, what action is taken in response to the + * error is userspace's decision (assuming the architecture supports + * gracefully handling memory errors). If/when the guest attempts to + * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, + * at which point KVM can either terminate the VM or propagate the + * error to userspace. + */ + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_end(gmem, start, end); + + filemap_invalidate_unlock_shared(mapping); + + return MF_DELAYED; +} + +static const struct address_space_operations kvm_gmem_aops = { + .dirty_folio = noop_dirty_folio, +#ifdef CONFIG_MIGRATION + .migrate_folio = kvm_gmem_migrate_folio, +#endif + .error_remove_page = kvm_gmem_error_page, +}; + +static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = path->dentry->d_inode; + + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EINVAL; +} +static const struct inode_operations kvm_gmem_iops = { + .getattr = kvm_gmem_getattr, + .setattr = kvm_gmem_setattr, +}; + +static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) +{ + const char *anon_name = "[kvm-gmem]"; + struct kvm_gmem *gmem; + struct inode *inode; + struct file *file; + int fd, err; + + fd = get_unused_fd_flags(0); + if (fd < 0) + return fd; + + gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); + if (!gmem) { + err = -ENOMEM; + goto err_fd; + } + + file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, + O_RDWR, NULL); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_gmem; + } + + file->f_flags |= O_LARGEFILE; + + inode = file->f_inode; + WARN_ON(file->f_mapping != inode->i_mapping); + + inode->i_private = (void *)(unsigned long)flags; + inode->i_op = &kvm_gmem_iops; + inode->i_mapping->a_ops = &kvm_gmem_aops; + inode->i_mode |= S_IFREG; + inode->i_size = size; + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unmovable(inode->i_mapping); + /* Unmovable mappings are supposed to be marked unevictable as well. */ + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + + kvm_get_kvm(kvm); + gmem->kvm = kvm; + xa_init(&gmem->bindings); + list_add(&gmem->entry, &inode->i_mapping->private_list); + + fd_install(fd, file); + return fd; + +err_gmem: + kfree(gmem); +err_fd: + put_unused_fd(fd); + return err; +} + +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) +{ + loff_t size = args->size; + u64 flags = args->flags; + u64 valid_flags = 0; + + if (flags & ~valid_flags) + return -EINVAL; + + if (size <= 0 || !PAGE_ALIGNED(size)) + return -EINVAL; + + return __kvm_gmem_create(kvm, size, flags); +} + +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned int fd, loff_t offset) +{ + loff_t size = slot->npages << PAGE_SHIFT; + unsigned long start, end; + struct kvm_gmem *gmem; + struct inode *inode; + struct file *file; + int r = -EINVAL; + + BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); + + file = fget(fd); + if (!file) + return -EBADF; + + if (file->f_op != &kvm_gmem_fops) + goto err; + + gmem = file->private_data; + if (gmem->kvm != kvm) + goto err; + + inode = file_inode(file); + + if (offset < 0 || !PAGE_ALIGNED(offset) || + offset + size > i_size_read(inode)) + goto err; + + filemap_invalidate_lock(inode->i_mapping); + + start = offset >> PAGE_SHIFT; + end = start + slot->npages; + + if (!xa_empty(&gmem->bindings) && + xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + filemap_invalidate_unlock(inode->i_mapping); + goto err; + } + + /* + * No synchronize_rcu() needed, any in-flight readers are guaranteed to + * be see either a NULL file or this new file, no need for them to go + * away. + */ + rcu_assign_pointer(slot->gmem.file, file); + slot->gmem.pgoff = start; + + xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); + filemap_invalidate_unlock(inode->i_mapping); + + /* + * Drop the reference to the file, even on success. The file pins KVM, + * not the other way 'round. Active bindings are invalidated if the + * file is closed before memslots are destroyed. + */ + r = 0; +err: + fput(file); + return r; +} + +void kvm_gmem_unbind(struct kvm_memory_slot *slot) +{ + unsigned long start = slot->gmem.pgoff; + unsigned long end = start + slot->npages; + struct kvm_gmem *gmem; + struct file *file; + + /* + * Nothing to do if the underlying file was already closed (or is being + * closed right now), kvm_gmem_release() invalidates all bindings. + */ + file = kvm_gmem_get_file(slot); + if (!file) + return; + + gmem = file->private_data; + + filemap_invalidate_lock(file->f_mapping); + xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); + rcu_assign_pointer(slot->gmem.file, NULL); + synchronize_rcu(); + filemap_invalidate_unlock(file->f_mapping); + + fput(file); +} + +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +{ + pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; + struct kvm_gmem *gmem; + struct folio *folio; + struct page *page; + struct file *file; + int r; + + file = kvm_gmem_get_file(slot); + if (!file) + return -EFAULT; + + gmem = file->private_data; + + if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) { + r = -EIO; + goto out_fput; + } + + folio = kvm_gmem_get_folio(file_inode(file), index); + if (!folio) { + r = -ENOMEM; + goto out_fput; + } + + if (folio_test_hwpoison(folio)) { + r = -EHWPOISON; + goto out_unlock; + } + + page = folio_file_page(folio, index); + + *pfn = page_to_pfn(page); + if (max_order) + *max_order = 0; + + r = 0; + +out_unlock: + folio_unlock(folio); +out_fput: + fput(file); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f1a575d39b3b..8f46d757a2c5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -791,7 +791,7 @@ void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end) } } -static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_mmu_invalidate_range_add(kvm, range->start, range->end); return kvm_unmap_gfn_range(kvm, range); @@ -1027,6 +1027,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) /* This does not remove the slot from struct kvm_memslots data structures */ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { + if (slot->flags & KVM_MEM_GUEST_MEMFD) + kvm_gmem_unbind(slot); + kvm_destroy_dirty_bitmap(slot); kvm_arch_free_memslot(kvm, slot); @@ -1606,10 +1609,18 @@ static void kvm_replace_memslot(struct kvm *kvm, #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) -static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem) +static int check_memory_region_flags(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; + if (kvm_arch_has_private_mem(kvm)) + valid_flags |= KVM_MEM_GUEST_MEMFD; + + /* Dirty logging private memory is not currently supported. */ + if (mem->flags & KVM_MEM_GUEST_MEMFD) + valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + #ifdef __KVM_HAVE_READONLY_MEM valid_flags |= KVM_MEM_READONLY; #endif @@ -2018,7 +2029,7 @@ int __kvm_set_memory_region(struct kvm *kvm, int as_id, id; int r; - r = check_memory_region_flags(mem); + r = check_memory_region_flags(kvm, mem); if (r) return r; @@ -2037,6 +2048,10 @@ int __kvm_set_memory_region(struct kvm *kvm, !access_ok((void __user *)(unsigned long)mem->userspace_addr, mem->memory_size)) return -EINVAL; + if (mem->flags & KVM_MEM_GUEST_MEMFD && + (mem->guest_memfd_offset & (PAGE_SIZE - 1) || + mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) + return -EINVAL; if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) @@ -2075,6 +2090,9 @@ int __kvm_set_memory_region(struct kvm *kvm, if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) return -EINVAL; } else { /* Modify an existing slot. */ + /* Private memslots are immutable, they can only be deleted. */ + if (mem->flags & KVM_MEM_GUEST_MEMFD) + return -EINVAL; if ((mem->userspace_addr != old->userspace_addr) || (npages != old->npages) || ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) @@ -2103,10 +2121,23 @@ int __kvm_set_memory_region(struct kvm *kvm, new->npages = npages; new->flags = mem->flags; new->userspace_addr = mem->userspace_addr; + if (mem->flags & KVM_MEM_GUEST_MEMFD) { + r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset); + if (r) + goto out; + } r = kvm_set_memslot(kvm, old, new, change); if (r) - kfree(new); + goto out_unbind; + + return 0; + +out_unbind: + if (mem->flags & KVM_MEM_GUEST_MEMFD) + kvm_gmem_unbind(new); +out: + kfree(new); return r; } EXPORT_SYMBOL_GPL(__kvm_set_memory_region); @@ -2442,7 +2473,7 @@ out: static u64 kvm_supported_mem_attributes(struct kvm *kvm) { - if (!kvm) + if (!kvm || kvm_arch_has_private_mem(kvm)) return KVM_MEMORY_ATTRIBUTE_PRIVATE; return 0; @@ -4844,6 +4875,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES case KVM_CAP_MEMORY_ATTRIBUTES: return kvm_supported_mem_attributes(kvm); +#endif +#ifdef CONFIG_KVM_PRIVATE_MEM + case KVM_CAP_GUEST_MEMFD: + return !kvm || kvm_arch_has_private_mem(kvm); #endif default: break; @@ -5277,6 +5312,18 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_GET_STATS_FD: r = kvm_vm_ioctl_get_stats_fd(kvm); break; +#ifdef CONFIG_KVM_PRIVATE_MEM + case KVM_CREATE_GUEST_MEMFD: { + struct kvm_create_guest_memfd guest_memfd; + + r = -EFAULT; + if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd))) + goto out; + + r = kvm_gmem_create(kvm, &guest_memfd); + break; + } +#endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } @@ -6409,6 +6456,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (WARN_ON_ONCE(r)) goto err_vfio; + kvm_gmem_init(module); + /* * Registration _must_ be the very last thing done, as this exposes * /dev/kvm to userspace, i.e. all infrastructure must be setup! diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 180f1a09e6ba..ecefc7ec51af 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -37,4 +37,30 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, } #endif /* HAVE_KVM_PFNCACHE */ +#ifdef CONFIG_KVM_PRIVATE_MEM +void kvm_gmem_init(struct module *module); +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned int fd, loff_t offset); +void kvm_gmem_unbind(struct kvm_memory_slot *slot); +#else +static inline void kvm_gmem_init(struct module *module) +{ + +} + +static inline int kvm_gmem_bind(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned int fd, loff_t offset) +{ + WARN_ON_ONCE(1); + return -EIO; +} + +static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot) +{ + WARN_ON_ONCE(1); +} +#endif /* CONFIG_KVM_PRIVATE_MEM */ + #endif /* __KVM_MM_H__ */ -- cgit v1.2.3 From eed52e434bc33603ddb0af62b6c4ef818948489d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:04 -0700 Subject: KVM: Allow arch code to track number of memslot address spaces per VM Let x86 track the number of address spaces on a per-VM basis so that KVM can disallow SMM memslots for confidential VMs. Confidentials VMs are fundamentally incompatible with emulating SMM, which as the name suggests requires being able to read and write guest memory and register state. Disallowing SMM will simplify support for guest private memory, as KVM will not need to worry about tracking memory attributes for multiple address spaces (SMM is the only "non-default" address space across all architectures). Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/dirty_ring.c | 2 +- virt/kvm/kvm_main.c | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) (limited to 'virt') diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index c1cd7dfe4a90..86d267db87bb 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -58,7 +58,7 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) as_id = slot >> 16; id = (u16)slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return; memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8f46d757a2c5..8758cb799e18 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -615,7 +615,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct interval_tree_node *node; slots = __kvm_memslots(kvm, i); @@ -1241,7 +1241,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) goto out_err_no_irq_srcu; refcount_set(&kvm->users_count, 1); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { for (j = 0; j < 2; j++) { slots = &kvm->__memslots[i][j]; @@ -1391,7 +1391,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { kvm_free_memslots(kvm, &kvm->__memslots[i][0]); kvm_free_memslots(kvm, &kvm->__memslots[i][1]); } @@ -1682,7 +1682,7 @@ static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) * space 0 will use generations 0, 2, 4, ... while address space 1 will * use generations 1, 3, 5, ... */ - gen += KVM_ADDRESS_SPACE_NUM; + gen += kvm_arch_nr_memslot_as_ids(kvm); kvm_arch_memslots_updated(kvm, gen); @@ -2052,7 +2052,7 @@ int __kvm_set_memory_region(struct kvm *kvm, (mem->guest_memfd_offset & (PAGE_SIZE - 1) || mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) return -EINVAL; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; @@ -2188,7 +2188,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); @@ -2250,7 +2250,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); @@ -2362,7 +2362,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; if (log->first_page & 63) @@ -2493,7 +2493,7 @@ static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) { @@ -4848,9 +4848,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; #endif -#if KVM_ADDRESS_SPACE_NUM > 1 +#if KVM_MAX_NR_ADDRESS_SPACES > 1 case KVM_CAP_MULTI_ADDRESS_SPACE: - return KVM_ADDRESS_SPACE_NUM; + if (kvm) + return kvm_arch_nr_memslot_as_ids(kvm); + return KVM_MAX_NR_ADDRESS_SPACES; #endif case KVM_CAP_NR_MEMSLOTS: return KVM_USER_MEM_SLOTS; @@ -4958,7 +4960,7 @@ bool kvm_are_all_memslots_empty(struct kvm *kvm) lockdep_assert_held(&kvm->slots_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { if (!kvm_memslots_empty(__kvm_memslots(kvm, i))) return false; } -- cgit v1.2.3 From 89ea60c2c7b5838bf192c50062d5720cd6ab8662 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:05 -0700 Subject: KVM: x86: Add support for "protected VMs" that can utilize private memory Add a new x86 VM type, KVM_X86_SW_PROTECTED_VM, to serve as a development and testing vehicle for Confidential (CoCo) VMs, and potentially to even become a "real" product in the distant future, e.g. a la pKVM. The private memory support in KVM x86 is aimed at AMD's SEV-SNP and Intel's TDX, but those technologies are extremely complex (understatement), difficult to debug, don't support running as nested guests, and require hardware that's isn't universally accessible. I.e. relying SEV-SNP or TDX for maintaining guest private memory isn't a realistic option. At the very least, KVM_X86_SW_PROTECTED_VM will enable a variety of selftests for guest_memfd and private memory support without requiring unique hardware. Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Message-Id: <20231027182217.3615211-24-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini --- virt/kvm/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'virt') diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 08afef022db9..2c964586aa14 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -104,3 +104,8 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES config KVM_PRIVATE_MEM select XARRAY_MULTI bool + +config KVM_GENERIC_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_PRIVATE_MEM + bool -- cgit v1.2.3