diff options
Diffstat (limited to 'arch/arm64/kvm/mmu.c')
-rw-r--r-- | arch/arm64/kvm/mmu.c | 452 |
1 files changed, 312 insertions, 140 deletions
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index dc04bc767865..754f2fe0cc67 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -15,6 +15,7 @@ #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include <asm/kvm_pgtable.h> +#include <asm/kvm_pkvm.h> #include <asm/kvm_ras.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> @@ -29,8 +30,12 @@ static unsigned long __ro_after_init hyp_idmap_start; static unsigned long __ro_after_init hyp_idmap_end; static phys_addr_t __ro_after_init hyp_idmap_vector; +u32 __ro_after_init __hyp_va_bits; + static unsigned long __ro_after_init io_map_base; +#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) + static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, phys_addr_t size) { @@ -147,7 +152,7 @@ static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, return -EINVAL; next = __stage2_range_addr_end(addr, end, chunk_size); - ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache); + ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); if (ret) break; } while (addr = next, addr != end); @@ -168,15 +173,23 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) */ int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { - kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); return 0; } int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { - kvm_tlb_flush_vmid_range(&kvm->arch.mmu, - gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); + u64 size = nr_pages << PAGE_SHIFT; + u64 addr = gfn << PAGE_SHIFT; + + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); return 0; } @@ -225,7 +238,7 @@ static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) void *pgtable = page_to_virt(page); s8 level = page_private(page); - kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); + KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); } static void stage2_free_unlinked_table(void *addr, s8 level) @@ -324,13 +337,19 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap, + WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), may_block)); } -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block) { - __unmap_stage2_range(mmu, start, size, true); + __unmap_stage2_range(mmu, start, size, may_block); +} + +void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) +{ + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); } static void stage2_flush_memslot(struct kvm *kvm, @@ -339,7 +358,7 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush); + kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); } /** @@ -362,6 +381,8 @@ static void stage2_flush_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_flush_memslot(kvm, memslot); + kvm_nested_s2_flush(kvm); + write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } @@ -696,10 +717,10 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) mutex_lock(&kvm_hyp_pgd_mutex); /* - * Efficient stack verification using the PAGE_SHIFT bit implies + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies * an alignment of our allocation on the order of the size. */ - size = PAGE_SIZE * 2; + size = NVHE_STACK_SIZE * 2; base = ALIGN_DOWN(io_map_base - size, size); ret = __hyp_alloc_private_va_range(base); @@ -716,12 +737,12 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) * at the higher address and leave the lower guard page * unbacked. * - * Any valid stack address now has the PAGE_SHIFT bit as 1 + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. */ - ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr, - PAGE_HYP); + ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, + phys_addr, PAGE_HYP); if (ret) kvm_err("Cannot map hyp stack\n"); @@ -855,21 +876,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .icache_inval_pou = invalidate_icache_guest_page, }; -/** - * kvm_init_stage2_mmu - Initialise a S2 MMU structure - * @kvm: The pointer to the KVM structure - * @mmu: The pointer to the s2 MMU structure - * @type: The machine type of the virtual machine - * - * Allocates only the stage-2 HW PGD level table(s). - * Note we don't need locking here as this is only called when the VM is - * created, which can only be done once. - */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) { u32 kvm_ipa_limit = get_kvm_ipa_limit(); - int cpu, err; - struct kvm_pgtable *pgt; u64 mmfr0, mmfr1; u32 phys_shift; @@ -896,20 +905,64 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); + return 0; +} + +/** + * kvm_init_stage2_mmu - Initialise a S2 MMU structure + * @kvm: The pointer to the KVM structure + * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine + * + * Allocates only the stage-2 HW PGD level table(s). + * Note we don't need locking here as this is only called in two cases: + * + * - when the VM is created, which can't race against anything + * + * - when secondary kvm_s2_mmu structures are initialised for NV + * guests, and the caller must hold kvm->lock as this is called on a + * per-vcpu basis. + */ +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +{ + int cpu, err; + struct kvm_pgtable *pgt; + + /* + * If we already have our page tables in place, and that the + * MMU context is the canonical one, we have a bug somewhere, + * as this is only supposed to ever happen once per VM. + * + * Otherwise, we're building nested page tables, and that's + * probably because userspace called KVM_ARM_VCPU_INIT more + * than once on the same vcpu. Since that's actually legal, + * don't kick a fuss and leave gracefully. + */ if (mmu->pgt != NULL) { + if (kvm_is_nested_s2_mmu(kvm, mmu)) + return 0; + kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } + err = kvm_init_ipa_range(mmu, type); + if (err) + return err; + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; mmu->arch = &kvm->arch; - err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops); + err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); if (err) goto out_free_pgtable; + mmu->pgt = pgt; + if (is_protected_kvm_enabled()) + return 0; + mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); if (!mmu->last_vcpu_ran) { err = -ENOMEM; @@ -923,12 +976,15 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; mmu->split_page_cache.gfp_zero = __GFP_ZERO; - mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); + + if (kvm_is_nested_s2_mmu(kvm, mmu)) + kvm_init_nested_s2_mmu(mmu); + return 0; out_destroy_pgtable: - kvm_pgtable_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); out_free_pgtable: kfree(pgt); return err; @@ -976,7 +1032,7 @@ static void stage2_unmap_memslot(struct kvm *kvm, if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); } hva = vm_end; } while (hva < reg_end); @@ -1003,6 +1059,8 @@ void stage2_unmap_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_unmap_memslot(kvm, memslot); + kvm_nested_s2_unmap(kvm, true); + write_unlock(&kvm->mmu_lock); mmap_read_unlock(current->mm); srcu_read_unlock(&kvm->srcu, idx); @@ -1023,26 +1081,40 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) write_unlock(&kvm->mmu_lock); if (pgt) { - kvm_pgtable_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); kfree(pgt); } } -static void hyp_mc_free_fn(void *addr, void *unused) +static void hyp_mc_free_fn(void *addr, void *mc) { + struct kvm_hyp_memcache *memcache = mc; + + if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, -1); + free_page((unsigned long)addr); } -static void *hyp_mc_alloc_fn(void *unused) +static void *hyp_mc_alloc_fn(void *mc) { - return (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + struct kvm_hyp_memcache *memcache = mc; + void *addr; + + addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) + kvm_account_pgtable_pages(addr, 1); + + return addr; } void free_hyp_memcache(struct kvm_hyp_memcache *mc) { - if (is_protected_kvm_enabled()) - __free_hyp_memcache(mc, hyp_mc_free_fn, - kvm_host_va, NULL); + if (!is_protected_kvm_enabled()) + return; + + kfree(mc->mapping); + __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); } int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) @@ -1050,8 +1122,14 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) if (!is_protected_kvm_enabled()) return 0; + if (!mc->mapping) { + mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); + if (!mc->mapping) + return -ENOMEM; + } + return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, - kvm_host_pa, NULL); + kvm_host_pa, mc); } /** @@ -1088,8 +1166,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, break; write_lock(&kvm->mmu_lock); - ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, - &cache, 0); + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, + pa, prot, &cache, 0); write_unlock(&kvm->mmu_lock); if (ret) break; @@ -1102,14 +1180,14 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, } /** - * stage2_wp_range() - write protect stage2 memory region range + * kvm_stage2_wp_range() - write protect stage2 memory region range * @mmu: The KVM stage-2 MMU pointer * @addr: Start address of range * @end: End address of range */ -static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) +void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect); + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); } /** @@ -1138,7 +1216,8 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_nested_s2_wp(kvm); write_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs_memslot(kvm, memslot); } @@ -1192,7 +1271,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, lockdep_assert_held_write(&kvm->mmu_lock); - stage2_wp_range(&kvm->arch.mmu, start, end); + kvm_stage2_wp_range(&kvm->arch.mmu, start, end); /* * Eager-splitting is done when manual-protect is set. We @@ -1204,6 +1283,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, */ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) kvm_mmu_split_huge_pages(kvm, start, end); + + kvm_nested_s2_wp(kvm); } static void kvm_send_hwpoison_signal(unsigned long address, short lsb) @@ -1357,10 +1438,21 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, { unsigned long i, nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(pfn); + struct folio *folio = page_folio(page); if (!kvm_has_mte(kvm)) return; + if (folio_test_hugetlb(folio)) { + /* Hugetlb has MTE flags set on head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr_pages; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + return; + } + for (i = 0; i < nr_pages; i++, page++) { if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); @@ -1375,6 +1467,7 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, + struct kvm_s2_trans *nested, struct kvm_memory_slot *memslot, unsigned long hva, bool fault_is_perm) { @@ -1383,16 +1476,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool exec_fault, mte_allowed; bool device = false, vfio_allow_any_uc = false; unsigned long mmu_seq; + phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; short vma_shift; + void *memcache; gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; + struct page *page; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); @@ -1412,8 +1508,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * and a write fault needs to collapse a block entry into a table. */ if (!fault_is_perm || (logging_active && write_fault)) { - ret = kvm_mmu_topup_memory_cache(memcache, - kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu)); + int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); + + if (!is_protected_kvm_enabled()) { + memcache = &vcpu->arch.mmu_page_cache; + ret = kvm_mmu_topup_memory_cache(memcache, min_pages); + } else { + memcache = &vcpu->arch.pkvm_memcache; + ret = topup_hyp_memcache(memcache, min_pages); + } if (ret) return ret; } @@ -1434,7 +1537,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * logging_active is guaranteed to never be true for VM_PFNMAP * memslots. */ - if (logging_active) { + if (logging_active || is_protected_kvm_enabled()) { force_pte = true; vma_shift = PAGE_SHIFT; } else { @@ -1466,10 +1569,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } vma_pagesize = 1UL << vma_shift; - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) + + if (nested) { + unsigned long max_map_size; + + max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; + + ipa = kvm_s2_trans_output(nested); + + /* + * If we're about to create a shadow stage 2 entry, then we + * can only create a block mapping if the guest stage 2 page + * table uses at least as big a mapping. + */ + max_map_size = min(kvm_s2_trans_size(nested), max_map_size); + + /* + * Be careful that if the mapping size falls between + * two host sizes, take the smallest of the two. + */ + if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) + max_map_size = PMD_SIZE; + else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) + max_map_size = PAGE_SIZE; + + force_pte = (max_map_size == PAGE_SIZE); + vma_pagesize = min(vma_pagesize, (long)max_map_size); + } + + /* + * Both the canonical IPA and fault IPA must be hugepage-aligned to + * ensure we find the right PFN and lay down the mapping in the right + * place. + */ + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { fault_ipa &= ~(vma_pagesize - 1); + ipa &= ~(vma_pagesize - 1); + } - gfn = fault_ipa >> PAGE_SHIFT; + gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; @@ -1479,7 +1617,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to + * vma_lookup() or __kvm_faultin_pfn() become stale prior to * acquiring kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs @@ -1488,8 +1626,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, - write_fault, &writable, NULL); + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -1502,7 +1640,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If the page was identified as device early by looking at * the VMA flags, vma_pagesize is already representing the * largest quantity we can map. If instead it was mapped - * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE + * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE * and must not be upgraded. * * In both cases, we don't let transparent_hugepage_adjust() @@ -1520,10 +1658,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && device) return -ENOEXEC; - read_lock(&kvm->mmu_lock); + /* + * Potentially reduce shadow S2 permissions to match the guest's own + * S2. For exec faults, we'd only reach this point if the guest + * actually allowed it (see kvm_s2_handle_perm_fault). + * + * Also encode the level of the original translation in the SW bits + * of the leaf entry as a proxy for the span of that translation. + * This will be retrieved on TLB invalidation from the guest and + * used to limit the invalidation scope if a TTL hint or a range + * isn't provided. + */ + if (nested) { + writable &= kvm_s2_trans_writable(nested); + if (!kvm_s2_trans_readable(nested)) + prot &= ~KVM_PGTABLE_PROT_R; + + prot |= kvm_encode_nested_level(nested); + } + + kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_invalidate_retry(kvm, mmu_seq)) + if (mmu_invalidate_retry(kvm, mmu_seq)) { + ret = -EAGAIN; goto out_unlock; + } /* * If we are not forced to use page mapping, check if we are @@ -1564,7 +1723,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot |= KVM_PGTABLE_PROT_NORMAL_NC; else prot |= KVM_PGTABLE_PROT_DEVICE; - } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { + } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && + (!nested || kvm_s2_trans_executable(nested))) { prot |= KVM_PGTABLE_PROT_X; } @@ -1573,42 +1733,42 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_is_perm && vma_pagesize == fault_granule) - ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); - else - ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, + if (fault_is_perm && vma_pagesize == fault_granule) { + /* + * Drop the SW bits in favour of those stored in the + * PTE, which will be preserved. + */ + prot &= ~KVM_NV_GUEST_MAP_SZ; + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); + } else { + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, - memcache, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + memcache, flags); + } + +out_unlock: + kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_fault_unlock(kvm); /* Mark the page dirty only if the fault is handled successfully */ - if (writable && !ret) { - kvm_set_pfn_dirty(pfn); + if (writable && !ret) mark_page_dirty_in_slot(kvm, memslot, gfn); - } -out_unlock: - read_unlock(&kvm->mmu_lock); - kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; } /* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { - kvm_pte_t pte; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); read_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; - pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); + KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); read_unlock(&vcpu->kvm->mmu_lock); - - if (kvm_pte_valid(pte)) - kvm_set_pfn_accessed(kvm_pte_to_pfn(pte)); } /** @@ -1624,17 +1784,38 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { + struct kvm_s2_trans nested_trans, *nested = NULL; unsigned long esr; - phys_addr_t fault_ipa; + phys_addr_t fault_ipa; /* The address we faulted on */ + phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ struct kvm_memory_slot *memslot; unsigned long hva; bool is_iabt, write_fault, writable; gfn_t gfn; int ret, idx; + /* Synchronous External Abort? */ + if (kvm_vcpu_abt_issea(vcpu)) { + /* + * For RAS the host kernel may handle this abort. + * There is no need to pass the error into the guest. + */ + if (kvm_handle_guest_sea()) + kvm_inject_vabt(vcpu); + + return 1; + } + esr = kvm_vcpu_get_esr(vcpu); - fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + /* + * The fault IPA should be reliable at this point as we're not dealing + * with an SEA. + */ + ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) + return -EFAULT; + is_iabt = kvm_vcpu_trap_is_iabt(vcpu); if (esr_fsc_is_translation_fault(esr)) { @@ -1645,7 +1826,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) } /* Falls between the IPA range and the PARange? */ - if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) { + if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); if (is_iabt) @@ -1656,18 +1837,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) } } - /* Synchronous External Abort? */ - if (kvm_vcpu_abt_issea(vcpu)) { - /* - * For RAS the host kernel may handle this abort. - * There is no need to pass the error into the guest. - */ - if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu))) - kvm_inject_vabt(vcpu); - - return 1; - } - trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), kvm_vcpu_get_hfar(vcpu), fault_ipa); @@ -1684,7 +1853,42 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) idx = srcu_read_lock(&vcpu->kvm->srcu); - gfn = fault_ipa >> PAGE_SHIFT; + /* + * We may have faulted on a shadow stage 2 page table if we are + * running a nested guest. In this case, we have to resolve the L2 + * IPA to the L1 IPA first, before knowing what kind of memory should + * back the L1 IPA. + * + * If the shadow stage 2 page table walk faults, then we simply inject + * this to the guest and carry on. + * + * If there are no shadow S2 PTs because S2 is disabled, there is + * nothing to walk and we treat it as a 1:1 before going through the + * canonical translation. + */ + if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && + vcpu->arch.hw_mmu->nested_stage2_enabled) { + u32 esr; + + ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); + if (ret) { + esr = kvm_s2_trans_esr(&nested_trans); + kvm_inject_s2_fault(vcpu, esr); + goto out_unlock; + } + + ipa = kvm_s2_trans_output(&nested_trans); + nested = &nested_trans; + } + + gfn = ipa >> PAGE_SHIFT; memslot = gfn_to_memslot(vcpu->kvm, gfn); hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); write_fault = kvm_is_write_fault(vcpu); @@ -1728,13 +1932,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * faulting VA. This is always 12 bits, irrespective * of the page size. */ - fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, fault_ipa); + ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); + ret = io_mem_abort(vcpu, ipa); goto out_unlock; } /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); + VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); if (esr_fsc_is_access_flag_fault(esr)) { handle_access_fault(vcpu, fault_ipa); @@ -1742,7 +1946,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) goto out_unlock; } - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, + ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; @@ -1765,40 +1969,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) (range->end - range->start) << PAGE_SHIFT, range->may_block); - return false; -} - -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - kvm_pfn_t pfn = pte_pfn(range->arg.pte); - - if (!kvm->arch.mmu.pgt) - return false; - - WARN_ON(range->end - range->start != 1); - - /* - * If the page isn't tagged, defer to user_mem_abort() for sanitising - * the MTE tags. The S2 pte should have been unmapped by - * mmu_notifier_invalidate_range_end(). - */ - if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn))) - return false; - - /* - * We've moved a page around, probably through CoW, so let's treat - * it just like a translation fault and the map handler will clean - * the cache to the PoC. - * - * The MMU notifiers will have unmapped a huge PMD before calling - * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and - * therefore we never need to clear out a huge PMD through this - * calling path and a memcache is not required. - */ - kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, - PAGE_SIZE, __pfn_to_phys(pfn), - KVM_PGTABLE_PROT_R, NULL, 0); - + kvm_nested_s2_unmap(kvm, range->may_block); return false; } @@ -1809,9 +1980,13 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; - return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, true); + /* + * TODO: Handle nested_mmu structures here using the reverse mapping in + * a later version of patch series. + */ } bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1821,7 +1996,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; - return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, false); } @@ -1931,6 +2106,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits) goto out_destroy_pgtable; io_map_base = hyp_idmap_start; + __hyp_va_bits = *hyp_va_bits; return 0; out_destroy_pgtable: @@ -2054,11 +2230,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_uninit_stage2_mmu(kvm); -} - void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { @@ -2066,7 +2237,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - unmap_stage2_range(&kvm->arch.mmu, gpa, size); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); + kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); } |