diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2023-08-31 13:18:53 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2023-08-31 13:18:53 -0400 |
commit | e0fb12c673e53d2a103b9e0abc92204de0fc325d (patch) | |
tree | 3f91fa1ed8efb9e2b80e542075fb93cdbd0c3370 /arch/arm64/kvm/hyp | |
parent | 2dde18cd1d8fac735875f2e4987f11817cc0bc2c (diff) | |
parent | 1f66f1246bfa08aaf13db897736de49cbeaf72a1 (diff) | |
download | linux-e0fb12c673e53d2a103b9e0abc92204de0fc325d.tar.gz linux-e0fb12c673e53d2a103b9e0abc92204de0fc325d.tar.bz2 linux-e0fb12c673e53d2a103b9e0abc92204de0fc325d.zip |
Merge tag 'kvmarm-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 updates for Linux 6.6
- Add support for TLB range invalidation of Stage-2 page tables,
avoiding unnecessary invalidations. Systems that do not implement
range invalidation still rely on a full invalidation when dealing
with large ranges.
- Add infrastructure for forwarding traps taken from a L2 guest to
the L1 guest, with L0 acting as the dispatcher, another baby step
towards the full nested support.
- Simplify the way we deal with the (long deprecated) 'CPU target',
resulting in a much needed cleanup.
- Fix another set of PMU bugs, both on the guest and host sides,
as we seem to never have any shortage of those...
- Relax the alignment requirements of EL2 VA allocations for
non-stack allocations, as we were otherwise wasting a lot of that
precious VA space.
- The usual set of non-functional cleanups, although I note the lack
of spelling fixes...
Diffstat (limited to 'arch/arm64/kvm/hyp')
-rw-r--r-- | arch/arm64/kvm/hyp/include/hyp/switch.h | 127 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/include/nvhe/mm.h | 1 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/hyp-main.c | 11 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/mm.c | 83 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/setup.c | 27 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/switch.c | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/tlb.c | 30 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/pgtable.c | 63 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/vhe/tlb.c | 28 |
9 files changed, 287 insertions, 85 deletions
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 34f222af6165..9cfe6bd1dbe4 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -70,20 +70,26 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } -static inline bool __hfgxtr_traps_required(void) -{ - if (cpus_have_final_cap(ARM64_SME)) - return true; - - if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) - return true; +#define compute_clr_set(vcpu, reg, clr, set) \ + do { \ + u64 hfg; \ + hfg = __vcpu_sys_reg(vcpu, reg) & ~__ ## reg ## _RES0; \ + set |= hfg & __ ## reg ## _MASK; \ + clr |= ~hfg & __ ## reg ## _nMASK; \ + } while(0) - return false; -} -static inline void __activate_traps_hfgxtr(void) +static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu) { + struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; + u64 r_val, w_val; + + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + return; + + ctxt_sys_reg(hctxt, HFGRTR_EL2) = read_sysreg_s(SYS_HFGRTR_EL2); + ctxt_sys_reg(hctxt, HFGWTR_EL2) = read_sysreg_s(SYS_HFGWTR_EL2); if (cpus_have_final_cap(ARM64_SME)) { tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; @@ -98,26 +104,72 @@ static inline void __activate_traps_hfgxtr(void) if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) w_set |= HFGxTR_EL2_TCR_EL1_MASK; - sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); - sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { + compute_clr_set(vcpu, HFGRTR_EL2, r_clr, r_set); + compute_clr_set(vcpu, HFGWTR_EL2, w_clr, w_set); + } + + /* The default is not to trap anything but ACCDATA_EL1 */ + r_val = __HFGRTR_EL2_nMASK & ~HFGxTR_EL2_nACCDATA_EL1; + r_val |= r_set; + r_val &= ~r_clr; + + w_val = __HFGWTR_EL2_nMASK & ~HFGxTR_EL2_nACCDATA_EL1; + w_val |= w_set; + w_val &= ~w_clr; + + write_sysreg_s(r_val, SYS_HFGRTR_EL2); + write_sysreg_s(w_val, SYS_HFGWTR_EL2); + + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + return; + + ctxt_sys_reg(hctxt, HFGITR_EL2) = read_sysreg_s(SYS_HFGITR_EL2); + + r_set = r_clr = 0; + compute_clr_set(vcpu, HFGITR_EL2, r_clr, r_set); + r_val = __HFGITR_EL2_nMASK; + r_val |= r_set; + r_val &= ~r_clr; + + write_sysreg_s(r_val, SYS_HFGITR_EL2); + + ctxt_sys_reg(hctxt, HDFGRTR_EL2) = read_sysreg_s(SYS_HDFGRTR_EL2); + ctxt_sys_reg(hctxt, HDFGWTR_EL2) = read_sysreg_s(SYS_HDFGWTR_EL2); + + r_clr = r_set = w_clr = w_set = 0; + + compute_clr_set(vcpu, HDFGRTR_EL2, r_clr, r_set); + compute_clr_set(vcpu, HDFGWTR_EL2, w_clr, w_set); + + r_val = __HDFGRTR_EL2_nMASK; + r_val |= r_set; + r_val &= ~r_clr; + + w_val = __HDFGWTR_EL2_nMASK; + w_val |= w_set; + w_val &= ~w_clr; + + write_sysreg_s(r_val, SYS_HDFGRTR_EL2); + write_sysreg_s(w_val, SYS_HDFGWTR_EL2); } -static inline void __deactivate_traps_hfgxtr(void) +static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) { - u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; + struct kvm_cpu_context *hctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; - if (cpus_have_final_cap(ARM64_SME)) { - tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; + if (!cpus_have_final_cap(ARM64_HAS_FGT)) + return; - r_set |= tmp; - w_set |= tmp; - } + write_sysreg_s(ctxt_sys_reg(hctxt, HFGRTR_EL2), SYS_HFGRTR_EL2); + write_sysreg_s(ctxt_sys_reg(hctxt, HFGWTR_EL2), SYS_HFGWTR_EL2); - if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) - w_clr |= HFGxTR_EL2_TCR_EL1_MASK; + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + return; - sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); - sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); + write_sysreg_s(ctxt_sys_reg(hctxt, HFGITR_EL2), SYS_HFGITR_EL2); + write_sysreg_s(ctxt_sys_reg(hctxt, HDFGRTR_EL2), SYS_HDFGRTR_EL2); + write_sysreg_s(ctxt_sys_reg(hctxt, HDFGWTR_EL2), SYS_HDFGWTR_EL2); } static inline void __activate_traps_common(struct kvm_vcpu *vcpu) @@ -145,8 +197,21 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - if (__hfgxtr_traps_required()) - __activate_traps_hfgxtr(); + if (cpus_have_final_cap(ARM64_HAS_HCX)) { + u64 hcrx = HCRX_GUEST_FLAGS; + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { + u64 clr = 0, set = 0; + + compute_clr_set(vcpu, HCRX_EL2, clr, set); + + hcrx |= set; + hcrx &= ~clr; + } + + write_sysreg_s(hcrx, SYS_HCRX_EL2); + } + + __activate_traps_hfgxtr(vcpu); } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) @@ -162,8 +227,10 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); } - if (__hfgxtr_traps_required()) - __deactivate_traps_hfgxtr(); + if (cpus_have_final_cap(ARM64_HAS_HCX)) + write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); + + __deactivate_traps_hfgxtr(vcpu); } static inline void ___activate_traps(struct kvm_vcpu *vcpu) @@ -177,9 +244,6 @@ static inline void ___activate_traps(struct kvm_vcpu *vcpu) if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2); - - if (cpus_have_final_cap(ARM64_HAS_HCX)) - write_sysreg_s(HCRX_GUEST_FLAGS, SYS_HCRX_EL2); } static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) @@ -194,9 +258,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 &= ~HCR_VSE; vcpu->arch.hcr_el2 |= read_sysreg(hcr_el2) & HCR_VSE; } - - if (cpus_have_final_cap(ARM64_HAS_HCX)) - write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); } static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index d5ec972b5c1e..230e4f2527de 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -26,6 +26,7 @@ int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot int __pkvm_create_private_mapping(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot, unsigned long *haddr); +int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr); int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr); #endif /* __KVM_HYP_MM_H */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index a169c619db60..857d9bc04fd4 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -135,6 +135,16 @@ static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctx __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level); } +static void +handle___kvm_tlb_flush_vmid_range(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, start, host_ctxt, 2); + DECLARE_REG(unsigned long, pages, host_ctxt, 3); + + __kvm_tlb_flush_vmid_range(kern_hyp_va(mmu), start, pages); +} + static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -327,6 +337,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh), HANDLE_FUNC(__kvm_tlb_flush_vmid), + HANDLE_FUNC(__kvm_tlb_flush_vmid_range), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__vgic_v3_read_vmcr), diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 318298eb3d6b..65a7a186d7b2 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -44,6 +44,27 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size, return err; } +static int __pkvm_alloc_private_va_range(unsigned long start, size_t size) +{ + unsigned long cur; + + hyp_assert_lock_held(&pkvm_pgd_lock); + + if (!start || start < __io_map_base) + return -EINVAL; + + /* The allocated size is always a multiple of PAGE_SIZE */ + cur = start + PAGE_ALIGN(size); + + /* Are we overflowing on the vmemmap ? */ + if (cur > __hyp_vmemmap) + return -ENOMEM; + + __io_map_base = cur; + + return 0; +} + /** * pkvm_alloc_private_va_range - Allocates a private VA range. * @size: The size of the VA range to reserve. @@ -56,27 +77,16 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size, */ int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr) { - unsigned long base, addr; - int ret = 0; + unsigned long addr; + int ret; hyp_spin_lock(&pkvm_pgd_lock); - - /* Align the allocation based on the order of its size */ - addr = ALIGN(__io_map_base, PAGE_SIZE << get_order(size)); - - /* The allocated size is always a multiple of PAGE_SIZE */ - base = addr + PAGE_ALIGN(size); - - /* Are we overflowing on the vmemmap ? */ - if (!addr || base > __hyp_vmemmap) - ret = -ENOMEM; - else { - __io_map_base = base; - *haddr = addr; - } - + addr = __io_map_base; + ret = __pkvm_alloc_private_va_range(addr, size); hyp_spin_unlock(&pkvm_pgd_lock); + *haddr = addr; + return ret; } @@ -340,6 +350,45 @@ int hyp_create_idmap(u32 hyp_va_bits) return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC); } +int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) +{ + unsigned long addr, prev_base; + size_t size; + int ret; + + hyp_spin_lock(&pkvm_pgd_lock); + + prev_base = __io_map_base; + /* + * Efficient stack verification using the PAGE_SHIFT bit implies + * an alignment of our allocation on the order of the size. + */ + size = PAGE_SIZE * 2; + addr = ALIGN(__io_map_base, size); + + ret = __pkvm_alloc_private_va_range(addr, size); + if (!ret) { + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the PAGE_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * PAGE_SHIFT bit as 0 - this is used for overflow detection. + */ + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + PAGE_SIZE, + PAGE_SIZE, phys, PAGE_HYP); + if (ret) + __io_map_base = prev_base; + } + hyp_spin_unlock(&pkvm_pgd_lock); + + *haddr = addr + size; + + return ret; +} + static void *admit_host_page(void *arg) { struct kvm_hyp_memcache *host_mc = arg; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index bb98630dfeaf..0d5e0a89ddce 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -113,7 +113,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, for (i = 0; i < hyp_nr_cpus; i++) { struct kvm_nvhe_init_params *params = per_cpu_ptr(&kvm_init_params, i); - unsigned long hyp_addr; start = (void *)kern_hyp_va(per_cpu_base[i]); end = start + PAGE_ALIGN(hyp_percpu_size); @@ -121,33 +120,9 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - /* - * Allocate a contiguous HYP private VA range for the stack - * and guard page. The allocation is also aligned based on - * the order of its size. - */ - ret = pkvm_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); + ret = pkvm_create_stack(params->stack_pa, ¶ms->stack_hyp_va); if (ret) return ret; - - /* - * Since the stack grows downwards, map the stack to the page - * at the higher address and leave the lower guard page - * unbacked. - * - * Any valid stack address now has the PAGE_SHIFT bit as 1 - * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. - */ - hyp_spin_lock(&pkvm_pgd_lock); - ret = kvm_pgtable_hyp_map(&pkvm_pgtable, hyp_addr + PAGE_SIZE, - PAGE_SIZE, params->stack_pa, PAGE_HYP); - hyp_spin_unlock(&pkvm_pgd_lock); - if (ret) - return ret; - - /* Update stack_hyp_va to end of the stack's private VA range */ - params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); } /* diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index e89a23153e85..c353a06ee7e6 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -236,7 +236,7 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) * KVM_ARM_VCPU_INIT, however, this is likely not possible for * protected VMs. */ - vcpu->arch.target = -1; + vcpu_clear_flag(vcpu, VCPU_INITIALIZED); *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); *exit_code |= ARM_EXCEPTION_IL; } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index b9991bbd8e3f..1b265713d6be 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -182,6 +182,36 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, __tlb_switch_to_host(&cxt); } +void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, + phys_addr_t start, unsigned long pages) +{ + struct tlb_inv_context cxt; + unsigned long stride; + + /* + * Since the range of addresses may not be mapped at + * the same level, assume the worst case as PAGE_SIZE + */ + stride = PAGE_SIZE; + start = round_down(start, stride); + + /* Switch to requested VMID */ + __tlb_switch_to_guest(mmu, &cxt, false); + + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0); + + dsb(ish); + __tlbi(vmalle1is); + dsb(ish); + isb(); + + /* See the comment in __kvm_tlb_flush_vmid_ipa() */ + if (icache_is_vpipt()) + icache_inval_all_pou(); + + __tlb_switch_to_host(&cxt); +} + void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index f7a93ef29250..f155b8c9e98c 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -670,6 +670,26 @@ static bool stage2_has_fwb(struct kvm_pgtable *pgt) return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); } +void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, + phys_addr_t addr, size_t size) +{ + unsigned long pages, inval_pages; + + if (!system_supports_tlb_range()) { + kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); + return; + } + + pages = size >> PAGE_SHIFT; + while (pages > 0) { + inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); + kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages); + + addr += inval_pages << PAGE_SHIFT; + pages -= inval_pages; + } +} + #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, @@ -786,7 +806,8 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, * evicted pte value (if any). */ if (kvm_pte_table(ctx->old, ctx->level)) - kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); + kvm_tlb_flush_vmid_range(mmu, ctx->addr, + kvm_granule_size(ctx->level)); else if (kvm_pte_valid(ctx->old)) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); @@ -810,16 +831,36 @@ static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t n smp_store_release(ctx->ptep, new); } -static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu, - struct kvm_pgtable_mm_ops *mm_ops) +static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) +{ + /* + * If FEAT_TLBIRANGE is implemented, defer the individual + * TLB invalidations until the entire walk is finished, and + * then use the range-based TLBI instructions to do the + * invalidations. Condition deferred TLB invalidation on the + * system supporting FWB as the optimization is entirely + * pointless when the unmap walker needs to perform CMOs. + */ + return system_supports_tlb_range() && stage2_has_fwb(pgt); +} + +static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, + struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) { + struct kvm_pgtable *pgt = ctx->arg; + /* - * Clear the existing PTE, and perform break-before-make with - * TLB maintenance if it was valid. + * Clear the existing PTE, and perform break-before-make if it was + * valid. Depending on the system support, defer the TLB maintenance + * for the same until the entire unmap walk is completed. */ if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + + if (!stage2_unmap_defer_tlb_flush(pgt)) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, + ctx->addr, ctx->level); } mm_ops->put_page(ctx->ptep); @@ -1077,7 +1118,7 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, * block entry and rely on the remaining portions being faulted * back lazily. */ - stage2_put_pte(ctx, mmu, mm_ops); + stage2_unmap_put_pte(ctx, mmu, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), @@ -1091,13 +1132,19 @@ static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { + int ret; struct kvm_pgtable_walker walker = { .cb = stage2_unmap_walker, .arg = pgt, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - return kvm_pgtable_walk(pgt, addr, size, &walker); + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + if (stage2_unmap_defer_tlb_flush(pgt)) + /* Perform the deferred TLB invalidations */ + kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); + + return ret; } struct stage2_attr_data { diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index e69da550cdc5..46bd43f61d76 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -143,6 +143,34 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, __tlb_switch_to_host(&cxt); } +void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, + phys_addr_t start, unsigned long pages) +{ + struct tlb_inv_context cxt; + unsigned long stride; + + /* + * Since the range of addresses may not be mapped at + * the same level, assume the worst case as PAGE_SIZE + */ + stride = PAGE_SIZE; + start = round_down(start, stride); + + dsb(ishst); + + /* Switch to requested VMID */ + __tlb_switch_to_guest(mmu, &cxt); + + __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride, 0); + + dsb(ish); + __tlbi(vmalle1is); + dsb(ish); + isb(); + + __tlb_switch_to_host(&cxt); +} + void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; |