diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2023-07-01 07:04:29 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2023-07-01 07:04:29 -0400 |
commit | cc744042d90809ccb7cac7f9fb773f5c9cb9f835 (patch) | |
tree | ff26add0ac2a17b15bb7b789f276623aae7e8201 /arch/arm64/kvm/hyp | |
parent | b5396271eab4ec28f0d27ff48e1b151b7b824295 (diff) | |
parent | 192df2aa0113ddddee2a93e453ff46610807b425 (diff) | |
download | linux-cc744042d90809ccb7cac7f9fb773f5c9cb9f835.tar.gz linux-cc744042d90809ccb7cac7f9fb773f5c9cb9f835.tar.bz2 linux-cc744042d90809ccb7cac7f9fb773f5c9cb9f835.zip |
Merge tag 'kvmarm-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 updates for 6.5
- Eager page splitting optimization for dirty logging, optionally
allowing for a VM to avoid the cost of block splitting in the stage-2
fault path.
- Arm FF-A proxy for pKVM, allowing a pKVM host to safely interact with
services that live in the Secure world. pKVM intervenes on FF-A calls
to guarantee the host doesn't misuse memory donated to the hyp or a
pKVM guest.
- Support for running the split hypervisor with VHE enabled, known as
'hVHE' mode. This is extremely useful for testing the split
hypervisor on VHE-only systems, and paves the way for new use cases
that depend on having two TTBRs available at EL2.
- Generalized framework for configurable ID registers from userspace.
KVM/arm64 currently prevents arbitrary CPU feature set configuration
from userspace, but the intent is to relax this limitation and allow
userspace to select a feature set consistent with the CPU.
- Enable the use of Branch Target Identification (FEAT_BTI) in the
hypervisor.
- Use a separate set of pointer authentication keys for the hypervisor
when running in protected mode, as the host is untrusted at runtime.
- Ensure timer IRQs are consistently released in the init failure
paths.
- Avoid trapping CTR_EL0 on systems with Enhanced Virtualization Traps
(FEAT_EVT), as it is a register commonly read from userspace.
- Erratum workaround for the upcoming AmpereOne part, which has broken
hardware A/D state management.
As a consequence of the hVHE series reworking the arm64 software
features framework, the for-next/module-alloc branch from the arm64 tree
comes along for the ride.
Diffstat (limited to 'arch/arm64/kvm/hyp')
-rw-r--r-- | arch/arm64/kvm/hyp/include/hyp/switch.h | 101 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/include/nvhe/ffa.h | 17 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 3 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/Makefile | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/ffa.c | 762 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/host.S | 36 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/hyp-init.S | 32 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/hyp-main.c | 19 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/mem_protect.c | 74 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/pkvm.c | 27 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/setup.c | 11 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/switch.c | 28 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/timer-sr.c | 16 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/tlb.c | 52 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/pgtable.c | 228 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/vhe/switch.c | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/vhe/tlb.c | 32 |
17 files changed, 1370 insertions, 72 deletions
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 4fe217efa218..f35d5abedf9c 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -70,6 +70,56 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } +static inline bool __hfgxtr_traps_required(void) +{ + if (cpus_have_final_cap(ARM64_SME)) + return true; + + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + return true; + + return false; +} + +static inline void __activate_traps_hfgxtr(void) +{ + u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; + + if (cpus_have_final_cap(ARM64_SME)) { + tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; + + r_clr |= tmp; + w_clr |= tmp; + } + + /* + * Trap guest writes to TCR_EL1 to prevent it from enabling HA or HD. + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + w_set |= HFGxTR_EL2_TCR_EL1_MASK; + + sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); + sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); +} + +static inline void __deactivate_traps_hfgxtr(void) +{ + u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; + + if (cpus_have_final_cap(ARM64_SME)) { + tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; + + r_set |= tmp; + w_set |= tmp; + } + + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + w_clr |= HFGxTR_EL2_TCR_EL1_MASK; + + sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); + sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); +} + static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ @@ -95,16 +145,8 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - if (cpus_have_final_cap(ARM64_SME)) { - sysreg_clear_set_s(SYS_HFGRTR_EL2, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK, - 0); - sysreg_clear_set_s(SYS_HFGWTR_EL2, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK, - 0); - } + if (__hfgxtr_traps_required()) + __activate_traps_hfgxtr(); } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) @@ -120,14 +162,8 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); } - if (cpus_have_final_cap(ARM64_SME)) { - sysreg_clear_set_s(SYS_HFGRTR_EL2, 0, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK); - sysreg_clear_set_s(SYS_HFGWTR_EL2, 0, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK); - } + if (__hfgxtr_traps_required()) + __deactivate_traps_hfgxtr(); } static inline void ___activate_traps(struct kvm_vcpu *vcpu) @@ -203,7 +239,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (has_vhe()) { + if (has_vhe() || has_hvhe()) { reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN; if (sve_guest) reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; @@ -395,12 +431,39 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) return true; } +static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + int rt = kvm_vcpu_sys_get_rt(vcpu); + u64 val = vcpu_get_reg(vcpu, rt); + + if (sysreg != SYS_TCR_EL1) + return false; + + /* + * Affected parts do not advertise support for hardware Access Flag / + * Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying + * control bits are still functional. The architecture requires these be + * RES0 on systems that do not implement FEAT_HAFDBS. + * + * Uphold the requirements of the architecture by masking guest writes + * to TCR_EL1.{HA,HD} here. + */ + val &= ~(TCR_HD | TCR_HA); + write_sysreg_el1(val, SYS_TCR); + return true; +} + static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && handle_tx2_tvm(vcpu)) return true; + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) && + handle_ampere1_tcr(vcpu)) + return true; + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && __vgic_v3_perform_cpuif_access(vcpu) == 1) return true; diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h new file mode 100644 index 000000000000..1becb10ecd80 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran <qwandor@google.com> + */ +#ifndef __KVM_HYP_FFA_H +#define __KVM_HYP_FFA_H + +#include <asm/kvm_host.h> + +#define FFA_MIN_FUNC_NUM 0x60 +#define FFA_MAX_FUNC_NUM 0x7F + +int hyp_ffa_init(void *pages); +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt); + +#endif /* __KVM_HYP_FFA_H */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index b7bdbe63deed..0972faccc2af 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -57,6 +57,7 @@ extern struct host_mmu host_mmu; enum pkvm_component_id { PKVM_ID_HOST, PKVM_ID_HYP, + PKVM_ID_FFA, }; extern unsigned long hyp_nr_cpus; @@ -66,6 +67,8 @@ int __pkvm_host_share_hyp(u64 pfn); int __pkvm_host_unshare_hyp(u64 pfn); int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 530347cdebe3..9ddc025e4b86 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -22,7 +22,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs)) hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c new file mode 100644 index 000000000000..58dcd92bf346 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -0,0 +1,762 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by + * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware + * Framework for Arm A-profile", which is specified by Arm in document + * number DEN0077. + * + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran <qwandor@google.com> + * + * This driver hooks into the SMC trapping logic for the host and intercepts + * all calls falling within the FF-A range. Each call is either: + * + * - Forwarded on unmodified to the SPMD at EL3 + * - Rejected as "unsupported" + * - Accompanied by a host stage-2 page-table check/update and reissued + * + * Consequently, any attempts by the host to make guest memory pages + * accessible to the secure world using FF-A will be detected either here + * (in the case that the memory is already owned by the guest) or during + * donation to the guest (in the case that the memory was previously shared + * with the secure world). + * + * To allow the rolling-back of page-table updates and FF-A calls in the + * event of failure, operations involving the RXTX buffers are locked for + * the duration and are therefore serialised. + */ + +#include <linux/arm-smccc.h> +#include <linux/arm_ffa.h> +#include <asm/kvm_pkvm.h> + +#include <nvhe/ffa.h> +#include <nvhe/mem_protect.h> +#include <nvhe/memory.h> +#include <nvhe/trap_handler.h> +#include <nvhe/spinlock.h> + +/* + * "ID value 0 must be returned at the Non-secure physical FF-A instance" + * We share this ID with the host. + */ +#define HOST_FFA_ID 0 + +/* + * A buffer to hold the maximum descriptor size we can see from the host, + * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP + * when resolving the handle on the reclaim path. + */ +struct kvm_ffa_descriptor_buffer { + void *buf; + size_t len; +}; + +static struct kvm_ffa_descriptor_buffer ffa_desc_buf; + +struct kvm_ffa_buffers { + hyp_spinlock_t lock; + void *tx; + void *rx; +}; + +/* + * Note that we don't currently lock these buffers explicitly, instead + * relying on the locking of the host FFA buffers as we only have one + * client. + */ +static struct kvm_ffa_buffers hyp_buffers; +static struct kvm_ffa_buffers host_buffers; + +static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) +{ + *res = (struct arm_smccc_res) { + .a0 = FFA_ERROR, + .a2 = ffa_errno, + }; +} + +static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop) +{ + if (ret == FFA_RET_SUCCESS) { + *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS, + .a2 = prop }; + } else { + ffa_to_smccc_error(res, ret); + } +} + +static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret) +{ + ffa_to_smccc_res_prop(res, ret, 0); +} + +static void ffa_set_retval(struct kvm_cpu_context *ctxt, + struct arm_smccc_res *res) +{ + cpu_reg(ctxt, 0) = res->a0; + cpu_reg(ctxt, 1) = res->a1; + cpu_reg(ctxt, 2) = res->a2; + cpu_reg(ctxt, 3) = res->a3; +} + +static bool is_ffa_call(u64 func_id) +{ + return ARM_SMCCC_IS_FAST_CALL(func_id) && + ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD && + ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM && + ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM; +} + +static int ffa_map_hyp_buffers(u64 ffa_page_count) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP, + hyp_virt_to_phys(hyp_buffers.tx), + hyp_virt_to_phys(hyp_buffers.rx), + ffa_page_count, + 0, 0, 0, 0, + &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static int ffa_unmap_hyp_buffers(void) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_smc(FFA_RXTX_UNMAP, + HOST_FFA_ID, + 0, 0, 0, 0, 0, 0, + &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo, + u32 handle_hi, u32 fraglen, u32 endpoint_id) +{ + arm_smccc_1_1_smc(FFA_MEM_FRAG_TX, + handle_lo, handle_hi, fraglen, endpoint_id, + 0, 0, 0, + res); +} + +static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo, + u32 handle_hi, u32 fragoff) +{ + arm_smccc_1_1_smc(FFA_MEM_FRAG_RX, + handle_lo, handle_hi, fragoff, HOST_FFA_ID, + 0, 0, 0, + res); +} + +static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len, + u32 fraglen) +{ + arm_smccc_1_1_smc(func_id, len, fraglen, + 0, 0, 0, 0, 0, + res); +} + +static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo, + u32 handle_hi, u32 flags) +{ + arm_smccc_1_1_smc(FFA_MEM_RECLAIM, + handle_lo, handle_hi, flags, + 0, 0, 0, 0, + res); +} + +static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) +{ + arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ, + len, len, + 0, 0, 0, 0, 0, + res); +} + +static void do_ffa_rxtx_map(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(phys_addr_t, tx, ctxt, 1); + DECLARE_REG(phys_addr_t, rx, ctxt, 2); + DECLARE_REG(u32, npages, ctxt, 3); + int ret = 0; + void *rx_virt, *tx_virt; + + if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (host_buffers.tx) { + ret = FFA_RET_DENIED; + goto out_unlock; + } + + /* + * Map our hypervisor buffers into the SPMD before mapping and + * pinning the host buffers in our own address space. + */ + ret = ffa_map_hyp_buffers(npages); + if (ret) + goto out_unlock; + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unmap; + } + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_tx; + } + + tx_virt = hyp_phys_to_virt(tx); + ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_rx; + } + + rx_virt = hyp_phys_to_virt(rx); + ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unpin_tx; + } + + host_buffers.tx = tx_virt; + host_buffers.rx = rx_virt; + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); + return; + +err_unpin_tx: + hyp_unpin_shared_mem(tx_virt, tx_virt + 1); +err_unshare_rx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx)); +err_unshare_tx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx)); +err_unmap: + ffa_unmap_hyp_buffers(); + goto out_unlock; +} + +static void do_ffa_rxtx_unmap(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + int ret = 0; + + if (id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx))); + host_buffers.tx = NULL; + + hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx))); + host_buffers.rx = NULL; + + ffa_unmap_hyp_buffers(); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); +} + +static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nshared = __ffa_host_share_ranges(ranges, nranges); + int ret = 0; + + if (nshared != nranges) { + WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges); + int ret = 0; + + if (nunshared != nranges) { + WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static void do_ffa_mem_frag_tx(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, fraglen, ctxt, 3); + DECLARE_REG(u32, endpoint_id, ctxt, 4); + struct ffa_mem_region_addr_range *buf; + int ret = FFA_RET_INVALID_PARAMETERS; + u32 nr_ranges; + + if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) + goto out; + + if (fraglen % sizeof(*buf)) + goto out; + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) + goto out_unlock; + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + nr_ranges = fraglen / sizeof(*buf); + + ret = ffa_host_share_ranges(buf, nr_ranges); + if (ret) { + /* + * We're effectively aborting the transaction, so we need + * to restore the global state back to what it was prior to + * transmission of the first fragment. + */ + ffa_mem_reclaim(res, handle_lo, handle_hi, 0); + WARN_ON(res->a0 != FFA_SUCCESS); + goto out_unlock; + } + + ffa_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id); + if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX) + WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges)); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + + /* + * If for any reason this did not succeed, we're in trouble as we have + * now lost the content of the previous fragments and we can't rollback + * the host stage-2 changes. The pages previously marked as shared will + * remain stuck in that state forever, hence preventing the host from + * sharing/donating them again and may possibly lead to subsequent + * failures, but this will not compromise confidentiality. + */ + return; +} + +static __always_inline void do_ffa_mem_xfer(const u64 func_id, + struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, len, ctxt, 1); + DECLARE_REG(u32, fraglen, ctxt, 2); + DECLARE_REG(u64, addr_mbz, ctxt, 3); + DECLARE_REG(u32, npages_mbz, ctxt, 4); + struct ffa_composite_mem_region *reg; + struct ffa_mem_region *buf; + u32 offset, nr_ranges; + int ret = 0; + + BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE && + func_id != FFA_FN64_MEM_LEND); + + if (addr_mbz || npages_mbz || fraglen > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (fraglen < sizeof(struct ffa_mem_region) + + sizeof(struct ffa_mem_region_attributes)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + + offset = buf->ep_mem_access[0].composite_off; + if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + reg = (void *)buf + offset; + nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents; + if (nr_ranges % sizeof(reg->constituents[0])) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + nr_ranges /= sizeof(reg->constituents[0]); + ret = ffa_host_share_ranges(reg->constituents, nr_ranges); + if (ret) + goto out_unlock; + + ffa_mem_xfer(res, func_id, len, fraglen); + if (fraglen != len) { + if (res->a0 != FFA_MEM_FRAG_RX) + goto err_unshare; + + if (res->a3 != fraglen) + goto err_unshare; + } else if (res->a0 != FFA_SUCCESS) { + goto err_unshare; + } + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + return; + +err_unshare: + WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges)); + goto out_unlock; +} + +static void do_ffa_mem_reclaim(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, flags, ctxt, 3); + struct ffa_composite_mem_region *reg; + u32 offset, len, fraglen, fragoff; + struct ffa_mem_region *buf; + int ret = 0; + u64 handle; + + handle = PACK_HANDLE(handle_lo, handle_hi); + + hyp_spin_lock(&host_buffers.lock); + + buf = hyp_buffers.tx; + *buf = (struct ffa_mem_region) { + .sender_id = HOST_FFA_ID, + .handle = handle, + }; + + ffa_retrieve_req(res, sizeof(*buf)); + buf = hyp_buffers.rx; + if (res->a0 != FFA_MEM_RETRIEVE_RESP) + goto out_unlock; + + len = res->a1; + fraglen = res->a2; + + offset = buf->ep_mem_access[0].composite_off; + /* + * We can trust the SPMD to get this right, but let's at least + * check that we end up with something that doesn't look _completely_ + * bogus. + */ + if (WARN_ON(offset > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { + ret = FFA_RET_ABORTED; + goto out_unlock; + } + + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + goto out_unlock; + } + + buf = ffa_desc_buf.buf; + memcpy(buf, hyp_buffers.rx, fraglen); + + for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { + ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff); + if (res->a0 != FFA_MEM_FRAG_TX) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + fraglen = res->a3; + memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); + } + + ffa_mem_reclaim(res, handle_lo, handle_hi, flags); + if (res->a0 != FFA_SUCCESS) + goto out_unlock; + + reg = (void *)buf + offset; + /* If the SPMD was happy, then we should be too. */ + WARN_ON(ffa_host_unshare_ranges(reg->constituents, + reg->addr_range_cnt)); +out_unlock: + hyp_spin_unlock(&host_buffers.lock); + + if (ret) + ffa_to_smccc_res(res, ret); +} + +/* + * Is a given FFA function supported, either by forwarding on directly + * or by handling at EL2? + */ +static bool ffa_call_supported(u64 func_id) +{ + switch (func_id) { + /* Unsupported memory management calls */ + case FFA_FN64_MEM_RETRIEVE_REQ: + case FFA_MEM_RETRIEVE_RESP: + case FFA_MEM_RELINQUISH: + case FFA_MEM_OP_PAUSE: + case FFA_MEM_OP_RESUME: + case FFA_MEM_FRAG_RX: + case FFA_FN64_MEM_DONATE: + /* Indirect message passing via RX/TX buffers */ + case FFA_MSG_SEND: + case FFA_MSG_POLL: + case FFA_MSG_WAIT: + /* 32-bit variants of 64-bit calls */ + case FFA_MSG_SEND_DIRECT_REQ: + case FFA_MSG_SEND_DIRECT_RESP: + case FFA_RXTX_MAP: + case FFA_MEM_DONATE: + case FFA_MEM_RETRIEVE_REQ: + return false; + } + + return true; +} + +static bool do_ffa_features(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + u64 prop = 0; + int ret = 0; + + if (!ffa_call_supported(id)) { + ret = FFA_RET_NOT_SUPPORTED; + goto out_handled; + } + + switch (id) { + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + ret = FFA_RET_SUCCESS; + prop = 0; /* No support for dynamic buffers */ + goto out_handled; + default: + return false; + } + +out_handled: + ffa_to_smccc_res_prop(res, ret, prop); + return true; +} + +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, func_id, host_ctxt, 0); + struct arm_smccc_res res; + + /* + * There's no way we can tell what a non-standard SMC call might + * be up to. Ideally, we would terminate these here and return + * an error to the host, but sadly devices make use of custom + * firmware calls for things like power management, debugging, + * RNG access and crash reporting. + * + * Given that the architecture requires us to trust EL3 anyway, + * we forward unrecognised calls on under the assumption that + * the firmware doesn't expose a mechanism to access arbitrary + * non-secure memory. Short of a per-device table of SMCs, this + * is the best we can do. + */ + if (!is_ffa_call(func_id)) + return false; + + switch (func_id) { + case FFA_FEATURES: + if (!do_ffa_features(&res, host_ctxt)) + return false; + goto out_handled; + /* Memory management */ + case FFA_FN64_RXTX_MAP: + do_ffa_rxtx_map(&res, host_ctxt); + goto out_handled; + case FFA_RXTX_UNMAP: + do_ffa_rxtx_unmap(&res, host_ctxt); + goto out_handled; + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt); + goto out_handled; + case FFA_MEM_RECLAIM: + do_ffa_mem_reclaim(&res, host_ctxt); + goto out_handled; + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt); + goto out_handled; + case FFA_MEM_FRAG_TX: + do_ffa_mem_frag_tx(&res, host_ctxt); + goto out_handled; + } + + if (ffa_call_supported(func_id)) + return false; /* Pass through */ + + ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED); +out_handled: + ffa_set_retval(host_ctxt, &res); + return true; +} + +int hyp_ffa_init(void *pages) +{ + struct arm_smccc_res res; + size_t min_rxtx_sz; + void *tx, *rx; + + if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) + return 0; + + arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 == FFA_RET_NOT_SUPPORTED) + return 0; + + if (res.a0 != FFA_VERSION_1_0) + return -EOPNOTSUPP; + + arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + if (res.a2 != HOST_FFA_ID) + return -EINVAL; + + arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP, + 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + switch (res.a2) { + case FFA_FEAT_RXTX_MIN_SZ_4K: + min_rxtx_sz = SZ_4K; + break; + case FFA_FEAT_RXTX_MIN_SZ_16K: + min_rxtx_sz = SZ_16K; + break; + case FFA_FEAT_RXTX_MIN_SZ_64K: + min_rxtx_sz = SZ_64K; + break; + default: + return -EINVAL; + } + + if (min_rxtx_sz > PAGE_SIZE) + return -EOPNOTSUPP; + + tx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + rx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + + ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) { + .buf = pages, + .len = PAGE_SIZE * + (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)), + }; + + hyp_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + .tx = tx, + .rx = rx, + }; + + host_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + }; + + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index b6c0188c4b35..c87c63133e10 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -10,6 +10,7 @@ #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_ptrauth.h> .text @@ -37,10 +38,43 @@ SYM_FUNC_START(__host_exit) /* Save the host context pointer in x29 across the function call */ mov x29, x0 + +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL +alternative_if_not ARM64_HAS_ADDRESS_AUTH +b __skip_pauth_save +alternative_else_nop_endif + +alternative_if ARM64_KVM_PROTECTED_MODE + /* Save kernel ptrauth keys. */ + add x18, x29, #CPU_APIAKEYLO_EL1 + ptrauth_save_state x18, x19, x20 + + /* Use hyp keys. */ + adr_this_cpu x18, kvm_hyp_ctxt, x19 + add x18, x18, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state x18, x19, x20 + isb +alternative_else_nop_endif +__skip_pauth_save: +#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ + bl handle_trap - /* Restore host regs x0-x17 */ __host_enter_restore_full: + /* Restore kernel keys. */ +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL +alternative_if_not ARM64_HAS_ADDRESS_AUTH +b __skip_pauth_restore +alternative_else_nop_endif + +alternative_if ARM64_KVM_PROTECTED_MODE + add x18, x29, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state x18, x19, x20 +alternative_else_nop_endif +__skip_pauth_restore: +#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ + + /* Restore host regs x0-x17 */ ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index a6d67c2bb5ae..90fade1b032e 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -83,9 +83,6 @@ SYM_CODE_END(__kvm_hyp_init) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START_LOCAL(___kvm_hyp_init) - ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] - msr tpidr_el2, x1 - ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA] mov sp, x1 @@ -95,6 +92,22 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) ldr x1, [x0, #NVHE_INIT_HCR_EL2] msr hcr_el2, x1 + mov x2, #HCR_E2H + and x2, x1, x2 + cbz x2, 1f + + // hVHE: Replay the EL2 setup to account for the E2H bit + // TPIDR_EL2 is used to preserve x0 across the macro maze... + isb + msr tpidr_el2, x0 + init_el2_state + finalise_el2_state + mrs x0, tpidr_el2 + +1: + ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] + msr tpidr_el2, x1 + ldr x1, [x0, #NVHE_INIT_VTTBR] msr vttbr_el2, x1 @@ -128,6 +141,13 @@ alternative_if ARM64_HAS_ADDRESS_AUTH SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) orr x0, x0, x1 alternative_else_nop_endif + +#ifdef CONFIG_ARM64_BTI_KERNEL +alternative_if ARM64_BTI + orr x0, x0, #SCTLR_EL2_BT +alternative_else_nop_endif +#endif /* CONFIG_ARM64_BTI_KERNEL */ + msr sctlr_el2, x0 isb @@ -184,6 +204,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) /* Initialize EL2 CPU state to sane values. */ init_el2_state // Clobbers x0..x2 finalise_el2_state + __init_el2_nvhe_prepare_eret /* Enable MMU, set vectors and stack. */ mov x0, x28 @@ -196,6 +217,11 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) SYM_CODE_END(__kvm_hyp_init_cpu) SYM_CODE_START(__kvm_handle_stub_hvc) + /* + * __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so + * we need bti j at beginning. + */ + bti j cmp x0, #HVC_SOFT_RESTART b.ne 1f diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 728e01d4536b..a169c619db60 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -13,6 +13,7 @@ #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include <nvhe/ffa.h> #include <nvhe/mem_protect.h> #include <nvhe/mm.h> #include <nvhe/pkvm.h> @@ -125,6 +126,15 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); } +static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2); + DECLARE_REG(int, level, host_ctxt, 3); + + __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level); +} + static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -315,6 +325,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), + HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh), HANDLE_FUNC(__kvm_tlb_flush_vmid), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), @@ -374,6 +385,8 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt) handled = kvm_host_psci_handler(host_ctxt); if (!handled) + handled = kvm_host_ffa_handler(host_ctxt); + if (!handled) default_host_smc_handler(host_ctxt); /* SMC was trapped, move ELR past the current PC. */ @@ -392,7 +405,11 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) handle_host_smc(host_ctxt); break; case ESR_ELx_EC_SVE: - sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); + if (has_hvhe()) + sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN | + CPACR_EL1_ZEN_EL0EN)); + else + sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); isb(); sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); break; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index a8813b212996..9d703441278b 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -91,9 +91,9 @@ static void host_s2_put_page(void *addr) hyp_put_page(&host_s2_pool, addr); } -static void host_s2_free_removed_table(void *addr, u32 level) +static void host_s2_free_unlinked_table(void *addr, u32 level) { - kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level); + kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level); } static int prepare_s2_pool(void *pgt_pool_base) @@ -110,7 +110,7 @@ static int prepare_s2_pool(void *pgt_pool_base) host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_pages_exact = host_s2_zalloc_pages_exact, .zalloc_page = host_s2_zalloc_page, - .free_removed_table = host_s2_free_removed_table, + .free_unlinked_table = host_s2_free_unlinked_table, .phys_to_virt = hyp_phys_to_virt, .virt_to_phys = hyp_virt_to_phys, .page_count = hyp_page_count, @@ -842,6 +842,13 @@ static int check_share(struct pkvm_mem_share *share) case PKVM_ID_HYP: ret = hyp_ack_share(completer_addr, tx, share->completer_prot); break; + case PKVM_ID_FFA: + /* + * We only check the host; the secure side will check the other + * end when we forward the FFA call. + */ + ret = 0; + break; default: ret = -EINVAL; } @@ -870,6 +877,13 @@ static int __do_share(struct pkvm_mem_share *share) case PKVM_ID_HYP: ret = hyp_complete_share(completer_addr, tx, share->completer_prot); break; + case PKVM_ID_FFA: + /* + * We're not responsible for any secure page-tables, so there's + * nothing to do here. + */ + ret = 0; + break; default: ret = -EINVAL; } @@ -918,6 +932,10 @@ static int check_unshare(struct pkvm_mem_share *share) case PKVM_ID_HYP: ret = hyp_ack_unshare(completer_addr, tx); break; + case PKVM_ID_FFA: + /* See check_share() */ + ret = 0; + break; default: ret = -EINVAL; } @@ -946,6 +964,10 @@ static int __do_unshare(struct pkvm_mem_share *share) case PKVM_ID_HYP: ret = hyp_complete_unshare(completer_addr, tx); break; + case PKVM_ID_FFA: + /* See __do_share() */ + ret = 0; + break; default: ret = -EINVAL; } @@ -1235,3 +1257,49 @@ void hyp_unpin_shared_mem(void *from, void *to) hyp_unlock_component(); host_unlock_component(); } + +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) +{ + int ret; + struct pkvm_mem_share share = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HOST, + .addr = hyp_pfn_to_phys(pfn), + }, + .completer = { + .id = PKVM_ID_FFA, + }, + }, + }; + + host_lock_component(); + ret = do_share(&share); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) +{ + int ret; + struct pkvm_mem_share share = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HOST, + .addr = hyp_pfn_to_phys(pfn), + }, + .completer = { + .id = PKVM_ID_FFA, + }, + }, + }; + + host_lock_component(); + ret = do_unshare(&share); + host_unlock_component(); + + return ret; +} diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index a06ece14a6d8..8033ef353a5d 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -27,6 +27,7 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) u64 hcr_set = HCR_RW; u64 hcr_clear = 0; u64 cptr_set = 0; + u64 cptr_clear = 0; /* Protected KVM does not support AArch32 guests. */ BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), @@ -43,6 +44,9 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), PVM_ID_AA64PFR0_ALLOW)); + if (has_hvhe()) + hcr_set |= HCR_E2H; + /* Trap RAS unless all current versions are supported */ if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < ID_AA64PFR0_EL1_RAS_V1P1) { @@ -57,12 +61,17 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) } /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) - cptr_set |= CPTR_EL2_TZ; + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { + if (has_hvhe()) + cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; + else + cptr_set |= CPTR_EL2_TZ; + } vcpu->arch.hcr_el2 |= hcr_set; vcpu->arch.hcr_el2 &= ~hcr_clear; vcpu->arch.cptr_el2 |= cptr_set; + vcpu->arch.cptr_el2 &= ~cptr_clear; } /* @@ -120,8 +129,12 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) mdcr_set |= MDCR_EL2_TTRF; /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) - cptr_set |= CPTR_EL2_TTA; + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) { + if (has_hvhe()) + cptr_set |= CPACR_EL1_TTA; + else + cptr_set |= CPTR_EL2_TTA; + } vcpu->arch.mdcr_el2 |= mdcr_set; vcpu->arch.mdcr_el2 &= ~mdcr_clear; @@ -176,8 +189,10 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) /* Clear res0 and set res1 bits to trap potential new features. */ vcpu->arch.hcr_el2 &= ~(HCR_RES0); vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); - vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; - vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); + if (!has_hvhe()) { + vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; + vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); + } } /* diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 110f04627785..bb98630dfeaf 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -11,6 +11,7 @@ #include <asm/kvm_pkvm.h> #include <nvhe/early_alloc.h> +#include <nvhe/ffa.h> #include <nvhe/fixed_config.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> @@ -28,6 +29,7 @@ static void *vmemmap_base; static void *vm_table_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; +static void *ffa_proxy_pages; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; static struct hyp_pool hpool; @@ -57,6 +59,11 @@ static int divide_memory_pool(void *virt, unsigned long size) if (!host_s2_pgt_base) return -ENOMEM; + nr_pages = hyp_ffa_proxy_pages(); + ffa_proxy_pages = hyp_early_alloc_contig(nr_pages); + if (!ffa_proxy_pages) + return -ENOMEM; + return 0; } @@ -314,6 +321,10 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; + ret = hyp_ffa_init(ffa_proxy_pages); + if (ret) + goto out; + pkvm_hyp_vm_table_init(vm_table_base); out: /* diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 77791495c995..0a6271052def 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -44,13 +44,24 @@ static void __activate_traps(struct kvm_vcpu *vcpu) __activate_traps_common(vcpu); val = vcpu->arch.cptr_el2; - val |= CPTR_EL2_TTA | CPTR_EL2_TAM; + val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */ + val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; + if (cpus_have_final_cap(ARM64_SME)) { + if (has_hvhe()) + val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN); + else + val |= CPTR_EL2_TSM; + } + if (!guest_owns_fp_regs(vcpu)) { - val |= CPTR_EL2_TFP | CPTR_EL2_TZ; + if (has_hvhe()) + val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | + CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN); + else + val |= CPTR_EL2_TFP | CPTR_EL2_TZ; + __activate_traps_fpsimd32(vcpu); } - if (cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; write_sysreg(val, cptr_el2); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); @@ -73,7 +84,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { extern char __kvm_hyp_host_vector[]; - u64 cptr; ___deactivate_traps(vcpu); @@ -98,13 +108,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); - cptr = CPTR_EL2_DEFAULT; - if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)) - cptr |= CPTR_EL2_TZ; - if (cpus_have_final_cap(ARM64_SME)) - cptr &= ~CPTR_EL2_TSM; - - write_sysreg(cptr, cptr_el2); + kvm_reset_cptr_el2(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); } diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c index b185ac0dbd47..3aaab20ae5b4 100644 --- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -17,21 +17,24 @@ void __kvm_timer_set_cntvoff(u64 cntvoff) } /* - * Should only be called on non-VHE systems. + * Should only be called on non-VHE or hVHE setups. * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). */ void __timer_disable_traps(struct kvm_vcpu *vcpu) { - u64 val; + u64 val, shift = 0; + + if (has_hvhe()) + shift = 10; /* Allow physical timer/counter access for the host */ val = read_sysreg(cnthctl_el2); - val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; + val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; write_sysreg(val, cnthctl_el2); } /* - * Should only be called on non-VHE systems. + * Should only be called on non-VHE or hVHE setups. * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). */ void __timer_enable_traps(struct kvm_vcpu *vcpu) @@ -50,5 +53,10 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu) else clr |= CNTHCTL_EL1PCTEN; + if (has_hvhe()) { + clr <<= 10; + set <<= 10; + } + sysreg_clear_set(cnthctl_el2, clr, set); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 978179133f4b..b9991bbd8e3f 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -130,6 +130,58 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, __tlb_switch_to_host(&cxt); } +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + /* Switch to requested VMID */ + __tlb_switch_to_guest(mmu, &cxt, true); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); + + /* + * If the host is running at EL1 and we have a VPIPT I-cache, + * then we must perform I-cache maintenance at EL2 in order for + * it to have an effect on the guest. Since the guest cannot hit + * I-cache lines allocated with a different VMID, we don't need + * to worry about junk out of guest reset (we nuke the I-cache on + * VMID rollover), but we do need to be careful when remapping + * executable pages for the same guest. This can happen when KSM + * takes a CoW fault on an executable page, copies the page into + * a page that was previously mapped in the guest and then needs + * to invalidate the guest view of the I-cache for that page + * from EL1. To solve this, we invalidate the entire I-cache when + * unmapping a page from a guest if we have a VPIPT I-cache but + * the host is running at EL1. As above, we could do better if + * we had the VA. + * + * The moral of this story is: if you have a VPIPT I-cache, then + * you should be running with VHE enabled. + */ + if (icache_is_vpipt()) + icache_inval_all_pou(); + + __tlb_switch_to_host(&cxt); +} + void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 95dae02ccc2e..aa740a974e02 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -21,8 +21,10 @@ #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) @@ -34,7 +36,7 @@ #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) -#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) @@ -42,6 +44,8 @@ #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) +#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) + #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) @@ -63,6 +67,16 @@ struct kvm_pgtable_walk_data { const u64 end; }; +static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) +{ + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); +} + +static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) +{ + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); +} + static bool kvm_phys_is_valid(u64 phys) { return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); @@ -386,6 +400,9 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) if (device) return -EINVAL; + + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; } else { attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; } @@ -623,10 +640,18 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) #ifdef CONFIG_ARM64_HW_AFDBM /* * Enable the Hardware Access Flag management, unconditionally - * on all CPUs. The features is RES0 on CPUs without the support - * and must be ignored by the CPUs. + * on all CPUs. In systems that have asymmetric support for the feature + * this allows KVM to leverage hardware support on the subset of cores + * that implement the feature. + * + * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by + * hardware) on implementations that do not advertise support for the + * feature. As such, setting HA unconditionally is safe, unless you + * happen to be running on a design that has unadvertised support for + * HAFDBS. Here be dragons. */ - vtcr |= VTCR_EL2_HA; + if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + vtcr |= VTCR_EL2_HA; #endif /* CONFIG_ARM64_HW_AFDBM */ /* Set the vmid bits */ @@ -755,14 +780,17 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) return false; - /* - * Perform the appropriate TLB invalidation based on the evicted pte - * value (if any). - */ - if (kvm_pte_table(ctx->old, ctx->level)) - kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); - else if (kvm_pte_valid(ctx->old)) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { + /* + * Perform the appropriate TLB invalidation based on the + * evicted pte value (if any). + */ + if (kvm_pte_table(ctx->old, ctx->level)) + kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); + else if (kvm_pte_valid(ctx->old)) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, + ctx->addr, ctx->level); + } if (stage2_pte_is_counted(ctx->old)) mm_ops->put_page(ctx->ptep); @@ -869,11 +897,13 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, return -EAGAIN; /* Perform CMOs before installation of the guest stage-2 PTE */ - if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && + stage2_pte_cacheable(pgt, new)) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), - granule); + granule); - if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && + stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); stage2_make_pte(ctx, new); @@ -895,7 +925,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, if (ret) return ret; - mm_ops->free_removed_table(childp, ctx->level); + mm_ops->free_unlinked_table(childp, ctx->level); return 0; } @@ -940,7 +970,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, * The TABLE_PRE callback runs for table entries on the way down, looking * for table entries which we could conceivably replace with a block entry * for this mapping. If it finds one it replaces the entry and calls - * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table. + * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. * * Otherwise, the LEAF callback performs the mapping at the existing leaves * instead. @@ -1209,7 +1239,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED); if (!ret) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; } @@ -1242,6 +1272,162 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) return kvm_pgtable_walk(pgt, addr, size, &walker); } +kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, + u64 phys, u32 level, + enum kvm_pgtable_prot prot, + void *mc, bool force_pte) +{ + struct stage2_map_data map_data = { + .phys = phys, + .mmu = pgt->mmu, + .memcache = mc, + .force_pte = force_pte, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_SKIP_BBM_TLBI | + KVM_PGTABLE_WALK_SKIP_CMO, + .arg = &map_data, + }; + /* + * The input address (.addr) is irrelevant for walking an + * unlinked table. Construct an ambiguous IA range to map + * kvm_granule_size(level) worth of memory. + */ + struct kvm_pgtable_walk_data data = { + .walker = &walker, + .addr = 0, + .end = kvm_granule_size(level), + }; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + kvm_pte_t *pgtable; + int ret; + + if (!IS_ALIGNED(phys, kvm_granule_size(level))) + return ERR_PTR(-EINVAL); + + ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); + if (ret) + return ERR_PTR(ret); + + pgtable = mm_ops->zalloc_page(mc); + if (!pgtable) + return ERR_PTR(-ENOMEM); + + ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, + level + 1); + if (ret) { + kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); + mm_ops->put_page(pgtable); + return ERR_PTR(ret); + } + + return pgtable; +} + +/* + * Get the number of page-tables needed to replace a block with a + * fully populated tree up to the PTE entries. Note that @level is + * interpreted as in "level @level entry". + */ +static int stage2_block_get_nr_page_tables(u32 level) +{ + switch (level) { + case 1: + return PTRS_PER_PTE + 1; + case 2: + return 1; + case 3: + return 0; + default: + WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || + level >= KVM_PGTABLE_MAX_LEVELS); + return -EINVAL; + }; +} + +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + struct kvm_mmu_memory_cache *mc = ctx->arg; + struct kvm_s2_mmu *mmu; + kvm_pte_t pte = ctx->old, new, *childp; + enum kvm_pgtable_prot prot; + u32 level = ctx->level; + bool force_pte; + int nr_pages; + u64 phys; + + /* No huge-pages exist at the last level */ + if (level == KVM_PGTABLE_MAX_LEVELS - 1) + return 0; + + /* We only split valid block mappings */ + if (!kvm_pte_valid(pte)) + return 0; + + nr_pages = stage2_block_get_nr_page_tables(level); + if (nr_pages < 0) + return nr_pages; + + if (mc->nobjs >= nr_pages) { + /* Build a tree mapped down to the PTE granularity. */ + force_pte = true; + } else { + /* + * Don't force PTEs, so create_unlinked() below does + * not populate the tree up to the PTE level. The + * consequence is that the call will require a single + * page of level 2 entries at level 1, or a single + * page of PTEs at level 2. If we are at level 1, the + * PTEs will be created recursively. + */ + force_pte = false; + nr_pages = 1; + } + + if (mc->nobjs < nr_pages) + return -ENOMEM; + + mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); + phys = kvm_pte_to_phys(pte); + prot = kvm_pgtable_stage2_pte_prot(pte); + + childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, + level, prot, mc, force_pte); + if (IS_ERR(childp)) + return PTR_ERR(childp); + + if (!stage2_try_break_pte(ctx, mmu)) { + kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); + mm_ops->put_page(childp); + return -EAGAIN; + } + + /* + * Note, the contents of the page table are guaranteed to be made + * visible before the new PTE is assigned because stage2_make_pte() + * writes the PTE using smp_store_release(). + */ + new = kvm_init_table_pte(childp, mm_ops); + stage2_make_pte(ctx, new); + dsb(ishst); + return 0; +} + +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_split_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = mc, + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, @@ -1311,7 +1497,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->pgd = NULL; } -void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b37e7c96efea..6537f58b1a8c 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -84,7 +84,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); - write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); + kvm_reset_cptr_el2(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 24cef9b87f9e..e69da550cdc5 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -111,6 +111,38 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, __tlb_switch_to_host(&cxt); } +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + dsb(nshst); + + /* Switch to requested VMID */ + __tlb_switch_to_guest(mmu, &cxt); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); + + __tlb_switch_to_host(&cxt); +} + void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; |