summaryrefslogtreecommitdiffstats
path: root/arch/arm64/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm/mmu.c')
-rw-r--r--arch/arm64/kvm/mmu.c452
1 files changed, 312 insertions, 140 deletions
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index dc04bc767865..754f2fe0cc67 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -15,6 +15,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
+#include <asm/kvm_pkvm.h>
#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
@@ -29,8 +30,12 @@ static unsigned long __ro_after_init hyp_idmap_start;
static unsigned long __ro_after_init hyp_idmap_end;
static phys_addr_t __ro_after_init hyp_idmap_vector;
+u32 __ro_after_init __hyp_va_bits;
+
static unsigned long __ro_after_init io_map_base;
+#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn)
+
static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
phys_addr_t size)
{
@@ -147,7 +152,7 @@ static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
return -EINVAL;
next = __stage2_range_addr_end(addr, end, chunk_size);
- ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache);
if (ret)
break;
} while (addr = next, addr != end);
@@ -168,15 +173,23 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot)
*/
int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
- kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
+ if (is_protected_kvm_enabled())
+ kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
+ else
+ kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
return 0;
}
int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
gfn_t gfn, u64 nr_pages)
{
- kvm_tlb_flush_vmid_range(&kvm->arch.mmu,
- gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
+ u64 size = nr_pages << PAGE_SHIFT;
+ u64 addr = gfn << PAGE_SHIFT;
+
+ if (is_protected_kvm_enabled())
+ kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
+ else
+ kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size);
return 0;
}
@@ -225,7 +238,7 @@ static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
void *pgtable = page_to_virt(page);
s8 level = page_private(page);
- kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);
+ KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level);
}
static void stage2_free_unlinked_table(void *addr, s8 level)
@@ -324,13 +337,19 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
- WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
+ WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap),
may_block));
}
-static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
+void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
+ u64 size, bool may_block)
{
- __unmap_stage2_range(mmu, start, size, true);
+ __unmap_stage2_range(mmu, start, size, may_block);
+}
+
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+{
+ stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush));
}
static void stage2_flush_memslot(struct kvm *kvm,
@@ -339,7 +358,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush);
+ kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
}
/**
@@ -362,6 +381,8 @@ static void stage2_flush_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_flush_memslot(kvm, memslot);
+ kvm_nested_s2_flush(kvm);
+
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -696,10 +717,10 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
mutex_lock(&kvm_hyp_pgd_mutex);
/*
- * Efficient stack verification using the PAGE_SHIFT bit implies
+ * Efficient stack verification using the NVHE_STACK_SHIFT bit implies
* an alignment of our allocation on the order of the size.
*/
- size = PAGE_SIZE * 2;
+ size = NVHE_STACK_SIZE * 2;
base = ALIGN_DOWN(io_map_base - size, size);
ret = __hyp_alloc_private_va_range(base);
@@ -716,12 +737,12 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
* at the higher address and leave the lower guard page
* unbacked.
*
- * Any valid stack address now has the PAGE_SHIFT bit as 1
+ * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1
* and addresses corresponding to the guard page have the
- * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+ * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection.
*/
- ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr,
- PAGE_HYP);
+ ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE,
+ phys_addr, PAGE_HYP);
if (ret)
kvm_err("Cannot map hyp stack\n");
@@ -855,21 +876,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.icache_inval_pou = invalidate_icache_guest_page,
};
-/**
- * kvm_init_stage2_mmu - Initialise a S2 MMU structure
- * @kvm: The pointer to the KVM structure
- * @mmu: The pointer to the s2 MMU structure
- * @type: The machine type of the virtual machine
- *
- * Allocates only the stage-2 HW PGD level table(s).
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
{
u32 kvm_ipa_limit = get_kvm_ipa_limit();
- int cpu, err;
- struct kvm_pgtable *pgt;
u64 mmfr0, mmfr1;
u32 phys_shift;
@@ -896,20 +905,64 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+ return 0;
+}
+
+/**
+ * kvm_init_stage2_mmu - Initialise a S2 MMU structure
+ * @kvm: The pointer to the KVM structure
+ * @mmu: The pointer to the s2 MMU structure
+ * @type: The machine type of the virtual machine
+ *
+ * Allocates only the stage-2 HW PGD level table(s).
+ * Note we don't need locking here as this is only called in two cases:
+ *
+ * - when the VM is created, which can't race against anything
+ *
+ * - when secondary kvm_s2_mmu structures are initialised for NV
+ * guests, and the caller must hold kvm->lock as this is called on a
+ * per-vcpu basis.
+ */
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+{
+ int cpu, err;
+ struct kvm_pgtable *pgt;
+
+ /*
+ * If we already have our page tables in place, and that the
+ * MMU context is the canonical one, we have a bug somewhere,
+ * as this is only supposed to ever happen once per VM.
+ *
+ * Otherwise, we're building nested page tables, and that's
+ * probably because userspace called KVM_ARM_VCPU_INIT more
+ * than once on the same vcpu. Since that's actually legal,
+ * don't kick a fuss and leave gracefully.
+ */
if (mmu->pgt != NULL) {
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ return 0;
+
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
+ err = kvm_init_ipa_range(mmu, type);
+ if (err)
+ return err;
+
pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
if (!pgt)
return -ENOMEM;
mmu->arch = &kvm->arch;
- err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
+ err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops);
if (err)
goto out_free_pgtable;
+ mmu->pgt = pgt;
+ if (is_protected_kvm_enabled())
+ return 0;
+
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
err = -ENOMEM;
@@ -923,12 +976,15 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
mmu->split_page_cache.gfp_zero = __GFP_ZERO;
- mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
+
+ if (kvm_is_nested_s2_mmu(kvm, mmu))
+ kvm_init_nested_s2_mmu(mmu);
+
return 0;
out_destroy_pgtable:
- kvm_pgtable_stage2_destroy(pgt);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
out_free_pgtable:
kfree(pgt);
return err;
@@ -976,7 +1032,7 @@ static void stage2_unmap_memslot(struct kvm *kvm,
if (!(vma->vm_flags & VM_PFNMAP)) {
gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
- unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true);
}
hva = vm_end;
} while (hva < reg_end);
@@ -1003,6 +1059,8 @@ void stage2_unmap_vm(struct kvm *kvm)
kvm_for_each_memslot(memslot, bkt, slots)
stage2_unmap_memslot(kvm, memslot);
+ kvm_nested_s2_unmap(kvm, true);
+
write_unlock(&kvm->mmu_lock);
mmap_read_unlock(current->mm);
srcu_read_unlock(&kvm->srcu, idx);
@@ -1023,26 +1081,40 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
write_unlock(&kvm->mmu_lock);
if (pgt) {
- kvm_pgtable_stage2_destroy(pgt);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
kfree(pgt);
}
}
-static void hyp_mc_free_fn(void *addr, void *unused)
+static void hyp_mc_free_fn(void *addr, void *mc)
{
+ struct kvm_hyp_memcache *memcache = mc;
+
+ if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, -1);
+
free_page((unsigned long)addr);
}
-static void *hyp_mc_alloc_fn(void *unused)
+static void *hyp_mc_alloc_fn(void *mc)
{
- return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ struct kvm_hyp_memcache *memcache = mc;
+ void *addr;
+
+ addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
+ kvm_account_pgtable_pages(addr, 1);
+
+ return addr;
}
void free_hyp_memcache(struct kvm_hyp_memcache *mc)
{
- if (is_protected_kvm_enabled())
- __free_hyp_memcache(mc, hyp_mc_free_fn,
- kvm_host_va, NULL);
+ if (!is_protected_kvm_enabled())
+ return;
+
+ kfree(mc->mapping);
+ __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc);
}
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
@@ -1050,8 +1122,14 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
if (!is_protected_kvm_enabled())
return 0;
+ if (!mc->mapping) {
+ mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT);
+ if (!mc->mapping)
+ return -ENOMEM;
+ }
+
return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
- kvm_host_pa, NULL);
+ kvm_host_pa, mc);
}
/**
@@ -1088,8 +1166,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
break;
write_lock(&kvm->mmu_lock);
- ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
- &cache, 0);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE,
+ pa, prot, &cache, 0);
write_unlock(&kvm->mmu_lock);
if (ret)
break;
@@ -1102,14 +1180,14 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
}
/**
- * stage2_wp_range() - write protect stage2 memory region range
+ * kvm_stage2_wp_range() - write protect stage2 memory region range
* @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
-static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
- stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect);
+ stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect));
}
/**
@@ -1138,7 +1216,8 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_nested_s2_wp(kvm);
write_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
@@ -1192,7 +1271,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
lockdep_assert_held_write(&kvm->mmu_lock);
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
/*
* Eager-splitting is done when manual-protect is set. We
@@ -1204,6 +1283,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
*/
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
kvm_mmu_split_huge_pages(kvm, start, end);
+
+ kvm_nested_s2_wp(kvm);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -1357,10 +1438,21 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
{
unsigned long i, nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
if (!kvm_has_mte(kvm))
return;
+ if (folio_test_hugetlb(folio)) {
+ /* Hugetlb has MTE flags set on head page only */
+ if (folio_try_hugetlb_mte_tagging(folio)) {
+ for (i = 0; i < nr_pages; i++, page++)
+ mte_clear_page_tags(page_address(page));
+ folio_set_hugetlb_mte_tagged(folio);
+ }
+ return;
+ }
+
for (i = 0; i < nr_pages; i++, page++) {
if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
@@ -1375,6 +1467,7 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ struct kvm_s2_trans *nested,
struct kvm_memory_slot *memslot, unsigned long hva,
bool fault_is_perm)
{
@@ -1383,16 +1476,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
bool exec_fault, mte_allowed;
bool device = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
+ phys_addr_t ipa = fault_ipa;
struct kvm *kvm = vcpu->kvm;
- struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
short vma_shift;
+ void *memcache;
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
+ struct page *page;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
if (fault_is_perm)
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
@@ -1412,8 +1508,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* and a write fault needs to collapse a block entry into a table.
*/
if (!fault_is_perm || (logging_active && write_fault)) {
- ret = kvm_mmu_topup_memory_cache(memcache,
- kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
+ int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
+
+ if (!is_protected_kvm_enabled()) {
+ memcache = &vcpu->arch.mmu_page_cache;
+ ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
+ } else {
+ memcache = &vcpu->arch.pkvm_memcache;
+ ret = topup_hyp_memcache(memcache, min_pages);
+ }
if (ret)
return ret;
}
@@ -1434,7 +1537,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* logging_active is guaranteed to never be true for VM_PFNMAP
* memslots.
*/
- if (logging_active) {
+ if (logging_active || is_protected_kvm_enabled()) {
force_pte = true;
vma_shift = PAGE_SHIFT;
} else {
@@ -1466,10 +1569,45 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
vma_pagesize = 1UL << vma_shift;
- if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+
+ if (nested) {
+ unsigned long max_map_size;
+
+ max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
+
+ ipa = kvm_s2_trans_output(nested);
+
+ /*
+ * If we're about to create a shadow stage 2 entry, then we
+ * can only create a block mapping if the guest stage 2 page
+ * table uses at least as big a mapping.
+ */
+ max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+
+ /*
+ * Be careful that if the mapping size falls between
+ * two host sizes, take the smallest of the two.
+ */
+ if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
+ max_map_size = PMD_SIZE;
+ else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
+ max_map_size = PAGE_SIZE;
+
+ force_pte = (max_map_size == PAGE_SIZE);
+ vma_pagesize = min(vma_pagesize, (long)max_map_size);
+ }
+
+ /*
+ * Both the canonical IPA and fault IPA must be hugepage-aligned to
+ * ensure we find the right PFN and lay down the mapping in the right
+ * place.
+ */
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
fault_ipa &= ~(vma_pagesize - 1);
+ ipa &= ~(vma_pagesize - 1);
+ }
- gfn = fault_ipa >> PAGE_SHIFT;
+ gfn = ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
@@ -1479,7 +1617,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
/*
* Read mmu_invalidate_seq so that KVM can detect if the results of
- * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
+ * vma_lookup() or __kvm_faultin_pfn() become stale prior to
* acquiring kvm->mmu_lock.
*
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
@@ -1488,8 +1626,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
- pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
- write_fault, &writable, NULL);
+ pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
+ &writable, &page);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
return 0;
@@ -1502,7 +1640,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* If the page was identified as device early by looking at
* the VMA flags, vma_pagesize is already representing the
* largest quantity we can map. If instead it was mapped
- * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
+ * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
* and must not be upgraded.
*
* In both cases, we don't let transparent_hugepage_adjust()
@@ -1520,10 +1658,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault && device)
return -ENOEXEC;
- read_lock(&kvm->mmu_lock);
+ /*
+ * Potentially reduce shadow S2 permissions to match the guest's own
+ * S2. For exec faults, we'd only reach this point if the guest
+ * actually allowed it (see kvm_s2_handle_perm_fault).
+ *
+ * Also encode the level of the original translation in the SW bits
+ * of the leaf entry as a proxy for the span of that translation.
+ * This will be retrieved on TLB invalidation from the guest and
+ * used to limit the invalidation scope if a TTL hint or a range
+ * isn't provided.
+ */
+ if (nested) {
+ writable &= kvm_s2_trans_writable(nested);
+ if (!kvm_s2_trans_readable(nested))
+ prot &= ~KVM_PGTABLE_PROT_R;
+
+ prot |= kvm_encode_nested_level(nested);
+ }
+
+ kvm_fault_lock(kvm);
pgt = vcpu->arch.hw_mmu->pgt;
- if (mmu_invalidate_retry(kvm, mmu_seq))
+ if (mmu_invalidate_retry(kvm, mmu_seq)) {
+ ret = -EAGAIN;
goto out_unlock;
+ }
/*
* If we are not forced to use page mapping, check if we are
@@ -1564,7 +1723,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
prot |= KVM_PGTABLE_PROT_NORMAL_NC;
else
prot |= KVM_PGTABLE_PROT_DEVICE;
- } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
+ } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
+ (!nested || kvm_s2_trans_executable(nested))) {
prot |= KVM_PGTABLE_PROT_X;
}
@@ -1573,42 +1733,42 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_is_perm && vma_pagesize == fault_granule)
- ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
- else
- ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+ if (fault_is_perm && vma_pagesize == fault_granule) {
+ /*
+ * Drop the SW bits in favour of those stored in the
+ * PTE, which will be preserved.
+ */
+ prot &= ~KVM_NV_GUEST_MAP_SZ;
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
+ } else {
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
- memcache,
- KVM_PGTABLE_WALK_HANDLE_FAULT |
- KVM_PGTABLE_WALK_SHARED);
+ memcache, flags);
+ }
+
+out_unlock:
+ kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_fault_unlock(kvm);
/* Mark the page dirty only if the fault is handled successfully */
- if (writable && !ret) {
- kvm_set_pfn_dirty(pfn);
+ if (writable && !ret)
mark_page_dirty_in_slot(kvm, memslot, gfn);
- }
-out_unlock:
- read_unlock(&kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
return ret != -EAGAIN ? ret : 0;
}
/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- kvm_pte_t pte;
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
read_lock(&vcpu->kvm->mmu_lock);
mmu = vcpu->arch.hw_mmu;
- pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
+ KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags);
read_unlock(&vcpu->kvm->mmu_lock);
-
- if (kvm_pte_valid(pte))
- kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
}
/**
@@ -1624,17 +1784,38 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
*/
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
{
+ struct kvm_s2_trans nested_trans, *nested = NULL;
unsigned long esr;
- phys_addr_t fault_ipa;
+ phys_addr_t fault_ipa; /* The address we faulted on */
+ phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
struct kvm_memory_slot *memslot;
unsigned long hva;
bool is_iabt, write_fault, writable;
gfn_t gfn;
int ret, idx;
+ /* Synchronous External Abort? */
+ if (kvm_vcpu_abt_issea(vcpu)) {
+ /*
+ * For RAS the host kernel may handle this abort.
+ * There is no need to pass the error into the guest.
+ */
+ if (kvm_handle_guest_sea())
+ kvm_inject_vabt(vcpu);
+
+ return 1;
+ }
+
esr = kvm_vcpu_get_esr(vcpu);
- fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ /*
+ * The fault IPA should be reliable at this point as we're not dealing
+ * with an SEA.
+ */
+ ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm))
+ return -EFAULT;
+
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
if (esr_fsc_is_translation_fault(esr)) {
@@ -1645,7 +1826,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
/* Falls between the IPA range and the PARange? */
- if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
+ if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
if (is_iabt)
@@ -1656,18 +1837,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
}
- /* Synchronous External Abort? */
- if (kvm_vcpu_abt_issea(vcpu)) {
- /*
- * For RAS the host kernel may handle this abort.
- * There is no need to pass the error into the guest.
- */
- if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
- kvm_inject_vabt(vcpu);
-
- return 1;
- }
-
trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
kvm_vcpu_get_hfar(vcpu), fault_ipa);
@@ -1684,7 +1853,42 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
idx = srcu_read_lock(&vcpu->kvm->srcu);
- gfn = fault_ipa >> PAGE_SHIFT;
+ /*
+ * We may have faulted on a shadow stage 2 page table if we are
+ * running a nested guest. In this case, we have to resolve the L2
+ * IPA to the L1 IPA first, before knowing what kind of memory should
+ * back the L1 IPA.
+ *
+ * If the shadow stage 2 page table walk faults, then we simply inject
+ * this to the guest and carry on.
+ *
+ * If there are no shadow S2 PTs because S2 is disabled, there is
+ * nothing to walk and we treat it as a 1:1 before going through the
+ * canonical translation.
+ */
+ if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
+ vcpu->arch.hw_mmu->nested_stage2_enabled) {
+ u32 esr;
+
+ ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ipa = kvm_s2_trans_output(&nested_trans);
+ nested = &nested_trans;
+ }
+
+ gfn = ipa >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->kvm, gfn);
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
write_fault = kvm_is_write_fault(vcpu);
@@ -1728,13 +1932,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* faulting VA. This is always 12 bits, irrespective
* of the page size.
*/
- fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
- ret = io_mem_abort(vcpu, fault_ipa);
+ ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+ ret = io_mem_abort(vcpu, ipa);
goto out_unlock;
}
/* Userspace should not be able to register out-of-bounds IPAs */
- VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
+ VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
if (esr_fsc_is_access_flag_fault(esr)) {
handle_access_fault(vcpu, fault_ipa);
@@ -1742,7 +1946,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out_unlock;
}
- ret = user_mem_abort(vcpu, fault_ipa, memslot, hva,
+ ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
@@ -1765,40 +1969,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
(range->end - range->start) << PAGE_SHIFT,
range->may_block);
- return false;
-}
-
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- kvm_pfn_t pfn = pte_pfn(range->arg.pte);
-
- if (!kvm->arch.mmu.pgt)
- return false;
-
- WARN_ON(range->end - range->start != 1);
-
- /*
- * If the page isn't tagged, defer to user_mem_abort() for sanitising
- * the MTE tags. The S2 pte should have been unmapped by
- * mmu_notifier_invalidate_range_end().
- */
- if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
- return false;
-
- /*
- * We've moved a page around, probably through CoW, so let's treat
- * it just like a translation fault and the map handler will clean
- * the cache to the PoC.
- *
- * The MMU notifiers will have unmapped a huge PMD before calling
- * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
- * therefore we never need to clear out a huge PMD through this
- * calling path and a memcache is not required.
- */
- kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
- PAGE_SIZE, __pfn_to_phys(pfn),
- KVM_PGTABLE_PROT_R, NULL, 0);
-
+ kvm_nested_s2_unmap(kvm, range->may_block);
return false;
}
@@ -1809,9 +1980,13 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
range->start << PAGE_SHIFT,
size, true);
+ /*
+ * TODO: Handle nested_mmu structures here using the reverse mapping in
+ * a later version of patch series.
+ */
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -1821,7 +1996,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
range->start << PAGE_SHIFT,
size, false);
}
@@ -1931,6 +2106,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits)
goto out_destroy_pgtable;
io_map_base = hyp_idmap_start;
+ __hyp_va_bits = *hyp_va_bits;
return 0;
out_destroy_pgtable:
@@ -2054,11 +2230,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
}
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
- kvm_uninit_stage2_mmu(kvm);
-}
-
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
@@ -2066,7 +2237,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
phys_addr_t size = slot->npages << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
- unmap_stage2_range(&kvm->arch.mmu, gpa, size);
+ kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
+ kvm_nested_s2_unmap(kvm, true);
write_unlock(&kvm->mmu_lock);
}