From c662f77331c98018ed256501557b4dd67133fbd7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 13 Feb 2018 15:16:01 +1100 Subject: KVM: PPC: Fix compile error that occurs when CONFIG_ALTIVEC=n Commit accb757d798c ("KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_run", 2017-12-04) added a "goto out" statement and an "out:" label to kvm_arch_vcpu_ioctl_run(). Since the only "goto out" is inside a CONFIG_VSX block, compiling with CONFIG_VSX=n gives a warning that label "out" is defined but not used, and because arch/powerpc is compiled with -Werror, that becomes a compile error that makes the kernel build fail. Merge commit 1ab03c072feb ("Merge tag 'kvm-ppc-next-4.16-2' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc", 2018-02-09) added a similar block of code inside a #ifdef CONFIG_ALTIVEC, with a "goto out" statement. In order to make the build succeed, this adds a #ifdef around the "out:" label. This is a minimal, ugly fix, to be replaced later by a refactoring of the code. Since CONFIG_VSX depends on CONFIG_ALTIVEC, it is sufficient to use #ifdef CONFIG_ALTIVEC here. Fixes: accb757d798c ("KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_run") Reported-by: Christian Zigotzky Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 403e642c78f5..0083142c2f84 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1608,7 +1608,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_sigset_deactivate(vcpu); +#ifdef CONFIG_ALTIVEC out: +#endif vcpu_put(vcpu); return r; } -- cgit v1.2.3 From 6df3877fc962c2bb3d0438633dfd24a185af6838 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 13 Feb 2018 15:45:21 +1100 Subject: KVM: PPC: Book3S: Fix compile error that occurs with some gcc versions Some versions of gcc generate a warning that the variable "emulated" may be used uninitialized in function kvmppc_handle_load128_by2x64(). It would be used uninitialized if kvmppc_handle_load128_by2x64 was ever called with vcpu->arch.mmio_vmx_copy_nums == 0, but neither of the callers ever do that, so there is no actual bug. When gcc generates a warning, it causes the build to fail because arch/powerpc is compiled with -Werror. This silences the warning by initializing "emulated" to EMULATE_DONE. Fixes: 09f984961c13 ("KVM: PPC: Book3S: Add MMIO emulation for VMX instructions") Reported-by: Michael Ellerman Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0083142c2f84..52c205373986 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1345,7 +1345,7 @@ static int kvmppc_emulate_mmio_vsx_loadstore(struct kvm_vcpu *vcpu, int kvmppc_handle_load128_by2x64(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int rt, int is_default_endian) { - enum emulation_result emulated; + enum emulation_result emulated = EMULATE_DONE; while (vcpu->arch.mmio_vmx_copy_nums) { emulated = __kvmppc_handle_load(run, vcpu, rt, 8, -- cgit v1.2.3 From c3856aeb29402e94ad9b3879030165cc6a4fdc56 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 23 Feb 2018 21:21:12 +1100 Subject: KVM: PPC: Book3S HV: Fix handling of large pages in radix page fault handler This fixes several bugs in the radix page fault handler relating to the way large pages in the memory backing the guest were handled. First, the check for large pages only checked for explicit huge pages and missed transparent huge pages. Then the check that the addresses (host virtual vs. guest physical) had appropriate alignment was wrong, meaning that the code never put a large page in the partition scoped radix tree; it was always demoted to a small page. Fixing this exposed bugs in kvmppc_create_pte(). We were never invalidating a 2MB PTE, which meant that if a page was initially faulted in without write permission and the guest then attempted to store to it, we would never update the PTE to have write permission. If we find a valid 2MB PTE in the PMD, we need to clear it and do a TLB invalidation before installing either the new 2MB PTE or a pointer to a page table page. This also corrects an assumption that get_user_pages_fast would set the _PAGE_DIRTY bit if we are writing, which is not true. Instead we mark the page dirty explicitly with set_page_dirty_lock(). This also means we don't need the dirty bit set on the host PTE when providing write access on a read fault. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 69 +++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 26 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 0c854816e653..5cb4e4687107 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -195,6 +195,12 @@ static void kvmppc_pte_free(pte_t *ptep) kmem_cache_free(kvm_pte_cache, ptep); } +/* Like pmd_huge() and pmd_large(), but works regardless of config options */ +static inline int pmd_is_leaf(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, unsigned int level, unsigned long mmu_seq) { @@ -219,7 +225,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, else new_pmd = pmd_alloc_one(kvm->mm, gpa); - if (level == 0 && !(pmd && pmd_present(*pmd))) + if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) new_ptep = kvmppc_pte_alloc(); /* Check if we might have been invalidated; let the guest retry if so */ @@ -244,12 +250,30 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, new_pmd = NULL; } pmd = pmd_offset(pud, gpa); - if (pmd_large(*pmd)) { - /* Someone else has instantiated a large page here; retry */ - ret = -EAGAIN; - goto out_unlock; - } - if (level == 1 && !pmd_none(*pmd)) { + if (pmd_is_leaf(*pmd)) { + unsigned long lgpa = gpa & PMD_MASK; + + /* + * If we raced with another CPU which has just put + * a 2MB pte in after we saw a pte page, try again. + */ + if (level == 0 && !new_ptep) { + ret = -EAGAIN; + goto out_unlock; + } + /* Valid 2MB page here already, remove it */ + old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), + ~0UL, 0, lgpa, PMD_SHIFT); + kvmppc_radix_tlbie_page(kvm, lgpa, PMD_SHIFT); + if (old & _PAGE_DIRTY) { + unsigned long gfn = lgpa >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + memslot = gfn_to_memslot(kvm, gfn); + if (memslot && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, + gfn, PMD_SIZE); + } + } else if (level == 1 && !pmd_none(*pmd)) { /* * There's a page table page here, but we wanted * to install a large page. Tell the caller and let @@ -412,28 +436,24 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } else { page = pages[0]; pfn = page_to_pfn(page); - if (PageHuge(page)) { - page = compound_head(page); - pte_size <<= compound_order(page); + if (PageCompound(page)) { + pte_size <<= compound_order(compound_head(page)); /* See if we can insert a 2MB large-page PTE here */ if (pte_size >= PMD_SIZE && - (gpa & PMD_MASK & PAGE_MASK) == - (hva & PMD_MASK & PAGE_MASK)) { + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { level = 1; pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); } } /* See if we can provide write access */ if (writing) { - /* - * We assume gup_fast has set dirty on the host PTE. - */ pgflags |= _PAGE_WRITE; } else { local_irq_save(flags); ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL); - if (ptep && pte_write(*ptep) && pte_dirty(*ptep)) + if (ptep && pte_write(*ptep)) pgflags |= _PAGE_WRITE; local_irq_restore(flags); } @@ -459,18 +479,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte = pfn_pte(pfn, __pgprot(pgflags)); ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); } - if (ret == 0 || ret == -EAGAIN) - ret = RESUME_GUEST; if (page) { - /* - * We drop pages[0] here, not page because page might - * have been set to the head page of a compound, but - * we have to drop the reference on the correct tail - * page to match the get inside gup() - */ - put_page(pages[0]); + if (!ret && (pgflags & _PAGE_WRITE)) + set_page_dirty_lock(page); + put_page(page); } + + if (ret == 0 || ret == -EAGAIN) + ret = RESUME_GUEST; return ret; } @@ -644,7 +661,7 @@ void kvmppc_free_radix(struct kvm *kvm) continue; pmd = pmd_offset(pud, 0); for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) { - if (pmd_huge(*pmd)) { + if (pmd_is_leaf(*pmd)) { pmd_clear(pmd); continue; } -- cgit v1.2.3 From debd574f4195e205ba505b25e19b2b797f4bcd94 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 2 Mar 2018 15:38:04 +1100 Subject: KVM: PPC: Book3S HV: Fix VRMA initialization with 2MB or 1GB memory backing The current code for initializing the VRMA (virtual real memory area) for HPT guests requires the page size of the backing memory to be one of 4kB, 64kB or 16MB. With a radix host we have the possibility that the backing memory page size can be 2MB or 1GB. In these cases, if the guest switches to HPT mode, KVM will not initialize the VRMA and the guest will fail to run. In fact it is not necessary that the VRMA page size is the same as the backing memory page size; any VRMA page size less than or equal to the backing memory page size is acceptable. Therefore we now choose the largest page size out of the set {4k, 64k, 16M} which is not larger than the backing memory page size. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 89707354c2ef..b4a538b29da5 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3656,15 +3656,17 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto up_out; psize = vma_kernel_pagesize(vma); - porder = __ilog2(psize); up_read(¤t->mm->mmap_sem); /* We can handle 4k, 64k or 16M pages in the VRMA */ - err = -EINVAL; - if (!(psize == 0x1000 || psize == 0x10000 || - psize == 0x1000000)) - goto out_srcu; + if (psize >= 0x1000000) + psize = 0x1000000; + else if (psize >= 0x10000) + psize = 0x10000; + else + psize = 0x1000; + porder = __ilog2(psize); senc = slb_pgsize_encoding(psize); kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | -- cgit v1.2.3 From 61bd0f66ff92d5ce765ff9850fd3cbfec773c560 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Fri, 2 Mar 2018 11:51:56 +0100 Subject: KVM: PPC: Book3S HV: Fix guest time accounting with VIRT_CPU_ACCOUNTING_GEN Since commit 8b24e69fc47e ("KVM: PPC: Book3S HV: Close race with testing for signals on guest entry"), if CONFIG_VIRT_CPU_ACCOUNTING_GEN is set, the guest time is not accounted to guest time and user time, but instead to system time. This is because guest_enter()/guest_exit() are called while interrupts are disabled and the tick counter cannot be updated between them. To fix that, move guest_exit() after local_irq_enable(), and as guest_enter() is called with IRQ disabled, call guest_enter_irqoff() instead. Fixes: 8b24e69fc47e ("KVM: PPC: Book3S HV: Close race with testing for signals on guest entry") Signed-off-by: Laurent Vivier Reviewed-by: Paolo Bonzini Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index b4a538b29da5..9cb9448163c4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2885,7 +2885,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ trace_hardirqs_on(); - guest_enter(); + guest_enter_irqoff(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); @@ -2893,8 +2893,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) srcu_read_unlock(&vc->kvm->srcu, srcu_idx); - guest_exit(); - trace_hardirqs_off(); set_irq_happened(trap); @@ -2937,6 +2935,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_set_host_core(pcpu); local_irq_enable(); + guest_exit(); /* Let secondaries go back to the offline loop */ for (i = 0; i < controlled_threads; ++i) { -- cgit v1.2.3 From a8b48a4dccea77e29462e59f1dbf0d5aa1ff167c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 7 Mar 2018 22:17:20 +1100 Subject: KVM: PPC: Book3S HV: Fix trap number return from __kvmppc_vcore_entry This fixes a bug where the trap number that is returned by __kvmppc_vcore_entry gets corrupted. The effect of the corruption is that IPIs get ignored on POWER9 systems when the IPI is sent via a doorbell interrupt to a CPU which is executing in a KVM guest. The effect of the IPI being ignored is often that another CPU locks up inside smp_call_function_many() (and if that CPU is holding a spinlock, other CPUs then lock up inside raw_spin_lock()). The trap number is currently held in register r12 for most of the assembly-language part of the guest exit path. In that path, we call kvmppc_subcore_exit_guest(), which is a C function, without restoring r12 afterwards. Depending on the kernel config and the compiler, it may modify r12 or it may not, so some config/compiler combinations see the bug and others don't. To fix this, we arrange for the trap number to be stored on the stack from the 'guest_bypass:' label until the end of the function, then the trap number is loaded and returned in r12 as before. Cc: stable@vger.kernel.org # v4.8+ Fixes: fd7bacbca47a ("KVM: PPC: Book3S HV: Fix TB corruption in guest exit path on HMI interrupt") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index f31f357b8c5a..d33264697a31 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -320,7 +320,6 @@ kvm_novcpu_exit: stw r12, STACK_SLOT_TRAP(r1) bl kvmhv_commence_exit nop - lwz r12, STACK_SLOT_TRAP(r1) b kvmhv_switch_to_host /* @@ -1220,6 +1219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) secondary_too_late: li r12, 0 + stw r12, STACK_SLOT_TRAP(r1) cmpdi r4, 0 beq 11f stw r12, VCPU_TRAP(r4) @@ -1558,12 +1558,12 @@ mc_cont: 3: stw r5,VCPU_SLB_MAX(r9) guest_bypass: + stw r12, STACK_SLOT_TRAP(r1) mr r3, r12 /* Increment exit count, poke other threads to exit */ bl kvmhv_commence_exit nop ld r9, HSTATE_KVM_VCPU(r13) - lwz r12, VCPU_TRAP(r9) /* Stop others sending VCPU interrupts to this physical CPU */ li r0, -1 @@ -1898,6 +1898,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do * have to coordinate the hardware threads. + * Here STACK_SLOT_TRAP(r1) contains the trap number. */ kvmhv_switch_to_host: /* Secondary threads wait for primary to do partition switch */ @@ -1950,12 +1951,12 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* If HMI, call kvmppc_realmode_hmi_handler() */ + lwz r12, STACK_SLOT_TRAP(r1) cmpwi r12, BOOK3S_INTERRUPT_HMI bne 27f bl kvmppc_realmode_hmi_handler nop cmpdi r3, 0 - li r12, BOOK3S_INTERRUPT_HMI /* * At this point kvmppc_realmode_hmi_handler may have resync-ed * the TB, and if it has, we must not subtract the guest timebase @@ -2008,10 +2009,8 @@ BEGIN_FTR_SECTION lwz r8, KVM_SPLIT_DO_RESTORE(r3) cmpwi r8, 0 beq 47f - stw r12, STACK_SLOT_TRAP(r1) bl kvmhv_p9_restore_lpcr nop - lwz r12, STACK_SLOT_TRAP(r1) b 48f 47: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) @@ -2049,6 +2048,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) + lwz r12, STACK_SLOT_TRAP(r1) /* return trap # in r12 */ ld r0, SFS+PPC_LR_STKOFF(r1) addi r1, r1, SFS mtlr r0 -- cgit v1.2.3 From 39c983ea0f96a270d4876c4148e3bb2d9cd3294f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 22 Feb 2018 15:16:54 +1100 Subject: KVM: PPC: Remove unused kvm_unmap_hva callback Since commit fb1522e099f0 ("KVM: update to new mmu_notifier semantic v2", 2017-08-31), the MMU notifier code in KVM no longer calls the kvm_unmap_hva callback. This removes the PPC implementations of kvm_unmap_hva(). Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/include/asm/kvm_ppc.h | 1 - arch/powerpc/kvm/book3s.c | 6 ------ arch/powerpc/kvm/book3s.h | 1 - arch/powerpc/kvm/book3s_64_mmu_hv.c | 9 --------- arch/powerpc/kvm/book3s_64_vio_hv.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 1 - arch/powerpc/kvm/book3s_pr.c | 10 ---------- arch/powerpc/kvm/e500_mmu_host.c | 2 +- arch/powerpc/kvm/trace_pr.h | 15 --------------- 10 files changed, 2 insertions(+), 46 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1f53b562726f..6b69d7999381 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -60,7 +60,6 @@ #define KVM_ARCH_WANT_MMU_NOTIFIER -extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); extern int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 7765a800ddae..23cfaef9544e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -295,7 +295,6 @@ struct kvmppc_ops { const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, const struct kvm_memory_slot *new); - int (*unmap_hva)(struct kvm *kvm, unsigned long hva); int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, unsigned long end); int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 234531d1bee1..97d4a112648f 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -819,12 +819,6 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new); } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm->arch.kvm_ops->unmap_hva(kvm, hva); -} -EXPORT_SYMBOL_GPL(kvm_unmap_hva); - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index d2b3ec088b8c..4ad5e287b8bc 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -14,7 +14,6 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, struct kvm_memory_slot *memslot); -extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index ef243fed2f2b..a670fa5fbe50 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -877,15 +877,6 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, return 0; } -int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) -{ - hva_handler_fn handler; - - handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; - kvm_handle_hva(kvm, hva, handler); - return 0; -} - int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) { hva_handler_fn handler; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index c32e9bfe75b1..6651f736a0b1 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -450,7 +450,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, /* * Synchronize with the MMU notifier callbacks in - * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). + * book3s_64_mmu_hv.c (kvm_unmap_hva_range_hv etc.). * While we have the rmap lock, code running on other CPUs * cannot finish unmapping the host real page that backs * this guest real page, so we are OK to access the host diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9cb9448163c4..4863ab81f663 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4351,7 +4351,6 @@ static struct kvmppc_ops kvm_ops_hv = { .flush_memslot = kvmppc_core_flush_memslot_hv, .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, .commit_memory_region = kvmppc_core_commit_memory_region_hv, - .unmap_hva = kvm_unmap_hva_hv, .unmap_hva_range = kvm_unmap_hva_range_hv, .age_hva = kvm_age_hva_hv, .test_age_hva = kvm_test_age_hva_hv, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 3ae752314b34..d3f304d06adf 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -277,15 +277,6 @@ static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, } } -static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva) -{ - trace_kvm_unmap_hva(hva); - - do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); - - return 0; -} - static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, unsigned long end) { @@ -1773,7 +1764,6 @@ static struct kvmppc_ops kvm_ops_pr = { .flush_memslot = kvmppc_core_flush_memslot_pr, .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, .commit_memory_region = kvmppc_core_commit_memory_region_pr, - .unmap_hva = kvm_unmap_hva_pr, .unmap_hva_range = kvm_unmap_hva_range_pr, .age_hva = kvm_age_hva_pr, .test_age_hva = kvm_test_age_hva_pr, diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 423b21393bc9..c878b4ffb86f 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -724,7 +724,7 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type, /************* MMU Notifiers *************/ -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +static int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) { trace_kvm_unmap_hva(hva); diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h index 85785a370c0e..2f9a8829552b 100644 --- a/arch/powerpc/kvm/trace_pr.h +++ b/arch/powerpc/kvm/trace_pr.h @@ -254,21 +254,6 @@ TRACE_EVENT(kvm_exit, ) ); -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("unmap hva 0x%lx\n", __entry->hva) -); - #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ -- cgit v1.2.3 From c4c8a7643e74ebd7f2cfa80807562f16bb58c1d9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 23 Feb 2018 21:40:49 +1100 Subject: KVM: PPC: Book3S HV: Radix page fault handler optimizations This improves the handling of transparent huge pages in the radix hypervisor page fault handler. Previously, if a small page is faulted in to a 2MB region of guest physical space, that means that there is a page table pointer at the PMD level, which could never be replaced by a leaf (2MB) PMD entry. This adds the code to clear the PMD, invlidate the page walk cache and free the page table page in this situation, so that the leaf PMD entry can be created. This also adds code to check whether a PMD or PTE being inserted is the same as is already there (because of a race with another CPU that faulted on the same page) and if so, we don't replace the existing entry, meaning that we don't invalidate the PTE or PMD and do a TLB invalidation. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 42 ++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 5cb4e4687107..ed62164f8474 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -160,6 +160,17 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, asm volatile("ptesync": : :"memory"); } +static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr) +{ + unsigned long rb = 0x2 << PPC_BITLSHIFT(53); /* IS = 2 */ + + asm volatile("ptesync": : :"memory"); + /* RIC=1 PRS=0 R=1 IS=2 */ + asm volatile(PPC_TLBIE_5(%0, %1, 1, 0, 1) + : : "r" (rb), "r" (kvm->arch.lpid) : "memory"); + asm volatile("ptesync": : :"memory"); +} + unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, unsigned long clr, unsigned long set, unsigned long addr, unsigned int shift) @@ -261,6 +272,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ret = -EAGAIN; goto out_unlock; } + /* Check if we raced and someone else has set the same thing */ + if (level == 1 && pmd_raw(*pmd) == pte_raw(pte)) { + ret = 0; + goto out_unlock; + } /* Valid 2MB page here already, remove it */ old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), ~0UL, 0, lgpa, PMD_SHIFT); @@ -275,12 +291,13 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, } } else if (level == 1 && !pmd_none(*pmd)) { /* - * There's a page table page here, but we wanted - * to install a large page. Tell the caller and let - * it try installing a normal page if it wants. + * There's a page table page here, but we wanted to + * install a large page, so remove and free the page + * table page. new_ptep will be NULL since level == 1. */ - ret = -EBUSY; - goto out_unlock; + new_ptep = pte_offset_kernel(pmd, 0); + pmd_clear(pmd); + kvmppc_radix_flush_pwc(kvm, gpa); } if (level == 0) { if (pmd_none(*pmd)) { @@ -291,6 +308,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, } ptep = pte_offset_kernel(pmd, gpa); if (pte_present(*ptep)) { + /* Check if someone else set the same thing */ + if (pte_raw(*ptep) == pte_raw(pte)) { + ret = 0; + goto out_unlock; + } /* PTE was previously valid, so invalidate it */ old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, gpa, 0); @@ -469,16 +491,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Allocate space in the tree and write the PTE */ ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); - if (ret == -EBUSY) { - /* - * There's already a PMD where wanted to install a large page; - * for now, fall back to installing a small page. - */ - level = 0; - pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1); - pte = pfn_pte(pfn, __pgprot(pgflags)); - ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); - } if (page) { if (!ret && (pgflags & _PAGE_WRITE)) -- cgit v1.2.3 From f7caf712d885713986baeac86b1b64bcbd9dcd91 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 24 Feb 2018 20:08:51 +1100 Subject: KVM: PPC: Book3S HV: Streamline setting of reference and change bits When using the radix MMU, we can get hypervisor page fault interrupts with the DSISR_SET_RC bit set in DSISR/HSRR1, indicating that an attempt to set the R (reference) or C (change) bit in a PTE atomically failed. Previously we would find the corresponding Linux PTE and check the permission and dirty bits there, but this is not really necessary since we only need to do what the hardware was trying to do, namely set R or C atomically. This removes the code that reads the Linux PTE and just update the partition-scoped PTE, having first checked that it is still present, and if the access is a write, that the PTE still has write permission. Furthermore, we now check whether any other relevant bits are set in DSISR, and if there are, then we proceed with the rest of the function in order to handle whatever condition they represent, instead of returning to the guest as we did previously. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 52 +++++++++++++--------------------- 1 file changed, 19 insertions(+), 33 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index ed62164f8474..f783b067e5ac 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -345,7 +345,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long gpa, gfn, hva, pfn; struct kvm_memory_slot *memslot; struct page *page = NULL, *pages[1]; - long ret, npages, ok; + long ret, npages; unsigned int writing; struct vm_area_struct *vma; unsigned long flags; @@ -397,43 +397,29 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (dsisr & DSISR_SET_RC) { /* * Need to set an R or C bit in the 2nd-level tables; - * if the relevant bits aren't already set in the linux - * page tables, fall through to do the gup_fast to - * set them in the linux page tables too. + * since we are just helping out the hardware here, + * it is sufficient to do what the hardware does. */ - ok = 0; pgflags = _PAGE_ACCESSED; if (writing) pgflags |= _PAGE_DIRTY; - local_irq_save(flags); - ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL); - if (ptep) { - pte = READ_ONCE(*ptep); - if (pte_present(pte) && - (pte_val(pte) & pgflags) == pgflags) - ok = 1; - } - local_irq_restore(flags); - if (ok) { - spin_lock(&kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { - spin_unlock(&kvm->mmu_lock); - return RESUME_GUEST; - } - /* - * We are walking the secondary page table here. We can do this - * without disabling irq. - */ - ptep = __find_linux_pte(kvm->arch.pgtable, - gpa, NULL, &shift); - if (ptep && pte_present(*ptep)) { - kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, - gpa, shift); - spin_unlock(&kvm->mmu_lock); - return RESUME_GUEST; - } - spin_unlock(&kvm->mmu_lock); + /* + * We are walking the secondary page table here. We can do this + * without disabling irq. + */ + spin_lock(&kvm->mmu_lock); + ptep = __find_linux_pte(kvm->arch.pgtable, + gpa, NULL, &shift); + if (ptep && pte_present(*ptep) && + (!writing || pte_write(*ptep))) { + kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, + gpa, shift); + dsisr &= ~DSISR_SET_RC; } + spin_unlock(&kvm->mmu_lock); + if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | + DSISR_PROTFAULT | DSISR_SET_RC))) + return RESUME_GUEST; } ret = -EFAULT; -- cgit v1.2.3 From 58c5c276b4c2ceb2b02ecd959ad9784b997d4332 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 24 Feb 2018 20:14:37 +1100 Subject: KVM: PPC: Book3S HV: Handle 1GB pages in radix page fault handler This adds code to the radix hypervisor page fault handler to handle the case where the guest memory is backed by 1GB hugepages, and put them into the partition-scoped radix tree at the PUD level. The code is essentially analogous to the code for 2MB pages. This also rearranges kvmppc_create_pte() to make it easier to follow. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 129 ++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 36 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index f783b067e5ac..05acc67e0eb2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -150,7 +150,9 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, { int psize = MMU_BASE_PSIZE; - if (pshift >= PMD_SHIFT) + if (pshift >= PUD_SHIFT) + psize = MMU_PAGE_1G; + else if (pshift >= PMD_SHIFT) psize = MMU_PAGE_2M; addr &= ~0xfffUL; addr |= mmu_psize_defs[psize].ap << 5; @@ -231,9 +233,9 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, new_pud = pud_alloc_one(kvm->mm, gpa); pmd = NULL; - if (pud && pud_present(*pud)) + if (pud && pud_present(*pud) && !pud_huge(*pud)) pmd = pmd_offset(pud, gpa); - else + else if (level <= 1) new_pmd = pmd_alloc_one(kvm->mm, gpa); if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) @@ -254,6 +256,50 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, new_pud = NULL; } pud = pud_offset(pgd, gpa); + if (pud_huge(*pud)) { + unsigned long hgpa = gpa & PUD_MASK; + + /* + * If we raced with another CPU which has just put + * a 1GB pte in after we saw a pmd page, try again. + */ + if (level <= 1 && !new_pmd) { + ret = -EAGAIN; + goto out_unlock; + } + /* Check if we raced and someone else has set the same thing */ + if (level == 2 && pud_raw(*pud) == pte_raw(pte)) { + ret = 0; + goto out_unlock; + } + /* Valid 1GB page here already, remove it */ + old = kvmppc_radix_update_pte(kvm, (pte_t *)pud, + ~0UL, 0, hgpa, PUD_SHIFT); + kvmppc_radix_tlbie_page(kvm, hgpa, PUD_SHIFT); + if (old & _PAGE_DIRTY) { + unsigned long gfn = hgpa >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + memslot = gfn_to_memslot(kvm, gfn); + if (memslot && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, + gfn, PUD_SIZE); + } + } + if (level == 2) { + if (!pud_none(*pud)) { + /* + * There's a page table page here, but we wanted to + * install a large page, so remove and free the page + * table page. new_pmd will be NULL since level == 2. + */ + new_pmd = pmd_offset(pud, 0); + pud_clear(pud); + kvmppc_radix_flush_pwc(kvm, gpa); + } + kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); + ret = 0; + goto out_unlock; + } if (pud_none(*pud)) { if (!new_pmd) goto out_unlock; @@ -289,41 +335,43 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, kvmppc_update_dirty_map(memslot, gfn, PMD_SIZE); } - } else if (level == 1 && !pmd_none(*pmd)) { - /* - * There's a page table page here, but we wanted to - * install a large page, so remove and free the page - * table page. new_ptep will be NULL since level == 1. - */ - new_ptep = pte_offset_kernel(pmd, 0); - pmd_clear(pmd); - kvmppc_radix_flush_pwc(kvm, gpa); } - if (level == 0) { - if (pmd_none(*pmd)) { - if (!new_ptep) - goto out_unlock; - pmd_populate(kvm->mm, pmd, new_ptep); - new_ptep = NULL; - } - ptep = pte_offset_kernel(pmd, gpa); - if (pte_present(*ptep)) { - /* Check if someone else set the same thing */ - if (pte_raw(*ptep) == pte_raw(pte)) { - ret = 0; - goto out_unlock; - } - /* PTE was previously valid, so invalidate it */ - old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, - 0, gpa, 0); - kvmppc_radix_tlbie_page(kvm, gpa, 0); - if (old & _PAGE_DIRTY) - mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + if (level == 1) { + if (!pmd_none(*pmd)) { + /* + * There's a page table page here, but we wanted to + * install a large page, so remove and free the page + * table page. new_ptep will be NULL since level == 1. + */ + new_ptep = pte_offset_kernel(pmd, 0); + pmd_clear(pmd); + kvmppc_radix_flush_pwc(kvm, gpa); } - kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); - } else { kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); + ret = 0; + goto out_unlock; + } + if (pmd_none(*pmd)) { + if (!new_ptep) + goto out_unlock; + pmd_populate(kvm->mm, pmd, new_ptep); + new_ptep = NULL; + } + ptep = pte_offset_kernel(pmd, gpa); + if (pte_present(*ptep)) { + /* Check if someone else set the same thing */ + if (pte_raw(*ptep) == pte_raw(pte)) { + ret = 0; + goto out_unlock; + } + /* PTE was previously valid, so invalidate it */ + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, + 0, gpa, 0); + kvmppc_radix_tlbie_page(kvm, gpa, 0); + if (old & _PAGE_DIRTY) + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); } + kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); ret = 0; out_unlock: @@ -446,8 +494,13 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pfn = page_to_pfn(page); if (PageCompound(page)) { pte_size <<= compound_order(compound_head(page)); - /* See if we can insert a 2MB large-page PTE here */ - if (pte_size >= PMD_SIZE && + /* See if we can insert a 1GB or 2MB large PTE here */ + if (pte_size >= PUD_SIZE && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); + } else if (pte_size >= PMD_SIZE && (gpa & (PMD_SIZE - PAGE_SIZE)) == (hva & (PMD_SIZE - PAGE_SIZE))) { level = 1; @@ -657,6 +710,10 @@ void kvmppc_free_radix(struct kvm *kvm) for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) { if (!pud_present(*pud)) continue; + if (pud_huge(*pud)) { + pud_clear(pud); + continue; + } pmd = pmd_offset(pud, 0); for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) { if (pmd_is_leaf(*pmd)) { -- cgit v1.2.3 From 31c8b0d0694a1f7e3b46df0d1341a874ecb5e0de Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 1 Mar 2018 15:14:02 +1100 Subject: KVM: PPC: Book3S HV: Use __gfn_to_pfn_memslot() in page fault handler This changes the hypervisor page fault handler for radix guests to use the generic KVM __gfn_to_pfn_memslot() function instead of using get_user_pages_fast() and then handling the case of VM_PFNMAP vmas specially. The old code missed the case of VM_IO vmas; with this change, VM_IO vmas will now be handled correctly by code within __gfn_to_pfn_memslot. Currently, __gfn_to_pfn_memslot calls hva_to_pfn, which only uses __get_user_pages_fast for the initial lookup in the cases where either atomic or async is set. Since we are not setting either atomic or async, we do our own __get_user_pages_fast first, for now. This also adds code to check for the KVM_MEM_READONLY flag on the memslot. If it is set and this is a write access, we synthesize a data storage interrupt for the guest. In the case where the page is not normal RAM (i.e. page == NULL in kvmppc_book3s_radix_page_fault(), we read the PTE from the Linux page tables because we need the mapping attribute bits as well as the PFN. (The mapping attribute bits indicate whether accesses have to be non-cacheable and/or guarded.) Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 148 ++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 60 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 05acc67e0eb2..0590f1667607 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -392,11 +392,11 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long mmu_seq, pte_size; unsigned long gpa, gfn, hva, pfn; struct kvm_memory_slot *memslot; - struct page *page = NULL, *pages[1]; - long ret, npages; - unsigned int writing; - struct vm_area_struct *vma; - unsigned long flags; + struct page *page = NULL; + long ret; + bool writing; + bool upgrade_write = false; + bool *upgrade_p = &upgrade_write; pte_t pte, *ptep; unsigned long pgflags; unsigned int shift, level; @@ -436,12 +436,17 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr & DSISR_ISSTORE); } - /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; - smp_rmb(); - writing = (dsisr & DSISR_ISSTORE) != 0; - hva = gfn_to_hva_memslot(memslot, gfn); + if (memslot->flags & KVM_MEM_READONLY) { + if (writing) { + /* give the guest a DSI */ + dsisr = DSISR_ISSTORE | DSISR_PROTFAULT; + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + upgrade_p = NULL; + } + if (dsisr & DSISR_SET_RC) { /* * Need to set an R or C bit in the 2nd-level tables; @@ -470,69 +475,92 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return RESUME_GUEST; } - ret = -EFAULT; - pfn = 0; - pte_size = PAGE_SIZE; - pgflags = _PAGE_READ | _PAGE_EXEC; - level = 0; - npages = get_user_pages_fast(hva, 1, writing, pages); - if (npages < 1) { - /* Check if it's an I/O mapping */ - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, hva); - if (vma && vma->vm_start <= hva && hva < vma->vm_end && - (vma->vm_flags & VM_PFNMAP)) { - pfn = vma->vm_pgoff + - ((hva - vma->vm_start) >> PAGE_SHIFT); - pgflags = pgprot_val(vma->vm_page_prot); - } - up_read(¤t->mm->mmap_sem); - if (!pfn) - return -EFAULT; - } else { - page = pages[0]; + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* + * Do a fast check first, since __gfn_to_pfn_memslot doesn't + * do it with !atomic && !async, which is how we call it. + * We always ask for write permission since the common case + * is that the page is writable. + */ + hva = gfn_to_hva_memslot(memslot, gfn); + if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { pfn = page_to_pfn(page); - if (PageCompound(page)) { - pte_size <<= compound_order(compound_head(page)); - /* See if we can insert a 1GB or 2MB large PTE here */ - if (pte_size >= PUD_SIZE && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); - } else if (pte_size >= PMD_SIZE && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); - } + upgrade_write = true; + } else { + /* Call KVM generic code to do the slow-path check */ + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + writing, upgrade_p); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; + page = NULL; + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageReserved(page)) + page = NULL; } - /* See if we can provide write access */ - if (writing) { - pgflags |= _PAGE_WRITE; - } else { - local_irq_save(flags); - ptep = find_current_mm_pte(current->mm->pgd, - hva, NULL, NULL); - if (ptep && pte_write(*ptep)) - pgflags |= _PAGE_WRITE; - local_irq_restore(flags); + } + + /* See if we can insert a 1GB or 2MB large PTE here */ + level = 0; + if (page && PageCompound(page)) { + pte_size = PAGE_SIZE << compound_order(compound_head(page)); + if (pte_size >= PUD_SIZE && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); + } else if (pte_size >= PMD_SIZE && + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { + level = 1; + pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); } } /* * Compute the PTE value that we need to insert. */ - pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED; - if (pgflags & _PAGE_WRITE) - pgflags |= _PAGE_DIRTY; - pte = pfn_pte(pfn, __pgprot(pgflags)); + if (page) { + pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | + _PAGE_ACCESSED; + if (writing || upgrade_write) + pgflags |= _PAGE_WRITE | _PAGE_DIRTY; + pte = pfn_pte(pfn, __pgprot(pgflags)); + } else { + /* + * Read the PTE from the process' radix tree and use that + * so we get the attribute bits. + */ + local_irq_disable(); + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + pte = *ptep; + local_irq_enable(); + if (shift == PUD_SHIFT && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + } else if (shift == PMD_SHIFT && + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { + level = 1; + } else if (shift && shift != PAGE_SHIFT) { + /* Adjust PFN */ + unsigned long mask = (1ul << shift) - PAGE_SIZE; + pte = __pte(pte_val(pte) | (hva & mask)); + } + if (!(writing || upgrade_write)) + pte = __pte(pte_val(pte) & ~ _PAGE_WRITE); + pte = __pte(pte_val(pte) | _PAGE_EXEC); + } /* Allocate space in the tree and write the PTE */ ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); if (page) { - if (!ret && (pgflags & _PAGE_WRITE)) + if (!ret && (pte_val(pte) & _PAGE_WRITE)) set_page_dirty_lock(page); put_page(page); } -- cgit v1.2.3