diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 14:35:31 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 14:35:31 -0700 |
commit | 2e7580b0e75d771d93e24e681031a165b1d31071 (patch) | |
tree | d9449702609eeaab28913a43b5a4434667e09d43 /arch/powerpc/kvm | |
parent | d25413efa9536e2f425ea45c7720598035c597bc (diff) | |
parent | cf9eeac46350b8b43730b7dc5e999757bed089a4 (diff) | |
download | linux-2e7580b0e75d771d93e24e681031a165b1d31071.tar.gz linux-2e7580b0e75d771d93e24e681031a165b1d31071.tar.bz2 linux-2e7580b0e75d771d93e24e681031a165b1d31071.zip |
Merge branch 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Avi Kivity:
"Changes include timekeeping improvements, support for assigning host
PCI devices that share interrupt lines, s390 user-controlled guests, a
large ppc update, and random fixes."
This is with the sign-off's fixed, hopefully next merge window we won't
have rebased commits.
* 'kvm-updates/3.4' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (130 commits)
KVM: Convert intx_mask_lock to spin lock
KVM: x86: fix kvm_write_tsc() TSC matching thinko
x86: kvmclock: abstract save/restore sched_clock_state
KVM: nVMX: Fix erroneous exception bitmap check
KVM: Ignore the writes to MSR_K7_HWCR(3)
KVM: MMU: make use of ->root_level in reset_rsvds_bits_mask
KVM: PMU: add proper support for fixed counter 2
KVM: PMU: Fix raw event check
KVM: PMU: warn when pin control is set in eventsel msr
KVM: VMX: Fix delayed load of shared MSRs
KVM: use correct tlbs dirty type in cmpxchg
KVM: Allow host IRQ sharing for assigned PCI 2.3 devices
KVM: Ensure all vcpus are consistent with in-kernel irqchip settings
KVM: x86 emulator: Allow PM/VM86 switch during task switch
KVM: SVM: Fix CPL updates
KVM: x86 emulator: VM86 segments must have DPL 3
KVM: x86 emulator: Fix task switch privilege checks
arch/powerpc/kvm/book3s_hv.c: included linux/sched.h twice
KVM: x86 emulator: correctly mask pmc index bits in RDPMC instruction emulation
KVM: mmu_notifier: Flush TLBs before releasing mmu_lock
...
Diffstat (limited to 'arch/powerpc/kvm')
-rw-r--r-- | arch/powerpc/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s.c | 57 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_32_mmu_host.c | 21 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_mmu_host.c | 66 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_mmu_hv.c | 919 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_emulate.c | 8 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv.c | 465 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_builtin.c | 209 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rm_mmu.c | 835 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rmhandlers.S | 176 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_paired_singles.c | 9 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_pr.c | 178 | ||||
-rw-r--r-- | arch/powerpc/kvm/booke.c | 150 | ||||
-rw-r--r-- | arch/powerpc/kvm/booke.h | 4 | ||||
-rw-r--r-- | arch/powerpc/kvm/booke_emulate.c | 23 | ||||
-rw-r--r-- | arch/powerpc/kvm/booke_interrupts.S | 18 | ||||
-rw-r--r-- | arch/powerpc/kvm/e500.c | 32 | ||||
-rw-r--r-- | arch/powerpc/kvm/e500_emulate.c | 38 | ||||
-rw-r--r-- | arch/powerpc/kvm/e500_tlb.c | 775 | ||||
-rw-r--r-- | arch/powerpc/kvm/e500_tlb.h | 80 | ||||
-rw-r--r-- | arch/powerpc/kvm/emulate.c | 61 | ||||
-rw-r--r-- | arch/powerpc/kvm/powerpc.c | 148 | ||||
-rw-r--r-- | arch/powerpc/kvm/trace.h | 62 |
23 files changed, 3315 insertions, 1020 deletions
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 78133deb4b64..8f64709ae331 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -69,6 +69,7 @@ config KVM_BOOK3S_64 config KVM_BOOK3S_64_HV bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 + select MMU_NOTIFIER ---help--- Support running unmodified book3s_64 guest kernels in virtual machines on POWER7 and PPC970 processors that have diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index e41ac6f7dcf1..7d54f4ed6d96 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -258,7 +258,7 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) return true; } -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned long old_pending = vcpu->arch.pending_exceptions; @@ -423,10 +423,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg4 = vcpu->arch.sprg4; - regs->sprg5 = vcpu->arch.sprg5; - regs->sprg6 = vcpu->arch.sprg6; - regs->sprg7 = vcpu->arch.sprg7; + regs->sprg4 = vcpu->arch.shared->sprg4; + regs->sprg5 = vcpu->arch.shared->sprg5; + regs->sprg6 = vcpu->arch.shared->sprg6; + regs->sprg7 = vcpu->arch.shared->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -450,10 +450,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg4 = regs->sprg4; - vcpu->arch.sprg5 = regs->sprg5; - vcpu->arch.sprg6 = regs->sprg6; - vcpu->arch.sprg7 = regs->sprg7; + vcpu->arch.shared->sprg4 = regs->sprg4; + vcpu->arch.shared->sprg5 = regs->sprg5; + vcpu->arch.shared->sprg6 = regs->sprg6; + vcpu->arch.shared->sprg7 = regs->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -477,41 +477,10 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return 0; } -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +void kvmppc_decrementer_func(unsigned long data) { - struct kvm_memory_slot *memslot; - struct kvm_vcpu *vcpu; - ulong ga, ga_end; - int is_dirty = 0; - int r; - unsigned long n; - - mutex_lock(&kvm->slots_lock); - - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) - goto out; - - /* If nothing is dirty, don't bother messing with page tables. */ - if (is_dirty) { - memslot = id_to_memslot(kvm->memslots, log->slot); + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - ga = memslot->base_gfn << PAGE_SHIFT; - ga_end = ga + (memslot->npages << PAGE_SHIFT); - - kvm_for_each_vcpu(n, vcpu, kvm) - kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); - - n = kvm_dirty_bitmap_bytes(memslot); - memset(memslot->dirty_bitmap, 0, n); - } - - r = 0; -out: - mutex_unlock(&kvm->slots_lock); - return r; + kvmppc_core_queue_dec(vcpu); + kvm_vcpu_kick(vcpu); } diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 9fecbfbce773..f922c29bb234 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -151,13 +151,15 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) bool primary = false; bool evict = false; struct hpte_cache *pte; + int r = 0; /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); if (is_error_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); - return -EINVAL; + r = -EINVAL; + goto out; } hpaddr <<= PAGE_SHIFT; @@ -249,7 +251,8 @@ next_pteg: kvmppc_mmu_hpte_cache_map(vcpu, pte); - return 0; +out: + return r; } static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -297,12 +300,14 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) u64 gvsid; u32 sr; struct kvmppc_sid_map *map; - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + int r = 0; if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { /* Invalidate an entry */ svcpu->sr[esid] = SR_INVALID; - return -ENOENT; + r = -ENOENT; + goto out; } map = find_sid_vsid(vcpu, gvsid); @@ -315,17 +320,21 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr); - return 0; +out: + svcpu_put(svcpu); + return r; } void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { int i; - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr)); for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++) svcpu->sr[i] = SR_INVALID; + + svcpu_put(svcpu); } void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index fa2f08434ba5..6f87f39a1ac2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -88,12 +88,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) int vflags = 0; int attempt = 0; struct kvmppc_sid_map *map; + int r = 0; /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); if (is_error_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); - return -EINVAL; + r = -EINVAL; + goto out; } hpaddr <<= PAGE_SHIFT; hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); @@ -110,7 +112,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n", vsid, orig_pte->eaddr); WARN_ON(true); - return -EINVAL; + r = -EINVAL; + goto out; } vsid = map->host_vsid; @@ -131,8 +134,10 @@ map_again: /* In case we tried normal mapping already, let's nuke old entries */ if (attempt > 1) - if (ppc_md.hpte_remove(hpteg) < 0) - return -1; + if (ppc_md.hpte_remove(hpteg) < 0) { + r = -1; + goto out; + } ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M); @@ -162,7 +167,8 @@ map_again: kvmppc_mmu_hpte_cache_map(vcpu, pte); } - return 0; +out: + return r; } static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -207,25 +213,30 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); int i; int max_slb_size = 64; int found_inval = -1; int r; - if (!to_svcpu(vcpu)->slb_max) - to_svcpu(vcpu)->slb_max = 1; + if (!svcpu->slb_max) + svcpu->slb_max = 1; /* Are we overwriting? */ - for (i = 1; i < to_svcpu(vcpu)->slb_max; i++) { - if (!(to_svcpu(vcpu)->slb[i].esid & SLB_ESID_V)) + for (i = 1; i < svcpu->slb_max; i++) { + if (!(svcpu->slb[i].esid & SLB_ESID_V)) found_inval = i; - else if ((to_svcpu(vcpu)->slb[i].esid & ESID_MASK) == esid) - return i; + else if ((svcpu->slb[i].esid & ESID_MASK) == esid) { + r = i; + goto out; + } } /* Found a spare entry that was invalidated before */ - if (found_inval > 0) - return found_inval; + if (found_inval > 0) { + r = found_inval; + goto out; + } /* No spare invalid entry, so create one */ @@ -233,30 +244,35 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) max_slb_size = mmu_slb_size; /* Overflowing -> purge */ - if ((to_svcpu(vcpu)->slb_max) == max_slb_size) + if ((svcpu->slb_max) == max_slb_size) kvmppc_mmu_flush_segments(vcpu); - r = to_svcpu(vcpu)->slb_max; - to_svcpu(vcpu)->slb_max++; + r = svcpu->slb_max; + svcpu->slb_max++; +out: + svcpu_put(svcpu); return r; } int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); u64 esid = eaddr >> SID_SHIFT; u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V; u64 slb_vsid = SLB_VSID_USER; u64 gvsid; int slb_index; struct kvmppc_sid_map *map; + int r = 0; slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK); if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { /* Invalidate an entry */ - to_svcpu(vcpu)->slb[slb_index].esid = 0; - return -ENOENT; + svcpu->slb[slb_index].esid = 0; + r = -ENOENT; + goto out; } map = find_sid_vsid(vcpu, gvsid); @@ -269,18 +285,22 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) slb_vsid &= ~SLB_VSID_KP; slb_esid |= slb_index; - to_svcpu(vcpu)->slb[slb_index].esid = slb_esid; - to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid; + svcpu->slb[slb_index].esid = slb_esid; + svcpu->slb[slb_index].vsid = slb_vsid; trace_kvm_book3s_slbmte(slb_vsid, slb_esid); - return 0; +out: + svcpu_put(svcpu); + return r; } void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { - to_svcpu(vcpu)->slb_max = 1; - to_svcpu(vcpu)->slb[0].esid = 0; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->slb_max = 1; + svcpu->slb[0].esid = 0; + svcpu_put(svcpu); } void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index bc3a2ea94217..ddc485a529f2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -23,6 +23,7 @@ #include <linux/gfp.h> #include <linux/slab.h> #include <linux/hugetlb.h> +#include <linux/vmalloc.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -33,15 +34,6 @@ #include <asm/ppc-opcode.h> #include <asm/cputable.h> -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) - -/* Pages in the VRMA are 16MB pages */ -#define VRMA_PAGE_ORDER 24 -#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ - /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 #define NR_LPIDS (LPID_RSVD + 1) @@ -51,21 +43,41 @@ long kvmppc_alloc_hpt(struct kvm *kvm) { unsigned long hpt; unsigned long lpid; + struct revmap_entry *rev; + struct kvmppc_linear_info *li; + + /* Allocate guest's hashed page table */ + li = kvm_alloc_hpt(); + if (li) { + /* using preallocated memory */ + hpt = (ulong)li->base_virt; + kvm->arch.hpt_li = li; + } else { + /* using dynamic memory */ + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| + __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); + } - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN, - HPT_ORDER - PAGE_SHIFT); if (!hpt) { pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); return -ENOMEM; } kvm->arch.hpt_virt = hpt; + /* Allocate reverse map array */ + rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE); + if (!rev) { + pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); + goto out_freehpt; + } + kvm->arch.revmap = rev; + + /* Allocate the guest's logical partition ID */ do { lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS); if (lpid >= NR_LPIDS) { pr_err("kvm_alloc_hpt: No LPIDs free\n"); - free_pages(hpt, HPT_ORDER - PAGE_SHIFT); - return -ENOMEM; + goto out_freeboth; } } while (test_and_set_bit(lpid, lpid_inuse)); @@ -74,37 +86,64 @@ long kvmppc_alloc_hpt(struct kvm *kvm) pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); return 0; + + out_freeboth: + vfree(rev); + out_freehpt: + free_pages(hpt, HPT_ORDER - PAGE_SHIFT); + return -ENOMEM; } void kvmppc_free_hpt(struct kvm *kvm) { clear_bit(kvm->arch.lpid, lpid_inuse); - free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); + vfree(kvm->arch.revmap); + if (kvm->arch.hpt_li) + kvm_release_hpt(kvm->arch.hpt_li); + else + free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); +} + +/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; +} + +/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize == 0x10000) ? 0x1000 : 0; } -void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) +void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, + unsigned long porder) { unsigned long i; - unsigned long npages = kvm->arch.ram_npages; - unsigned long pfn; - unsigned long *hpte; - unsigned long hash; - struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo; + unsigned long npages; + unsigned long hp_v, hp_r; + unsigned long addr, hash; + unsigned long psize; + unsigned long hp0, hp1; + long ret; - if (!pginfo) - return; + psize = 1ul << porder; + npages = memslot->npages >> (porder - PAGE_SHIFT); /* VRMA can't be > 1TB */ - if (npages > 1ul << (40 - kvm->arch.ram_porder)) - npages = 1ul << (40 - kvm->arch.ram_porder); + if (npages > 1ul << (40 - porder)) + npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ if (npages > HPT_NPTEG) npages = HPT_NPTEG; + hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | + HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); + hp1 = hpte1_pgsize_encoding(psize) | + HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; + for (i = 0; i < npages; ++i) { - pfn = pginfo[i].pfn; - if (!pfn) - break; + addr = i << porder; /* can't use hpt_hash since va > 64 bits */ hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; /* @@ -113,15 +152,15 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) * at most one HPTE per HPTEG, we just assume entry 7 * is available and use it. */ - hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7)); - hpte += 7 * 2; - /* HPTE low word - RPN, protection, etc. */ - hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C | - HPTE_R_M | PP_RWXX; - wmb(); - hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | - (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | - HPTE_V_LARGE | HPTE_V_VALID; + hash = (hash << 3) + 7; + hp_v = hp0 | ((addr >> 16) & ~0x7fUL); + hp_r = hp1 | addr; + ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); + if (ret != H_SUCCESS) { + pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", + addr, ret); + break; + } } } @@ -158,10 +197,814 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); } +/* + * This is called to get a reference to a guest page if there isn't + * one already in the kvm->arch.slot_phys[][] arrays. + */ +static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, + struct kvm_memory_slot *memslot, + unsigned long psize) +{ + unsigned long start; + long np, err; + struct page *page, *hpage, *pages[1]; + unsigned long s, pgsize; + unsigned long *physp; + unsigned int is_io, got, pgorder; + struct vm_area_struct *vma; + unsigned long pfn, i, npages; + + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return -EINVAL; + if (physp[gfn - memslot->base_gfn]) + return 0; + + is_io = 0; + got = 0; + page = NULL; + pgsize = psize; + err = -EINVAL; + start = gfn_to_hva_memslot(memslot, gfn); + + /* Instantiate and get the page we want access to */ + np = get_user_pages_fast(start, 1, 1, pages); + if (np != 1) { + /* Look up the vma for the page */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + if (!vma || vma->vm_start > start || + start + psize > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + goto up_err; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + /* check alignment of pfn vs. requested page size */ + if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1))) + goto up_err; + up_read(¤t->mm->mmap_sem); + + } else { + page = pages[0]; + got = KVMPPC_GOT_PAGE; + + /* See if this is a large page */ + s = PAGE_SIZE; + if (PageHuge(page)) { + hpage = compound_head(page); + s <<= compound_order(hpage); + /* Get the whole large page if slot alignment is ok */ + if (s > psize && slot_is_aligned(memslot, s) && + !(memslot->userspace_addr & (s - 1))) { + start &= ~(s - 1); + pgsize = s; + page = hpage; + } + } + if (s < psize) + goto out; + pfn = page_to_pfn(page); + } + + npages = pgsize >> PAGE_SHIFT; + pgorder = __ilog2(npages); + physp += (gfn - memslot->base_gfn) & ~(npages - 1); + spin_lock(&kvm->arch.slot_phys_lock); + for (i = 0; i < npages; ++i) { + if (!physp[i]) { + physp[i] = ((pfn + i) << PAGE_SHIFT) + + got + is_io + pgorder; + got = 0; + } + } + spin_unlock(&kvm->arch.slot_phys_lock); + err = 0; + + out: + if (got) { + if (PageHuge(page)) + page = compound_head(page); + put_page(page); + } + return err; + + up_err: + up_read(¤t->mm->mmap_sem); + return err; +} + +/* + * We come here on a H_ENTER call from the guest when we are not + * using mmu notifiers and we don't have the requested page pinned + * already. + */ +long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long psize, gpa, gfn; + struct kvm_memory_slot *memslot; + long ret; + + if (kvm->arch.using_mmu_notifiers) + goto do_insert; + + psize = hpte_page_size(pteh, ptel); + if (!psize) + return H_PARAMETER; + + pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + + /* Find the memslot (if any) for this address */ + gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); + gfn = gpa >> PAGE_SHIFT; + memslot = gfn_to_memslot(kvm, gfn); + if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) { + if (!slot_is_aligned(memslot, psize)) + return H_PARAMETER; + if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0) + return H_PARAMETER; + } + + do_insert: + /* Protect linux PTE lookup from page table destruction */ + rcu_read_lock_sched(); /* this disables preemption too */ + vcpu->arch.pgdir = current->mm->pgd; + ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); + rcu_read_unlock_sched(); + if (ret == H_TOO_HARD) { + /* this can't happen */ + pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); + ret = H_RESOURCE; /* or something */ + } + return ret; + +} + +static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, + gva_t eaddr) +{ + u64 mask; + int i; + + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) + continue; + + if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) + mask = ESID_MASK_1T; + else + mask = ESID_MASK; + + if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) + return &vcpu->arch.slb[i]; + } + return NULL; +} + +static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, + unsigned long ea) +{ + unsigned long ra_mask; + + ra_mask = hpte_page_size(v, r) - 1; + return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); +} + static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_slb *slbe; + unsigned long slb_v; + unsigned long pp, key; + unsigned long v, gr; + unsigned long *hptep; + int index; + int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); + + /* Get SLB entry */ + if (virtmode) { + slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); + if (!slbe) + return -EINVAL; + slb_v = slbe->origv; + } else { + /* real mode access */ + slb_v = vcpu->kvm->arch.vrma_slb_v; + } + + /* Find the HPTE in the hash table */ + index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, + HPTE_V_VALID | HPTE_V_ABSENT); + if (index < 0) + return -ENOENT; + hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + v = hptep[0] & ~HPTE_V_HVLOCK; + gr = kvm->arch.revmap[index].guest_rpte; + + /* Unlock the HPTE */ + asm volatile("lwsync" : : : "memory"); + hptep[0] = v; + + gpte->eaddr = eaddr; + gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); + + /* Get PP bits and key for permission check */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + key &= slb_v; + + /* Calculate permissions */ + gpte->may_read = hpte_read_permission(pp, key); + gpte->may_write = hpte_write_permission(pp, key); + gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); + + /* Storage key permission check for POWER7 */ + if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) { + int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (amrfield & 1) + gpte->may_read = 0; + if (amrfield & 2) + gpte->may_write = 0; + } + + /* Get the guest physical address */ + gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); + return 0; +} + +/* + * Quick test for whether an instruction is a load or a store. + * If the instruction is a load or a store, then this will indicate + * which it is, at least on server processors. (Embedded processors + * have some external PID instructions that don't follow the rule + * embodied here.) If the instruction isn't a load or store, then + * this doesn't return anything useful. + */ +static int instruction_is_store(unsigned int instr) +{ + unsigned int mask; + + mask = 0x10000000; + if ((instr & 0xfc000000) == 0x7c000000) + mask = 0x100; /* major opcode 31 */ + return (instr & mask) != 0; +} + +static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, int is_store) +{ + int ret; + u32 last_inst; + unsigned long srr0 = kvmppc_get_pc(vcpu); + + /* We try to load the last instruction. We don't let + * emulate_instruction do it as it doesn't check what + * kvmppc_ld returns. + * If we fail, we just return to the guest and try executing it again. + */ + if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) { + ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); + if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED) + return RESUME_GUEST; + vcpu->arch.last_inst = last_inst; + } + + /* + * WARNING: We do not know for sure whether the instruction we just + * read from memory is the same that caused the fault in the first + * place. If the instruction we read is neither an load or a store, + * then it can't access memory, so we don't need to worry about + * enforcing access permissions. So, assuming it is a load or + * store, we just check that its direction (load or store) is + * consistent with the original fault, since that's what we + * checked the access permissions against. If there is a mismatch + * we just return and retry the instruction. + */ + + if (instruction_is_store(vcpu->arch.last_inst) != !!is_store) + return RESUME_GUEST; + + /* + * Emulated accesses are emulated by looking at the hash for + * translation once, then performing the access later. The + * translation could be invalidated in the meantime in which + * point performing the subsequent memory access on the old + * physical address could possibly be a security hole for the + * guest (but not the host). + * + * This is less of an issue for MMIO stores since they aren't + * globally visible. It could be an issue for MMIO loads to + * a certain extent but we'll ignore it for now. + */ + + vcpu->arch.paddr_accessed = gpa; + return kvmppc_emulate_mmio(run, vcpu); +} + +int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hptep, hpte[3], r; + unsigned long mmu_seq, psize, pte_size; + unsigned long gfn, hva, pfn; + struct kvm_memory_slot *memslot; + unsigned long *rmap; + struct revmap_entry *rev; + struct page *page, *pages[1]; + long index, ret, npages; + unsigned long is_io; + unsigned int writing, write_ok; + struct vm_area_struct *vma; + unsigned long rcbits; + + /* + * Real-mode code has already searched the HPT and found the + * entry we're interested in. Lock the entry and check that + * it hasn't changed. If it has, just return and re-execute the + * instruction. + */ + if (ea != vcpu->arch.pgfault_addr) + return RESUME_GUEST; + index = vcpu->arch.pgfault_index; + hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + rev = &kvm->arch.revmap[index]; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; + hpte[1] = hptep[1]; + hpte[2] = r = rev->guest_rpte; + asm volatile("lwsync" : : : "memory"); + hptep[0] = hpte[0]; + preempt_enable(); + + if (hpte[0] != vcpu->arch.pgfault_hpte[0] || + hpte[1] != vcpu->arch.pgfault_hpte[1]) + return RESUME_GUEST; + + /* Translate the logical address and get the page */ + psize = hpte_page_size(hpte[0], r); + gfn = hpte_rpn(r, psize); + memslot = gfn_to_memslot(kvm, gfn); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1)); + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, + dsisr & DSISR_ISSTORE); + } + + if (!kvm->arch.using_mmu_notifiers) + return -EFAULT; /* should never get here */ + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + is_io = 0; + pfn = 0; + page = NULL; + pte_size = PAGE_SIZE; + writing = (dsisr & DSISR_ISSTORE) != 0; + /* If writing != 0, then the HPTE must allow writing, if we get here */ + write_ok = writing; + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, writing, pages); + if (npages < 1) { + /* Check if it's an I/O mapping */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && + (vma->vm_flags & VM_PFNMAP)) { + pfn = vma->vm_pgoff + + ((hva - vma->vm_start) >> PAGE_SHIFT); + pte_size = psize; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + write_ok = vma->vm_flags & VM_WRITE; + } + up_read(¤t->mm->mmap_sem); + if (!pfn) + return -EFAULT; + } else { + page = pages[0]; + if (PageHuge(page)) { + page = compound_head(page); + pte_size <<= compound_order(page); + } + /* if the guest wants write access, see if that is OK */ + if (!writing && hpte_is_writable(r)) { + pte_t *ptep, pte; + + /* + * We need to protect against page table destruction + * while looking up and updating the pte. + */ + rcu_read_lock_sched(); + ptep = find_linux_pte_or_hugepte(current->mm->pgd, + hva, NULL); + if (ptep && pte_present(*ptep)) { + pte = kvmppc_read_update_linux_pte(ptep, 1); + if (pte_write(pte)) + write_ok = 1; + } + rcu_read_unlock_sched(); + } + pfn = page_to_pfn(page); + } + + ret = -EFAULT; + if (psize > pte_size) + goto out_put; + + /* Check WIMG vs. the actual page we're accessing */ + if (!hpte_cache_flags_ok(r, is_io)) { + if (is_io) + return -EFAULT; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; + } + + /* Set the HPTE to point to pfn */ + r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT); + if (hpte_is_writable(r) && !write_ok) + r = hpte_make_readonly(r); + ret = RESUME_GUEST; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || + rev->guest_rpte != hpte[2]) + /* HPTE has been changed under us; let the guest retry */ + goto out_unlock; + hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + + rmap = &memslot->rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + + /* Check if we might have been invalidated; let the guest retry if so */ + ret = RESUME_GUEST; + if (mmu_notifier_retry(vcpu, mmu_seq)) { + unlock_rmap(rmap); + goto out_unlock; + } + + /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + r &= rcbits | ~(HPTE_R_R | HPTE_R_C); + + if (hptep[0] & HPTE_V_VALID) { + /* HPTE was previously valid, so we need to invalidate it */ + unlock_rmap(rmap); + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, index); + /* don't lose previous R and C bits */ + r |= hptep[1] & (HPTE_R_R | HPTE_R_C); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + } + + hptep[1] = r; + eieio(); + hptep[0] = hpte[0]; + asm volatile("ptesync" : : : "memory"); + preempt_enable(); + if (page && hpte_is_writable(r)) + SetPageDirty(page); + + out_put: + if (page) + put_page(page); + return ret; + + out_unlock: + hptep[0] &= ~HPTE_V_HVLOCK; + preempt_enable(); + goto out_put; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn)) +{ + int ret; + int retval = 0; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long start = memslot->userspace_addr; + unsigned long end; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + + ret = handler(kvm, &memslot->rmap[gfn_offset], + memslot->base_gfn + gfn_offset); + retval |= ret; + } + } + + return retval; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long h, i, j; + unsigned long *hptep; + unsigned long ptel, psize, rcbits; + + for (;;) { + lock_rmap(rmapp); + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + break; + } + + /* + * To avoid an ABBA deadlock with the HPTE lock bit, + * we can't spin on the HPTE lock while holding the + * rmap chain lock. + */ + i = *rmapp & KVMPPC_RMAP_INDEX; + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + continue; + } + j = rev[i].forw; + if (j == i) { + /* chain is now empty */ + *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + } else { + /* remove i from chain */ + h = rev[i].back; + rev[h].forw = j; + rev[j].back = h; + rev[i].forw = rev[i].back = i; + *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; + } + + /* Now check and modify the HPTE */ + ptel = rev[i].guest_rpte; + psize = hpte_page_size(hptep[0], ptel); + if ((hptep[0] & HPTE_V_VALID) && + hpte_rpn(ptel, psize) == gfn) { + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + /* Harvest R and C */ + rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); + *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; + rev[i].guest_rpte = ptel | rcbits; + } + unlock_rmap(rmapp); + hptep[0] &= ~HPTE_V_HVLOCK; + } + return 0; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + if (kvm->arch.using_mmu_notifiers) + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return 0; +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hptep; + int ret = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) { + *rmapp &= ~KVMPPC_RMAP_REFERENCED; + ret = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return ret; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + /* If this HPTE isn't referenced, ignore it */ + if (!(hptep[1] & HPTE_R_R)) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { + kvmppc_clear_ref_hpte(kvm, hptep, i); + rev[i].guest_rpte |= HPTE_R_R; + ret = 1; + } + hptep[0] &= ~HPTE_V_HVLOCK; + } while ((i = j) != head); + + unlock_rmap(rmapp); + return ret; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_age_rmapp); +} + +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hp; + int ret = 1; + + if (*rmapp & KVMPPC_RMAP_REFERENCED) + return 1; + + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) + goto out; + + if (*rmapp & KVMPPC_RMAP_PRESENT) { + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + if (hp[1] & HPTE_R_R) + goto out; + } while ((i = j) != head); + } + ret = 0; + + out: + unlock_rmap(rmapp); + return ret; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - return -ENOENT; + if (!kvm->arch.using_mmu_notifiers) + return; + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); +} + +static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hptep; + int ret = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_CHANGED) { + *rmapp &= ~KVMPPC_RMAP_CHANGED; + ret = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return ret; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + if (!(hptep[1] & HPTE_R_C)) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) { + /* need to make it temporarily absent to clear C */ + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[1] &= ~HPTE_R_C; + eieio(); + hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + rev[i].guest_rpte |= HPTE_R_C; + ret = 1; + } + hptep[0] &= ~HPTE_V_HVLOCK; + } while ((i = j) != head); + + unlock_rmap(rmapp); + return ret; +} + +long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + unsigned long i; + unsigned long *rmapp, *map; + + preempt_disable(); + rmapp = memslot->rmap; + map = memslot->dirty_bitmap; + for (i = 0; i < memslot->npages; ++i) { + if (kvm_test_clear_dirty(kvm, rmapp)) + __set_bit_le(i, map); + ++rmapp; + } + preempt_enable(); + return 0; +} + +void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, + unsigned long *nb_ret) +{ + struct kvm_memory_slot *memslot; + unsigned long gfn = gpa >> PAGE_SHIFT; + struct page *page, *pages[1]; + int npages; + unsigned long hva, psize, offset; + unsigned long pa; + unsigned long *physp; + + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return NULL; + if (!kvm->arch.using_mmu_notifiers) { + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return NULL; + physp += gfn - memslot->base_gfn; + pa = *physp; + if (!pa) { + if (kvmppc_get_guest_page(kvm, gfn, memslot, + PAGE_SIZE) < 0) + return NULL; + pa = *physp; + } + page = pfn_to_page(pa >> PAGE_SHIFT); + } else { + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, 1, pages); + if (npages < 1) + return NULL; + page = pages[0]; + } + psize = PAGE_SIZE; + if (PageHuge(page)) { + page = compound_head(page); + psize <<= compound_order(page); + } + if (!kvm->arch.using_mmu_notifiers) + get_page(page); + offset = gpa & (psize - 1); + if (nb_ret) + *nb_ret = psize - offset; + return page_address(page) + offset; +} + +void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) +{ + struct page *page = virt_to_page(va); + + page = compound_head(page); + put_page(page); } void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 0c9dc62532d0..f1950d131827 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -230,9 +230,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, r = kvmppc_st(vcpu, &addr, 32, zeros, true); if ((r == -ENOENT) || (r == -EPERM)) { + struct kvmppc_book3s_shadow_vcpu *svcpu; + + svcpu = svcpu_get(vcpu); *advance = 0; vcpu->arch.shared->dar = vaddr; - to_svcpu(vcpu)->fault_dar = vaddr; + svcpu->fault_dar = vaddr; dsisr = DSISR_ISSTORE; if (r == -ENOENT) @@ -241,7 +244,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->dsisr = dsisr; - to_svcpu(vcpu)->fault_dsisr = dsisr; + svcpu->fault_dsisr = dsisr; + svcpu_put(svcpu); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a7267167a550..d386b6198bc7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -48,22 +48,14 @@ #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/highmem.h> - -/* - * For now, limit memory to 64GB and require it to be large pages. - * This value is chosen because it makes the ram_pginfo array be - * 64kB in size, which is about as large as we want to be trying - * to allocate with kmalloc. - */ -#define MAX_MEM_ORDER 36 - -#define LARGE_PAGE_ORDER 24 /* 16MB pages */ +#include <linux/hugetlb.h> /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ /* #define EXIT_DEBUG_INT */ static void kvmppc_end_cede(struct kvm_vcpu *vcpu); +static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { @@ -146,10 +138,10 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, unsigned long vcpuid, unsigned long vpa) { struct kvm *kvm = vcpu->kvm; - unsigned long pg_index, ra, len; - unsigned long pg_offset; + unsigned long len, nb; void *va; struct kvm_vcpu *tvcpu; + int err = H_PARAMETER; tvcpu = kvmppc_find_vcpu(kvm, vcpuid); if (!tvcpu) @@ -162,45 +154,41 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, if (flags < 4) { if (vpa & 0x7f) return H_PARAMETER; + if (flags >= 2 && !tvcpu->arch.vpa) + return H_RESOURCE; /* registering new area; convert logical addr to real */ - pg_index = vpa >> kvm->arch.ram_porder; - pg_offset = vpa & (kvm->arch.ram_psize - 1); - if (pg_index >= kvm->arch.ram_npages) + va = kvmppc_pin_guest_page(kvm, vpa, &nb); + if (va == NULL) return H_PARAMETER; - if (kvm->arch.ram_pginfo[pg_index].pfn == 0) - return H_PARAMETER; - ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT; - ra |= pg_offset; - va = __va(ra); if (flags <= 1) len = *(unsigned short *)(va + 4); else len = *(unsigned int *)(va + 4); - if (pg_offset + len > kvm->arch.ram_psize) - return H_PARAMETER; + if (len > nb) + goto out_unpin; switch (flags) { case 1: /* register VPA */ if (len < 640) - return H_PARAMETER; + goto out_unpin; + if (tvcpu->arch.vpa) + kvmppc_unpin_guest_page(kvm, vcpu->arch.vpa); tvcpu->arch.vpa = va; init_vpa(vcpu, va); break; case 2: /* register DTL */ if (len < 48) - return H_PARAMETER; - if (!tvcpu->arch.vpa) - return H_RESOURCE; + goto out_unpin; len -= len % 48; + if (tvcpu->arch.dtl) + kvmppc_unpin_guest_page(kvm, vcpu->arch.dtl); tvcpu->arch.dtl = va; tvcpu->arch.dtl_end = va + len; break; case 3: /* register SLB shadow buffer */ - if (len < 8) - return H_PARAMETER; - if (!tvcpu->arch.vpa) - return H_RESOURCE; - tvcpu->arch.slb_shadow = va; - len = (len - 16) / 16; + if (len < 16) + goto out_unpin; + if (tvcpu->arch.slb_shadow) + kvmppc_unpin_guest_page(kvm, vcpu->arch.slb_shadow); tvcpu->arch.slb_shadow = va; break; } @@ -209,17 +197,30 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, case 5: /* unregister VPA */ if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl) return H_RESOURCE; + if (!tvcpu->arch.vpa) + break; + kvmppc_unpin_guest_page(kvm, tvcpu->arch.vpa); tvcpu->arch.vpa = NULL; break; case 6: /* unregister DTL */ + if (!tvcpu->arch.dtl) + break; + kvmppc_unpin_guest_page(kvm, tvcpu->arch.dtl); tvcpu->arch.dtl = NULL; break; case 7: /* unregister SLB shadow buffer */ + if (!tvcpu->arch.slb_shadow) + break; + kvmppc_unpin_guest_page(kvm, tvcpu->arch.slb_shadow); tvcpu->arch.slb_shadow = NULL; break; } } return H_SUCCESS; + + out_unpin: + kvmppc_unpin_guest_page(kvm, va); + return err; } int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) @@ -229,6 +230,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) struct kvm_vcpu *tvcpu; switch (req) { + case H_ENTER: + ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + break; case H_CEDE: break; case H_PROD: @@ -318,20 +325,19 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } /* - * We get these next two if the guest does a bad real-mode access, - * as we have enabled VRMA (virtualized real mode area) mode in the - * LPCR. We just generate an appropriate DSI/ISI to the guest. + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. Any other HDSI/HISI interrupts + * have been handled already. */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: - vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr; - vcpu->arch.shregs.dar = vcpu->arch.fault_dar; - kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0); - r = RESUME_GUEST; + r = kvmppc_book3s_hv_page_fault(run, vcpu, + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); break; case BOOK3S_INTERRUPT_H_INST_STORAGE: - kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, - 0x08000000); - r = RESUME_GUEST; + r = kvmppc_book3s_hv_page_fault(run, vcpu, + kvmppc_get_pc(vcpu), 0); break; /* * This occurs if the guest executes an illegal instruction. @@ -391,6 +397,42 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + r = put_user(0, (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + { + u64 hior; + /* Only allow this to be set to zero */ + r = get_user(hior, (u64 __user *)reg->addr); + if (!r && (hior != 0)) + r = -EINVAL; + break; + } + default: + break; + } + + return r; +} + int kvmppc_core_check_processor_compat(void) { if (cpu_has_feature(CPU_FTR_HVMODE)) @@ -410,7 +452,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) goto out; err = -ENOMEM; - vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) goto out; @@ -462,15 +504,21 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) return vcpu; free_vcpu: - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) { + if (vcpu->arch.dtl) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl); + if (vcpu->arch.slb_shadow) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow); + if (vcpu->arch.vpa) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa); kvm_vcpu_uninit(vcpu); - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } static void kvmppc_set_timer(struct kvm_vcpu *vcpu) @@ -481,7 +529,7 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu) if (now > vcpu->arch.dec_expires) { /* decrementer has already gone negative */ kvmppc_core_queue_dec(vcpu); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); return; } dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC @@ -796,7 +844,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) list_for_each_entry_safe(v, vn, &vc->runnable_threads, arch.run_list) { - kvmppc_core_deliver_interrupts(v); + kvmppc_core_prepare_to_enter(v); if (signal_pending(v->arch.run_task)) { kvmppc_remove_runnable(vc, v); v->stat.signal_exits++; @@ -835,20 +883,26 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINVAL; } + kvmppc_core_prepare_to_enter(vcpu); + /* No need to go into the guest when all we'll do is come back out */ if (signal_pending(current)) { run->exit_reason = KVM_EXIT_INTR; return -EINTR; } - /* On PPC970, check that we have an RMA region */ - if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) - return -EPERM; + /* On the first time here, set up VRMA or RMA */ + if (!vcpu->kvm->arch.rma_setup_done) { + r = kvmppc_hv_setup_rma(vcpu); + if (r) + return r; + } flush_fp_to_thread(current); flush_altivec_to_thread(current); flush_vsx_to_thread(current); vcpu->arch.wqp = &vcpu->arch.vcore->wq; + vcpu->arch.pgdir = current->mm->pgd; do { r = kvmppc_run_vcpu(run, vcpu); @@ -856,7 +910,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_PAPR_HCALL && !(vcpu->arch.shregs.msr & MSR_PR)) { r = kvmppc_pseries_do_hcall(vcpu); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); } } while (r == RESUME_GUEST); return r; @@ -1000,7 +1054,7 @@ static inline int lpcr_rmls(unsigned long rma_size) static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct kvmppc_rma_info *ri = vma->vm_file->private_data; + struct kvmppc_linear_info *ri = vma->vm_file->private_data; struct page *page; if (vmf->pgoff >= ri->npages) @@ -1025,7 +1079,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) static int kvm_rma_release(struct inode *inode, struct file *filp) { - struct kvmppc_rma_info *ri = filp->private_data; + struct kvmppc_linear_info *ri = filp->private_data; kvm_release_rma(ri); return 0; @@ -1038,7 +1092,7 @@ static struct file_operations kvm_rma_fops = { long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) { - struct kvmppc_rma_info *ri; + struct kvmppc_linear_info *ri; long fd; ri = kvm_alloc_rma(); @@ -1053,89 +1107,189 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) return fd; } -static struct page *hva_to_page(unsigned long addr) +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - struct page *page[1]; - int npages; + struct kvm_memory_slot *memslot; + int r; + unsigned long n; - might_sleep(); + mutex_lock(&kvm->slots_lock); - npages = get_user_pages_fast(addr, 1, 1, page); + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; - if (unlikely(npages != 1)) - return 0; + memslot = id_to_memslot(kvm->memslots, log->slot); + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + + r = kvmppc_hv_get_dirty_log(kvm, memslot); + if (r) + goto out; - return page[0]; + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static unsigned long slb_pgsize_encoding(unsigned long psize) +{ + unsigned long senc = 0; + + if (psize > 0x1000) { + senc = SLB_VSID_L; + if (psize == 0x10000) + senc |= SLB_VSID_LP_01; + } + return senc; } int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { - unsigned long psize, porder; - unsigned long i, npages, totalpages; - unsigned long pg_ix; - struct kvmppc_pginfo *pginfo; - unsigned long hva; - struct kvmppc_rma_info *ri = NULL; + unsigned long npages; + unsigned long *phys; + + /* Allocate a slot_phys array */ + phys = kvm->arch.slot_phys[mem->slot]; + if (!kvm->arch.using_mmu_notifiers && !phys) { + npages = mem->memory_size >> PAGE_SHIFT; + phys = vzalloc(npages * sizeof(unsigned long)); + if (!phys) + return -ENOMEM; + kvm->arch.slot_phys[mem->slot] = phys; + kvm->arch.slot_npages[mem->slot] = npages; + } + + return 0; +} + +static void unpin_slot(struct kvm *kvm, int slot_id) +{ + unsigned long *physp; + unsigned long j, npages, pfn; struct page *page; - /* For now, only allow 16MB pages */ - porder = LARGE_PAGE_ORDER; - psize = 1ul << porder; - if ((mem->memory_size & (psize - 1)) || - (mem->guest_phys_addr & (psize - 1))) { - pr_err("bad memory_size=%llx @ %llx\n", - mem->memory_size, mem->guest_phys_addr); - return -EINVAL; + physp = kvm->arch.slot_phys[slot_id]; + npages = kvm->arch.slot_npages[slot_id]; + if (physp) { + spin_lock(&kvm->arch.slot_phys_lock); + for (j = 0; j < npages; j++) { + if (!(physp[j] & KVMPPC_GOT_PAGE)) + continue; + pfn = physp[j] >> PAGE_SHIFT; + page = pfn_to_page(pfn); + if (PageHuge(page)) + page = compound_head(page); + SetPageDirty(page); + put_page(page); + } + kvm->arch.slot_phys[slot_id] = NULL; + spin_unlock(&kvm->arch.slot_phys_lock); + vfree(physp); } +} - npages = mem->memory_size >> porder; - totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder; +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ +} - /* More memory than we have space to track? */ - if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER))) - return -EINVAL; +static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) +{ + int err = 0; + struct kvm *kvm = vcpu->kvm; + struct kvmppc_linear_info *ri = NULL; + unsigned long hva; + struct kvm_memory_slot *memslot; + struct vm_area_struct *vma; + unsigned long lpcr, senc; + unsigned long psize, porder; + unsigned long rma_size; + unsigned long rmls; + unsigned long *physp; + unsigned long i, npages; - /* Do we already have an RMA registered? */ - if (mem->guest_phys_addr == 0 && kvm->arch.rma) - return -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->arch.rma_setup_done) + goto out; /* another vcpu beat us to it */ - if (totalpages > kvm->arch.ram_npages) - kvm->arch.ram_npages = totalpages; + /* Look up the memslot for guest physical address 0 */ + memslot = gfn_to_memslot(kvm, 0); + + /* We must have some memory at 0 by now */ + err = -EINVAL; + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + goto out; + + /* Look up the VMA for the start of this memory slot */ + hva = memslot->userspace_addr; + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) + goto up_out; + + psize = vma_kernel_pagesize(vma); + porder = __ilog2(psize); /* Is this one of our preallocated RMAs? */ - if (mem->guest_phys_addr == 0) { - struct vm_area_struct *vma; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, mem->userspace_addr); - if (vma && vma->vm_file && - vma->vm_file->f_op == &kvm_rma_fops && - mem->userspace_addr == vma->vm_start) - ri = vma->vm_file->private_data; - up_read(¤t->mm->mmap_sem); - if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) { - pr_err("CPU requires an RMO\n"); - return -EINVAL; + if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops && + hva == vma->vm_start) + ri = vma->vm_file->private_data; + + up_read(¤t->mm->mmap_sem); + + if (!ri) { + /* On POWER7, use VRMA; on PPC970, give up */ + err = -EPERM; + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + pr_err("KVM: CPU requires an RMO\n"); + goto out; } - } - if (ri) { - unsigned long rma_size; - unsigned long lpcr; - long rmls; + /* We can handle 4k, 64k or 16M pages in the VRMA */ + err = -EINVAL; + if (!(psize == 0x1000 || psize == 0x10000 || + psize == 0x1000000)) + goto out; + + /* Update VRMASD field in the LPCR */ + senc = slb_pgsize_encoding(psize); + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; + lpcr |= senc << (LPCR_VRMASD_SH - 4); + kvm->arch.lpcr = lpcr; - rma_size = ri->npages << PAGE_SHIFT; - if (rma_size > mem->memory_size) - rma_size = mem->memory_size; + /* Create HPTEs in the hash page table for the VRMA */ + kvmppc_map_vrma(vcpu, memslot, porder); + + } else { + /* Set up to use an RMO region */ + rma_size = ri->npages; + if (rma_size > memslot->npages) + rma_size = memslot->npages; + rma_size <<= PAGE_SHIFT; rmls = lpcr_rmls(rma_size); + err = -EINVAL; if (rmls < 0) { - pr_err("Can't use RMA of 0x%lx bytes\n", rma_size); - return -EINVAL; + pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); + goto out; } atomic_inc(&ri->use_count); kvm->arch.rma = ri; - kvm->arch.n_rma_pages = rma_size >> porder; /* Update LPCR and RMOR */ lpcr = kvm->arch.lpcr; @@ -1155,53 +1309,35 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; } kvm->arch.lpcr = lpcr; - pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n", + pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n", ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); - } - pg_ix = mem->guest_phys_addr >> porder; - pginfo = kvm->arch.ram_pginfo + pg_ix; - for (i = 0; i < npages; ++i, ++pg_ix) { - if (ri && pg_ix < kvm->arch.n_rma_pages) { - pginfo[i].pfn = ri->base_pfn + - (pg_ix << (porder - PAGE_SHIFT)); - continue; - } - hva = mem->userspace_addr + (i << porder); - page = hva_to_page(hva); - if (!page) { - pr_err("oops, no pfn for hva %lx\n", hva); - goto err; - } - /* Check it's a 16MB page */ - if (!PageHead(page) || - compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) { - pr_err("page at %lx isn't 16MB (o=%d)\n", - hva, compound_order(page)); - goto err; - } - pginfo[i].pfn = page_to_pfn(page); + /* Initialize phys addrs of pages in RMO */ + npages = ri->npages; + porder = __ilog2(npages); + physp = kvm->arch.slot_phys[memslot->id]; + spin_lock(&kvm->arch.slot_phys_lock); + for (i = 0; i < npages; ++i) + physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder; + spin_unlock(&kvm->arch.slot_phys_lock); } - return 0; - - err: - return -EINVAL; -} + /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ + smp_wmb(); + kvm->arch.rma_setup_done = 1; + err = 0; + out: + mutex_unlock(&kvm->lock); + return err; -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) -{ - if (mem->guest_phys_addr == 0 && mem->memory_size != 0 && - !kvm->arch.rma) - kvmppc_map_vrma(kvm, mem); + up_out: + up_read(¤t->mm->mmap_sem); + goto out; } int kvmppc_core_init_vm(struct kvm *kvm) { long r; - unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER); - long err = -ENOMEM; unsigned long lpcr; /* Allocate hashed page table */ @@ -1211,19 +1347,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); - kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), - GFP_KERNEL); - if (!kvm->arch.ram_pginfo) { - pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n", - npages * sizeof(struct kvmppc_pginfo)); - goto out_free; - } - - kvm->arch.ram_npages = 0; - kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER; - kvm->arch.ram_porder = LARGE_PAGE_ORDER; kvm->arch.rma = NULL; - kvm->arch.n_rma_pages = 0; kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); @@ -1241,30 +1365,25 @@ int kvmppc_core_init_vm(struct kvm *kvm) kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); lpcr &= LPCR_PECE | LPCR_LPES; lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | - LPCR_VPM0 | LPCR_VRMA_L; + LPCR_VPM0 | LPCR_VPM1; + kvm->arch.vrma_slb_v = SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); } kvm->arch.lpcr = lpcr; + kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); + spin_lock_init(&kvm->arch.slot_phys_lock); return 0; - - out_free: - kvmppc_free_hpt(kvm); - return err; } void kvmppc_core_destroy_vm(struct kvm *kvm) { - struct kvmppc_pginfo *pginfo; unsigned long i; - if (kvm->arch.ram_pginfo) { - pginfo = kvm->arch.ram_pginfo; - kvm->arch.ram_pginfo = NULL; - for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i) - if (pginfo[i].pfn) - put_page(pfn_to_page(pginfo[i].pfn)); - kfree(pginfo); - } + if (!kvm->arch.using_mmu_notifiers) + for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) + unpin_slot(kvm, i); + if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); kvm->arch.rma = NULL; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index a795a13f4a70..bed1279aa6a8 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -18,6 +18,15 @@ #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#define KVM_LINEAR_RMA 0 +#define KVM_LINEAR_HPT 1 + +static void __init kvm_linear_init_one(ulong size, int count, int type); +static struct kvmppc_linear_info *kvm_alloc_linear(int type); +static void kvm_release_linear(struct kvmppc_linear_info *ri); + +/*************** RMA *************/ + /* * This maintains a list of RMAs (real mode areas) for KVM guests to use. * Each RMA has to be physically contiguous and of a size that the @@ -29,32 +38,6 @@ static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ static unsigned long kvm_rma_count; -static int __init early_parse_rma_size(char *p) -{ - if (!p) - return 1; - - kvm_rma_size = memparse(p, &p); - - return 0; -} -early_param("kvm_rma_size", early_parse_rma_size); - -static int __init early_parse_rma_count(char *p) -{ - if (!p) - return 1; - - kvm_rma_count = simple_strtoul(p, NULL, 0); - - return 0; -} -early_param("kvm_rma_count", early_parse_rma_count); - -static struct kvmppc_rma_info *rma_info; -static LIST_HEAD(free_rmas); -static DEFINE_SPINLOCK(rma_lock); - /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) @@ -81,45 +64,106 @@ static inline int lpcr_rmls(unsigned long rma_size) } } +static int __init early_parse_rma_size(char *p) +{ + if (!p) + return 1; + + kvm_rma_size = memparse(p, &p); + + return 0; +} +early_param("kvm_rma_size", early_parse_rma_size); + +static int __init early_parse_rma_count(char *p) +{ + if (!p) + return 1; + + kvm_rma_count = simple_strtoul(p, NULL, 0); + + return 0; +} +early_param("kvm_rma_count", early_parse_rma_count); + +struct kvmppc_linear_info *kvm_alloc_rma(void) +{ + return kvm_alloc_linear(KVM_LINEAR_RMA); +} +EXPORT_SYMBOL_GPL(kvm_alloc_rma); + +void kvm_release_rma(struct kvmppc_linear_info *ri) +{ + kvm_release_linear(ri); +} +EXPORT_SYMBOL_GPL(kvm_release_rma); + +/*************** HPT *************/ + /* - * Called at boot time while the bootmem allocator is active, - * to allocate contiguous physical memory for the real memory - * areas for guests. + * This maintains a list of big linear HPT tables that contain the GVA->HPA + * memory mappings. If we don't reserve those early on, we might not be able + * to get a big (usually 16MB) linear memory region from the kernel anymore. */ -void __init kvm_rma_init(void) + +static unsigned long kvm_hpt_count; + +static int __init early_parse_hpt_count(char *p) +{ + if (!p) + return 1; + + kvm_hpt_count = simple_strtoul(p, NULL, 0); + + return 0; +} +early_param("kvm_hpt_count", early_parse_hpt_count); + +struct kvmppc_linear_info *kvm_alloc_hpt(void) +{ + return kvm_alloc_linear(KVM_LINEAR_HPT); +} +EXPORT_SYMBOL_GPL(kvm_alloc_hpt); + +void kvm_release_hpt(struct kvmppc_linear_info *li) +{ + kvm_release_linear(li); +} +EXPORT_SYMBOL_GPL(kvm_release_hpt); + +/*************** generic *************/ + +static LIST_HEAD(free_linears); +static DEFINE_SPINLOCK(linear_lock); + +static void __init kvm_linear_init_one(ulong size, int count, int type) { unsigned long i; unsigned long j, npages; - void *rma; + void *linear; struct page *pg; + const char *typestr; + struct kvmppc_linear_info *linear_info; - /* Only do this on PPC970 in HV mode */ - if (!cpu_has_feature(CPU_FTR_HVMODE) || - !cpu_has_feature(CPU_FTR_ARCH_201)) - return; - - if (!kvm_rma_size || !kvm_rma_count) + if (!count) return; - /* Check that the requested size is one supported in hardware */ - if (lpcr_rmls(kvm_rma_size) < 0) { - pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); - return; - } - - npages = kvm_rma_size >> PAGE_SHIFT; - rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info)); - for (i = 0; i < kvm_rma_count; ++i) { - rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size); - pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma, - kvm_rma_size >> 20); - rma_info[i].base_virt = rma; - rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT; - rma_info[i].npages = npages; - list_add_tail(&rma_info[i].list, &free_rmas); - atomic_set(&rma_info[i].use_count, 0); - - pg = pfn_to_page(rma_info[i].base_pfn); + typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT"; + + npages = size >> PAGE_SHIFT; + linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); + for (i = 0; i < count; ++i) { + linear = alloc_bootmem_align(size, size); + pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, + size >> 20); + linear_info[i].base_virt = linear; + linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT; + linear_info[i].npages = npages; + linear_info[i].type = type; + list_add_tail(&linear_info[i].list, &free_linears); + atomic_set(&linear_info[i].use_count, 0); + + pg = pfn_to_page(linear_info[i].base_pfn); for (j = 0; j < npages; ++j) { atomic_inc(&pg->_count); ++pg; @@ -127,30 +171,59 @@ void __init kvm_rma_init(void) } } -struct kvmppc_rma_info *kvm_alloc_rma(void) +static struct kvmppc_linear_info *kvm_alloc_linear(int type) { - struct kvmppc_rma_info *ri; + struct kvmppc_linear_info *ri; ri = NULL; - spin_lock(&rma_lock); - if (!list_empty(&free_rmas)) { - ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list); + spin_lock(&linear_lock); + list_for_each_entry(ri, &free_linears, list) { + if (ri->type != type) + continue; + list_del(&ri->list); atomic_inc(&ri->use_count); + break; } - spin_unlock(&rma_lock); + spin_unlock(&linear_lock); + memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT); return ri; } -EXPORT_SYMBOL_GPL(kvm_alloc_rma); -void kvm_release_rma(struct kvmppc_rma_info *ri) +static void kvm_release_linear(struct kvmppc_linear_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - spin_lock(&rma_lock); - list_add_tail(&ri->list, &free_rmas); - spin_unlock(&rma_lock); + spin_lock(&linear_lock); + list_add_tail(&ri->list, &free_linears); + spin_unlock(&linear_lock); } } -EXPORT_SYMBOL_GPL(kvm_release_rma); +/* + * Called at boot time while the bootmem allocator is active, + * to allocate contiguous physical memory for the hash page + * tables for guests. + */ +void __init kvm_linear_init(void) +{ + /* HPT */ + kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT); + + /* RMA */ + /* Only do this on PPC970 in HV mode */ + if (!cpu_has_feature(CPU_FTR_HVMODE) || + !cpu_has_feature(CPU_FTR_ARCH_201)) + return; + + if (!kvm_rma_size || !kvm_rma_count) + return; + + /* Check that the requested size is one supported in hardware */ + if (lpcr_rmls(kvm_rma_size) < 0) { + pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); + return; + } + + kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA); +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index bacb0cfa3602..def880aea63a 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -11,6 +11,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/hugetlb.h> +#include <linux/module.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -20,95 +21,307 @@ #include <asm/synch.h> #include <asm/ppc-opcode.h> -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) +/* Translate address of a vmalloc'd thing to a linear map address */ +static void *real_vmalloc_addr(void *x) +{ + unsigned long addr = (unsigned long) x; + pte_t *p; -#define HPTE_V_HVLOCK 0x40UL + p = find_linux_pte(swapper_pg_dir, addr); + if (!p || !pte_present(*p)) + return NULL; + /* assume we don't have huge pages in vmalloc space... */ + addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); + return __va(addr); +} -static inline long lock_hpte(unsigned long *hpte, unsigned long bits) +/* + * Add this HPTE into the chain for the real page. + * Must be called with the chain locked; it unlocks the chain. + */ +void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, + unsigned long *rmap, long pte_index, int realmode) { - unsigned long tmp, old; + struct revmap_entry *head, *tail; + unsigned long i; - asm volatile(" ldarx %0,0,%2\n" - " and. %1,%0,%3\n" - " bne 2f\n" - " ori %0,%0,%4\n" - " stdcx. %0,0,%2\n" - " beq+ 2f\n" - " li %1,%3\n" - "2: isync" - : "=&r" (tmp), "=&r" (old) - : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) - : "cc", "memory"); - return old == 0; + if (*rmap & KVMPPC_RMAP_PRESENT) { + i = *rmap & KVMPPC_RMAP_INDEX; + head = &kvm->arch.revmap[i]; + if (realmode) + head = real_vmalloc_addr(head); + tail = &kvm->arch.revmap[head->back]; + if (realmode) + tail = real_vmalloc_addr(tail); + rev->forw = i; + rev->back = head->back; + tail->forw = pte_index; + head->back = pte_index; + } else { + rev->forw = rev->back = pte_index; + i = pte_index; + } + smp_wmb(); + *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ +} +EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); + +/* Remove this HPTE from the chain for a real page */ +static void remove_revmap_chain(struct kvm *kvm, long pte_index, + struct revmap_entry *rev, + unsigned long hpte_v, unsigned long hpte_r) +{ + struct revmap_entry *next, *prev; + unsigned long gfn, ptel, head; + struct kvm_memory_slot *memslot; + unsigned long *rmap; + unsigned long rcbits; + + rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); + ptel = rev->guest_rpte |= rcbits; + gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return; + + rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]); + lock_rmap(rmap); + + head = *rmap & KVMPPC_RMAP_INDEX; + next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); + prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); + next->back = rev->back; + prev->forw = rev->forw; + if (head == pte_index) { + head = rev->forw; + if (head == pte_index) + *rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + else + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head; + } + *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; + unlock_rmap(rmap); +} + +static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, + int writing, unsigned long *pte_sizep) +{ + pte_t *ptep; + unsigned long ps = *pte_sizep; + unsigned int shift; + + ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift); + if (!ptep) + return __pte(0); + if (shift) + *pte_sizep = 1ul << shift; + else + *pte_sizep = PAGE_SIZE; + if (ps > *pte_sizep) + return __pte(0); + if (!pte_present(*ptep)) + return __pte(0); + return kvmppc_read_update_linux_pte(ptep, writing); +} + +static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) +{ + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hpte[0] = hpte_v; } long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) { - unsigned long porder; struct kvm *kvm = vcpu->kvm; - unsigned long i, lpn, pa; + unsigned long i, pa, gpa, gfn, psize; + unsigned long slot_fn, hva; unsigned long *hpte; + struct revmap_entry *rev; + unsigned long g_ptel = ptel; + struct kvm_memory_slot *memslot; + unsigned long *physp, pte_size; + unsigned long is_io; + unsigned long *rmap; + pte_t pte; + unsigned int writing; + unsigned long mmu_seq; + unsigned long rcbits; + bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; - /* only handle 4k, 64k and 16M pages for now */ - porder = 12; - if (pteh & HPTE_V_LARGE) { - if (cpu_has_feature(CPU_FTR_ARCH_206) && - (ptel & 0xf000) == 0x1000) { - /* 64k page */ - porder = 16; - } else if ((ptel & 0xff000) == 0) { - /* 16M page */ - porder = 24; - /* lowest AVA bit must be 0 for 16M pages */ - if (pteh & 0x80) - return H_PARAMETER; - } else + psize = hpte_page_size(pteh, ptel); + if (!psize) + return H_PARAMETER; + writing = hpte_is_writable(ptel); + pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + + /* used later to detect if we might have been invalidated */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* Find the memslot (if any) for this address */ + gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); + gfn = gpa >> PAGE_SHIFT; + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + pa = 0; + is_io = ~0ul; + rmap = NULL; + if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { + /* PPC970 can't do emulated MMIO */ + if (!cpu_has_feature(CPU_FTR_ARCH_206)) return H_PARAMETER; + /* Emulated MMIO - mark this with key=31 */ + pteh |= HPTE_V_ABSENT; + ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO; + goto do_insert; } - lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder; - if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder) - return H_PARAMETER; - pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT; - if (!pa) + + /* Check if the requested page fits entirely in the memslot. */ + if (!slot_is_aligned(memslot, psize)) return H_PARAMETER; - /* Check WIMG */ - if ((ptel & HPTE_R_WIMG) != HPTE_R_M && - (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) + slot_fn = gfn - memslot->base_gfn; + rmap = &memslot->rmap[slot_fn]; + + if (!kvm->arch.using_mmu_notifiers) { + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return H_PARAMETER; + physp += slot_fn; + if (realmode) + physp = real_vmalloc_addr(physp); + pa = *physp; + if (!pa) + return H_TOO_HARD; + is_io = pa & (HPTE_R_I | HPTE_R_W); + pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); + pa &= PAGE_MASK; + } else { + /* Translate to host virtual address */ + hva = gfn_to_hva_memslot(memslot, gfn); + + /* Look up the Linux PTE for the backing page */ + pte_size = psize; + pte = lookup_linux_pte(vcpu, hva, writing, &pte_size); + if (pte_present(pte)) { + if (writing && !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) << PAGE_SHIFT; + } + } + if (pte_size < psize) return H_PARAMETER; - pteh &= ~0x60UL; - ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); + if (pa && pte_size > psize) + pa |= gpa & (pte_size - 1); + + ptel &= ~(HPTE_R_PP0 - psize); ptel |= pa; - if (pte_index >= (HPT_NPTEG << 3)) + + if (pa) + pteh |= HPTE_V_VALID; + else + pteh |= HPTE_V_ABSENT; + + /* Check WIMG */ + if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { + if (is_io) + return H_PARAMETER; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); + ptel |= HPTE_R_M; + } + + /* Find and lock the HPTEG slot to use */ + do_insert: + if (pte_index >= HPT_NPTE) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - for (i = 0; ; ++i) { - if (i == 8) - return H_PTEG_FULL; + for (i = 0; i < 8; ++i) { if ((*hpte & HPTE_V_VALID) == 0 && - lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) + try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) break; hpte += 2; } + if (i == 8) { + /* + * Since try_lock_hpte doesn't retry (not even stdcx. + * failures), it could be that there is a free slot + * but we transiently failed to lock it. Try again, + * actually locking each slot and checking it. + */ + hpte -= 16; + for (i = 0; i < 8; ++i) { + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT))) + break; + *hpte &= ~HPTE_V_HVLOCK; + hpte += 2; + } + if (i == 8) + return H_PTEG_FULL; + } + pte_index += i; } else { - i = 0; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) - return H_PTEG_FULL; + if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) { + /* Lock the slot and check again */ + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { + *hpte &= ~HPTE_V_HVLOCK; + return H_PTEG_FULL; + } + } } + + /* Save away the guest's idea of the second HPTE dword */ + rev = &kvm->arch.revmap[pte_index]; + if (realmode) + rev = real_vmalloc_addr(rev); + if (rev) + rev->guest_rpte = g_ptel; + + /* Link HPTE into reverse-map chain */ + if (pteh & HPTE_V_VALID) { + if (realmode) + rmap = real_vmalloc_addr(rmap); + lock_rmap(rmap); + /* Check for pending invalidations under the rmap chain lock */ + if (kvm->arch.using_mmu_notifiers && + mmu_notifier_retry(vcpu, mmu_seq)) { + /* inval in progress, write a non-present HPTE */ + pteh |= HPTE_V_ABSENT; + pteh &= ~HPTE_V_VALID; + unlock_rmap(rmap); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, + realmode); + /* Only set R/C in real HPTE if already set in *rmap */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C); + } + } + hpte[1] = ptel; + + /* Write the first HPTE dword, unlocking the HPTE and making it valid */ eieio(); hpte[0] = pteh; asm volatile("ptesync" : : : "memory"); - atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt); - vcpu->arch.gpr[4] = pte_index + i; + + vcpu->arch.gpr[4] = pte_index; return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_h_enter); #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) @@ -137,37 +350,46 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, struct kvm *kvm = vcpu->kvm; unsigned long *hpte; unsigned long v, r, rb; + struct revmap_entry *rev; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if ((hpte[0] & HPTE_V_VALID) == 0 || + if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) { hpte[0] &= ~HPTE_V_HVLOCK; return H_NOT_FOUND; } - if (atomic_read(&kvm->online_vcpus) == 1) - flags |= H_LOCAL; - vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; - vcpu->arch.gpr[5] = r = hpte[1]; - rb = compute_tlbie_rb(v, r, pte_index); - hpte[0] = 0; - if (!(flags & H_LOCAL)) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); + + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + v = hpte[0] & ~HPTE_V_HVLOCK; + if (v & HPTE_V_VALID) { + hpte[0] &= ~HPTE_V_VALID; + rb = compute_tlbie_rb(v, hpte[1], pte_index); + if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } + /* Read PTE low word after tlbie to get final R/C values */ + remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); } + r = rev->guest_rpte; + unlock_hpte(hpte, 0); + + vcpu->arch.gpr[4] = v; + vcpu->arch.gpr[5] = r; return H_SUCCESS; } @@ -175,78 +397,117 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; unsigned long *args = &vcpu->arch.gpr[4]; - unsigned long *hp, tlbrb[4]; - long int i, found; - long int n_inval = 0; - unsigned long flags, req, pte_index; + unsigned long *hp, *hptes[4], tlbrb[4]; + long int i, j, k, n, found, indexes[4]; + unsigned long flags, req, pte_index, rcbits; long int local = 0; long int ret = H_SUCCESS; + struct revmap_entry *rev, *revs[4]; if (atomic_read(&kvm->online_vcpus) == 1) local = 1; - for (i = 0; i < 4; ++i) { - pte_index = args[i * 2]; - flags = pte_index >> 56; - pte_index &= ((1ul << 56) - 1); - req = flags >> 6; - flags &= 3; - if (req == 3) - break; - if (req != 1 || flags == 3 || - pte_index >= (HPT_NPTEG << 3)) { - /* parameter error */ - args[i * 2] = ((0xa0 | flags) << 56) + pte_index; - ret = H_PARAMETER; - break; - } - hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hp, HPTE_V_HVLOCK)) - cpu_relax(); - found = 0; - if (hp[0] & HPTE_V_VALID) { - switch (flags & 3) { - case 0: /* absolute */ - found = 1; + for (i = 0; i < 4 && ret == H_SUCCESS; ) { + n = 0; + for (; i < 4; ++i) { + j = i * 2; + pte_index = args[j]; + flags = pte_index >> 56; + pte_index &= ((1ul << 56) - 1); + req = flags >> 6; + flags &= 3; + if (req == 3) { /* no more requests */ + i = 4; break; - case 1: /* andcond */ - if (!(hp[0] & args[i * 2 + 1])) - found = 1; + } + if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) { + /* parameter error */ + args[j] = ((0xa0 | flags) << 56) + pte_index; + ret = H_PARAMETER; break; - case 2: /* AVPN */ - if ((hp[0] & ~0x7fUL) == args[i * 2 + 1]) + } + hp = (unsigned long *) + (kvm->arch.hpt_virt + (pte_index << 4)); + /* to avoid deadlock, don't spin except for first */ + if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { + if (n) + break; + while (!try_lock_hpte(hp, HPTE_V_HVLOCK)) + cpu_relax(); + } + found = 0; + if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) { + switch (flags & 3) { + case 0: /* absolute */ found = 1; - break; + break; + case 1: /* andcond */ + if (!(hp[0] & args[j + 1])) + found = 1; + break; + case 2: /* AVPN */ + if ((hp[0] & ~0x7fUL) == args[j + 1]) + found = 1; + break; + } + } + if (!found) { + hp[0] &= ~HPTE_V_HVLOCK; + args[j] = ((0x90 | flags) << 56) + pte_index; + continue; } + + args[j] = ((0x80 | flags) << 56) + pte_index; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + + if (!(hp[0] & HPTE_V_VALID)) { + /* insert R and C bits from PTE */ + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + continue; + } + + hp[0] &= ~HPTE_V_VALID; /* leave it locked */ + tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index); + indexes[n] = j; + hptes[n] = hp; + revs[n] = rev; + ++n; + } + + if (!n) + break; + + /* Now that we've collected a batch, do the tlbies */ + if (!local) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + for (k = 0; k < n; ++k) + asm volatile(PPC_TLBIE(%1,%0) : : + "r" (tlbrb[k]), + "r" (kvm->arch.lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + for (k = 0; k < n; ++k) + asm volatile("tlbiel %0" : : "r" (tlbrb[k])); + asm volatile("ptesync" : : : "memory"); } - if (!found) { - hp[0] &= ~HPTE_V_HVLOCK; - args[i * 2] = ((0x90 | flags) << 56) + pte_index; - continue; + + /* Read PTE low words after tlbie to get final R/C values */ + for (k = 0; k < n; ++k) { + j = indexes[k]; + pte_index = args[j] & ((1ul << 56) - 1); + hp = hptes[k]; + rev = revs[k]; + remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]); + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + hp[0] = 0; } - /* insert R and C bits from PTE */ - flags |= (hp[1] >> 5) & 0x0c; - args[i * 2] = ((0x80 | flags) << 56) + pte_index; - tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); - hp[0] = 0; - } - if (n_inval == 0) - return ret; - - if (!local) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - for (i = 0; i < n_inval; ++i) - asm volatile(PPC_TLBIE(%1,%0) - : : "r" (tlbrb[i]), "r" (kvm->arch.lpid)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - for (i = 0; i < n_inval; ++i) - asm volatile("tlbiel %0" : : "r" (tlbrb[i])); - asm volatile("ptesync" : : : "memory"); } + return ret; } @@ -256,40 +517,55 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, { struct kvm *kvm = vcpu->kvm; unsigned long *hpte; - unsigned long v, r, rb; + struct revmap_entry *rev; + unsigned long v, r, rb, mask, bits; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if ((hpte[0] & HPTE_V_VALID) == 0 || + if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { hpte[0] &= ~HPTE_V_HVLOCK; return H_NOT_FOUND; } + if (atomic_read(&kvm->online_vcpus) == 1) flags |= H_LOCAL; v = hpte[0]; - r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | - HPTE_R_KEY_HI | HPTE_R_KEY_LO); - r |= (flags << 55) & HPTE_R_PP0; - r |= (flags << 48) & HPTE_R_KEY_HI; - r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); - rb = compute_tlbie_rb(v, r, pte_index); - hpte[0] = v & ~HPTE_V_VALID; - if (!(flags & H_LOCAL)) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); + bits = (flags << 55) & HPTE_R_PP0; + bits |= (flags << 48) & HPTE_R_KEY_HI; + bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + + /* Update guest view of 2nd HPTE dword */ + mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | + HPTE_R_KEY_HI | HPTE_R_KEY_LO; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + if (rev) { + r = (rev->guest_rpte & ~mask) | bits; + rev->guest_rpte = r; + } + r = (hpte[1] & ~mask) | bits; + + /* Update HPTE */ + if (v & HPTE_V_VALID) { + rb = compute_tlbie_rb(v, r, pte_index); + hpte[0] = v & ~HPTE_V_VALID; + if (!(flags & H_LOCAL)) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } } hpte[1] = r; eieio(); @@ -298,40 +574,243 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, return H_SUCCESS; } -static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr) -{ - long int i; - unsigned long offset, rpn; - - offset = realaddr & (kvm->arch.ram_psize - 1); - rpn = (realaddr - offset) >> PAGE_SHIFT; - for (i = 0; i < kvm->arch.ram_npages; ++i) - if (rpn == kvm->arch.ram_pginfo[i].pfn) - return (i << PAGE_SHIFT) + offset; - return HPTE_R_RPN; /* all 1s in the RPN field */ -} - long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long pte_index) { struct kvm *kvm = vcpu->kvm; - unsigned long *hpte, r; + unsigned long *hpte, v, r; int i, n = 1; + struct revmap_entry *rev = NULL; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; n = 4; } + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); for (i = 0; i < n; ++i, ++pte_index) { hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + v = hpte[0] & ~HPTE_V_HVLOCK; r = hpte[1]; - if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID)) - r = reverse_xlate(kvm, r & HPTE_R_RPN) | - (r & ~HPTE_R_RPN); - vcpu->arch.gpr[4 + i * 2] = hpte[0]; + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + } + if (v & HPTE_V_VALID) + r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); + vcpu->arch.gpr[4 + i * 2] = v; vcpu->arch.gpr[5 + i * 2] = r; } return H_SUCCESS; } + +void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index) +{ + unsigned long rb; + + hptep[0] &= ~HPTE_V_VALID; + rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; +} +EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); + +void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index) +{ + unsigned long rb; + unsigned char rbyte; + + rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); + rbyte = (hptep[1] & ~HPTE_R_R) >> 8; + /* modify only the second-last byte, which contains the ref bit */ + *((char *)hptep + 14) = rbyte; + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; +} +EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte); + +static int slb_base_page_shift[4] = { + 24, /* 16M */ + 16, /* 64k */ + 34, /* 16G */ + 20, /* 1M, unsupported */ +}; + +long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, + unsigned long valid) +{ + unsigned int i; + unsigned int pshift; + unsigned long somask; + unsigned long vsid, hash; + unsigned long avpn; + unsigned long *hpte; + unsigned long mask, val; + unsigned long v, r; + + /* Get page shift, work out hash and AVPN etc. */ + mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY; + val = 0; + pshift = 12; + if (slb_v & SLB_VSID_L) { + mask |= HPTE_V_LARGE; + val |= HPTE_V_LARGE; + pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4]; + } + if (slb_v & SLB_VSID_B_1T) { + somask = (1UL << 40) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T; + vsid ^= vsid << 25; + } else { + somask = (1UL << 28) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; + } + hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK; + avpn = slb_v & ~(somask >> 16); /* also includes B */ + avpn |= (eaddr & somask) >> 16; + + if (pshift >= 24) + avpn &= ~((1UL << (pshift - 16)) - 1); + else + avpn &= ~0x7fUL; + val |= avpn; + + for (;;) { + hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7)); + + for (i = 0; i < 16; i += 2) { + /* Read the PTE racily */ + v = hpte[i] & ~HPTE_V_HVLOCK; + + /* Check valid/absent, hash, segment size and AVPN */ + if (!(v & valid) || (v & mask) != val) + continue; + + /* Lock the PTE and read it under the lock */ + while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK)) + cpu_relax(); + v = hpte[i] & ~HPTE_V_HVLOCK; + r = hpte[i+1]; + + /* + * Check the HPTE again, including large page size + * Since we don't currently allow any MPSS (mixed + * page-size segment) page sizes, it is sufficient + * to check against the actual page size. + */ + if ((v & valid) && (v & mask) == val && + hpte_page_size(v, r) == (1ul << pshift)) + /* Return with the HPTE still locked */ + return (hash << 3) + (i >> 1); + + /* Unlock and move on */ + hpte[i] = v; + } + + if (val & HPTE_V_SECONDARY) + break; + val |= HPTE_V_SECONDARY; + hash = hash ^ HPT_HASH_MASK; + } + return -1; +} +EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); + +/* + * Called in real mode to check whether an HPTE not found fault + * is due to accessing a paged-out page or an emulated MMIO page, + * or if a protection fault is due to accessing a page that the + * guest wanted read/write access to but which we made read-only. + * Returns a possibly modified status (DSISR) value if not + * (i.e. pass the interrupt to the guest), + * -1 to pass the fault up to host kernel mode code, -2 to do that + * and also load the instruction word (for MMIO emulation), + * or 0 if we should make the guest retry the access. + */ +long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, + unsigned long slb_v, unsigned int status, bool data) +{ + struct kvm *kvm = vcpu->kvm; + long int index; + unsigned long v, r, gr; + unsigned long *hpte; + unsigned long valid; + struct revmap_entry *rev; + unsigned long pp, key; + + /* For protection fault, expect to find a valid HPTE */ + valid = HPTE_V_VALID; + if (status & DSISR_NOHPTE) + valid |= HPTE_V_ABSENT; + + index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); + if (index < 0) { + if (status & DSISR_NOHPTE) + return status; /* there really was no HPTE */ + return 0; /* for prot fault, HPTE disappeared */ + } + hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + v = hpte[0] & ~HPTE_V_HVLOCK; + r = hpte[1]; + rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + gr = rev->guest_rpte; + + unlock_hpte(hpte, v); + + /* For not found, if the HPTE is valid by now, retry the instruction */ + if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) + return 0; + + /* Check access permissions to the page */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + status &= ~DSISR_NOHPTE; /* DSISR_NOHPTE == SRR1_ISI_NOPT */ + if (!data) { + if (gr & (HPTE_R_N | HPTE_R_G)) + return status | SRR1_ISI_N_OR_G; + if (!hpte_read_permission(pp, slb_v & key)) + return status | SRR1_ISI_PROT; + } else if (status & DSISR_ISSTORE) { + /* check write permission */ + if (!hpte_write_permission(pp, slb_v & key)) + return status | DSISR_PROTFAULT; + } else { + if (!hpte_read_permission(pp, slb_v & key)) + return status | DSISR_PROTFAULT; + } + + /* Check storage key, if applicable */ + if (data && (vcpu->arch.shregs.msr & MSR_DR)) { + unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (status & DSISR_ISSTORE) + perm >>= 1; + if (perm & 1) + return status | DSISR_KEYFAULT; + } + + /* Save HPTE info for virtual-mode handler */ + vcpu->arch.pgfault_addr = addr; + vcpu->arch.pgfault_index = index; + vcpu->arch.pgfault_hpte[0] = v; + vcpu->arch.pgfault_hpte[1] = r; + + /* Check the storage key to see if it is possibly emulated MMIO */ + if (data && (vcpu->arch.shregs.msr & MSR_IR) && + (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) + return -2; /* MMIO emulation - load instr word */ + + return -1; /* send fault up to host kernel mode */ +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 5c8b26183f50..b70bf22a3ff3 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -601,6 +601,30 @@ kvmppc_interrupt: stw r12,VCPU_TRAP(r9) + /* Save HEIR (HV emulation assist reg) in last_inst + if this is an HEI (HV emulation interrupt, e40) */ + li r3,KVM_INST_FETCH_FAILED +BEGIN_FTR_SECTION + cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST + bne 11f + mfspr r3,SPRN_HEIR +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +11: stw r3,VCPU_LAST_INST(r9) + + /* these are volatile across C function calls */ + mfctr r3 + mfxer r4 + std r3, VCPU_CTR(r9) + stw r4, VCPU_XER(r9) + +BEGIN_FTR_SECTION + /* If this is a page table miss then see if it's theirs or ours */ + cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + beq kvmppc_hdsi + cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE + beq kvmppc_hisi +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -608,7 +632,7 @@ kvmppc_interrupt: cmpwi r3,0 bge ignore_hdec 2: - /* See if this is something we can handle in real mode */ + /* See if this is an hcall we can handle in real mode */ cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode @@ -624,6 +648,7 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +nohpte_cont: hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save DEC */ mfspr r5,SPRN_DEC @@ -632,36 +657,21 @@ hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ add r5,r5,r6 std r5,VCPU_DEC_EXPIRES(r9) - /* Save HEIR (HV emulation assist reg) in last_inst - if this is an HEI (HV emulation interrupt, e40) */ - li r3,-1 -BEGIN_FTR_SECTION - cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST - bne 11f - mfspr r3,SPRN_HEIR -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -11: stw r3,VCPU_LAST_INST(r9) - /* Save more register state */ - mfxer r5 mfdar r6 mfdsisr r7 - mfctr r8 - - stw r5, VCPU_XER(r9) std r6, VCPU_DAR(r9) stw r7, VCPU_DSISR(r9) - std r8, VCPU_CTR(r9) - /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */ BEGIN_FTR_SECTION + /* don't overwrite fault_dar/fault_dsisr if HDSI */ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE beq 6f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -7: std r6, VCPU_FAULT_DAR(r9) + std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) /* Save guest CTRL register, set runlatch to 1 */ - mfspr r6,SPRN_CTRLF +6: mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) andi. r0,r6,1 bne 4f @@ -1094,9 +1104,131 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_HSRR1, r7 ba 0x500 -6: mfspr r6,SPRN_HDAR - mfspr r7,SPRN_HDSISR - b 7b +/* + * Check whether an HDSI is an HPTE not found fault or something else. + * If it is an HPTE not found fault that is due to the guest accessing + * a page that they have mapped but which we have paged out, then + * we continue on with the guest exit path. In all other cases, + * reflect the HDSI to the guest as a DSI. + */ +kvmppc_hdsi: + mfspr r4, SPRN_HDAR + mfspr r6, SPRN_HDSISR + /* HPTE not found fault or protection fault? */ + andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h + beq 1f /* if not, send it to the guest */ + andi. r0, r11, MSR_DR /* data relocation enabled? */ + beq 3f + clrrdi r0, r4, 28 + PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: std r4, VCPU_FAULT_DAR(r9) + stw r6, VCPU_FAULT_DSISR(r9) + + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + li r7, 1 /* data fault */ + bl .kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq nohpte_cont + cmpdi r3, -2 /* MMIO emulation; need instr word */ + beq 2f + + /* Synthesize a DSI for the guest */ + ld r4, VCPU_FAULT_DAR(r9) + mr r6, r3 +1: mtspr SPRN_DAR, r4 + mtspr SPRN_DSISR, r6 + mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_DATA_STORAGE + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return + +3: ld r5, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r5) + b 4b + + /* If this is for emulated MMIO, load the instruction word */ +2: li r8, KVM_INST_FETCH_FAILED /* In case lwz faults */ + + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r0, KVM_GUEST_MODE_SKIP + stb r0, HSTATE_IN_GUEST(r13) + + /* Do the access with MSR:DR enabled */ + mfmsr r3 + ori r4, r3, MSR_DR /* Enable paging for data */ + mtmsrd r4 + lwz r8, 0(r10) + mtmsrd r3 + + /* Store the result */ + stw r8, VCPU_LAST_INST(r9) + + /* Unset guest mode. */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + b nohpte_cont + +/* + * Similarly for an HISI, reflect it to the guest as an ISI unless + * it is an HPTE not found fault for a page that we have paged out. + */ +kvmppc_hisi: + andis. r0, r11, SRR1_ISI_NOPT@h + beq 1f + andi. r0, r11, MSR_IR /* instruction relocation enabled? */ + beq 3f + clrrdi r0, r10, 28 + PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + mr r4, r10 + mr r6, r11 + li r7, 0 /* instruction fault */ + bl .kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_INST_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq nohpte_cont + + /* Synthesize an ISI for the guest */ + mr r11, r3 +1: mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_INST_STORAGE + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return + +3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r6) + b 4b /* * Try to handle an hcall in real mode. diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c index 7b0ee96c1bed..e70ef2d86431 100644 --- a/arch/powerpc/kvm/book3s_paired_singles.c +++ b/arch/powerpc/kvm/book3s_paired_singles.c @@ -196,7 +196,8 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_inject_pf(vcpu, addr, false); goto done_load; } else if (r == EMULATE_DO_MMIO) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, len, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + len, 1); goto done_load; } @@ -286,11 +287,13 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_inject_pf(vcpu, addr, false); goto done_load; } else if ((r == EMULATE_DO_MMIO) && w) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, 4, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + 4, 1); vcpu->arch.qpr[rs] = tmp[1]; goto done_load; } else if (r == EMULATE_DO_MMIO) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FQPR | rs, 8, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FQPR | rs, + 8, 1); goto done_load; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 220fcdf26978..7340e1090b77 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -51,15 +51,19 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, #define MSR_USER32 MSR_USER #define MSR_USER64 MSR_USER #define HW_PAGE_SIZE PAGE_SIZE +#define __hard_irq_disable local_irq_disable +#define __hard_irq_enable local_irq_enable #endif void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb)); memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, sizeof(get_paca()->shadow_vcpu)); - to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; + svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; + svcpu_put(svcpu); #endif #ifdef CONFIG_PPC_BOOK3S_32 @@ -70,10 +74,12 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, sizeof(get_paca()->shadow_vcpu)); - to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; + to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; + svcpu_put(svcpu); #endif kvmppc_giveup_ext(vcpu, MSR_FP); @@ -151,14 +157,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr >= 0x330000) && (pvr < 0x70330000)) { kvmppc_mmu_book3s_64_init(vcpu); - to_book3s(vcpu)->hior = 0xfff00000; + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0xfff00000; to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; vcpu->arch.cpu_type = KVM_CPU_3S_64; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - to_book3s(vcpu)->hior = 0; + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0; to_book3s(vcpu)->msr_mask = 0xffffffffULL; vcpu->arch.cpu_type = KVM_CPU_3S_32; } @@ -308,19 +316,22 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (page_found == -ENOENT) { /* Page not found in guest PTE entries */ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + vcpu->arch.shared->dsisr = svcpu->fault_dsisr; vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + (svcpu->shadow_srr1 & 0x00000000f8000000ULL); + svcpu_put(svcpu); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = - to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; + vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE; vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + svcpu->shadow_srr1 & 0x00000000f8000000ULL; + svcpu_put(svcpu); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ @@ -517,24 +528,29 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run->ready_for_interrupt_injection = 1; trace_kvm_book3s_exit(exit_nr, vcpu); + preempt_enable(); kvm_resched(vcpu); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: + { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong shadow_srr1 = svcpu->shadow_srr1; vcpu->stat.pf_instruc++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] - == SR_INVALID) { + if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) { kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); r = RESUME_GUEST; + svcpu_put(svcpu); break; } #endif + svcpu_put(svcpu); /* only care about PTEG not found errors, but leave NX alone */ - if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { + if (shadow_srr1 & 0x40000000) { r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && @@ -547,33 +563,37 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); r = RESUME_GUEST; } else { - vcpu->arch.shared->msr |= - to_svcpu(vcpu)->shadow_srr1 & 0x58000000; + vcpu->arch.shared->msr |= shadow_srr1 & 0x58000000; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } break; + } case BOOK3S_INTERRUPT_DATA_STORAGE: { ulong dar = kvmppc_get_fault_dar(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u32 fault_dsisr = svcpu->fault_dsisr; vcpu->stat.pf_storage++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { + if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) { kvmppc_mmu_map_segment(vcpu, dar); r = RESUME_GUEST; + svcpu_put(svcpu); break; } #endif + svcpu_put(svcpu); /* The only case we need to handle is missing shadow PTEs */ - if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { + if (fault_dsisr & DSISR_NOHPTE) { r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); } else { vcpu->arch.shared->dar = dar; - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + vcpu->arch.shared->dsisr = fault_dsisr; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } @@ -609,10 +629,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PROGRAM: { enum emulation_result er; + struct kvmppc_book3s_shadow_vcpu *svcpu; ulong flags; program_interrupt: - flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; + svcpu = svcpu_get(vcpu); + flags = svcpu->shadow_srr1 & 0x1f0000ull; + svcpu_put(svcpu); if (vcpu->arch.shared->msr & MSR_PR) { #ifdef EXIT_DEBUG @@ -740,20 +763,33 @@ program_interrupt: r = RESUME_GUEST; break; default: + { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong shadow_srr1 = svcpu->shadow_srr1; + svcpu_put(svcpu); /* Ugh - bork here! What did we get? */ printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", - exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); + exit_nr, kvmppc_get_pc(vcpu), shadow_srr1); r = RESUME_HOST; BUG(); break; } - + } if (!(r & RESUME_HOST)) { /* To avoid clobbering exit_reason, only check for signals if * we aren't already exiting to userspace for some other * reason. */ + + /* + * Interrupts could be timers for the guest which we have to + * inject again, so let's postpone them until we're in the guest + * and if we really did time things so badly, then we just exit + * again due to a host external interrupt. + */ + __hard_irq_disable(); if (signal_pending(current)) { + __hard_irq_enable(); #ifdef EXIT_DEBUG printk(KERN_EMERG "KVM: Going back to host\n"); #endif @@ -761,10 +797,12 @@ program_interrupt: run->exit_reason = KVM_EXIT_INTR; r = -EINTR; } else { + preempt_disable(); + /* In case an interrupt came in that was triggered * from userspace (like DEC), we need to check what * to inject now! */ - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); } } @@ -836,6 +874,38 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + r = put_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + r = get_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); + if (!r) + to_book3s(vcpu)->hior_explicit = true; + break; + default: + break; + } + + return r; +} + int kvmppc_core_check_processor_compat(void) { return 0; @@ -923,16 +993,31 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif ulong ext_msr; + preempt_disable(); + /* Check if we can run the vcpu at all */ if (!vcpu->arch.sane) { kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - return -EINVAL; + ret = -EINVAL; + goto out; } + kvmppc_core_prepare_to_enter(vcpu); + + /* + * Interrupts could be timers for the guest which we have to inject + * again, so let's postpone them until we're in the guest and if we + * really did time things so badly, then we just exit again due to + * a host external interrupt. + */ + __hard_irq_disable(); + /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { + __hard_irq_enable(); kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + ret = -EINTR; + goto out; } /* Save FPU state in stack */ @@ -974,8 +1059,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvm_guest_exit(); - local_irq_disable(); - current->thread.regs->msr = ext_msr; /* Make sure we save the guest FPU/Altivec/VSX state */ @@ -1002,9 +1085,50 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) current->thread.used_vsr = used_vsr; #endif +out: + preempt_enable(); return ret; } +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + struct kvm_vcpu *vcpu; + ulong ga, ga_end; + int is_dirty = 0; + int r; + unsigned long n; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log(kvm, log, &is_dirty); + if (r) + goto out; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + memslot = id_to_memslot(kvm->memslots, log->slot); + + ga = memslot->base_gfn << PAGE_SHIFT; + ga_end = ga + (memslot->npages << PAGE_SHIFT); + + kvm_for_each_vcpu(n, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + } + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index bb6c988f010a..ee9e1ee9c858 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -124,12 +124,6 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) vcpu->arch.shared->msr = new_msr; kvmppc_mmu_msr_notify(vcpu, old_msr); - - if (vcpu->arch.shared->msr & MSR_WE) { - kvm_vcpu_block(vcpu); - kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); - }; - kvmppc_vcpu_sync_spe(vcpu); } @@ -258,9 +252,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, allowed = vcpu->arch.shared->msr & MSR_ME; msr_mask = 0; break; - case BOOKE_IRQPRIO_EXTERNAL: case BOOKE_IRQPRIO_DECREMENTER: case BOOKE_IRQPRIO_FIT: + keep_irq = true; + /* fall through */ + case BOOKE_IRQPRIO_EXTERNAL: allowed = vcpu->arch.shared->msr & MSR_EE; allowed = allowed && !crit; msr_mask = MSR_CE|MSR_ME|MSR_DE; @@ -276,7 +272,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; if (update_esr == true) - vcpu->arch.esr = vcpu->arch.queued_esr; + vcpu->arch.shared->esr = vcpu->arch.queued_esr; if (update_dear == true) vcpu->arch.shared->dar = vcpu->arch.queued_dear; kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); @@ -288,13 +284,26 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, return allowed; } -/* Check pending exceptions and deliver one, if possible. */ -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +static void update_timer_ints(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) + kvmppc_core_queue_dec(vcpu); + else + kvmppc_core_dequeue_dec(vcpu); +} + +static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; - unsigned long old_pending = vcpu->arch.pending_exceptions; unsigned int priority; + if (vcpu->requests) { + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) { + smp_mb(); + update_timer_ints(vcpu); + } + } + priority = __ffs(*pending); while (priority <= BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) @@ -306,10 +315,24 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) } /* Tell the guest about our interrupt status */ - if (*pending) - vcpu->arch.shared->int_pending = 1; - else if (old_pending) - vcpu->arch.shared->int_pending = 0; + vcpu->arch.shared->int_pending = !!*pending; +} + +/* Check pending exceptions and deliver one, if possible. */ +void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(!irqs_disabled()); + + kvmppc_core_check_exceptions(vcpu); + + if (vcpu->arch.shared->msr & MSR_WE) { + local_irq_enable(); + kvm_vcpu_block(vcpu); + local_irq_disable(); + + kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); + kvmppc_core_check_exceptions(vcpu); + }; } int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) @@ -322,11 +345,21 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } local_irq_disable(); + + kvmppc_core_prepare_to_enter(vcpu); + + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + ret = -EINTR; + goto out; + } + kvm_guest_enter(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); kvm_guest_exit(); - local_irq_enable(); +out: + local_irq_enable(); return ret; } @@ -603,7 +636,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, local_irq_disable(); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); if (!(r & RESUME_HOST)) { /* To avoid clobbering exit_reason, only check for signals if @@ -628,6 +661,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.pc = 0; vcpu->arch.shared->msr = 0; vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; + vcpu->arch.shared->pir = vcpu->vcpu_id; kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ vcpu->arch.shadow_pid = 1; @@ -662,10 +696,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg4 = vcpu->arch.sprg4; - regs->sprg5 = vcpu->arch.sprg5; - regs->sprg6 = vcpu->arch.sprg6; - regs->sprg7 = vcpu->arch.sprg7; + regs->sprg4 = vcpu->arch.shared->sprg4; + regs->sprg5 = vcpu->arch.shared->sprg5; + regs->sprg6 = vcpu->arch.shared->sprg6; + regs->sprg7 = vcpu->arch.shared->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -690,10 +724,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg4 = regs->sprg4; - vcpu->arch.sprg5 = regs->sprg5; - vcpu->arch.sprg6 = regs->sprg6; - vcpu->arch.sprg7 = regs->sprg7; + vcpu->arch.shared->sprg4 = regs->sprg4; + vcpu->arch.shared->sprg5 = regs->sprg5; + vcpu->arch.shared->sprg6 = regs->sprg6; + vcpu->arch.shared->sprg7 = regs->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -711,7 +745,7 @@ static void get_sregs_base(struct kvm_vcpu *vcpu, sregs->u.e.csrr0 = vcpu->arch.csrr0; sregs->u.e.csrr1 = vcpu->arch.csrr1; sregs->u.e.mcsr = vcpu->arch.mcsr; - sregs->u.e.esr = vcpu->arch.esr; + sregs->u.e.esr = vcpu->arch.shared->esr; sregs->u.e.dear = vcpu->arch.shared->dar; sregs->u.e.tsr = vcpu->arch.tsr; sregs->u.e.tcr = vcpu->arch.tcr; @@ -729,28 +763,19 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, vcpu->arch.csrr0 = sregs->u.e.csrr0; vcpu->arch.csrr1 = sregs->u.e.csrr1; vcpu->arch.mcsr = sregs->u.e.mcsr; - vcpu->arch.esr = sregs->u.e.esr; + vcpu->arch.shared->esr = sregs->u.e.esr; vcpu->arch.shared->dar = sregs->u.e.dear; vcpu->arch.vrsave = sregs->u.e.vrsave; - vcpu->arch.tcr = sregs->u.e.tcr; + kvmppc_set_tcr(vcpu, sregs->u.e.tcr); - if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) { vcpu->arch.dec = sregs->u.e.dec; - - kvmppc_emulate_dec(vcpu); + kvmppc_emulate_dec(vcpu); + } if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { - /* - * FIXME: existing KVM timer handling is incomplete. - * TSR cannot be read by the guest, and its value in - * vcpu->arch is always zero. For now, just handle - * the case where the caller is trying to inject a - * decrementer interrupt. - */ - - if ((sregs->u.e.tsr & TSR_DIS) && - (vcpu->arch.tcr & TCR_DIE)) - kvmppc_core_queue_dec(vcpu); + vcpu->arch.tsr = sregs->u.e.tsr; + update_timer_ints(vcpu); } return 0; @@ -761,7 +786,7 @@ static void get_sregs_arch206(struct kvm_vcpu *vcpu, { sregs->u.e.features |= KVM_SREGS_E_ARCH206; - sregs->u.e.pir = 0; + sregs->u.e.pir = vcpu->vcpu_id; sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0; sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1; sregs->u.e.decar = vcpu->arch.decar; @@ -774,7 +799,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu, if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206)) return 0; - if (sregs->u.e.pir != 0) + if (sregs->u.e.pir != vcpu->vcpu_id) return -EINVAL; vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0; @@ -862,6 +887,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return kvmppc_core_set_sregs(vcpu, sregs); } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + return -EINVAL; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + return -EINVAL; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; @@ -906,6 +941,33 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) { } +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) +{ + vcpu->arch.tcr = new_tcr; + update_timer_ints(vcpu); +} + +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + set_bits(tsr_bits, &vcpu->arch.tsr); + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); +} + +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + clear_bits(tsr_bits, &vcpu->arch.tsr); + update_timer_ints(vcpu); +} + +void kvmppc_decrementer_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + + kvmppc_set_tsr_bits(vcpu, TSR_DIS); +} + int __init kvmppc_booke_init(void) { unsigned long ivor[16]; diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 8e1fe33d64e5..2fe202705a3f 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -55,6 +55,10 @@ extern unsigned long kvmppc_booke_handlers; void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); + int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 1260f5f24c0c..3e652da36534 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2008 + * Copyright 2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard <hollisb@us.ibm.com> */ @@ -107,7 +108,7 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_DEAR: vcpu->arch.shared->dar = spr_val; break; case SPRN_ESR: - vcpu->arch.esr = spr_val; break; + vcpu->arch.shared->esr = spr_val; break; case SPRN_DBCR0: vcpu->arch.dbcr0 = spr_val; break; case SPRN_DBCR1: @@ -115,23 +116,23 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_DBSR: vcpu->arch.dbsr &= ~spr_val; break; case SPRN_TSR: - vcpu->arch.tsr &= ~spr_val; break; + kvmppc_clr_tsr_bits(vcpu, spr_val); + break; case SPRN_TCR: - vcpu->arch.tcr = spr_val; - kvmppc_emulate_dec(vcpu); + kvmppc_set_tcr(vcpu, spr_val); break; /* Note: SPRG4-7 are user-readable. These values are * loaded into the real SPRGs when resuming the * guest. */ case SPRN_SPRG4: - vcpu->arch.sprg4 = spr_val; break; + vcpu->arch.shared->sprg4 = spr_val; break; case SPRN_SPRG5: - vcpu->arch.sprg5 = spr_val; break; + vcpu->arch.shared->sprg5 = spr_val; break; case SPRN_SPRG6: - vcpu->arch.sprg6 = spr_val; break; + vcpu->arch.shared->sprg6 = spr_val; break; case SPRN_SPRG7: - vcpu->arch.sprg7 = spr_val; break; + vcpu->arch.shared->sprg7 = spr_val; break; case SPRN_IVPR: vcpu->arch.ivpr = spr_val; @@ -202,13 +203,17 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_DEAR: kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break; case SPRN_ESR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->esr); break; case SPRN_DBCR0: kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; case SPRN_DBCR1: kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; case SPRN_DBSR: kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; + case SPRN_TSR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.tsr); break; + case SPRN_TCR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.tcr); break; case SPRN_IVOR0: kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 42f2fb1f66e9..10d8ef602e5c 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -402,19 +402,25 @@ lightweight_exit: /* Save vcpu pointer for the exception handlers. */ mtspr SPRN_SPRG_WVCPU, r4 + lwz r5, VCPU_SHARED(r4) + /* Can't switch the stack pointer until after IVPR is switched, * because host interrupt handlers would get confused. */ lwz r1, VCPU_GPR(r1)(r4) - /* Host interrupt handlers may have clobbered these guest-readable - * SPRGs, so we need to reload them here with the guest's values. */ - lwz r3, VCPU_SPRG4(r4) + /* + * Host interrupt handlers may have clobbered these + * guest-readable SPRGs, or the guest kernel may have + * written directly to the shared area, so we + * need to reload them here with the guest's values. + */ + lwz r3, VCPU_SHARED_SPRG4(r5) mtspr SPRN_SPRG4W, r3 - lwz r3, VCPU_SPRG5(r4) + lwz r3, VCPU_SHARED_SPRG5(r5) mtspr SPRN_SPRG5W, r3 - lwz r3, VCPU_SPRG6(r4) + lwz r3, VCPU_SHARED_SPRG6(r5) mtspr SPRN_SPRG6W, r3 - lwz r3, VCPU_SPRG7(r4) + lwz r3, VCPU_SHARED_SPRG7(r5) mtspr SPRN_SPRG7W, r3 #ifdef CONFIG_KVM_EXIT_TIMING diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 8c0d45a6faf7..ddcd896fa2ff 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -71,9 +71,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.pvr = mfspr(SPRN_PVR); vcpu_e500->svr = mfspr(SPRN_SVR); - /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ - vcpu->vcpu_id = 0; - vcpu->arch.cpu_type = KVM_CPU_E500V2; return 0; @@ -118,12 +115,12 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; - sregs->u.e.mas0 = vcpu_e500->mas0; - sregs->u.e.mas1 = vcpu_e500->mas1; - sregs->u.e.mas2 = vcpu_e500->mas2; - sregs->u.e.mas7_3 = ((u64)vcpu_e500->mas7 << 32) | vcpu_e500->mas3; - sregs->u.e.mas4 = vcpu_e500->mas4; - sregs->u.e.mas6 = vcpu_e500->mas6; + sregs->u.e.mas0 = vcpu->arch.shared->mas0; + sregs->u.e.mas1 = vcpu->arch.shared->mas1; + sregs->u.e.mas2 = vcpu->arch.shared->mas2; + sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3; + sregs->u.e.mas4 = vcpu->arch.shared->mas4; + sregs->u.e.mas6 = vcpu->arch.shared->mas6; sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG); sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg; @@ -151,13 +148,12 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) } if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { - vcpu_e500->mas0 = sregs->u.e.mas0; - vcpu_e500->mas1 = sregs->u.e.mas1; - vcpu_e500->mas2 = sregs->u.e.mas2; - vcpu_e500->mas7 = sregs->u.e.mas7_3 >> 32; - vcpu_e500->mas3 = (u32)sregs->u.e.mas7_3; - vcpu_e500->mas4 = sregs->u.e.mas4; - vcpu_e500->mas6 = sregs->u.e.mas6; + vcpu->arch.shared->mas0 = sregs->u.e.mas0; + vcpu->arch.shared->mas1 = sregs->u.e.mas1; + vcpu->arch.shared->mas2 = sregs->u.e.mas2; + vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3; + vcpu->arch.shared->mas4 = sregs->u.e.mas4; + vcpu->arch.shared->mas6 = sregs->u.e.mas6; } if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) @@ -233,6 +229,10 @@ static int __init kvmppc_e500_init(void) unsigned long ivor[3]; unsigned long max_ivor = 0; + r = kvmppc_core_check_processor_compat(); + if (r) + return r; + r = kvmppc_booke_init(); if (r) return r; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index d48ae396f41e..6d0b2bd54fb0 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -89,19 +89,23 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) return EMULATE_FAIL; vcpu_e500->pid[2] = spr_val; break; case SPRN_MAS0: - vcpu_e500->mas0 = spr_val; break; + vcpu->arch.shared->mas0 = spr_val; break; case SPRN_MAS1: - vcpu_e500->mas1 = spr_val; break; + vcpu->arch.shared->mas1 = spr_val; break; case SPRN_MAS2: - vcpu_e500->mas2 = spr_val; break; + vcpu->arch.shared->mas2 = spr_val; break; case SPRN_MAS3: - vcpu_e500->mas3 = spr_val; break; + vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= spr_val; + break; case SPRN_MAS4: - vcpu_e500->mas4 = spr_val; break; + vcpu->arch.shared->mas4 = spr_val; break; case SPRN_MAS6: - vcpu_e500->mas6 = spr_val; break; + vcpu->arch.shared->mas6 = spr_val; break; case SPRN_MAS7: - vcpu_e500->mas7 = spr_val; break; + vcpu->arch.shared->mas7_3 &= (u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32; + break; case SPRN_L1CSR0: vcpu_e500->l1csr0 = spr_val; vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); @@ -143,6 +147,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; + unsigned long val; switch (sprn) { case SPRN_PID: @@ -152,20 +157,23 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_PID2: kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; case SPRN_MAS0: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas0); break; case SPRN_MAS1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas1); break; case SPRN_MAS2: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas2); break; case SPRN_MAS3: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break; + val = (u32)vcpu->arch.shared->mas7_3; + kvmppc_set_gpr(vcpu, rt, val); + break; case SPRN_MAS4: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas4); break; case SPRN_MAS6: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas6); break; case SPRN_MAS7: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break; - + val = vcpu->arch.shared->mas7_3 >> 32; + kvmppc_set_gpr(vcpu, rt, val); + break; case SPRN_TLB0CFG: kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; case SPRN_TLB1CFG: diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 13c432ea2fa8..6e53e4164de1 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -12,12 +12,19 @@ * published by the Free Software Foundation. */ +#include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/vmalloc.h> +#include <linux/hugetlb.h> #include <asm/kvm_ppc.h> #include <asm/kvm_e500.h> @@ -26,7 +33,7 @@ #include "trace.h" #include "timing.h" -#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) +#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) struct id { unsigned long val; @@ -63,7 +70,14 @@ static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); * The valid range of shadow ID is [1..255] */ static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); -static unsigned int tlb1_entry_num; +static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; + +static struct kvm_book3e_206_tlb_entry *get_entry( + struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry) +{ + int offset = vcpu_e500->gtlb_offset[tlbsel]; + return &vcpu_e500->gtlb_arch[offset + entry]; +} /* * Allocate a free shadow id and setup a valid sid mapping in given entry. @@ -116,13 +130,11 @@ static inline int local_sid_lookup(struct id *entry) return -1; } -/* Invalidate all id mappings on local core */ +/* Invalidate all id mappings on local core -- call with preempt disabled */ static inline void local_sid_destroy_all(void) { - preempt_disable(); __get_cpu_var(pcpu_last_used_sid) = 0; memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); - preempt_enable(); } static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) @@ -218,34 +230,13 @@ void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) preempt_enable(); } -void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *tlbe; - int i, tlbsel; - - printk("| %8s | %8s | %8s | %8s | %8s |\n", - "nr", "mas1", "mas2", "mas3", "mas7"); - - for (tlbsel = 0; tlbsel < 2; tlbsel++) { - printk("Guest TLB%d:\n", tlbsel); - for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) { - tlbe = &vcpu_e500->gtlb_arch[tlbsel][i]; - if (tlbe->mas1 & MAS1_VALID) - printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n", - tlbsel, i, tlbe->mas1, tlbe->mas2, - tlbe->mas3, tlbe->mas7); - } - } -} - -static inline unsigned int tlb0_get_next_victim( +static inline unsigned int gtlb0_get_next_victim( struct kvmppc_vcpu_e500 *vcpu_e500) { unsigned int victim; victim = vcpu_e500->gtlb_nv[0]++; - if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) + if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways)) vcpu_e500->gtlb_nv[0] = 0; return victim; @@ -254,12 +245,12 @@ static inline unsigned int tlb0_get_next_victim( static inline unsigned int tlb1_max_shadow_size(void) { /* reserve one entry for magic page */ - return tlb1_entry_num - tlbcam_index - 1; + return host_tlb_params[1].entries - tlbcam_index - 1; } -static inline int tlbe_is_writable(struct tlbe *tlbe) +static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) { - return tlbe->mas3 & (MAS3_SW|MAS3_UW); + return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); } static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) @@ -290,40 +281,66 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) /* * writing shadow tlb entry to host TLB */ -static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0) +static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, + uint32_t mas0) { unsigned long flags; local_irq_save(flags); mtspr(SPRN_MAS0, mas0); mtspr(SPRN_MAS1, stlbe->mas1); - mtspr(SPRN_MAS2, stlbe->mas2); - mtspr(SPRN_MAS3, stlbe->mas3); - mtspr(SPRN_MAS7, stlbe->mas7); + mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2); + mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); + mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); asm volatile("isync; tlbwe" : : : "memory"); local_irq_restore(flags); + + trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1, + stlbe->mas2, stlbe->mas7_3); +} + +/* + * Acquire a mas0 with victim hint, as if we just took a TLB miss. + * + * We don't care about the address we're searching for, other than that it's + * in the right set and is not present in the TLB. Using a zero PID and a + * userspace address means we don't have to set and then restore MAS5, or + * calculate a proper MAS6 value. + */ +static u32 get_host_mas0(unsigned long eaddr) +{ + unsigned long flags; + u32 mas0; + + local_irq_save(flags); + mtspr(SPRN_MAS6, 0); + asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET)); + mas0 = mfspr(SPRN_MAS0); + local_irq_restore(flags); + + return mas0; } +/* sesel is for tlb1 only */ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel, struct tlbe *stlbe) + int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe) { + u32 mas0; + if (tlbsel == 0) { - __write_host_tlbe(stlbe, - MAS0_TLBSEL(0) | - MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1))); + mas0 = get_host_mas0(stlbe->mas2); + __write_host_tlbe(stlbe, mas0); } else { __write_host_tlbe(stlbe, MAS0_TLBSEL(1) | - MAS0_ESEL(to_htlb1_esel(esel))); + MAS0_ESEL(to_htlb1_esel(sesel))); } - trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, - stlbe->mas3, stlbe->mas7); } void kvmppc_map_magic(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe magic; + struct kvm_book3e_206_tlb_entry magic; ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; unsigned int stid; pfn_t pfn; @@ -337,9 +354,9 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu) magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | MAS1_TSIZE(BOOK3E_PAGESZ_4K); magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; - magic.mas3 = (pfn << PAGE_SHIFT) | - MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; - magic.mas7 = pfn >> (32 - PAGE_SHIFT); + magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) | + MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; + magic.mas8 = 0; __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); preempt_enable(); @@ -357,10 +374,11 @@ void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) { } -static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) +static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) { - struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); struct vcpu_id_table *idt = vcpu_e500->idt; unsigned int pr, tid, ts, pid; u32 val, eaddr; @@ -414,25 +432,57 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, preempt_enable(); } +static int tlb0_set_base(gva_t addr, int sets, int ways) +{ + int set_base; + + set_base = (addr >> PAGE_SHIFT) & (sets - 1); + set_base *= ways; + + return set_base; +} + +static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr) +{ + return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets, + vcpu_e500->gtlb_params[0].ways); +} + +static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int esel = get_tlb_esel_bit(vcpu); + + if (tlbsel == 0) { + esel &= vcpu_e500->gtlb_params[0].ways - 1; + esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2); + } else { + esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1; + } + + return esel; +} + /* Search the guest TLB for a matching entry. */ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t eaddr, int tlbsel, unsigned int pid, int as) { - int size = vcpu_e500->gtlb_size[tlbsel]; - int set_base; + int size = vcpu_e500->gtlb_params[tlbsel].entries; + unsigned int set_base, offset; int i; if (tlbsel == 0) { - int mask = size / KVM_E500_TLB0_WAY_NUM - 1; - set_base = (eaddr >> PAGE_SHIFT) & mask; - set_base *= KVM_E500_TLB0_WAY_NUM; - size = KVM_E500_TLB0_WAY_NUM; + set_base = gtlb0_set_base(vcpu_e500, eaddr); + size = vcpu_e500->gtlb_params[0].ways; } else { set_base = 0; } + offset = vcpu_e500->gtlb_offset[tlbsel]; + for (i = 0; i < size; i++) { - struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i]; + struct kvm_book3e_206_tlb_entry *tlbe = + &vcpu_e500->gtlb_arch[offset + set_base + i]; unsigned int tid; if (eaddr < get_tlb_eaddr(tlbe)) @@ -457,27 +507,55 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, return -1; } -static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv, - struct tlbe *gtlbe, - pfn_t pfn) +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, + struct kvm_book3e_206_tlb_entry *gtlbe, + pfn_t pfn) { - priv->pfn = pfn; - priv->flags = E500_TLB_VALID; + ref->pfn = pfn; + ref->flags = E500_TLB_VALID; if (tlbe_is_writable(gtlbe)) - priv->flags |= E500_TLB_DIRTY; + ref->flags |= E500_TLB_DIRTY; } -static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv) +static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) { - if (priv->flags & E500_TLB_VALID) { - if (priv->flags & E500_TLB_DIRTY) - kvm_release_pfn_dirty(priv->pfn); + if (ref->flags & E500_TLB_VALID) { + if (ref->flags & E500_TLB_DIRTY) + kvm_release_pfn_dirty(ref->pfn); else - kvm_release_pfn_clean(priv->pfn); + kvm_release_pfn_clean(ref->pfn); + + ref->flags = 0; + } +} + +static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int tlbsel = 0; + int i; + + for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { + struct tlbe_ref *ref = + &vcpu_e500->gtlb_priv[tlbsel][i].ref; + kvmppc_e500_ref_release(ref); + } +} + +static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int stlbsel = 1; + int i; + + kvmppc_e500_id_table_reset_all(vcpu_e500); - priv->flags = 0; + for (i = 0; i < host_tlb_params[stlbsel].entries; i++) { + struct tlbe_ref *ref = + &vcpu_e500->tlb_refs[stlbsel][i]; + kvmppc_e500_ref_release(ref); } + + clear_tlb_privs(vcpu_e500); } static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, @@ -488,59 +566,54 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, int tlbsel; /* since we only have two TLBs, only lower bit is used. */ - tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; - victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; - pidsel = (vcpu_e500->mas4 >> 16) & 0xf; - tsized = (vcpu_e500->mas4 >> 7) & 0x1f; + tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; + pidsel = (vcpu->arch.shared->mas4 >> 16) & 0xf; + tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f; - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) + vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) | MAS1_TID(vcpu_e500->pid[pidsel]) | MAS1_TSIZE(tsized); - vcpu_e500->mas2 = (eaddr & MAS2_EPN) - | (vcpu_e500->mas4 & MAS2_ATTRIB_MASK); - vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; - vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1) + vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN) + | (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK); + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; + vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1) | (get_cur_pid(vcpu) << 16) | (as ? MAS6_SAS : 0); - vcpu_e500->mas7 = 0; } -static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - struct tlbe *gtlbe, int tsize, - struct tlbe_priv *priv, - u64 gvaddr, struct tlbe *stlbe) +/* TID must be supplied by the caller */ +static inline void kvmppc_e500_setup_stlbe( + struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe, + int tsize, struct tlbe_ref *ref, u64 gvaddr, + struct kvm_book3e_206_tlb_entry *stlbe) { - pfn_t pfn = priv->pfn; - unsigned int stid; + pfn_t pfn = ref->pfn; - stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), - get_tlb_tid(gtlbe), - get_cur_pr(&vcpu_e500->vcpu), 0); + BUG_ON(!(ref->flags & E500_TLB_VALID)); /* Force TS=1 IPROT=0 for all guest mappings. */ - stlbe->mas1 = MAS1_TSIZE(tsize) - | MAS1_TID(stid) | MAS1_TS | MAS1_VALID; + stlbe->mas1 = MAS1_TSIZE(tsize) | MAS1_TS | MAS1_VALID; stlbe->mas2 = (gvaddr & MAS2_EPN) | e500_shadow_mas2_attrib(gtlbe->mas2, vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN) - | e500_shadow_mas3_attrib(gtlbe->mas3, + stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) + | e500_shadow_mas3_attrib(gtlbe->mas7_3, vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN; } - static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel, - struct tlbe *stlbe) + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe, + struct tlbe_ref *ref) { struct kvm_memory_slot *slot; unsigned long pfn, hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; - struct tlbe_priv *priv; /* * Translate guest physical to true physical, acquiring @@ -621,12 +694,31 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, pfn &= ~(tsize_pages - 1); break; } + } else if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_HUGETLB)) { + unsigned long psize = vma_kernel_pagesize(vma); + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * Take the largest page size that satisfies both host + * and guest mapping + */ + tsize = min(__ilog2(psize) - 10, tsize); + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); } up_read(¤t->mm->mmap_sem); } if (likely(!pfnmap)) { + unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); if (is_error_pfn(pfn)) { printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", @@ -634,45 +726,52 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvm_release_pfn_clean(pfn); return; } + + /* Align guest and physical address to page map boundaries */ + pfn &= ~(tsize_pages - 1); + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); } - /* Drop old priv and setup new one. */ - priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; - kvmppc_e500_priv_release(priv); - kvmppc_e500_priv_setup(priv, gtlbe, pfn); + /* Drop old ref and setup new one. */ + kvmppc_e500_ref_release(ref); + kvmppc_e500_ref_setup(ref, gtlbe, pfn); - kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe); + kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, ref, gvaddr, stlbe); } /* XXX only map the one-one case, for now use TLB0 */ -static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, - int esel, struct tlbe *stlbe) +static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, + int esel, + struct kvm_book3e_206_tlb_entry *stlbe) { - struct tlbe *gtlbe; + struct kvm_book3e_206_tlb_entry *gtlbe; + struct tlbe_ref *ref; - gtlbe = &vcpu_e500->gtlb_arch[0][esel]; + gtlbe = get_entry(vcpu_e500, 0, esel); + ref = &vcpu_e500->gtlb_priv[0][esel].ref; kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), get_tlb_raddr(gtlbe) >> PAGE_SHIFT, - gtlbe, 0, esel, stlbe); - - return esel; + gtlbe, 0, stlbe, ref); } /* Caller must ensure that the specified guest TLB entry is safe to insert into * the shadow TLB. */ /* XXX for both one-one and one-to-many , for now use TLB1 */ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe) + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe) { + struct tlbe_ref *ref; unsigned int victim; - victim = vcpu_e500->gtlb_nv[1]++; + victim = vcpu_e500->host_tlb1_nv++; - if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size())) - vcpu_e500->gtlb_nv[1] = 0; + if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size())) + vcpu_e500->host_tlb1_nv = 0; - kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe); + ref = &vcpu_e500->tlb_refs[1][victim]; + kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref); return victim; } @@ -689,7 +788,8 @@ static inline int kvmppc_e500_gtlbe_invalidate( struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int esel) { - struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); if (unlikely(get_tlb_iprot(gtlbe))) return -1; @@ -704,10 +804,10 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) int esel; if (value & MMUCSR0_TLB0FI) - for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); if (value & MMUCSR0_TLB1FI) - for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); /* Invalidate all vcpu id mappings */ @@ -732,7 +832,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) if (ia) { /* invalidate all entries */ - for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; + esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); } else { ea &= 0xfffff000; @@ -752,18 +853,17 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int tlbsel, esel; - struct tlbe *gtlbe; + struct kvm_book3e_206_tlb_entry *gtlbe; - tlbsel = get_tlb_tlbsel(vcpu_e500); - esel = get_tlb_esel(vcpu_e500, tlbsel); + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - vcpu_e500->mas0 &= ~MAS0_NV(~0); - vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = gtlbe->mas1; - vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas3 = gtlbe->mas3; - vcpu_e500->mas7 = gtlbe->mas7; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + vcpu->arch.shared->mas0 &= ~MAS0_NV(~0); + vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; return EMULATE_DONE; } @@ -771,10 +871,10 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int as = !!get_cur_sas(vcpu_e500); - unsigned int pid = get_cur_spid(vcpu_e500); + int as = !!get_cur_sas(vcpu); + unsigned int pid = get_cur_spid(vcpu); int esel, tlbsel; - struct tlbe *gtlbe = NULL; + struct kvm_book3e_206_tlb_entry *gtlbe = NULL; gva_t ea; ea = kvmppc_get_gpr(vcpu, rb); @@ -782,70 +882,90 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); if (esel >= 0) { - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); break; } } if (gtlbe) { - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) + esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1; + + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = gtlbe->mas1; - vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas3 = gtlbe->mas3; - vcpu_e500->mas7 = gtlbe->mas7; + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; } else { int victim; /* since we only have two TLBs, only lower bit is used. */ - tlbsel = vcpu_e500->mas4 >> 28 & 0x1; - victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; + tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) + | MAS0_ESEL(victim) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) - | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) - | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); - vcpu_e500->mas2 &= MAS2_EPN; - vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK; - vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; - vcpu_e500->mas7 = 0; + vcpu->arch.shared->mas1 = + (vcpu->arch.shared->mas6 & MAS6_SPID0) + | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0)) + | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0)); + vcpu->arch.shared->mas2 &= MAS2_EPN; + vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 & + MAS2_ATTRIB_MASK; + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | + MAS3_U2 | MAS3_U3; } kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); return EMULATE_DONE; } +/* sesel is for tlb1 only */ +static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe, + int stlbsel, int sesel) +{ + int stid; + + preempt_disable(); + stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), + get_tlb_tid(gtlbe), + get_cur_pr(&vcpu_e500->vcpu), 0); + + stlbe->mas1 |= MAS1_TID(stid); + write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe); + preempt_enable(); +} + int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *gtlbe; + struct kvm_book3e_206_tlb_entry *gtlbe; int tlbsel, esel; - tlbsel = get_tlb_tlbsel(vcpu_e500); - esel = get_tlb_esel(vcpu_e500, tlbsel); + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); if (get_tlb_v(gtlbe)) - kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); - gtlbe->mas1 = vcpu_e500->mas1; - gtlbe->mas2 = vcpu_e500->mas2; - gtlbe->mas3 = vcpu_e500->mas3; - gtlbe->mas7 = vcpu_e500->mas7; + gtlbe->mas1 = vcpu->arch.shared->mas1; + gtlbe->mas2 = vcpu->arch.shared->mas2; + gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; - trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2, - gtlbe->mas3, gtlbe->mas7); + trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, + gtlbe->mas2, gtlbe->mas7_3); /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ if (tlbe_is_host_safe(vcpu, gtlbe)) { - struct tlbe stlbe; + struct kvm_book3e_206_tlb_entry stlbe; int stlbsel, sesel; u64 eaddr; u64 raddr; - preempt_disable(); switch (tlbsel) { case 0: /* TLB0 */ @@ -853,7 +973,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); stlbsel = 0; - sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + sesel = 0; /* unused */ break; @@ -874,8 +995,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) default: BUG(); } - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); - preempt_enable(); + + write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); } kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); @@ -914,9 +1035,11 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, gva_t eaddr) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *gtlbe = - &vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)]; - u64 pgmask = get_tlb_bytes(gtlbe) - 1; + struct kvm_book3e_206_tlb_entry *gtlbe; + u64 pgmask; + + gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index)); + pgmask = get_tlb_bytes(gtlbe) - 1; return get_tlb_raddr(gtlbe) | (eaddr & pgmask); } @@ -930,22 +1053,21 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); struct tlbe_priv *priv; - struct tlbe *gtlbe, stlbe; + struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; int tlbsel = tlbsel_of(index); int esel = esel_of(index); int stlbsel, sesel; - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); - preempt_disable(); switch (tlbsel) { case 0: stlbsel = 0; - sesel = esel; - priv = &vcpu_e500->gtlb_priv[stlbsel][sesel]; + sesel = 0; /* unused */ + priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K, - priv, eaddr, &stlbe); + &priv->ref, eaddr, &stlbe); break; case 1: { @@ -962,8 +1084,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, break; } - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); - preempt_enable(); + write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); } int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, @@ -993,85 +1114,279 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) { - struct tlbe *tlbe; + struct kvm_book3e_206_tlb_entry *tlbe; /* Insert large initial mapping for guest. */ - tlbe = &vcpu_e500->gtlb_arch[1][0]; + tlbe = get_entry(vcpu_e500, 1, 0); tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); tlbe->mas2 = 0; - tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; - tlbe->mas7 = 0; + tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK; /* 4K map for serial output. Used by kernel wrapper. */ - tlbe = &vcpu_e500->gtlb_arch[1][1]; + tlbe = get_entry(vcpu_e500, 1, 1); tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; - tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; - tlbe->mas7 = 0; + tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; +} + +static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int i; + + clear_tlb_refs(vcpu_e500); + kfree(vcpu_e500->gtlb_priv[0]); + kfree(vcpu_e500->gtlb_priv[1]); + + if (vcpu_e500->shared_tlb_pages) { + vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch, + PAGE_SIZE))); + + for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) { + set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]); + put_page(vcpu_e500->shared_tlb_pages[i]); + } + + vcpu_e500->num_shared_tlb_pages = 0; + vcpu_e500->shared_tlb_pages = NULL; + } else { + kfree(vcpu_e500->gtlb_arch); + } + + vcpu_e500->gtlb_arch = NULL; +} + +int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, + struct kvm_config_tlb *cfg) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_params params; + char *virt; + struct page **pages; + struct tlbe_priv *privs[2] = {}; + size_t array_len; + u32 sets; + int num_pages, ret, i; + + if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV) + return -EINVAL; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)cfg->params, + sizeof(params))) + return -EFAULT; + + if (params.tlb_sizes[1] > 64) + return -EINVAL; + if (params.tlb_ways[1] != params.tlb_sizes[1]) + return -EINVAL; + if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0) + return -EINVAL; + if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0) + return -EINVAL; + + if (!is_power_of_2(params.tlb_ways[0])) + return -EINVAL; + + sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]); + if (!is_power_of_2(sets)) + return -EINVAL; + + array_len = params.tlb_sizes[0] + params.tlb_sizes[1]; + array_len *= sizeof(struct kvm_book3e_206_tlb_entry); + + if (cfg->array_len < array_len) + return -EINVAL; + + num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) - + cfg->array / PAGE_SIZE; + pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); + if (ret < 0) + goto err_pages; + + if (ret != num_pages) { + num_pages = ret; + ret = -EFAULT; + goto err_put_page; + } + + virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); + if (!virt) + goto err_put_page; + + privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], + GFP_KERNEL); + privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], + GFP_KERNEL); + + if (!privs[0] || !privs[1]) + goto err_put_page; + + free_gtlb(vcpu_e500); + + vcpu_e500->gtlb_priv[0] = privs[0]; + vcpu_e500->gtlb_priv[1] = privs[1]; + + vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *) + (virt + (cfg->array & (PAGE_SIZE - 1))); + + vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0]; + vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1]; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; + + vcpu_e500->tlb0cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + if (params.tlb_sizes[0] <= 2048) + vcpu_e500->tlb0cfg |= params.tlb_sizes[0]; + vcpu_e500->tlb0cfg |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; + + vcpu_e500->tlb1cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu_e500->tlb1cfg |= params.tlb_sizes[1]; + vcpu_e500->tlb1cfg |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; + + vcpu_e500->shared_tlb_pages = pages; + vcpu_e500->num_shared_tlb_pages = num_pages; + + vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0]; + vcpu_e500->gtlb_params[0].sets = sets; + + vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1]; + vcpu_e500->gtlb_params[1].sets = 1; + + return 0; + +err_put_page: + kfree(privs[0]); + kfree(privs[1]); + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + +err_pages: + kfree(pages); + return ret; +} + +int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, + struct kvm_dirty_tlb *dirty) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + clear_tlb_refs(vcpu_e500); + return 0; } int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) { - tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; - - vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE; - vcpu_e500->gtlb_arch[0] = - kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_arch[0] == NULL) - goto err_out; - - vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE; - vcpu_e500->gtlb_arch[1] = - kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_arch[1] == NULL) - goto err_out_guest0; - - vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *) - kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_priv[0] == NULL) - goto err_out_guest1; - vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *) - kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - - if (vcpu_e500->gtlb_priv[1] == NULL) - goto err_out_priv0; + int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); + int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; + + host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; + host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + /* + * This should never happen on real e500 hardware, but is + * architecturally possible -- e.g. in some weird nested + * virtualization case. + */ + if (host_tlb_params[0].entries == 0 || + host_tlb_params[1].entries == 0) { + pr_err("%s: need to know host tlb size\n", __func__); + return -ENODEV; + } + + host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >> + TLBnCFG_ASSOC_SHIFT; + host_tlb_params[1].ways = host_tlb_params[1].entries; + + if (!is_power_of_2(host_tlb_params[0].entries) || + !is_power_of_2(host_tlb_params[0].ways) || + host_tlb_params[0].entries < host_tlb_params[0].ways || + host_tlb_params[0].ways == 0) { + pr_err("%s: bad tlb0 host config: %u entries %u ways\n", + __func__, host_tlb_params[0].entries, + host_tlb_params[0].ways); + return -ENODEV; + } + + host_tlb_params[0].sets = + host_tlb_params[0].entries / host_tlb_params[0].ways; + host_tlb_params[1].sets = 1; + + vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; + vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; + + vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM; + vcpu_e500->gtlb_params[0].sets = + KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM; + + vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE; + vcpu_e500->gtlb_params[1].sets = 1; + + vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL); + if (!vcpu_e500->gtlb_arch) + return -ENOMEM; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; + + vcpu_e500->tlb_refs[0] = + kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries, + GFP_KERNEL); + if (!vcpu_e500->tlb_refs[0]) + goto err; + + vcpu_e500->tlb_refs[1] = + kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->tlb_refs[1]) + goto err; + + vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[0].entries, + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[0]) + goto err; + + vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[1]) + goto err; if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) - goto err_out_priv1; + goto err; /* Init TLB configuration register */ - vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; - vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0]; - vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; - vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1]; + vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries; + vcpu_e500->tlb0cfg |= + vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT; + + vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[1].entries; + vcpu_e500->tlb0cfg |= + vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT; return 0; -err_out_priv1: - kfree(vcpu_e500->gtlb_priv[1]); -err_out_priv0: - kfree(vcpu_e500->gtlb_priv[0]); -err_out_guest1: - kfree(vcpu_e500->gtlb_arch[1]); -err_out_guest0: - kfree(vcpu_e500->gtlb_arch[0]); -err_out: +err: + free_gtlb(vcpu_e500); + kfree(vcpu_e500->tlb_refs[0]); + kfree(vcpu_e500->tlb_refs[1]); return -1; } void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) { - int stlbsel, i; - - /* release all privs */ - for (stlbsel = 0; stlbsel < 2; stlbsel++) - for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) { - struct tlbe_priv *priv = - &vcpu_e500->gtlb_priv[stlbsel][i]; - kvmppc_e500_priv_release(priv); - } - + free_gtlb(vcpu_e500); kvmppc_e500_id_table_free(vcpu_e500); - kfree(vcpu_e500->gtlb_arch[1]); - kfree(vcpu_e500->gtlb_arch[0]); + + kfree(vcpu_e500->tlb_refs[0]); + kfree(vcpu_e500->tlb_refs[1]); } diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index 59b88e99a235..5c6d2d7bf058 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -20,13 +20,9 @@ #include <asm/tlb.h> #include <asm/kvm_e500.h> -#define KVM_E500_TLB0_WAY_SIZE_BIT 7 /* Fixed */ -#define KVM_E500_TLB0_WAY_SIZE (1UL << KVM_E500_TLB0_WAY_SIZE_BIT) -#define KVM_E500_TLB0_WAY_SIZE_MASK (KVM_E500_TLB0_WAY_SIZE - 1) - -#define KVM_E500_TLB0_WAY_NUM_BIT 1 /* No greater than 7 */ -#define KVM_E500_TLB0_WAY_NUM (1UL << KVM_E500_TLB0_WAY_NUM_BIT) -#define KVM_E500_TLB0_WAY_NUM_MASK (KVM_E500_TLB0_WAY_NUM - 1) +/* This geometry is the legacy default -- can be overridden by userspace */ +#define KVM_E500_TLB0_WAY_SIZE 128 +#define KVM_E500_TLB0_WAY_NUM 2 #define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) #define KVM_E500_TLB1_SIZE 16 @@ -58,50 +54,54 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *); /* TLB helper functions */ -static inline unsigned int get_tlb_size(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 7) & 0x1f; } -static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) +static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) { return tlbe->mas2 & 0xfffff000; } -static inline u64 get_tlb_bytes(const struct tlbe *tlbe) +static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) { unsigned int pgsize = get_tlb_size(tlbe); return 1ULL << 10 << pgsize; } -static inline gva_t get_tlb_end(const struct tlbe *tlbe) +static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe) { u64 bytes = get_tlb_bytes(tlbe); return get_tlb_eaddr(tlbe) + bytes - 1; } -static inline u64 get_tlb_raddr(const struct tlbe *tlbe) +static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe) { - u64 rpn = tlbe->mas7; - return (rpn << 32) | (tlbe->mas3 & 0xfffff000); + return tlbe->mas7_3 & ~0xfffULL; } -static inline unsigned int get_tlb_tid(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 16) & 0xff; } -static inline unsigned int get_tlb_ts(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 12) & 0x1; } -static inline unsigned int get_tlb_v(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 31) & 0x1; } -static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 30) & 0x1; } @@ -121,59 +121,37 @@ static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) return !!(vcpu->arch.shared->msr & MSR_PR); } -static inline unsigned int get_cur_spid( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu) { - return (vcpu_e500->mas6 >> 16) & 0xff; + return (vcpu->arch.shared->mas6 >> 16) & 0xff; } -static inline unsigned int get_cur_sas( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu) { - return vcpu_e500->mas6 & 0x1; + return vcpu->arch.shared->mas6 & 0x1; } -static inline unsigned int get_tlb_tlbsel( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu) { /* * Manual says that tlbsel has 2 bits wide. * Since we only have two TLBs, only lower bit is used. */ - return (vcpu_e500->mas0 >> 28) & 0x1; -} - -static inline unsigned int get_tlb_nv_bit( - const struct kvmppc_vcpu_e500 *vcpu_e500) -{ - return vcpu_e500->mas0 & 0xfff; + return (vcpu->arch.shared->mas0 >> 28) & 0x1; } -static inline unsigned int get_tlb_esel_bit( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu) { - return (vcpu_e500->mas0 >> 16) & 0xfff; + return vcpu->arch.shared->mas0 & 0xfff; } -static inline unsigned int get_tlb_esel( - const struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel) +static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu) { - unsigned int esel = get_tlb_esel_bit(vcpu_e500); - - if (tlbsel == 0) { - esel &= KVM_E500_TLB0_WAY_NUM_MASK; - esel |= ((vcpu_e500->mas2 >> 12) & KVM_E500_TLB0_WAY_SIZE_MASK) - << KVM_E500_TLB0_WAY_NUM_BIT; - } else { - esel &= KVM_E500_TLB1_SIZE - 1; - } - - return esel; + return (vcpu->arch.shared->mas0 >> 16) & 0xfff; } static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, - const struct tlbe *tlbe) + const struct kvm_book3e_206_tlb_entry *tlbe) { gpa_t gpa; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 141dce3c6810..968f40101883 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard <hollisb@us.ibm.com> */ @@ -69,54 +70,55 @@ #define OP_STH 44 #define OP_STHU 45 -#ifdef CONFIG_PPC_BOOK3S -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ - return 1; -} -#else -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.tcr & TCR_DIE; -} -#endif - void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) { unsigned long dec_nsec; + unsigned long long dec_time; pr_debug("mtDEC: %x\n", vcpu->arch.dec); + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + #ifdef CONFIG_PPC_BOOK3S /* mtdec lowers the interrupt line when positive. */ kvmppc_core_dequeue_dec(vcpu); /* POWER4+ triggers a dec interrupt if the value is < 0 */ if (vcpu->arch.dec & 0x80000000) { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); kvmppc_core_queue_dec(vcpu); return; } #endif - if (kvmppc_dec_enabled(vcpu)) { - /* The decrementer ticks at the same rate as the timebase, so - * that's how we convert the guest DEC value to the number of - * host ticks. */ - - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - dec_nsec = vcpu->arch.dec; - dec_nsec *= 1000; - dec_nsec /= tb_ticks_per_usec; - hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), - HRTIMER_MODE_REL); - vcpu->arch.dec_jiffies = get_tb(); - } else { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - } + +#ifdef CONFIG_BOOKE + /* On BOOKE, DEC = 0 is as good as decrementer not enabled */ + if (vcpu->arch.dec == 0) + return; +#endif + + /* + * The decrementer ticks at the same rate as the timebase, so + * that's how we convert the guest DEC value to the number of + * host ticks. + */ + + dec_time = vcpu->arch.dec; + dec_time *= 1000; + do_div(dec_time, tb_ticks_per_usec); + dec_nsec = do_div(dec_time, NSEC_PER_SEC); + hrtimer_start(&vcpu->arch.dec_timer, + ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); + vcpu->arch.dec_jiffies = get_tb(); } u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) { u64 jd = tb - vcpu->arch.dec_jiffies; + +#ifdef CONFIG_BOOKE + if (vcpu->arch.dec < jd) + return 0; +#endif + return vcpu->arch.dec - jd; } @@ -159,7 +161,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_TRAP_64: kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); #else - kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR); + kvmppc_core_queue_program(vcpu, + vcpu->arch.shared->esr | ESR_PTR); #endif advance = 0; break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 607fbdf24b84..00d7e345b3fe 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -39,7 +39,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { return !(v->arch.shared->msr & MSR_WE) || - !!(v->arch.pending_exceptions); + !!(v->arch.pending_exceptions) || + v->requests; } int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) @@ -66,7 +67,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) vcpu->arch.magic_page_pa = param1; vcpu->arch.magic_page_ea = param2; - r2 = KVM_MAGIC_FEAT_SR; + r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; r = HC_EV_SUCCESS; break; @@ -171,8 +172,11 @@ void kvm_arch_check_processor_compat(void *rtn) *(int *)rtn = kvmppc_core_check_processor_compat(); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + if (type) + return -EINVAL; + return kvmppc_core_init_vm(kvm); } @@ -208,17 +212,22 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_HIOR: case KVM_CAP_PPC_PAPR: #endif case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_ONE_REG: r = 1; break; #ifndef CONFIG_KVM_BOOK3S_64_HV case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: +#ifdef CONFIG_KVM_E500 + case KVM_CAP_SW_TLB: +#endif r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -238,7 +247,26 @@ int kvm_dev_ioctl_check_extension(long ext) if (cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; + case KVM_CAP_SYNC_MMU: + r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; + break; #endif + case KVM_CAP_NR_VCPUS: + /* + * Recommending a number of CPUs is somewhat arbitrary; we + * return the number of present CPUs for -HV (since a host + * will have secondary threads "offline"), and for other KVM + * implementations just count online CPUs. + */ +#ifdef CONFIG_KVM_BOOK3S_64_HV + r = num_present_cpus(); +#else + r = num_online_cpus(); +#endif + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; default: r = 0; break; @@ -253,6 +281,16 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, @@ -279,9 +317,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; vcpu = kvmppc_core_vcpu_create(kvm, id); - vcpu->arch.wqp = &vcpu->wq; - if (!IS_ERR(vcpu)) + if (!IS_ERR(vcpu)) { + vcpu->arch.wqp = &vcpu->wq; kvmppc_create_vcpu_debugfs(vcpu, id); + } return vcpu; } @@ -305,18 +344,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -static void kvmppc_decrementer_func(unsigned long data) -{ - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - - kvmppc_core_queue_dec(vcpu); - - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } -} - /* * low level hrtimer wake routine. Because this runs in hardirq context * we schedule a tasklet to do the real work. @@ -431,20 +458,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); - switch (vcpu->arch.io_gpr & KVM_REG_EXT_MASK) { - case KVM_REG_GPR: + switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) { + case KVM_MMIO_REG_GPR: kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); break; - case KVM_REG_FPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_FPR: + vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; #ifdef CONFIG_PPC_BOOK3S - case KVM_REG_QPR: - vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_QPR: + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; - case KVM_REG_FQPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; - vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_FQPR: + vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; #endif default: @@ -553,8 +580,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->arch.hcall_needed = 0; } - kvmppc_core_deliver_interrupts(vcpu); - r = kvmppc_vcpu_run(run, vcpu); if (vcpu->sigset_active) @@ -563,6 +588,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) return r; } +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int me; + int cpu = vcpu->cpu; + + me = get_cpu(); + if (waitqueue_active(vcpu->arch.wqp)) { + wake_up_interruptible(vcpu->arch.wqp); + vcpu->stat.halt_wakeup++; + } else if (cpu != me && cpu != -1) { + smp_send_reschedule(vcpu->cpu); + } + put_cpu(); +} + int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq == KVM_INTERRUPT_UNSET) { @@ -571,13 +611,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) } kvmppc_core_queue_external(vcpu, irq); - - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } else if (vcpu->cpu != -1) { - smp_send_reschedule(vcpu->cpu); - } + kvm_vcpu_kick(vcpu); return 0; } @@ -599,6 +633,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.papr_enabled = true; break; +#ifdef CONFIG_KVM_E500 + case KVM_CAP_SW_TLB: { + struct kvm_config_tlb cfg; + void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0]; + + r = -EFAULT; + if (copy_from_user(&cfg, user_ptr, sizeof(cfg))) + break; + + r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg); + break; + } +#endif default: r = -EINVAL; break; @@ -648,6 +695,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: + { + struct kvm_one_reg reg; + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + goto out; + if (ioctl == KVM_SET_ONE_REG) + r = kvm_vcpu_ioctl_set_one_reg(vcpu, ®); + else + r = kvm_vcpu_ioctl_get_one_reg(vcpu, ®); + break; + } + +#ifdef CONFIG_KVM_E500 + case KVM_DIRTY_TLB: { + struct kvm_dirty_tlb dirty; + r = -EFAULT; + if (copy_from_user(&dirty, argp, sizeof(dirty))) + goto out; + r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty); + break; + } +#endif + default: r = -EINVAL; } @@ -656,6 +729,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) { u32 inst_lis = 0x3c000000; diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index b135d3d397db..877186b7b1c3 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -118,11 +118,14 @@ TRACE_EVENT(kvm_book3s_exit, ), TP_fast_assign( + struct kvmppc_book3s_shadow_vcpu *svcpu; __entry->exit_nr = exit_nr; __entry->pc = kvmppc_get_pc(vcpu); __entry->dar = kvmppc_get_fault_dar(vcpu); __entry->msr = vcpu->arch.shared->msr; - __entry->srr1 = to_svcpu(vcpu)->shadow_srr1; + svcpu = svcpu_get(vcpu); + __entry->srr1 = svcpu->shadow_srr1; + svcpu_put(svcpu); ), TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", @@ -337,6 +340,63 @@ TRACE_EVENT(kvm_book3s_slbmte, #endif /* CONFIG_PPC_BOOK3S */ + +/************************************************************************* + * Book3E trace points * + *************************************************************************/ + +#ifdef CONFIG_BOOKE + +TRACE_EVENT(kvm_booke206_stlb_write, + TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas8 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas8 = mas8; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas8, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_gtlb_write, + TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +#endif + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ |