diff options
Diffstat (limited to 'arch/powerpc/kvm')
28 files changed, 2722 insertions, 722 deletions
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 50e7dbc7356c..3d7fd21c65f9 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -83,6 +83,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) vcpu_44x->shadow_refs[i].gtlb_index = -1; vcpu->arch.cpu_type = KVM_CPU_440; + vcpu->arch.pvr = mfspr(SPRN_PVR); return 0; } diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c index c8c61578fdfc..35ec0a8547da 100644 --- a/arch/powerpc/kvm/44x_emulate.c +++ b/arch/powerpc/kvm/44x_emulate.c @@ -27,12 +27,70 @@ #include "booke.h" #include "44x_tlb.h" +#define XOP_MFDCRX 259 #define XOP_MFDCR 323 +#define XOP_MTDCRX 387 #define XOP_MTDCR 451 #define XOP_TLBSX 914 #define XOP_ICCCI 966 #define XOP_TLBWE 978 +static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn) +{ + /* emulate some access in kernel */ + switch (dcrn) { + case DCRN_CPR0_CONFIG_ADDR: + vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); + return EMULATE_DONE; + default: + vcpu->run->dcr.dcrn = dcrn; + vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs); + vcpu->run->dcr.is_write = 1; + vcpu->arch.dcr_is_write = 1; + vcpu->arch.dcr_needed = 1; + kvmppc_account_exit(vcpu, DCR_EXITS); + return EMULATE_DO_DCR; + } +} + +static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn) +{ + /* The guest may access CPR0 registers to determine the timebase + * frequency, and it must know the real host frequency because it + * can directly access the timebase registers. + * + * It would be possible to emulate those accesses in userspace, + * but userspace can really only figure out the end frequency. + * We could decompose that into the factors that compute it, but + * that's tricky math, and it's easier to just report the real + * CPR0 values. + */ + switch (dcrn) { + case DCRN_CPR0_CONFIG_ADDR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); + break; + case DCRN_CPR0_CONFIG_DATA: + local_irq_disable(); + mtdcr(DCRN_CPR0_CONFIG_ADDR, + vcpu->arch.cpr0_cfgaddr); + kvmppc_set_gpr(vcpu, rt, + mfdcr(DCRN_CPR0_CONFIG_DATA)); + local_irq_enable(); + break; + default: + vcpu->run->dcr.dcrn = dcrn; + vcpu->run->dcr.data = 0; + vcpu->run->dcr.is_write = 0; + vcpu->arch.dcr_is_write = 0; + vcpu->arch.io_gpr = rt; + vcpu->arch.dcr_needed = 1; + kvmppc_account_exit(vcpu, DCR_EXITS); + return EMULATE_DO_DCR; + } + + return EMULATE_DONE; +} + int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -50,55 +108,21 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, switch (get_xop(inst)) { case XOP_MFDCR: - /* The guest may access CPR0 registers to determine the timebase - * frequency, and it must know the real host frequency because it - * can directly access the timebase registers. - * - * It would be possible to emulate those accesses in userspace, - * but userspace can really only figure out the end frequency. - * We could decompose that into the factors that compute it, but - * that's tricky math, and it's easier to just report the real - * CPR0 values. - */ - switch (dcrn) { - case DCRN_CPR0_CONFIG_ADDR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); - break; - case DCRN_CPR0_CONFIG_DATA: - local_irq_disable(); - mtdcr(DCRN_CPR0_CONFIG_ADDR, - vcpu->arch.cpr0_cfgaddr); - kvmppc_set_gpr(vcpu, rt, - mfdcr(DCRN_CPR0_CONFIG_DATA)); - local_irq_enable(); - break; - default: - run->dcr.dcrn = dcrn; - run->dcr.data = 0; - run->dcr.is_write = 0; - vcpu->arch.io_gpr = rt; - vcpu->arch.dcr_needed = 1; - kvmppc_account_exit(vcpu, DCR_EXITS); - emulated = EMULATE_DO_DCR; - } + emulated = emulate_mfdcr(vcpu, rt, dcrn); + break; + case XOP_MFDCRX: + emulated = emulate_mfdcr(vcpu, rt, + kvmppc_get_gpr(vcpu, ra)); break; case XOP_MTDCR: - /* emulate some access in kernel */ - switch (dcrn) { - case DCRN_CPR0_CONFIG_ADDR: - vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); - break; - default: - run->dcr.dcrn = dcrn; - run->dcr.data = kvmppc_get_gpr(vcpu, rs); - run->dcr.is_write = 1; - vcpu->arch.dcr_needed = 1; - kvmppc_account_exit(vcpu, DCR_EXITS); - emulated = EMULATE_DO_DCR; - } + emulated = emulate_mtdcr(vcpu, rs, dcrn); + break; + case XOP_MTDCRX: + emulated = emulate_mtdcr(vcpu, rs, + kvmppc_get_gpr(vcpu, ra)); break; case XOP_TLBWE: diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index f4dacb9c57fa..4730c953f435 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,6 +20,7 @@ config KVM bool select PREEMPT_NOTIFIERS select ANON_INODES + select HAVE_KVM_EVENTFD config KVM_BOOK3S_HANDLER bool @@ -36,6 +37,7 @@ config KVM_BOOK3S_64_HANDLER config KVM_BOOK3S_PR bool select KVM_MMIO + select MMU_NOTIFIER config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" @@ -123,6 +125,7 @@ config KVM_E500V2 depends on EXPERIMENTAL && E500 && !PPC_E500MC select KVM select KVM_MMIO + select MMU_NOTIFIER ---help--- Support running unmodified E500 guest kernels in virtual machines on E500v2 host processors. @@ -138,6 +141,7 @@ config KVM_E500MC select KVM select KVM_MMIO select KVM_BOOKE_HV + select MMU_NOTIFIER ---help--- Support running unmodified E500MC/E5500 (32-bit) guest kernels in virtual machines on E500MC/E5500 host processors. diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index c2a08636e6d4..1e473d46322c 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -6,7 +6,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) +common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \ + eventfd.o) CFLAGS_44x_tlb.o := -I. CFLAGS_e500_tlb.o := -I. @@ -72,10 +73,12 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_hv_rmhandlers.o \ book3s_hv_rm_mmu.o \ book3s_64_vio_hv.o \ + book3s_hv_ras.o \ book3s_hv_builtin.o kvm-book3s_64-module-objs := \ ../../../virt/kvm/kvm_main.o \ + ../../../virt/kvm/eventfd.o \ powerpc.o \ emulate.o \ book3s.o \ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 3f2a8360c857..a4b645285240 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; @@ -476,6 +485,122 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return -ENOTSUPP; } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r; + union kvmppc_one_reg val; + int size; + long int i; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + r = kvmppc_get_one_reg(vcpu, reg->id, &val); + + if (r == -EINVAL) { + r = 0; + switch (reg->id) { + case KVM_REG_PPC_DAR: + val = get_reg_val(reg->id, vcpu->arch.shared->dar); + break; + case KVM_REG_PPC_DSISR: + val = get_reg_val(reg->id, vcpu->arch.shared->dsisr); + break; + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + i = reg->id - KVM_REG_PPC_FPR0; + val = get_reg_val(reg->id, vcpu->arch.fpr[i]); + break; + case KVM_REG_PPC_FPSCR: + val = get_reg_val(reg->id, vcpu->arch.fpscr); + break; +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val.vval = vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0]; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]); + break; +#endif /* CONFIG_ALTIVEC */ + default: + r = -EINVAL; + break; + } + } + if (r) + return r; + + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) + r = -EFAULT; + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r; + union kvmppc_one_reg val; + int size; + long int i; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) + return -EFAULT; + + r = kvmppc_set_one_reg(vcpu, reg->id, &val); + + if (r == -EINVAL) { + r = 0; + switch (reg->id) { + case KVM_REG_PPC_DAR: + vcpu->arch.shared->dar = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_DSISR: + vcpu->arch.shared->dsisr = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + i = reg->id - KVM_REG_PPC_FPR0; + vcpu->arch.fpr[i] = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_FPSCR: + vcpu->arch.fpscr = set_reg_val(reg->id, val); + break; +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val); + break; +#endif /* CONFIG_ALTIVEC */ + default: + r = -EINVAL; + break; + } + } + + return r; +} + int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index b0f625a33345..00e619bf608e 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -155,7 +155,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); - if (is_error_pfn(hpaddr)) { + if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); r = -EINVAL; @@ -254,6 +254,7 @@ next_pteg: kvmppc_mmu_hpte_cache_map(vcpu, pte); + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); out: return r; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 4d72f9ebc554..ead58e317294 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -93,7 +93,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); - if (is_error_pfn(hpaddr)) { + if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); r = -EINVAL; goto out; @@ -171,6 +171,7 @@ map_again: kvmppc_mmu_hpte_cache_map(vcpu, pte); } + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); out: return r; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d95d11322a15..8cc18abd6dde 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -24,6 +24,9 @@ #include <linux/slab.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> +#include <linux/srcu.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -40,6 +43,11 @@ /* Power architecture requires HPT is at least 256kB */ #define PPC_MIN_HPT_ORDER 18 +static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel, unsigned long *pte_idx_ret); +static void kvmppc_rmap_reset(struct kvm *kvm); + long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) { unsigned long hpt; @@ -137,10 +145,11 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) /* Set the entire HPT to 0, i.e. invalid HPTEs */ memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); /* - * Set the whole last_vcpu array to an invalid vcpu number. - * This ensures that each vcpu will flush its TLB on next entry. + * Reset all the reverse-mapping chains for all memslots */ - memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); + kvmppc_rmap_reset(kvm); + /* Ensure that each vcpu will flush its TLB on next entry. */ + cpumask_setall(&kvm->arch.need_tlb_flush); *htab_orderp = order; err = 0; } else { @@ -184,6 +193,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, unsigned long addr, hash; unsigned long psize; unsigned long hp0, hp1; + unsigned long idx_ret; long ret; struct kvm *kvm = vcpu->kvm; @@ -215,7 +225,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, hash = (hash << 3) + 7; hp_v = hp0 | ((addr >> 16) & ~0x7fUL); hp_r = hp1 | addr; - ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, + &idx_ret); if (ret != H_SUCCESS) { pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", addr, ret); @@ -260,7 +271,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) /* * This is called to get a reference to a guest page if there isn't - * one already in the kvm->arch.slot_phys[][] arrays. + * one already in the memslot->arch.slot_phys[] array. */ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, struct kvm_memory_slot *memslot, @@ -275,7 +286,7 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, struct vm_area_struct *vma; unsigned long pfn, i, npages; - physp = kvm->arch.slot_phys[memslot->id]; + physp = memslot->arch.slot_phys; if (!physp) return -EINVAL; if (physp[gfn - memslot->base_gfn]) @@ -353,15 +364,10 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, return err; } -/* - * We come here on a H_ENTER call from the guest when we are not - * using mmu notifiers and we don't have the requested page pinned - * already. - */ -long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, - long pte_index, unsigned long pteh, unsigned long ptel) +long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel, unsigned long *pte_idx_ret) { - struct kvm *kvm = vcpu->kvm; unsigned long psize, gpa, gfn; struct kvm_memory_slot *memslot; long ret; @@ -389,8 +395,8 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, do_insert: /* Protect linux PTE lookup from page table destruction */ rcu_read_lock_sched(); /* this disables preemption too */ - vcpu->arch.pgdir = current->mm->pgd; - ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); + ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, + current->mm->pgd, false, pte_idx_ret); rcu_read_unlock_sched(); if (ret == H_TOO_HARD) { /* this can't happen */ @@ -401,6 +407,19 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, } +/* + * We come here on a H_ENTER call from the guest when we are not + * using mmu notifiers and we don't have the requested page pinned + * already. + */ +long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel) +{ + return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index, + pteh, ptel, &vcpu->arch.gpr[4]); +} + static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, gva_t eaddr) { @@ -570,7 +589,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct kvm *kvm = vcpu->kvm; unsigned long *hptep, hpte[3], r; unsigned long mmu_seq, psize, pte_size; - unsigned long gfn, hva, pfn; + unsigned long gpa, gfn, hva, pfn; struct kvm_memory_slot *memslot; unsigned long *rmap; struct revmap_entry *rev; @@ -608,15 +627,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Translate the logical address and get the page */ psize = hpte_page_size(hpte[0], r); - gfn = hpte_rpn(r, psize); + gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1)); + gfn = gpa >> PAGE_SHIFT; memslot = gfn_to_memslot(kvm, gfn); /* No memslot means it's an emulated MMIO region */ - if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { - unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1)); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, dsisr & DSISR_ISSTORE); - } if (!kvm->arch.using_mmu_notifiers) return -EFAULT; /* should never get here */ @@ -710,7 +728,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Check if we might have been invalidated; let the guest retry if so */ ret = RESUME_GUEST; - if (mmu_notifier_retry(vcpu, mmu_seq)) { + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { unlock_rmap(rmap); goto out_unlock; } @@ -756,6 +774,25 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_put; } +static void kvmppc_rmap_reset(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + slots = kvm->memslots; + kvm_for_each_memslot(memslot, slots) { + /* + * This assumes it is acceptable to lose reference and + * change bits across a reset. + */ + memset(memslot->arch.rmap, 0, + memslot->npages * sizeof(*memslot->arch.rmap)); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + static int kvm_handle_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, @@ -850,7 +887,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, psize = hpte_page_size(hptep[0], ptel); if ((hptep[0] & HPTE_V_VALID) && hpte_rpn(ptel, psize) == gfn) { - hptep[0] |= HPTE_V_ABSENT; + if (kvm->arch.using_mmu_notifiers) + hptep[0] |= HPTE_V_ABSENT; kvmppc_invalidate_hpte(kvm, hptep, i); /* Harvest R and C */ rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); @@ -877,6 +915,28 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) return 0; } +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + unsigned long *rmapp; + unsigned long gfn; + unsigned long n; + + rmapp = memslot->arch.rmap; + gfn = memslot->base_gfn; + for (n = memslot->npages; n; --n) { + /* + * Testing the present bit without locking is OK because + * the memslot has been marked invalid already, and hence + * no new HPTEs referencing this page can be created, + * thus the present bit can't go from 0 to 1. + */ + if (*rmapp & KVMPPC_RMAP_PRESENT) + kvm_unmap_rmapp(kvm, rmapp, gfn); + ++rmapp; + ++gfn; + } +} + static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long gfn) { @@ -1030,16 +1090,16 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) return ret; } -long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long *map) { unsigned long i; - unsigned long *rmapp, *map; + unsigned long *rmapp; preempt_disable(); rmapp = memslot->arch.rmap; - map = memslot->dirty_bitmap; for (i = 0; i < memslot->npages; ++i) { - if (kvm_test_clear_dirty(kvm, rmapp)) + if (kvm_test_clear_dirty(kvm, rmapp) && map) __set_bit_le(i, map); ++rmapp; } @@ -1057,20 +1117,22 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, unsigned long hva, psize, offset; unsigned long pa; unsigned long *physp; + int srcu_idx; + srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, gfn); if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) - return NULL; + goto err; if (!kvm->arch.using_mmu_notifiers) { - physp = kvm->arch.slot_phys[memslot->id]; + physp = memslot->arch.slot_phys; if (!physp) - return NULL; + goto err; physp += gfn - memslot->base_gfn; pa = *physp; if (!pa) { if (kvmppc_get_guest_page(kvm, gfn, memslot, PAGE_SIZE) < 0) - return NULL; + goto err; pa = *physp; } page = pfn_to_page(pa >> PAGE_SHIFT); @@ -1079,9 +1141,11 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, hva = gfn_to_hva_memslot(memslot, gfn); npages = get_user_pages_fast(hva, 1, 1, pages); if (npages < 1) - return NULL; + goto err; page = pages[0]; } + srcu_read_unlock(&kvm->srcu, srcu_idx); + psize = PAGE_SIZE; if (PageHuge(page)) { page = compound_head(page); @@ -1091,6 +1155,10 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, if (nb_ret) *nb_ret = psize - offset; return page_address(page) + offset; + + err: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return NULL; } void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) @@ -1100,6 +1168,348 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) put_page(page); } +/* + * Functions for reading and writing the hash table via reads and + * writes on a file descriptor. + * + * Reads return the guest view of the hash table, which has to be + * pieced together from the real hash table and the guest_rpte + * values in the revmap array. + * + * On writes, each HPTE written is considered in turn, and if it + * is valid, it is written to the HPT as if an H_ENTER with the + * exact flag set was done. When the invalid count is non-zero + * in the header written to the stream, the kernel will make + * sure that that many HPTEs are invalid, and invalidate them + * if not. + */ + +struct kvm_htab_ctx { + unsigned long index; + unsigned long flags; + struct kvm *kvm; + int first_pass; +}; + +#define HPTE_SIZE (2 * sizeof(unsigned long)) + +static long record_hpte(unsigned long flags, unsigned long *hptp, + unsigned long *hpte, struct revmap_entry *revp, + int want_valid, int first_pass) +{ + unsigned long v, r; + int ok = 1; + int valid, dirty; + + /* Unmodified entries are uninteresting except on the first pass */ + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + if (!first_pass && !dirty) + return 0; + + valid = 0; + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) { + valid = 1; + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && + !(hptp[0] & HPTE_V_BOLTED)) + valid = 0; + } + if (valid != want_valid) + return 0; + + v = r = 0; + if (valid || dirty) { + /* lock the HPTE so it's stable and read it */ + preempt_disable(); + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) + cpu_relax(); + v = hptp[0]; + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + } + /* re-evaluate valid and dirty from synchronized HPTE value */ + valid = !!(v & HPTE_V_VALID); + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) + valid = 0; + r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C)); + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + /* only clear modified if this is the right sort of entry */ + if (valid == want_valid && dirty) { + r &= ~HPTE_GR_MODIFIED; + revp->guest_rpte = r; + } + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hptp[0] &= ~HPTE_V_HVLOCK; + preempt_enable(); + if (!(valid == want_valid && (first_pass || dirty))) + ok = 0; + } + hpte[0] = v; + hpte[1] = r; + return ok; +} + +static ssize_t kvm_htab_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + unsigned long *hptp; + struct revmap_entry *revp; + unsigned long i, nb, nw; + unsigned long __user *lbuf; + struct kvm_get_htab_header __user *hptr; + unsigned long flags; + int first_pass; + unsigned long hpte[2]; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + first_pass = ctx->first_pass; + flags = ctx->flags; + + i = ctx->index; + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + revp = kvm->arch.revmap + i; + lbuf = (unsigned long __user *)buf; + + nb = 0; + while (nb + sizeof(hdr) + HPTE_SIZE < count) { + /* Initialize header */ + hptr = (struct kvm_get_htab_header __user *)buf; + hdr.n_valid = 0; + hdr.n_invalid = 0; + nw = nb; + nb += sizeof(hdr); + lbuf = (unsigned long __user *)(buf + sizeof(hdr)); + + /* Skip uninteresting entries, i.e. clean on not-first pass */ + if (!first_pass) { + while (i < kvm->arch.hpt_npte && + !(revp->guest_rpte & HPTE_GR_MODIFIED)) { + ++i; + hptp += 2; + ++revp; + } + } + hdr.index = i; + + /* Grab a series of valid entries */ + while (i < kvm->arch.hpt_npte && + hdr.n_valid < 0xffff && + nb + HPTE_SIZE < count && + record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { + /* valid entry, write it out */ + ++hdr.n_valid; + if (__put_user(hpte[0], lbuf) || + __put_user(hpte[1], lbuf + 1)) + return -EFAULT; + nb += HPTE_SIZE; + lbuf += 2; + ++i; + hptp += 2; + ++revp; + } + /* Now skip invalid entries while we can */ + while (i < kvm->arch.hpt_npte && + hdr.n_invalid < 0xffff && + record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { + /* found an invalid entry */ + ++hdr.n_invalid; + ++i; + hptp += 2; + ++revp; + } + + if (hdr.n_valid || hdr.n_invalid) { + /* write back the header */ + if (__copy_to_user(hptr, &hdr, sizeof(hdr))) + return -EFAULT; + nw = nb; + buf = (char __user *)lbuf; + } else { + nb = nw; + } + + /* Check if we've wrapped around the hash table */ + if (i >= kvm->arch.hpt_npte) { + i = 0; + ctx->first_pass = 0; + break; + } + } + + ctx->index = i; + + return nb; +} + +static ssize_t kvm_htab_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + unsigned long i, j; + unsigned long v, r; + unsigned long __user *lbuf; + unsigned long *hptp; + unsigned long tmp[2]; + ssize_t nb; + long int err, ret; + int rma_setup; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + /* lock out vcpus from running while we're doing this */ + mutex_lock(&kvm->lock); + rma_setup = kvm->arch.rma_setup_done; + if (rma_setup) { + kvm->arch.rma_setup_done = 0; /* temporarily */ + /* order rma_setup_done vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.rma_setup_done = 1; + mutex_unlock(&kvm->lock); + return -EBUSY; + } + } + + err = 0; + for (nb = 0; nb + sizeof(hdr) <= count; ) { + err = -EFAULT; + if (__copy_from_user(&hdr, buf, sizeof(hdr))) + break; + + err = 0; + if (nb + hdr.n_valid * HPTE_SIZE > count) + break; + + nb += sizeof(hdr); + buf += sizeof(hdr); + + err = -EINVAL; + i = hdr.index; + if (i >= kvm->arch.hpt_npte || + i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) + break; + + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + lbuf = (unsigned long __user *)buf; + for (j = 0; j < hdr.n_valid; ++j) { + err = -EFAULT; + if (__get_user(v, lbuf) || __get_user(r, lbuf + 1)) + goto out; + err = -EINVAL; + if (!(v & HPTE_V_VALID)) + goto out; + lbuf += 2; + nb += HPTE_SIZE; + + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + err = -EIO; + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, + tmp); + if (ret != H_SUCCESS) { + pr_err("kvm_htab_write ret %ld i=%ld v=%lx " + "r=%lx\n", ret, i, v, r); + goto out; + } + if (!rma_setup && is_vrma_hpte(v)) { + unsigned long psize = hpte_page_size(v, r); + unsigned long senc = slb_pgsize_encoding(psize); + unsigned long lpcr; + + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; + lpcr |= senc << (LPCR_VRMASD_SH - 4); + kvm->arch.lpcr = lpcr; + rma_setup = 1; + } + ++i; + hptp += 2; + } + + for (j = 0; j < hdr.n_invalid; ++j) { + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + ++i; + hptp += 2; + } + err = 0; + } + + out: + /* Order HPTE updates vs. rma_setup_done */ + smp_wmb(); + kvm->arch.rma_setup_done = rma_setup; + mutex_unlock(&kvm->lock); + + if (err) + return err; + return nb; +} + +static int kvm_htab_release(struct inode *inode, struct file *filp) +{ + struct kvm_htab_ctx *ctx = filp->private_data; + + filp->private_data = NULL; + if (!(ctx->flags & KVM_GET_HTAB_WRITE)) + atomic_dec(&ctx->kvm->arch.hpte_mod_interest); + kvm_put_kvm(ctx->kvm); + kfree(ctx); + return 0; +} + +static struct file_operations kvm_htab_fops = { + .read = kvm_htab_read, + .write = kvm_htab_write, + .llseek = default_llseek, + .release = kvm_htab_release, +}; + +int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) +{ + int ret; + struct kvm_htab_ctx *ctx; + int rwflag; + + /* reject flags we don't recognize */ + if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) + return -EINVAL; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + kvm_get_kvm(kvm); + ctx->kvm = kvm; + ctx->index = ghf->start_index; + ctx->flags = ghf->flags; + ctx->first_pass = 1; + + rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; + ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag); + if (ret < 0) { + kvm_put_kvm(kvm); + return ret; + } + + if (rwflag == O_RDONLY) { + mutex_lock(&kvm->slots_lock); + atomic_inc(&kvm->arch.hpte_mod_interest); + /* make sure kvmppc_do_h_enter etc. see the increment */ + synchronize_srcu_expedited(&kvm->srcu); + mutex_unlock(&kvm->slots_lock); + } + + return ret; +} + void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) { struct kvmppc_mmu *mmu = &vcpu->arch.mmu; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index b9a989dc76cc..d31a716f7f2b 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -22,6 +22,7 @@ #include <asm/kvm_book3s.h> #include <asm/reg.h> #include <asm/switch_to.h> +#include <asm/time.h> #define OP_19_XOP_RFID 18 #define OP_19_XOP_RFI 50 @@ -395,6 +396,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) (mfmsr() & MSR_HV)) vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; break; + case SPRN_PURR: + to_book3s(vcpu)->purr_offset = spr_val - get_tb(); + break; + case SPRN_SPURR: + to_book3s(vcpu)->spurr_offset = spr_val - get_tb(); + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: @@ -412,6 +419,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_CTRLF: case SPRN_CTRLT: case SPRN_L2CR: + case SPRN_DSCR: case SPRN_MMCR0_GEKKO: case SPRN_MMCR1_GEKKO: case SPRN_PMC1_GEKKO: @@ -483,9 +491,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) *spr_val = to_book3s(vcpu)->hid[5]; break; case SPRN_CFAR: - case SPRN_PURR: + case SPRN_DSCR: *spr_val = 0; break; + case SPRN_PURR: + *spr_val = get_tb() + to_book3s(vcpu)->purr_offset; + break; + case SPRN_SPURR: + *spr_val = get_tb() + to_book3s(vcpu)->purr_offset; + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index a150817d6d4c..7057a02f0906 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -28,8 +28,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); #ifdef CONFIG_ALTIVEC EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); #endif -#ifdef CONFIG_VSX -EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); -#endif #endif diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 721d4603a235..71d0c90b62bf 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -30,6 +30,7 @@ #include <linux/cpumask.h> #include <linux/spinlock.h> #include <linux/page-flags.h> +#include <linux/srcu.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -46,6 +47,7 @@ #include <asm/page.h> #include <asm/hvcall.h> #include <asm/switch_to.h> +#include <asm/smp.h> #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/highmem.h> @@ -55,25 +57,77 @@ /* #define EXIT_DEBUG_SIMPLE */ /* #define EXIT_DEBUG_INT */ +/* Used to indicate that a guest page fault needs to be handled */ +#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) + +/* Used as a "null" value for timebase values */ +#define TB_NIL (~(u64)0) + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +/* + * We use the vcpu_load/put functions to measure stolen time. + * Stolen time is counted as time when either the vcpu is able to + * run as part of a virtual core, but the task running the vcore + * is preempted or sleeping, or when the vcpu needs something done + * in the kernel by the task running the vcpu, but that task is + * preempted or sleeping. Those two things have to be counted + * separately, since one of the vcpu tasks will take on the job + * of running the core, and the other vcpu tasks in the vcore will + * sleep waiting for it to do that, but that sleep shouldn't count + * as stolen time. + * + * Hence we accumulate stolen time when the vcpu can run as part of + * a vcore using vc->stolen_tb, and the stolen time when the vcpu + * needs its task to do other things in the kernel (for example, + * service a page fault) in busy_stolen. We don't accumulate + * stolen time for a vcore when it is inactive, or for a vcpu + * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of + * a misnomer; it means that the vcpu task is not executing in + * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in + * the kernel. We don't have any way of dividing up that time + * between time that the vcpu is genuinely stopped, time that + * the task is actively working on behalf of the vcpu, and time + * that the task is preempted, so we don't count any of it as + * stolen. + * + * Updates to busy_stolen are protected by arch.tbacct_lock; + * updates to vc->stolen_tb are protected by the arch.tbacct_lock + * of the vcpu that has taken responsibility for running the vcore + * (i.e. vc->runner). The stolen times are measured in units of + * timebase ticks. (Note that the != TB_NIL checks below are + * purely defensive; they should never fail.) + */ + void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - local_paca->kvm_hstate.kvm_vcpu = vcpu; - local_paca->kvm_hstate.kvm_vcore = vc; - if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) + spin_lock(&vcpu->arch.tbacct_lock); + if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE && + vc->preempt_tb != TB_NIL) { vc->stolen_tb += mftb() - vc->preempt_tb; + vc->preempt_tb = TB_NIL; + } + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && + vcpu->arch.busy_preempt != TB_NIL) { + vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; + vcpu->arch.busy_preempt = TB_NIL; + } + spin_unlock(&vcpu->arch.tbacct_lock); } void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; + spin_lock(&vcpu->arch.tbacct_lock); if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) vc->preempt_tb = mftb(); + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) + vcpu->arch.busy_preempt = mftb(); + spin_unlock(&vcpu->arch.tbacct_lock); } void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) @@ -142,6 +196,22 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) vpa->yield_count = 1; } +static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, + unsigned long addr, unsigned long len) +{ + /* check address is cacheline aligned */ + if (addr & (L1_CACHE_BYTES - 1)) + return -EINVAL; + spin_lock(&vcpu->arch.vpa_update_lock); + if (v->next_gpa != addr || v->len != len) { + v->next_gpa = addr; + v->len = addr ? len : 0; + v->update_pending = 1; + } + spin_unlock(&vcpu->arch.vpa_update_lock); + return 0; +} + /* Length for a per-processor buffer is passed in at offset 4 in the buffer */ struct reg_vpa { u32 dummy; @@ -317,10 +387,16 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) { + if (!(vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending)) + return; + spin_lock(&vcpu->arch.vpa_update_lock); if (vcpu->arch.vpa.update_pending) { kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); - init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); + if (vcpu->arch.vpa.pinned_addr) + init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); } if (vcpu->arch.dtl.update_pending) { kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); @@ -332,24 +408,61 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->arch.vpa_update_lock); } +/* + * Return the accumulated stolen time for the vcore up until `now'. + * The caller should hold the vcore lock. + */ +static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) +{ + u64 p; + + /* + * If we are the task running the vcore, then since we hold + * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb + * can't be updated, so we don't need the tbacct_lock. + * If the vcore is inactive, it can't become active (since we + * hold the vcore lock), so the vcpu load/put functions won't + * update stolen_tb/preempt_tb, and we don't need tbacct_lock. + */ + if (vc->vcore_state != VCORE_INACTIVE && + vc->runner->arch.run_task != current) { + spin_lock(&vc->runner->arch.tbacct_lock); + p = vc->stolen_tb; + if (vc->preempt_tb != TB_NIL) + p += now - vc->preempt_tb; + spin_unlock(&vc->runner->arch.tbacct_lock); + } else { + p = vc->stolen_tb; + } + return p; +} + static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { struct dtl_entry *dt; struct lppaca *vpa; - unsigned long old_stolen; + unsigned long stolen; + unsigned long core_stolen; + u64 now; dt = vcpu->arch.dtl_ptr; vpa = vcpu->arch.vpa.pinned_addr; - old_stolen = vcpu->arch.stolen_logged; - vcpu->arch.stolen_logged = vc->stolen_tb; + now = mftb(); + core_stolen = vcore_stolen_time(vc, now); + stolen = core_stolen - vcpu->arch.stolen_logged; + vcpu->arch.stolen_logged = core_stolen; + spin_lock(&vcpu->arch.tbacct_lock); + stolen += vcpu->arch.busy_stolen; + vcpu->arch.busy_stolen = 0; + spin_unlock(&vcpu->arch.tbacct_lock); if (!dt || !vpa) return; memset(dt, 0, sizeof(struct dtl_entry)); dt->dispatch_reason = 7; dt->processor_id = vc->pcpu + vcpu->arch.ptid; - dt->timebase = mftb(); - dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen; + dt->timebase = now; + dt->enqueue_to_dispatch_time = stolen; dt->srr0 = kvmppc_get_pc(vcpu); dt->srr1 = vcpu->arch.shregs.msr; ++dt; @@ -366,13 +479,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) unsigned long req = kvmppc_get_gpr(vcpu, 3); unsigned long target, ret = H_SUCCESS; struct kvm_vcpu *tvcpu; + int idx; switch (req) { case H_ENTER: + idx = srcu_read_lock(&vcpu->kvm->srcu); ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6), kvmppc_get_gpr(vcpu, 7)); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; case H_CEDE: break; @@ -429,6 +545,17 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PERFMON: r = RESUME_GUEST; break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* + * Deliver a machine check interrupt to the guest. + * We have to do this, even if the host has handled the + * machine check, because machine checks use SRR0/1 and + * the interrupt might have trashed guest state in them. + */ + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_MACHINE_CHECK); + r = RESUME_GUEST; + break; case BOOK3S_INTERRUPT_PROGRAM: { ulong flags; @@ -470,12 +597,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * have been handled already. */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: - r = kvmppc_book3s_hv_page_fault(run, vcpu, - vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + r = RESUME_PAGE_FAULT; break; case BOOK3S_INTERRUPT_H_INST_STORAGE: - r = kvmppc_book3s_hv_page_fault(run, vcpu, - kvmppc_get_pc(vcpu), 0); + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); + vcpu->arch.fault_dsisr = 0; + r = RESUME_PAGE_FAULT; break; /* * This occurs if the guest executes an illegal instruction. @@ -535,36 +662,174 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - int r = -EINVAL; + int r = 0; + long int i; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_HIOR: - r = put_user(0, (u64 __user *)reg->addr); + *val = get_reg_val(id, 0); + break; + case KVM_REG_PPC_DABR: + *val = get_reg_val(id, vcpu->arch.dabr); + break; + case KVM_REG_PPC_DSCR: + *val = get_reg_val(id, vcpu->arch.dscr); + break; + case KVM_REG_PPC_PURR: + *val = get_reg_val(id, vcpu->arch.purr); + break; + case KVM_REG_PPC_SPURR: + *val = get_reg_val(id, vcpu->arch.spurr); + break; + case KVM_REG_PPC_AMR: + *val = get_reg_val(id, vcpu->arch.amr); + break; + case KVM_REG_PPC_UAMOR: + *val = get_reg_val(id, vcpu->arch.uamor); + break; + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA: + i = id - KVM_REG_PPC_MMCR0; + *val = get_reg_val(id, vcpu->arch.mmcr[i]); + break; + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: + i = id - KVM_REG_PPC_PMC1; + *val = get_reg_val(id, vcpu->arch.pmc[i]); + break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + /* VSX => FP reg i is stored in arch.vsr[2*i] */ + long int i = id - KVM_REG_PPC_FPR0; + *val = get_reg_val(id, vcpu->arch.vsr[2 * i]); + } else { + /* let generic code handle it */ + r = -EINVAL; + } + break; + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + long int i = id - KVM_REG_PPC_VSR0; + val->vsxval[0] = vcpu->arch.vsr[2 * i]; + val->vsxval[1] = vcpu->arch.vsr[2 * i + 1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ + case KVM_REG_PPC_VPA_ADDR: + spin_lock(&vcpu->arch.vpa_update_lock); + *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_VPA_SLB: + spin_lock(&vcpu->arch.vpa_update_lock); + val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa; + val->vpaval.length = vcpu->arch.slb_shadow.len; + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_VPA_DTL: + spin_lock(&vcpu->arch.vpa_update_lock); + val->vpaval.addr = vcpu->arch.dtl.next_gpa; + val->vpaval.length = vcpu->arch.dtl.len; + spin_unlock(&vcpu->arch.vpa_update_lock); break; default: + r = -EINVAL; break; } return r; } -int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - int r = -EINVAL; + int r = 0; + long int i; + unsigned long addr, len; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_HIOR: - { - u64 hior; /* Only allow this to be set to zero */ - r = get_user(hior, (u64 __user *)reg->addr); - if (!r && (hior != 0)) + if (set_reg_val(id, *val)) r = -EINVAL; break; - } + case KVM_REG_PPC_DABR: + vcpu->arch.dabr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DSCR: + vcpu->arch.dscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PURR: + vcpu->arch.purr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SPURR: + vcpu->arch.spurr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_AMR: + vcpu->arch.amr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_UAMOR: + vcpu->arch.uamor = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA: + i = id - KVM_REG_PPC_MMCR0; + vcpu->arch.mmcr[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: + i = id - KVM_REG_PPC_PMC1; + vcpu->arch.pmc[i] = set_reg_val(id, *val); + break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + /* VSX => FP reg i is stored in arch.vsr[2*i] */ + long int i = id - KVM_REG_PPC_FPR0; + vcpu->arch.vsr[2 * i] = set_reg_val(id, *val); + } else { + /* let generic code handle it */ + r = -EINVAL; + } + break; + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + long int i = id - KVM_REG_PPC_VSR0; + vcpu->arch.vsr[2 * i] = val->vsxval[0]; + vcpu->arch.vsr[2 * i + 1] = val->vsxval[1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ + case KVM_REG_PPC_VPA_ADDR: + addr = set_reg_val(id, *val); + r = -EINVAL; + if (!addr && (vcpu->arch.slb_shadow.next_gpa || + vcpu->arch.dtl.next_gpa)) + break; + r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca)); + break; + case KVM_REG_PPC_VPA_SLB: + addr = val->vpaval.addr; + len = val->vpaval.length; + r = -EINVAL; + if (addr && !vcpu->arch.vpa.next_gpa) + break; + r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len); + break; + case KVM_REG_PPC_VPA_DTL: + addr = val->vpaval.addr; + len = val->vpaval.length; + r = -EINVAL; + if (addr && (len < sizeof(struct dtl_entry) || + !vcpu->arch.vpa.next_gpa)) + break; + len -= len % sizeof(struct dtl_entry); + r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); + break; default: + r = -EINVAL; break; } @@ -599,20 +864,18 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) goto free_vcpu; vcpu->arch.shared = &vcpu->arch.shregs; - vcpu->arch.last_cpu = -1; vcpu->arch.mmcr[0] = MMCR0_FC; vcpu->arch.ctrl = CTRL_RUNLATCH; /* default to host PVR, since we can't spoof it */ vcpu->arch.pvr = mfspr(SPRN_PVR); kvmppc_set_pvr(vcpu, vcpu->arch.pvr); spin_lock_init(&vcpu->arch.vpa_update_lock); + spin_lock_init(&vcpu->arch.tbacct_lock); + vcpu->arch.busy_preempt = TB_NIL; kvmppc_mmu_book3s_hv_init(vcpu); - /* - * We consider the vcpu stopped until we see the first run ioctl for it. - */ - vcpu->arch.state = KVMPPC_VCPU_STOPPED; + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; init_waitqueue_head(&vcpu->arch.cpu_run); @@ -624,9 +887,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); init_waitqueue_head(&vcore->wq); - vcore->preempt_tb = mftb(); + vcore->preempt_tb = TB_NIL; } kvm->arch.vcores[core] = vcore; + kvm->arch.online_vcores++; } mutex_unlock(&kvm->lock); @@ -637,7 +901,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) ++vcore->num_threads; spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; - vcpu->arch.stolen_logged = vcore->stolen_tb; vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); @@ -697,17 +960,18 @@ extern void xics_wake_cpu(int cpu); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, struct kvm_vcpu *vcpu) { - struct kvm_vcpu *v; + u64 now; if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) return; + spin_lock(&vcpu->arch.tbacct_lock); + now = mftb(); + vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - + vcpu->arch.stolen_logged; + vcpu->arch.busy_preempt = now; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + spin_unlock(&vcpu->arch.tbacct_lock); --vc->n_runnable; - ++vc->n_busy; - /* decrement the physical thread id of each following vcpu */ - v = vcpu; - list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) - --v->arch.ptid; list_del(&vcpu->arch.run_list); } @@ -720,6 +984,7 @@ static int kvmppc_grab_hwthread(int cpu) /* Ensure the thread won't go into the kernel if it wakes */ tpaca->kvm_hstate.hwthread_req = 1; + tpaca->kvm_hstate.kvm_vcpu = NULL; /* * If the thread is already executing in the kernel (e.g. handling @@ -769,7 +1034,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) smp_wmb(); #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) if (vcpu->arch.ptid) { - kvmppc_grab_hwthread(cpu); xics_wake_cpu(cpu); ++vc->n_woken; } @@ -795,7 +1059,8 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) /* * Check that we are on thread 0 and that any other threads in - * this core are off-line. + * this core are off-line. Then grab the threads so they can't + * enter the kernel. */ static int on_primary_thread(void) { @@ -807,6 +1072,17 @@ static int on_primary_thread(void) while (++thr < threads_per_core) if (cpu_online(cpu + thr)) return 0; + + /* Grab all hw threads so they can't go into the kernel */ + for (thr = 1; thr < threads_per_core; ++thr) { + if (kvmppc_grab_hwthread(cpu + thr)) { + /* Couldn't grab one; let the others go */ + do { + kvmppc_release_hwthread(cpu + thr); + } while (--thr > 0); + return 0; + } + } return 1; } @@ -814,21 +1090,24 @@ static int on_primary_thread(void) * Run a set of guest threads on a physical core. * Called with vc->lock held. */ -static int kvmppc_run_core(struct kvmppc_vcore *vc) +static void kvmppc_run_core(struct kvmppc_vcore *vc) { struct kvm_vcpu *vcpu, *vcpu0, *vnext; long ret; u64 now; int ptid, i, need_vpa_update; + int srcu_idx; + struct kvm_vcpu *vcpus_to_update[threads_per_core]; /* don't start if any threads have a signal pending */ need_vpa_update = 0; list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { if (signal_pending(vcpu->arch.run_task)) - return 0; - need_vpa_update |= vcpu->arch.vpa.update_pending | - vcpu->arch.slb_shadow.update_pending | - vcpu->arch.dtl.update_pending; + return; + if (vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending) + vcpus_to_update[need_vpa_update++] = vcpu; } /* @@ -838,7 +1117,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) vc->n_woken = 0; vc->nap_count = 0; vc->entry_exit_count = 0; - vc->vcore_state = VCORE_RUNNING; + vc->vcore_state = VCORE_STARTING; vc->in_guest = 0; vc->napping_threads = 0; @@ -848,24 +1127,12 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) */ if (need_vpa_update) { spin_unlock(&vc->lock); - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) - kvmppc_update_vpas(vcpu); + for (i = 0; i < need_vpa_update; ++i) + kvmppc_update_vpas(vcpus_to_update[i]); spin_lock(&vc->lock); } /* - * Make sure we are running on thread 0, and that - * secondary threads are offline. - * XXX we should also block attempts to bring any - * secondary threads online. - */ - if (threads_per_core > 1 && !on_primary_thread()) { - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) - vcpu->arch.ret = -EBUSY; - goto out; - } - - /* * Assign physical thread IDs, first to non-ceded vcpus * and then to ceded ones. */ @@ -879,28 +1146,36 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) } } if (!vcpu0) - return 0; /* nothing to run */ + goto out; /* nothing to run; should never happen */ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) if (vcpu->arch.ceded) vcpu->arch.ptid = ptid++; - vc->stolen_tb += mftb() - vc->preempt_tb; + /* + * Make sure we are running on thread 0, and that + * secondary threads are offline. + */ + if (threads_per_core > 1 && !on_primary_thread()) { + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + vcpu->arch.ret = -EBUSY; + goto out; + } + vc->pcpu = smp_processor_id(); list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { kvmppc_start_thread(vcpu); kvmppc_create_dtl_entry(vcpu, vc); } - /* Grab any remaining hw threads so they can't go into the kernel */ - for (i = ptid; i < threads_per_core; ++i) - kvmppc_grab_hwthread(vc->pcpu + i); + vc->vcore_state = VCORE_RUNNING; preempt_disable(); spin_unlock(&vc->lock); kvm_guest_enter(); + + srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu); + __kvmppc_vcore_entry(NULL, vcpu0); - for (i = 0; i < threads_per_core; ++i) - kvmppc_release_hwthread(vc->pcpu + i); spin_lock(&vc->lock); /* disable sending of IPIs on virtual external irqs */ @@ -909,10 +1184,14 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) /* wait for secondary threads to finish writing their state to memory */ if (vc->nap_count < vc->n_woken) kvmppc_wait_for_nap(vc); + for (i = 0; i < threads_per_core; ++i) + kvmppc_release_hwthread(vc->pcpu + i); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ vc->vcore_state = VCORE_EXITING; spin_unlock(&vc->lock); + srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx); + /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); kvm_guest_exit(); @@ -920,6 +1199,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) preempt_enable(); kvm_resched(vcpu); + spin_lock(&vc->lock); now = get_tb(); list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { /* cancel pending dec exception if dec is positive */ @@ -943,10 +1223,8 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) } } - spin_lock(&vc->lock); out: vc->vcore_state = VCORE_INACTIVE; - vc->preempt_tb = mftb(); list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, arch.run_list) { if (vcpu->arch.ret != RESUME_GUEST) { @@ -954,8 +1232,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) wake_up(&vcpu->arch.cpu_run); } } - - return 1; } /* @@ -979,20 +1255,11 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state) static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { DEFINE_WAIT(wait); - struct kvm_vcpu *v; - int all_idle = 1; prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); vc->vcore_state = VCORE_SLEEPING; spin_unlock(&vc->lock); - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { - if (!v->arch.ceded || v->arch.pending_exceptions) { - all_idle = 0; - break; - } - } - if (all_idle) - schedule(); + schedule(); finish_wait(&vc->wq, &wait); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; @@ -1001,13 +1268,13 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int n_ceded; - int prev_state; struct kvmppc_vcore *vc; struct kvm_vcpu *v, *vn; kvm_run->exit_reason = 0; vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; + kvmppc_update_vpas(vcpu); /* * Synchronize with other threads in this virtual core @@ -1017,8 +1284,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vcpu->arch.ceded = 0; vcpu->arch.run_task = current; vcpu->arch.kvm_run = kvm_run; - prev_state = vcpu->arch.state; + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + vcpu->arch.busy_preempt = TB_NIL; list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); ++vc->n_runnable; @@ -1027,33 +1295,26 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * If the vcore is already running, we may be able to start * this thread straight away and have it join in. */ - if (prev_state == KVMPPC_VCPU_STOPPED) { + if (!signal_pending(current)) { if (vc->vcore_state == VCORE_RUNNING && VCORE_EXIT_COUNT(vc) == 0) { vcpu->arch.ptid = vc->n_runnable - 1; + kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu); + } else if (vc->vcore_state == VCORE_SLEEPING) { + wake_up(&vc->wq); } - } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST) - --vc->n_busy; + } while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && !signal_pending(current)) { - if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) { + if (vc->vcore_state != VCORE_INACTIVE) { spin_unlock(&vc->lock); kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); spin_lock(&vc->lock); continue; } - vc->runner = vcpu; - n_ceded = 0; - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) - n_ceded += v->arch.ceded; - if (n_ceded == vc->n_runnable) - kvmppc_vcore_blocked(vc); - else - kvmppc_run_core(vc); - list_for_each_entry_safe(v, vn, &vc->runnable_threads, arch.run_list) { kvmppc_core_prepare_to_enter(v); @@ -1065,22 +1326,40 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) wake_up(&v->arch.cpu_run); } } + if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) + break; + vc->runner = vcpu; + n_ceded = 0; + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) + if (!v->arch.pending_exceptions) + n_ceded += v->arch.ceded; + if (n_ceded == vc->n_runnable) + kvmppc_vcore_blocked(vc); + else + kvmppc_run_core(vc); vc->runner = NULL; } - if (signal_pending(current)) { - if (vc->vcore_state == VCORE_RUNNING || - vc->vcore_state == VCORE_EXITING) { - spin_unlock(&vc->lock); - kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); - spin_lock(&vc->lock); - } - if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { - kvmppc_remove_runnable(vc, vcpu); - vcpu->stat.signal_exits++; - kvm_run->exit_reason = KVM_EXIT_INTR; - vcpu->arch.ret = -EINTR; - } + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && + (vc->vcore_state == VCORE_RUNNING || + vc->vcore_state == VCORE_EXITING)) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); + spin_lock(&vc->lock); + } + + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { + kvmppc_remove_runnable(vc, vcpu); + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + } + + if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { + /* Wake up some vcpu to run the core */ + v = list_first_entry(&vc->runnable_threads, + struct kvm_vcpu, arch.run_list); + wake_up(&v->arch.cpu_run); } spin_unlock(&vc->lock); @@ -1090,6 +1369,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; + int srcu_idx; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -1120,6 +1400,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) flush_vsx_to_thread(current); vcpu->arch.wqp = &vcpu->arch.vcore->wq; vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { r = kvmppc_run_vcpu(run, vcpu); @@ -1128,10 +1409,16 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) !(vcpu->arch.shregs.msr & MSR_PR)) { r = kvmppc_pseries_do_hcall(vcpu); kvmppc_core_prepare_to_enter(vcpu); + } else if (r == RESUME_PAGE_FAULT) { + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmppc_book3s_hv_page_fault(run, vcpu, + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); } } while (r == RESUME_GUEST); out: + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; atomic_dec(&vcpu->kvm->arch.vcpus_running); return r; } @@ -1273,7 +1560,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) n = kvm_dirty_bitmap_bytes(memslot); memset(memslot->dirty_bitmap, 0, n); - r = kvmppc_hv_get_dirty_log(kvm, memslot); + r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); if (r) goto out; @@ -1287,67 +1574,88 @@ out: return r; } -static unsigned long slb_pgsize_encoding(unsigned long psize) +static void unpin_slot(struct kvm_memory_slot *memslot) { - unsigned long senc = 0; + unsigned long *physp; + unsigned long j, npages, pfn; + struct page *page; - if (psize > 0x1000) { - senc = SLB_VSID_L; - if (psize == 0x10000) - senc |= SLB_VSID_LP_01; + physp = memslot->arch.slot_phys; + npages = memslot->npages; + if (!physp) + return; + for (j = 0; j < npages; j++) { + if (!(physp[j] & KVMPPC_GOT_PAGE)) + continue; + pfn = physp[j] >> PAGE_SHIFT; + page = pfn_to_page(pfn); + SetPageDirty(page); + put_page(page); + } +} + +void kvmppc_core_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + if (!dont || free->arch.rmap != dont->arch.rmap) { + vfree(free->arch.rmap); + free->arch.rmap = NULL; + } + if (!dont || free->arch.slot_phys != dont->arch.slot_phys) { + unpin_slot(free); + vfree(free->arch.slot_phys); + free->arch.slot_phys = NULL; } - return senc; +} + +int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages) +{ + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); + if (!slot->arch.rmap) + return -ENOMEM; + slot->arch.slot_phys = NULL; + + return 0; } int kvmppc_core_prepare_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) { - unsigned long npages; unsigned long *phys; - /* Allocate a slot_phys array */ - phys = kvm->arch.slot_phys[mem->slot]; - if (!kvm->arch.using_mmu_notifiers && !phys) { - npages = mem->memory_size >> PAGE_SHIFT; - phys = vzalloc(npages * sizeof(unsigned long)); + /* Allocate a slot_phys array if needed */ + phys = memslot->arch.slot_phys; + if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) { + phys = vzalloc(memslot->npages * sizeof(unsigned long)); if (!phys) return -ENOMEM; - kvm->arch.slot_phys[mem->slot] = phys; - kvm->arch.slot_npages[mem->slot] = npages; + memslot->arch.slot_phys = phys; } return 0; } -static void unpin_slot(struct kvm *kvm, int slot_id) +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old) { - unsigned long *physp; - unsigned long j, npages, pfn; - struct page *page; + unsigned long npages = mem->memory_size >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; - physp = kvm->arch.slot_phys[slot_id]; - npages = kvm->arch.slot_npages[slot_id]; - if (physp) { - spin_lock(&kvm->arch.slot_phys_lock); - for (j = 0; j < npages; j++) { - if (!(physp[j] & KVMPPC_GOT_PAGE)) - continue; - pfn = physp[j] >> PAGE_SHIFT; - page = pfn_to_page(pfn); - SetPageDirty(page); - put_page(page); - } - kvm->arch.slot_phys[slot_id] = NULL; - spin_unlock(&kvm->arch.slot_phys_lock); - vfree(physp); + if (npages && old.npages) { + /* + * If modifying a memslot, reset all the rmap dirty bits. + * If this is a new memslot, we don't need to do anything + * since the rmap array starts out as all zeroes, + * i.e. no pages are dirty. + */ + memslot = id_to_memslot(kvm->memslots, mem->slot); + kvmppc_hv_get_dirty_log(kvm, memslot, NULL); } } -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) -{ -} - static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; @@ -1362,6 +1670,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) unsigned long rmls; unsigned long *physp; unsigned long i, npages; + int srcu_idx; mutex_lock(&kvm->lock); if (kvm->arch.rma_setup_done) @@ -1377,12 +1686,13 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) } /* Look up the memslot for guest physical address 0 */ + srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, 0); /* We must have some memory at 0 by now */ err = -EINVAL; if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) - goto out; + goto out_srcu; /* Look up the VMA for the start of this memory slot */ hva = memslot->userspace_addr; @@ -1406,14 +1716,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) err = -EPERM; if (cpu_has_feature(CPU_FTR_ARCH_201)) { pr_err("KVM: CPU requires an RMO\n"); - goto out; + goto out_srcu; } /* We can handle 4k, 64k or 16M pages in the VRMA */ err = -EINVAL; if (!(psize == 0x1000 || psize == 0x10000 || psize == 0x1000000)) - goto out; + goto out_srcu; /* Update VRMASD field in the LPCR */ senc = slb_pgsize_encoding(psize); @@ -1436,7 +1746,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) err = -EINVAL; if (rmls < 0) { pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); - goto out; + goto out_srcu; } atomic_inc(&ri->use_count); kvm->arch.rma = ri; @@ -1465,17 +1775,24 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Initialize phys addrs of pages in RMO */ npages = ri->npages; porder = __ilog2(npages); - physp = kvm->arch.slot_phys[memslot->id]; - spin_lock(&kvm->arch.slot_phys_lock); - for (i = 0; i < npages; ++i) - physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder; - spin_unlock(&kvm->arch.slot_phys_lock); + physp = memslot->arch.slot_phys; + if (physp) { + if (npages > memslot->npages) + npages = memslot->npages; + spin_lock(&kvm->arch.slot_phys_lock); + for (i = 0; i < npages; ++i) + physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + + porder; + spin_unlock(&kvm->arch.slot_phys_lock); + } } /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ smp_wmb(); kvm->arch.rma_setup_done = 1; err = 0; + out_srcu: + srcu_read_unlock(&kvm->srcu, srcu_idx); out: mutex_unlock(&kvm->lock); return err; @@ -1496,6 +1813,13 @@ int kvmppc_core_init_vm(struct kvm *kvm) return -ENOMEM; kvm->arch.lpid = lpid; + /* + * Since we don't flush the TLB when tearing down a VM, + * and this lpid might have previously been used, + * make sure we flush on each core before running the new VM. + */ + cpumask_setall(&kvm->arch.need_tlb_flush); + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); kvm->arch.rma = NULL; @@ -1523,16 +1847,19 @@ int kvmppc_core_init_vm(struct kvm *kvm) kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); spin_lock_init(&kvm->arch.slot_phys_lock); + + /* + * Don't allow secondary CPU threads to come online + * while any KVM VMs exist. + */ + inhibit_secondary_onlining(); + return 0; } void kvmppc_core_destroy_vm(struct kvm *kvm) { - unsigned long i; - - if (!kvm->arch.using_mmu_notifiers) - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) - unpin_slot(kvm, i); + uninhibit_secondary_onlining(); if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fb4eac290fef..ec0a9e5de100 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -157,8 +157,8 @@ static void __init kvm_linear_init_one(ulong size, int count, int type) linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); for (i = 0; i < count; ++i) { linear = alloc_bootmem_align(size, size); - pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, - size >> 20); + pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, + size >> 20); linear_info[i].base_virt = linear; linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT; linear_info[i].npages = npages; diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c new file mode 100644 index 000000000000..35f3cf0269b3 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -0,0 +1,144 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/kernel.h> +#include <asm/opal.h> + +/* SRR1 bits for machine check on POWER7 */ +#define SRR1_MC_LDSTERR (1ul << (63-42)) +#define SRR1_MC_IFETCH_SH (63-45) +#define SRR1_MC_IFETCH_MASK 0x7 +#define SRR1_MC_IFETCH_SLBPAR 2 /* SLB parity error */ +#define SRR1_MC_IFETCH_SLBMULTI 3 /* SLB multi-hit */ +#define SRR1_MC_IFETCH_SLBPARMULTI 4 /* SLB parity + multi-hit */ +#define SRR1_MC_IFETCH_TLBMULTI 5 /* I-TLB multi-hit */ + +/* DSISR bits for machine check on POWER7 */ +#define DSISR_MC_DERAT_MULTI 0x800 /* D-ERAT multi-hit */ +#define DSISR_MC_TLB_MULTI 0x400 /* D-TLB multi-hit */ +#define DSISR_MC_SLB_PARITY 0x100 /* SLB parity error */ +#define DSISR_MC_SLB_MULTI 0x080 /* SLB multi-hit */ +#define DSISR_MC_SLB_PARMULTI 0x040 /* SLB parity + multi-hit */ + +/* POWER7 SLB flush and reload */ +static void reload_slb(struct kvm_vcpu *vcpu) +{ + struct slb_shadow *slb; + unsigned long i, n; + + /* First clear out SLB */ + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); + + /* Do they have an SLB shadow buffer registered? */ + slb = vcpu->arch.slb_shadow.pinned_addr; + if (!slb) + return; + + /* Sanity check */ + n = min_t(u32, slb->persistent, SLB_MIN_SIZE); + if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end) + return; + + /* Load up the SLB from that */ + for (i = 0; i < n; ++i) { + unsigned long rb = slb->save_area[i].esid; + unsigned long rs = slb->save_area[i].vsid; + + rb = (rb & ~0xFFFul) | i; /* insert entry number */ + asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); + } +} + +/* POWER7 TLB flush */ +static void flush_tlb_power7(struct kvm_vcpu *vcpu) +{ + unsigned long i, rb; + + rb = TLBIEL_INVAL_SET_LPID; + for (i = 0; i < POWER7_TLB_SETS; ++i) { + asm volatile("tlbiel %0" : : "r" (rb)); + rb += 1 << TLBIEL_INVAL_SET_SHIFT; + } +} + +/* + * On POWER7, see if we can handle a machine check that occurred inside + * the guest in real mode, without switching to the host partition. + * + * Returns: 0 => exit guest, 1 => deliver machine check to guest + */ +static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) +{ + unsigned long srr1 = vcpu->arch.shregs.msr; + struct opal_machine_check_event *opal_evt; + long handled = 1; + + if (srr1 & SRR1_MC_LDSTERR) { + /* error on load/store */ + unsigned long dsisr = vcpu->arch.shregs.dsisr; + + if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) { + /* flush and reload SLB; flushes D-ERAT too */ + reload_slb(vcpu); + dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI); + } + if (dsisr & DSISR_MC_TLB_MULTI) { + flush_tlb_power7(vcpu); + dsisr &= ~DSISR_MC_TLB_MULTI; + } + /* Any other errors we don't understand? */ + if (dsisr & 0xffffffffUL) + handled = 0; + } + + switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) { + case 0: + break; + case SRR1_MC_IFETCH_SLBPAR: + case SRR1_MC_IFETCH_SLBMULTI: + case SRR1_MC_IFETCH_SLBPARMULTI: + reload_slb(vcpu); + break; + case SRR1_MC_IFETCH_TLBMULTI: + flush_tlb_power7(vcpu); + break; + default: + handled = 0; + } + + /* + * See if OPAL has already handled the condition. + * We assume that if the condition is recovered then OPAL + * will have generated an error log event that we will pick + * up and log later. + */ + opal_evt = local_paca->opal_mc_evt; + if (opal_evt->version == OpalMCE_V1 && + (opal_evt->severity == OpalMCE_SEV_NO_ERROR || + opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED)) + handled = 1; + + if (handled) + opal_evt->in_use = 0; + + return handled; +} + +long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) +{ + if (cpu_has_feature(CPU_FTR_ARCH_206)) + return kvmppc_realmode_mc_power7(vcpu); + + return 0; +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index fb0e821622d4..19c93bae1aea 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x) return __va(addr); } +/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ +static int global_invalidates(struct kvm *kvm, unsigned long flags) +{ + int global; + + /* + * If there is only one vcore, and it's currently running, + * we can use tlbiel as long as we mark all other physical + * cores as potentially having stale TLB entries for this lpid. + * If we're not using MMU notifiers, we never take pages away + * from the guest, so we can use tlbiel if requested. + * Otherwise, don't use tlbiel. + */ + if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore) + global = 0; + else if (kvm->arch.using_mmu_notifiers) + global = 1; + else + global = !(flags & H_LOCAL); + + if (!global) { + /* any other core might now have stale TLB entries... */ + smp_wmb(); + cpumask_setall(&kvm->arch.need_tlb_flush); + cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, + &kvm->arch.need_tlb_flush); + } + + return global; +} + /* * Add this HPTE into the chain for the real page. * Must be called with the chain locked; it unlocks the chain. @@ -59,13 +90,24 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, head->back = pte_index; } else { rev->forw = rev->back = pte_index; - i = pte_index; + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | + pte_index | KVMPPC_RMAP_PRESENT; } - smp_wmb(); - *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ + unlock_rmap(rmap); } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); +/* + * Note modification of an HPTE; set the HPTE modified bit + * if anyone is interested. + */ +static inline void note_hpte_modification(struct kvm *kvm, + struct revmap_entry *rev) +{ + if (atomic_read(&kvm->arch.hpte_mod_interest)) + rev->guest_rpte |= HPTE_GR_MODIFIED; +} + /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, struct revmap_entry *rev, @@ -81,7 +123,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, ptel = rev->guest_rpte |= rcbits; gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); - if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + if (!memslot) return; rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); @@ -103,14 +145,14 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } -static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, +static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, int writing, unsigned long *pte_sizep) { pte_t *ptep; unsigned long ps = *pte_sizep; unsigned int shift; - ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift); + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); if (!ptep) return __pte(0); if (shift) @@ -130,15 +172,15 @@ static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) hpte[0] = hpte_v; } -long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, - long pte_index, unsigned long pteh, unsigned long ptel) +long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel, + pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) { - struct kvm *kvm = vcpu->kvm; unsigned long i, pa, gpa, gfn, psize; unsigned long slot_fn, hva; unsigned long *hpte; struct revmap_entry *rev; - unsigned long g_ptel = ptel; + unsigned long g_ptel; struct kvm_memory_slot *memslot; unsigned long *physp, pte_size; unsigned long is_io; @@ -147,13 +189,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, unsigned int writing; unsigned long mmu_seq; unsigned long rcbits; - bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; psize = hpte_page_size(pteh, ptel); if (!psize) return H_PARAMETER; writing = hpte_is_writable(ptel); pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + ptel &= ~HPTE_GR_RESERVED; + g_ptel = ptel; /* used later to detect if we might have been invalidated */ mmu_seq = kvm->mmu_notifier_seq; @@ -183,7 +226,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, rmap = &memslot->arch.rmap[slot_fn]; if (!kvm->arch.using_mmu_notifiers) { - physp = kvm->arch.slot_phys[memslot->id]; + physp = memslot->arch.slot_phys; if (!physp) return H_PARAMETER; physp += slot_fn; @@ -201,7 +244,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, /* Look up the Linux PTE for the backing page */ pte_size = psize; - pte = lookup_linux_pte(vcpu, hva, writing, &pte_size); + pte = lookup_linux_pte(pgdir, hva, writing, &pte_size); if (pte_present(pte)) { if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ @@ -210,6 +253,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pa = pte_pfn(pte) << PAGE_SHIFT; } } + if (pte_size < psize) return H_PARAMETER; if (pa && pte_size > psize) @@ -287,8 +331,10 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, rev = &kvm->arch.revmap[pte_index]; if (realmode) rev = real_vmalloc_addr(rev); - if (rev) + if (rev) { rev->guest_rpte = g_ptel; + note_hpte_modification(kvm, rev); + } /* Link HPTE into reverse-map chain */ if (pteh & HPTE_V_VALID) { @@ -297,7 +343,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, lock_rmap(rmap); /* Check for pending invalidations under the rmap chain lock */ if (kvm->arch.using_mmu_notifiers && - mmu_notifier_retry(vcpu, mmu_seq)) { + mmu_notifier_retry(kvm, mmu_seq)) { /* inval in progress, write a non-present HPTE */ pteh |= HPTE_V_ABSENT; pteh &= ~HPTE_V_VALID; @@ -318,10 +364,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, hpte[0] = pteh; asm volatile("ptesync" : : : "memory"); - vcpu->arch.gpr[4] = pte_index; + *pte_idx_ret = pte_index; return H_SUCCESS; } -EXPORT_SYMBOL_GPL(kvmppc_h_enter); +EXPORT_SYMBOL_GPL(kvmppc_do_h_enter); + +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel) +{ + return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel, + vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]); +} #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) @@ -343,11 +396,10 @@ static inline int try_lock_tlbie(unsigned int *lock) return old == 0; } -long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, - unsigned long pte_index, unsigned long avpn, - unsigned long va) +long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret) { - struct kvm *kvm = vcpu->kvm; unsigned long *hpte; unsigned long v, r, rb; struct revmap_entry *rev; @@ -369,7 +421,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, if (v & HPTE_V_VALID) { hpte[0] &= ~HPTE_V_VALID; rb = compute_tlbie_rb(v, hpte[1], pte_index); - if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { + if (global_invalidates(kvm, flags)) { while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) cpu_relax(); asm volatile("ptesync" : : : "memory"); @@ -385,13 +437,22 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, /* Read PTE low word after tlbie to get final R/C values */ remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); } - r = rev->guest_rpte; + r = rev->guest_rpte & ~HPTE_GR_RESERVED; + note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); - vcpu->arch.gpr[4] = v; - vcpu->arch.gpr[5] = r; + hpret[0] = v; + hpret[1] = r; return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_do_h_remove); + +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn) +{ + return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn, + &vcpu->arch.gpr[4]); +} long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) { @@ -459,6 +520,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) args[j] = ((0x80 | flags) << 56) + pte_index; rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + note_hpte_modification(kvm, rev); if (!(hp[0] & HPTE_V_VALID)) { /* insert R and C bits from PTE */ @@ -534,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, return H_NOT_FOUND; } - if (atomic_read(&kvm->online_vcpus) == 1) - flags |= H_LOCAL; v = hpte[0]; bits = (flags << 55) & HPTE_R_PP0; bits |= (flags << 48) & HPTE_R_KEY_HI; @@ -548,6 +608,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (rev) { r = (rev->guest_rpte & ~mask) | bits; rev->guest_rpte = r; + note_hpte_modification(kvm, rev); } r = (hpte[1] & ~mask) | bits; @@ -555,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, if (v & HPTE_V_VALID) { rb = compute_tlbie_rb(v, r, pte_index); hpte[0] = v & ~HPTE_V_VALID; - if (!(flags & H_LOCAL)) { + if (global_invalidates(kvm, flags)) { while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) cpu_relax(); asm volatile("ptesync" : : : "memory"); @@ -568,6 +629,28 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, asm volatile("tlbiel %0" : : "r" (rb)); asm volatile("ptesync" : : : "memory"); } + /* + * If the host has this page as readonly but the guest + * wants to make it read/write, reduce the permissions. + * Checking the host permissions involves finding the + * memslot and then the Linux PTE for the page. + */ + if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) { + unsigned long psize, gfn, hva; + struct kvm_memory_slot *memslot; + pgd_t *pgdir = vcpu->arch.pgdir; + pte_t pte; + + psize = hpte_page_size(v, r); + gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + if (memslot) { + hva = __gfn_to_hva_memslot(memslot, gfn); + pte = lookup_linux_pte(pgdir, hva, 1, &psize); + if (pte_present(pte) && !pte_write(pte)) + r = hpte_make_readonly(r); + } + } } hpte[1] = r; eieio(); @@ -599,8 +682,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; } - if (v & HPTE_V_VALID) + if (v & HPTE_V_VALID) { r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); + r &= ~HPTE_GR_RESERVED; + } vcpu->arch.gpr[4 + i * 2] = v; vcpu->arch.gpr[5 + i * 2] = r; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 74a24bbb9637..10b6c358dd77 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -27,6 +27,7 @@ #include <asm/asm-offsets.h> #include <asm/exception-64s.h> #include <asm/kvm_book3s_asm.h> +#include <asm/mmu-hash64.h> /***************************************************************************** * * @@ -134,8 +135,11 @@ kvm_start_guest: 27: /* XXX should handle hypervisor maintenance interrupts etc. here */ + /* reload vcpu pointer after clearing the IPI */ + ld r4,HSTATE_KVM_VCPU(r13) + cmpdi r4,0 /* if we have no vcpu to run, go back to sleep */ - beq cr1,kvm_no_guest + beq kvm_no_guest /* were we napping due to cede? */ lbz r0,HSTATE_NAPPING(r13) @@ -310,7 +314,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_SDR1,r6 /* switch to partition page table */ mtspr SPRN_LPID,r7 isync + + /* See if we need to flush the TLB */ + lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ + clrldi r7,r6,64-6 /* extract bit number (6 bits) */ + srdi r6,r6,6 /* doubleword number */ + sldi r6,r6,3 /* address offset */ + add r6,r6,r9 + addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ li r0,1 + sld r0,r0,r7 + ld r7,0(r6) + and. r7,r7,r0 + beq 22f +23: ldarx r7,0,r6 /* if set, clear the bit */ + andc r7,r7,r0 + stdcx. r7,0,r6 + bne 23b + li r6,128 /* and flush the TLB */ + mtctr r6 + li r7,0x800 /* IS field = 0b10 */ + ptesync +28: tlbiel r7 + addi r7,r7,0x1000 + bdnz 28b + ptesync + +22: li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ b 10f @@ -333,36 +363,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mr r9,r4 blt hdec_soon - /* - * Invalidate the TLB if we could possibly have stale TLB - * entries for this partition on this core due to the use - * of tlbiel. - * XXX maybe only need this on primary thread? - */ - ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ - lwz r5,VCPU_VCPUID(r4) - lhz r6,PACAPACAINDEX(r13) - rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ - lhz r8,VCPU_LAST_CPU(r4) - sldi r7,r6,1 /* see if this is the same vcpu */ - add r7,r7,r9 /* as last ran on this pcpu */ - lhz r0,KVM_LAST_VCPU(r7) - cmpw r6,r8 /* on the same cpu core as last time? */ - bne 3f - cmpw r0,r5 /* same vcpu as this core last ran? */ - beq 1f -3: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */ - sth r5,KVM_LAST_VCPU(r7) - li r6,128 - mtctr r6 - li r7,0x800 /* IS field = 0b10 */ - ptesync -2: tlbiel r7 - addi r7,r7,0x1000 - bdnz 2b - ptesync -1: - /* Save purr/spurr */ mfspr r5,SPRN_PURR mfspr r6,SPRN_SPURR @@ -679,8 +679,7 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -nohpte_cont: -hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ +guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save DEC */ mfspr r5,SPRN_DEC mftb r6 @@ -701,6 +700,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) + /* See if it is a machine check */ + cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK + beq machine_check_realmode +mc_cont: + /* Save guest CTRL register, set runlatch to 1 */ 6: mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) @@ -1113,38 +1117,41 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) /* * For external and machine check interrupts, we need * to call the Linux handler to process the interrupt. - * We do that by jumping to the interrupt vector address - * which we have in r12. The [h]rfid at the end of the + * We do that by jumping to absolute address 0x500 for + * external interrupts, or the machine_check_fwnmi label + * for machine checks (since firmware might have patched + * the vector area at 0x200). The [h]rfid at the end of the * handler will return to the book3s_hv_interrupts.S code. * For other interrupts we do the rfid to get back - * to the book3s_interrupts.S code here. + * to the book3s_hv_interrupts.S code here. */ ld r8, HSTATE_VMHANDLER(r13) ld r7, HSTATE_HOST_MSR(r13) + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL +BEGIN_FTR_SECTION beq 11f - cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* RFI into the highmem handler, or branch to interrupt handler */ -12: mfmsr r6 - mtctr r12 + mfmsr r6 li r0, MSR_RI andc r6, r6, r0 mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - beqctr + beqa 0x500 /* external interrupt (PPC970) */ + beq cr1, 13f /* machine check */ RFI -11: -BEGIN_FTR_SECTION - b 12b -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) - mtspr SPRN_HSRR0, r8 + /* On POWER7, we have external interrupts set to use HSRR0/1 */ +11: mtspr SPRN_HSRR0, r8 mtspr SPRN_HSRR1, r7 ba 0x500 +13: b machine_check_fwnmi + /* * Check whether an HDSI is an HPTE not found fault or something else. * If it is an HPTE not found fault that is due to the guest accessing @@ -1177,7 +1184,7 @@ kvmppc_hdsi: cmpdi r3, 0 /* retry the instruction */ beq 6f cmpdi r3, -1 /* handle in kernel mode */ - beq nohpte_cont + beq guest_exit_cont cmpdi r3, -2 /* MMIO emulation; need instr word */ beq 2f @@ -1191,6 +1198,7 @@ kvmppc_hdsi: li r10, BOOK3S_INTERRUPT_DATA_STORAGE li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ rotldi r11, r11, 63 +fast_interrupt_c_return: 6: ld r7, VCPU_CTR(r9) lwz r8, VCPU_XER(r9) mtctr r7 @@ -1223,7 +1231,7 @@ kvmppc_hdsi: /* Unset guest mode. */ li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) - b nohpte_cont + b guest_exit_cont /* * Similarly for an HISI, reflect it to the guest as an ISI unless @@ -1249,9 +1257,9 @@ kvmppc_hisi: ld r11, VCPU_MSR(r9) li r12, BOOK3S_INTERRUPT_H_INST_STORAGE cmpdi r3, 0 /* retry the instruction */ - beq 6f + beq fast_interrupt_c_return cmpdi r3, -1 /* handle in kernel mode */ - beq nohpte_cont + beq guest_exit_cont /* Synthesize an ISI for the guest */ mr r11, r3 @@ -1260,12 +1268,7 @@ kvmppc_hisi: li r10, BOOK3S_INTERRUPT_INST_STORAGE li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ rotldi r11, r11, 63 -6: ld r7, VCPU_CTR(r9) - lwz r8, VCPU_XER(r9) - mtctr r7 - mtxer r8 - mr r4, r9 - b fast_guest_return + b fast_interrupt_c_return 3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ ld r5, KVM_VRMA_SLB_V(r6) @@ -1281,14 +1284,14 @@ kvmppc_hisi: hcall_try_real_mode: ld r3,VCPU_GPR(R3)(r9) andi. r0,r11,MSR_PR - bne hcall_real_cont + bne guest_exit_cont clrrdi r3,r3,2 cmpldi r3,hcall_real_table_end - hcall_real_table - bge hcall_real_cont + bge guest_exit_cont LOAD_REG_ADDR(r4, hcall_real_table) lwzx r3,r3,r4 cmpwi r3,0 - beq hcall_real_cont + beq guest_exit_cont add r3,r3,r4 mtctr r3 mr r3,r9 /* get vcpu pointer */ @@ -1309,7 +1312,7 @@ hcall_real_fallback: li r12,BOOK3S_INTERRUPT_SYSCALL ld r9, HSTATE_KVM_VCPU(r13) - b hcall_real_cont + b guest_exit_cont .globl hcall_real_table hcall_real_table: @@ -1568,6 +1571,21 @@ kvm_cede_exit: li r3,H_TOO_HARD blr + /* Try to handle a machine check in real mode */ +machine_check_realmode: + mr r3, r9 /* get vcpu pointer */ + bl .kvmppc_realmode_machine_check + nop + cmpdi r3, 0 /* continue exiting from guest? */ + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_MACHINE_CHECK + beq mc_cont + /* If not, deliver a machine check. SRR0/1 are already set */ + li r10, BOOK3S_INTERRUPT_MACHINE_CHECK + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 + b fast_interrupt_c_return + secondary_too_late: ld r5,HSTATE_KVM_VCORE(r13) HMT_LOW @@ -1587,6 +1605,10 @@ secondary_too_late: .endr secondary_nap: + /* Clear our vcpu pointer so we don't come back in early */ + li r0, 0 + std r0, HSTATE_KVM_VCPU(r13) + lwsync /* Clear any pending IPI - assume we're a secondary thread */ ld r5, HSTATE_XICS_PHYS(r13) li r7, XICS_XIRR @@ -1612,8 +1634,6 @@ secondary_nap: kvm_no_guest: li r0, KVM_HWTHREAD_IN_NAP stb r0, HSTATE_HWTHREAD_STATE(r13) - li r0, 0 - std r0, HSTATE_KVM_VCPU(r13) li r3, LPCR_PECE0 mfspr r4, SPRN_LPCR diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index 41cb0017e757..2c86b0d63714 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -114,11 +114,6 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) hlist_del_init_rcu(&pte->list_vpte); hlist_del_init_rcu(&pte->list_vpte_long); - if (pte->pte.may_write) - kvm_release_pfn_dirty(pte->pfn); - else - kvm_release_pfn_clean(pte->pfn); - spin_unlock(&vcpu3s->mmu_lock); vcpu3s->hpte_cache_count--; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 05c28f59f77f..28d38adeca73 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -52,8 +52,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, #define MSR_USER32 MSR_USER #define MSR_USER64 MSR_USER #define HW_PAGE_SIZE PAGE_SIZE -#define __hard_irq_disable local_irq_disable -#define __hard_irq_enable local_irq_enable #endif void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -66,7 +64,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; svcpu_put(svcpu); #endif - + vcpu->cpu = smp_processor_id(); #ifdef CONFIG_PPC_BOOK3S_32 current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; #endif @@ -83,17 +81,71 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) svcpu_put(svcpu); #endif - kvmppc_giveup_ext(vcpu, MSR_FP); - kvmppc_giveup_ext(vcpu, MSR_VEC); - kvmppc_giveup_ext(vcpu, MSR_VSX); + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); + vcpu->cpu = -1; +} + +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ + int r = 1; /* Indicate we want to get back into the guest */ + + /* We misuse TLB_FLUSH to indicate that we want to clear + all shadow cache entries */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvmppc_mmu_pte_flush(vcpu, 0, 0); + + return r; +} + +/************* MMU Notifiers *************/ + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_unmap_hva(hva); + + /* + * Flush all shadow tlb entries everywhere. This is slow, but + * we are 100% sure that we catch the to be unmapped page + */ + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + /* kvm_unmap_hva flushes everything anyways */ + kvm_unmap_hva(kvm, start); + + return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; } +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + /* The page will get remapped properly on its next fault */ + kvm_unmap_hva(kvm, hva); +} + +/*****************************************/ + static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) { ulong smsr = vcpu->arch.shared->msr; /* Guest MSR values */ - smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; + smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE; /* Process MSR values */ smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; /* External providers the guest reserved */ @@ -379,10 +431,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, static inline int get_fpr_index(int i) { -#ifdef CONFIG_VSX - i *= 2; -#endif - return i; + return i * TS_FPRWIDTH; } /* Give up external provider (FPU, Altivec, VSX) */ @@ -396,41 +445,49 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) u64 *thread_fpr = (u64*)t->fpr; int i; - if (!(vcpu->arch.guest_owned_ext & msr)) + /* + * VSX instructions can access FP and vector registers, so if + * we are giving up VSX, make sure we give up FP and VMX as well. + */ + if (msr & MSR_VSX) + msr |= MSR_FP | MSR_VEC; + + msr &= vcpu->arch.guest_owned_ext; + if (!msr) return; #ifdef DEBUG_EXT printk(KERN_INFO "Giving up ext 0x%lx\n", msr); #endif - switch (msr) { - case MSR_FP: + if (msr & MSR_FP) { + /* + * Note that on CPUs with VSX, giveup_fpu stores + * both the traditional FP registers and the added VSX + * registers into thread.fpr[]. + */ giveup_fpu(current); for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; vcpu->arch.fpscr = t->fpscr.val; - break; - case MSR_VEC: + +#ifdef CONFIG_VSX + if (cpu_has_feature(CPU_FTR_VSX)) + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++) + vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; +#endif + } + #ifdef CONFIG_ALTIVEC + if (msr & MSR_VEC) { giveup_altivec(current); memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); vcpu->arch.vscr = t->vscr; -#endif - break; - case MSR_VSX: -#ifdef CONFIG_VSX - __giveup_vsx(current); - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) - vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; -#endif - break; - default: - BUG(); } +#endif - vcpu->arch.guest_owned_ext &= ~msr; - current->thread.regs->msr &= ~msr; + vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX); kvmppc_recalc_shadow_msr(vcpu); } @@ -490,47 +547,56 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, return RESUME_GUEST; } - /* We already own the ext */ - if (vcpu->arch.guest_owned_ext & msr) { - return RESUME_GUEST; + if (msr == MSR_VSX) { + /* No VSX? Give an illegal instruction interrupt */ +#ifdef CONFIG_VSX + if (!cpu_has_feature(CPU_FTR_VSX)) +#endif + { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + return RESUME_GUEST; + } + + /* + * We have to load up all the FP and VMX registers before + * we can let the guest use VSX instructions. + */ + msr = MSR_FP | MSR_VEC | MSR_VSX; } + /* See if we already own all the ext(s) needed */ + msr &= ~vcpu->arch.guest_owned_ext; + if (!msr) + return RESUME_GUEST; + #ifdef DEBUG_EXT printk(KERN_INFO "Loading up ext 0x%lx\n", msr); #endif current->thread.regs->msr |= msr; - switch (msr) { - case MSR_FP: + if (msr & MSR_FP) { for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; - +#ifdef CONFIG_VSX + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++) + thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; +#endif t->fpscr.val = vcpu->arch.fpscr; t->fpexc_mode = 0; kvmppc_load_up_fpu(); - break; - case MSR_VEC: + } + + if (msr & MSR_VEC) { #ifdef CONFIG_ALTIVEC memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); t->vscr = vcpu->arch.vscr; t->vrsave = -1; kvmppc_load_up_altivec(); #endif - break; - case MSR_VSX: -#ifdef CONFIG_VSX - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) - thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; - kvmppc_load_up_vsx(); -#endif - break; - default: - BUG(); } vcpu->arch.guest_owned_ext |= msr; - kvmppc_recalc_shadow_msr(vcpu); return RESUME_GUEST; @@ -540,18 +606,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { int r = RESUME_HOST; + int s; vcpu->stat.sum_exits++; run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; - /* We get here with MSR.EE=0, so enable it to be a nice citizen */ - __hard_irq_enable(); + /* We get here with MSR.EE=1 */ + + trace_kvm_exit(exit_nr, vcpu); + kvm_guest_exit(); - trace_kvm_book3s_exit(exit_nr, vcpu); - preempt_enable(); - kvm_resched(vcpu); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: { @@ -802,7 +868,6 @@ program_interrupt: } } - preempt_disable(); if (!(r & RESUME_HOST)) { /* To avoid clobbering exit_reason, only check for signals if * we aren't already exiting to userspace for some other @@ -814,20 +879,13 @@ program_interrupt: * and if we really did time things so badly, then we just exit * again due to a host external interrupt. */ - __hard_irq_disable(); - if (signal_pending(current)) { - __hard_irq_enable(); -#ifdef EXIT_DEBUG - printk(KERN_EMERG "KVM: Going back to host\n"); -#endif - vcpu->stat.signal_exits++; - run->exit_reason = KVM_EXIT_INTR; - r = -EINTR; + local_irq_disable(); + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) { + local_irq_enable(); + r = s; } else { - /* In case an interrupt came in that was triggered - * from userspace (like DEC), we need to check what - * to inject now! */ - kvmppc_core_prepare_to_enter(vcpu); + kvmppc_lazy_ee_enable(); } } @@ -899,34 +957,59 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } -int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - int r = -EINVAL; + int r = 0; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_HIOR: - r = copy_to_user((u64 __user *)(long)reg->addr, - &to_book3s(vcpu)->hior, sizeof(u64)); + *val = get_reg_val(id, to_book3s(vcpu)->hior); break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: { + long int i = id - KVM_REG_PPC_VSR0; + + if (!cpu_has_feature(CPU_FTR_VSX)) { + r = -ENXIO; + break; + } + val->vsxval[0] = vcpu->arch.fpr[i]; + val->vsxval[1] = vcpu->arch.vsr[i]; + break; + } +#endif /* CONFIG_VSX */ default: + r = -EINVAL; break; } return r; } -int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { - int r = -EINVAL; + int r = 0; - switch (reg->id) { + switch (id) { case KVM_REG_PPC_HIOR: - r = copy_from_user(&to_book3s(vcpu)->hior, - (u64 __user *)(long)reg->addr, sizeof(u64)); - if (!r) - to_book3s(vcpu)->hior_explicit = true; + to_book3s(vcpu)->hior = set_reg_val(id, *val); + to_book3s(vcpu)->hior_explicit = true; + break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: { + long int i = id - KVM_REG_PPC_VSR0; + + if (!cpu_has_feature(CPU_FTR_VSX)) { + r = -ENXIO; + break; + } + vcpu->arch.fpr[i] = val->vsxval[0]; + vcpu->arch.vsr[i] = val->vsxval[1]; break; + } +#endif /* CONFIG_VSX */ default: + r = -EINVAL; break; } @@ -1020,8 +1103,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif ulong ext_msr; - preempt_disable(); - /* Check if we can run the vcpu at all */ if (!vcpu->arch.sane) { kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -1029,21 +1110,16 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) goto out; } - kvmppc_core_prepare_to_enter(vcpu); - /* * Interrupts could be timers for the guest which we have to inject * again, so let's postpone them until we're in the guest and if we * really did time things so badly, then we just exit again due to * a host external interrupt. */ - __hard_irq_disable(); - - /* No need to go into the guest when all we do is going out */ - if (signal_pending(current)) { - __hard_irq_enable(); - kvm_run->exit_reason = KVM_EXIT_INTR; - ret = -EINTR; + local_irq_disable(); + ret = kvmppc_prepare_to_enter(vcpu); + if (ret <= 0) { + local_irq_enable(); goto out; } @@ -1070,7 +1146,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* Save VSX state in stack */ used_vsr = current->thread.used_vsr; if (used_vsr && (current->thread.regs->msr & MSR_VSX)) - __giveup_vsx(current); + __giveup_vsx(current); #endif /* Remember the MSR with disabled extensions */ @@ -1080,20 +1156,19 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if (vcpu->arch.shared->msr & MSR_FP) kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); - kvm_guest_enter(); + kvmppc_lazy_ee_enable(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); - kvm_guest_exit(); - - current->thread.regs->msr = ext_msr; + /* No need for kvm_guest_exit. It's done in handle_exit. + We also get here with interrupts enabled. */ /* Make sure we save the guest FPU/Altivec/VSX state */ - kvmppc_giveup_ext(vcpu, MSR_FP); - kvmppc_giveup_ext(vcpu, MSR_VEC); - kvmppc_giveup_ext(vcpu, MSR_VSX); + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); + + current->thread.regs->msr = ext_msr; - /* Restore FPU state from stack */ + /* Restore FPU/VSX state from stack */ memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); current->thread.fpscr.val = fpscr; current->thread.fpexc_mode = fpexc_mode; @@ -1113,7 +1188,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif out: - preempt_enable(); + vcpu->mode = OUTSIDE_GUEST_MODE; return ret; } @@ -1181,14 +1256,31 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info) } #endif /* CONFIG_PPC64 */ +void kvmppc_core_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages) +{ + return 0; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem) { return 0; } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old) +{ +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { } diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 9ecf6e35cd8d..8f7633e3afb8 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -170,20 +170,21 @@ kvmppc_handler_skip_ins: * Call kvmppc_handler_trampoline_enter in real mode * * On entry, r4 contains the guest shadow MSR + * MSR.EE has to be 0 when calling this function */ _GLOBAL(kvmppc_entry_trampoline) mfmsr r5 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) toreal(r7) - li r9, MSR_RI - ori r9, r9, MSR_EE - andc r9, r5, r9 /* Clear EE and RI in MSR value */ li r6, MSR_IR | MSR_DR - ori r6, r6, MSR_EE - andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */ - MTMSR_EERI(r9) /* Clear EE and RI in MSR */ - mtsrr0 r7 /* before we set srr0/1 */ + andc r6, r5, r6 /* Clear DR and IR in MSR value */ + /* + * Set EE in HOST_MSR so that it's enabled when we get into our + * C exit handler function + */ + ori r5, r5, MSR_EE + mtsrr0 r7 mtsrr1 r6 RFI @@ -233,8 +234,5 @@ define_load_up(fpu) #ifdef CONFIG_ALTIVEC define_load_up(altivec) #endif -#ifdef CONFIG_VSX -define_load_up(vsx) -#endif #include "book3s_segment.S" diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d25a097c852b..69f114015780 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -36,9 +36,11 @@ #include <asm/dbell.h> #include <asm/hw_irq.h> #include <asm/irq.h> +#include <asm/time.h> #include "timing.h" #include "booke.h" +#include "trace.h" unsigned long kvmppc_booke_handlers; @@ -62,6 +64,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "doorbell", VCPU_STAT(dbell_exits) }, { "guest doorbell", VCPU_STAT(gdbell_exits) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { NULL } }; @@ -120,6 +123,16 @@ static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) } #endif +static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) +{ +#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV) + /* We always treat the FP bit as enabled from the host + perspective, so only need to adjust the shadow MSR */ + vcpu->arch.shadow_msr &= ~MSR_FP; + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP; +#endif +} + /* * Helper function for "full" MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -136,11 +149,13 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) kvmppc_mmu_msr_notify(vcpu, old_msr); kvmppc_vcpu_sync_spe(vcpu); + kvmppc_vcpu_sync_fpu(vcpu); } static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) { + trace_kvm_booke_queue_irqprio(vcpu, priority); set_bit(priority, &vcpu->arch.pending_exceptions); } @@ -206,6 +221,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); } +static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); +} + +static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); +} + static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) { #ifdef CONFIG_KVM_BOOKE_HV @@ -287,6 +312,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, bool crit; bool keep_irq = false; enum int_class int_class; + ulong new_msr = vcpu->arch.shared->msr; /* Truncate crit indicators in 32 bit mode */ if (!(vcpu->arch.shared->msr & MSR_SF)) { @@ -325,6 +351,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, msr_mask = MSR_CE | MSR_ME | MSR_DE; int_class = INT_CLASS_NONCRIT; break; + case BOOKE_IRQPRIO_WATCHDOG: case BOOKE_IRQPRIO_CRITICAL: case BOOKE_IRQPRIO_DBELL_CRIT: allowed = vcpu->arch.shared->msr & MSR_CE; @@ -381,7 +408,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, set_guest_esr(vcpu, vcpu->arch.queued_esr); if (update_dear == true) set_guest_dear(vcpu, vcpu->arch.queued_dear); - kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); + + new_msr &= msr_mask; +#if defined(CONFIG_64BIT) + if (vcpu->arch.epcr & SPRN_EPCR_ICM) + new_msr |= MSR_CM; +#endif + kvmppc_set_msr(vcpu, new_msr); if (!keep_irq) clear_bit(priority, &vcpu->arch.pending_exceptions); @@ -404,12 +437,121 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, return allowed; } +/* + * Return the number of jiffies until the next timeout. If the timeout is + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA + * because the larger value can break the timer APIs. + */ +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) +{ + u64 tb, wdt_tb, wdt_ticks = 0; + u64 nr_jiffies = 0; + u32 period = TCR_GET_WP(vcpu->arch.tcr); + + wdt_tb = 1ULL << (63 - period); + tb = get_tb(); + /* + * The watchdog timeout will hapeen when TB bit corresponding + * to watchdog will toggle from 0 to 1. + */ + if (tb & wdt_tb) + wdt_ticks = wdt_tb; + + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); + + /* Convert timebase ticks to jiffies */ + nr_jiffies = wdt_ticks; + + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) + nr_jiffies++; + + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); +} + +static void arm_next_watchdog(struct kvm_vcpu *vcpu) +{ + unsigned long nr_jiffies; + unsigned long flags; + + /* + * If TSR_ENW and TSR_WIS are not set then no need to exit to + * userspace, so clear the KVM_REQ_WATCHDOG request. + */ + if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS)) + clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests); + + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); + nr_jiffies = watchdog_next_timeout(vcpu); + /* + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA + * then do not run the watchdog timer as this can break timer APIs. + */ + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); + else + del_timer(&vcpu->arch.wdt_timer); + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); +} + +void kvmppc_watchdog_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + u32 tsr, new_tsr; + int final; + + do { + new_tsr = tsr = vcpu->arch.tsr; + final = 0; + + /* Time out event */ + if (tsr & TSR_ENW) { + if (tsr & TSR_WIS) + final = 1; + else + new_tsr = tsr | TSR_WIS; + } else { + new_tsr = tsr | TSR_ENW; + } + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); + + if (new_tsr & TSR_WIS) { + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * If this is final watchdog expiry and some action is required + * then exit to userspace. + */ + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && + vcpu->arch.watchdog_enabled) { + smp_wmb(); + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * Stop running the watchdog timer after final expiration to + * prevent the host from being flooded with timers if the + * guest sets a short period. + * Timers will resume when TSR/TCR is updated next time. + */ + if (!final) + arm_next_watchdog(vcpu); +} + static void update_timer_ints(struct kvm_vcpu *vcpu) { if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) kvmppc_core_queue_dec(vcpu); else kvmppc_core_dequeue_dec(vcpu); + + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) + kvmppc_core_queue_watchdog(vcpu); + else + kvmppc_core_dequeue_watchdog(vcpu); } static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) @@ -417,13 +559,6 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned int priority; - if (vcpu->requests) { - if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) { - smp_mb(); - update_timer_ints(vcpu); - } - } - priority = __ffs(*pending); while (priority < BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) @@ -459,37 +594,20 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) return r; } -/* - * Common checks before entering the guest world. Call with interrupts - * disabled. - * - * returns !0 if a signal is pending and check_signal is true - */ -static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) { - int r = 0; + int r = 1; /* Indicate we want to get back into the guest */ - WARN_ON_ONCE(!irqs_disabled()); - while (true) { - if (need_resched()) { - local_irq_enable(); - cond_resched(); - local_irq_disable(); - continue; - } - - if (signal_pending(current)) { - r = 1; - break; - } - - if (kvmppc_core_prepare_to_enter(vcpu)) { - /* interrupts got enabled in between, so we - are back at square 1 */ - continue; - } + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) + update_timer_ints(vcpu); +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvmppc_core_flush_tlb(vcpu); +#endif - break; + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_WATCHDOG; + r = 0; } return r; @@ -497,7 +615,7 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - int ret; + int ret, s; #ifdef CONFIG_PPC_FPU unsigned int fpscr; int fpexc_mode; @@ -510,11 +628,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } local_irq_disable(); - if (kvmppc_prepare_to_enter(vcpu)) { - kvm_run->exit_reason = KVM_EXIT_INTR; - ret = -EINTR; + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) { + local_irq_enable(); + ret = s; goto out; } + kvmppc_lazy_ee_enable(); kvm_guest_enter(); @@ -542,6 +662,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ret = __kvmppc_vcpu_run(kvm_run, vcpu); + /* No need for kvm_guest_exit. It's done in handle_exit. + We also get here with interrupts enabled. */ + #ifdef CONFIG_PPC_FPU kvmppc_save_guest_fp(vcpu); @@ -557,10 +680,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) current->thread.fpexc_mode = fpexc_mode; #endif - kvm_guest_exit(); - out: - local_irq_enable(); + vcpu->mode = OUTSIDE_GUEST_MODE; return ret; } @@ -668,6 +789,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { int r = RESUME_HOST; + int s; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); @@ -677,6 +799,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, local_irq_enable(); + trace_kvm_exit(exit_nr, vcpu); + kvm_guest_exit(); + run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; @@ -971,10 +1096,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, */ if (!(r & RESUME_HOST)) { local_irq_disable(); - if (kvmppc_prepare_to_enter(vcpu)) { - run->exit_reason = KVM_EXIT_INTR; - r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); - kvmppc_account_exit(vcpu, SIGNAL_EXITS); + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) { + local_irq_enable(); + r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); + } else { + kvmppc_lazy_ee_enable(); } } @@ -1011,6 +1138,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + /* setup watchdog timer once */ + spin_lock_init(&vcpu->arch.wdt_lock); + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, + (unsigned long)vcpu); + + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + del_timer_sync(&vcpu->arch.wdt_timer); +} + int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; @@ -1106,7 +1248,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, } if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { + u32 old_tsr = vcpu->arch.tsr; + vcpu->arch.tsr = sregs->u.e.tsr; + + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + update_timer_ints(vcpu); } @@ -1221,12 +1369,70 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { - return -EINVAL; + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_IAC1: + case KVM_REG_PPC_IAC2: + case KVM_REG_PPC_IAC3: + case KVM_REG_PPC_IAC4: { + int iac = reg->id - KVM_REG_PPC_IAC1; + r = copy_to_user((u64 __user *)(long)reg->addr, + &vcpu->arch.dbg_reg.iac[iac], sizeof(u64)); + break; + } + case KVM_REG_PPC_DAC1: + case KVM_REG_PPC_DAC2: { + int dac = reg->id - KVM_REG_PPC_DAC1; + r = copy_to_user((u64 __user *)(long)reg->addr, + &vcpu->arch.dbg_reg.dac[dac], sizeof(u64)); + break; + } +#if defined(CONFIG_64BIT) + case KVM_REG_PPC_EPCR: + r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); + break; +#endif + default: + break; + } + return r; } int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { - return -EINVAL; + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_IAC1: + case KVM_REG_PPC_IAC2: + case KVM_REG_PPC_IAC3: + case KVM_REG_PPC_IAC4: { + int iac = reg->id - KVM_REG_PPC_IAC1; + r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac], + (u64 __user *)(long)reg->addr, sizeof(u64)); + break; + } + case KVM_REG_PPC_DAC1: + case KVM_REG_PPC_DAC2: { + int dac = reg->id - KVM_REG_PPC_DAC1; + r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac], + (u64 __user *)(long)reg->addr, sizeof(u64)); + break; + } +#if defined(CONFIG_64BIT) + case KVM_REG_PPC_EPCR: { + u32 new_epcr; + r = get_user(new_epcr, (u32 __user *)(long)reg->addr); + if (r == 0) + kvmppc_set_epcr(vcpu, new_epcr); + break; + } +#endif + default: + break; + } + return r; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) @@ -1253,20 +1459,50 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return -ENOTSUPP; } +void kvmppc_core_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages) +{ + return 0; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_userspace_memory_region *mem) { return 0; } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old) +{ +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ +} + +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr) { +#if defined(CONFIG_64BIT) + vcpu->arch.epcr = new_epcr; +#ifdef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM; + if (vcpu->arch.epcr & SPRN_EPCR_ICM) + vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM; +#endif +#endif } void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) { vcpu->arch.tcr = new_tcr; + arm_next_watchdog(vcpu); update_timer_ints(vcpu); } @@ -1281,6 +1517,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) { clear_bits(tsr_bits, &vcpu->arch.tsr); + + /* + * We may have stopped the watchdog due to + * being stuck on final expiration. + */ + if (tsr_bits & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + update_timer_ints(vcpu); } @@ -1298,12 +1542,14 @@ void kvmppc_decrementer_func(unsigned long data) void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + vcpu->cpu = smp_processor_id(); current->thread.kvm_vcpu = vcpu; } void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) { current->thread.kvm_vcpu = NULL; + vcpu->cpu = -1; } int __init kvmppc_booke_init(void) diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index ba61974c1e20..e9b88e433f64 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -69,6 +69,7 @@ extern unsigned long kvmppc_booke_handlers; void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr); void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 12834bb608ab..4685b8cf2249 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -133,10 +133,10 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) vcpu->arch.csrr1 = spr_val; break; case SPRN_DBCR0: - vcpu->arch.dbcr0 = spr_val; + vcpu->arch.dbg_reg.dbcr0 = spr_val; break; case SPRN_DBCR1: - vcpu->arch.dbcr1 = spr_val; + vcpu->arch.dbg_reg.dbcr1 = spr_val; break; case SPRN_DBSR: vcpu->arch.dbsr &= ~spr_val; @@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) kvmppc_clr_tsr_bits(vcpu, spr_val); break; case SPRN_TCR: + /* + * WRC is a 2-bit field that is supposed to preserve its + * value once written to non-zero. + */ + if (vcpu->arch.tcr & TCR_WRC_MASK) { + spr_val &= ~TCR_WRC_MASK; + spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; + } kvmppc_set_tcr(vcpu, spr_val); break; @@ -229,7 +237,17 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_IVOR15: vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; break; - + case SPRN_MCSR: + vcpu->arch.mcsr &= ~spr_val; + break; +#if defined(CONFIG_64BIT) + case SPRN_EPCR: + kvmppc_set_epcr(vcpu, spr_val); +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); +#endif + break; +#endif default: emulated = EMULATE_FAIL; } @@ -258,10 +276,10 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) *spr_val = vcpu->arch.csrr1; break; case SPRN_DBCR0: - *spr_val = vcpu->arch.dbcr0; + *spr_val = vcpu->arch.dbg_reg.dbcr0; break; case SPRN_DBCR1: - *spr_val = vcpu->arch.dbcr1; + *spr_val = vcpu->arch.dbg_reg.dbcr1; break; case SPRN_DBSR: *spr_val = vcpu->arch.dbsr; @@ -321,6 +339,14 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) case SPRN_IVOR15: *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; break; + case SPRN_MCSR: + *spr_val = vcpu->arch.mcsr; + break; +#if defined(CONFIG_64BIT) + case SPRN_EPCR: + *spr_val = vcpu->arch.epcr; + break; +#endif default: emulated = EMULATE_FAIL; diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index 099fe8272b57..e8ed7d659c55 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -16,6 +16,7 @@ * * Author: Varun Sethi <varun.sethi@freescale.com> * Author: Scott Wood <scotwood@freescale.com> + * Author: Mihai Caraman <mihai.caraman@freescale.com> * * This file is derived from arch/powerpc/kvm/booke_interrupts.S */ @@ -30,31 +31,33 @@ #include <asm/bitsperlong.h> #include <asm/thread_info.h> +#ifdef CONFIG_64BIT +#include <asm/exception-64e.h> +#else #include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ - -#define GET_VCPU(vcpu, thread) \ - PPC_LL vcpu, THREAD_KVM_VCPU(thread) +#endif #define LONGBYTES (BITS_PER_LONG / 8) #define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) /* The host stack layout: */ -#define HOST_R1 (0 * LONGBYTES) /* Implied by stwu. */ -#define HOST_CALLEE_LR (1 * LONGBYTES) -#define HOST_RUN (2 * LONGBYTES) /* struct kvm_run */ +#define HOST_R1 0 /* Implied by stwu. */ +#define HOST_CALLEE_LR PPC_LR_STKOFF +#define HOST_RUN (HOST_CALLEE_LR + LONGBYTES) /* * r2 is special: it holds 'current', and it made nonvolatile in the * kernel with the -ffixed-r2 gcc option. */ -#define HOST_R2 (3 * LONGBYTES) -#define HOST_CR (4 * LONGBYTES) -#define HOST_NV_GPRS (5 * LONGBYTES) +#define HOST_R2 (HOST_RUN + LONGBYTES) +#define HOST_CR (HOST_R2 + LONGBYTES) +#define HOST_NV_GPRS (HOST_CR + LONGBYTES) #define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) #define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n) #define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES) #define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ -#define HOST_STACK_LR (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */ +/* LR in caller stack frame. */ +#define HOST_STACK_LR (HOST_STACK_SIZE + PPC_LR_STKOFF) #define NEED_EMU 0x00000001 /* emulation -- save nv regs */ #define NEED_DEAR 0x00000002 /* save faulting DEAR */ @@ -201,12 +204,128 @@ b kvmppc_resume_host .endm +#ifdef CONFIG_64BIT +/* Exception types */ +#define EX_GEN 1 +#define EX_GDBELL 2 +#define EX_DBG 3 +#define EX_MC 4 +#define EX_CRIT 5 +#define EX_TLB 6 + +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags + _GLOBAL(kvmppc_handler_\intno\()_\srr1) + mr r11, r4 + /* + * Get vcpu from Paca: paca->__current.thread->kvm_vcpu + */ + PPC_LL r4, PACACURRENT(r13) + PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) + stw r10, VCPU_CR(r4) + PPC_STL r11, VCPU_GPR(R4)(r4) + PPC_STL r5, VCPU_GPR(R5)(r4) + .if \type == EX_CRIT + PPC_LL r5, (\paca_ex + EX_R13)(r13) + .else + mfspr r5, \scratch + .endif + PPC_STL r6, VCPU_GPR(R6)(r4) + PPC_STL r8, VCPU_GPR(R8)(r4) + PPC_STL r9, VCPU_GPR(R9)(r4) + PPC_STL r5, VCPU_GPR(R13)(r4) + PPC_LL r6, (\paca_ex + \ex_r10)(r13) + PPC_LL r8, (\paca_ex + \ex_r11)(r13) + PPC_STL r3, VCPU_GPR(R3)(r4) + PPC_STL r7, VCPU_GPR(R7)(r4) + PPC_STL r12, VCPU_GPR(R12)(r4) + PPC_STL r6, VCPU_GPR(R10)(r4) + PPC_STL r8, VCPU_GPR(R11)(r4) + mfctr r5 + PPC_STL r5, VCPU_CTR(r4) + mfspr r5, \srr0 + mfspr r6, \srr1 + kvm_handler_common \intno, \srr0, \flags +.endm + +#define EX_PARAMS(type) \ + EX_##type, \ + SPRN_SPRG_##type##_SCRATCH, \ + PACA_EX##type, \ + EX_R10, \ + EX_R11 + +#define EX_PARAMS_TLB \ + EX_TLB, \ + SPRN_SPRG_GEN_SCRATCH, \ + PACA_EXTLB, \ + EX_TLB_R10, \ + EX_TLB_R11 + +kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \ + SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,NEED_ESR +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\ + SPRN_CSRR0, SPRN_CSRR1, 0 +/* + * Only bolted TLB miss exception handlers are supported for now + */ +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \ + SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \ + SPRN_DSRR0, SPRN_DSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +#else /* * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h */ .macro kvm_handler intno srr0, srr1, flags _GLOBAL(kvmppc_handler_\intno\()_\srr1) - GET_VCPU(r11, r10) + PPC_LL r11, THREAD_KVM_VCPU(r10) PPC_STL r3, VCPU_GPR(R3)(r11) mfspr r3, SPRN_SPRG_RSCRATCH0 PPC_STL r4, VCPU_GPR(R4)(r11) @@ -233,7 +352,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) .macro kvm_lvl_handler intno scratch srr0, srr1, flags _GLOBAL(kvmppc_handler_\intno\()_\srr1) mfspr r10, SPRN_SPRG_THREAD - GET_VCPU(r11, r10) + PPC_LL r11, THREAD_KVM_VCPU(r10) PPC_STL r3, VCPU_GPR(R3)(r11) mfspr r3, \scratch PPC_STL r4, VCPU_GPR(R4)(r11) @@ -295,7 +414,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 - +#endif /* Registers: * SPRG_SCRATCH0: guest r10 diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index aa8b81428bf4..c70d37ed770a 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -27,8 +27,7 @@ #define E500_TLB_NUM 2 #define E500_TLB_VALID 1 -#define E500_TLB_DIRTY 2 -#define E500_TLB_BITMAP 4 +#define E500_TLB_BITMAP 2 struct tlbe_ref { pfn_t pfn; @@ -130,9 +129,9 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value); int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu); int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu); -int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb); -int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb); -int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb); +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea); +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea); +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea); int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500); void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); @@ -155,7 +154,7 @@ get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) { - return tlbe->mas2 & 0xfffff000; + return tlbe->mas2 & MAS2_EPN; } static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index e04b0ef55ce0..e78f353a836a 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -89,6 +89,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, int ra = get_ra(inst); int rb = get_rb(inst); int rt = get_rt(inst); + gva_t ea; switch (get_op(inst)) { case 31: @@ -113,15 +114,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case XOP_TLBSX: - emulated = kvmppc_e500_emul_tlbsx(vcpu,rb); + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbsx(vcpu, ea); break; - case XOP_TLBILX: - emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb); + case XOP_TLBILX: { + int type = rt & 0x3; + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea); break; + } case XOP_TLBIVAX: - emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb); + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; default: diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index ff38b664195d..cf3f18012371 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -304,17 +304,13 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, ref->flags = E500_TLB_VALID; if (tlbe_is_writable(gtlbe)) - ref->flags |= E500_TLB_DIRTY; + kvm_set_pfn_dirty(pfn); } static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) { if (ref->flags & E500_TLB_VALID) { - if (ref->flags & E500_TLB_DIRTY) - kvm_release_pfn_dirty(ref->pfn); - else - kvm_release_pfn_clean(ref->pfn); - + trace_kvm_booke206_ref_release(ref->pfn, ref->flags); ref->flags = 0; } } @@ -357,6 +353,13 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500) clear_tlb_privs(vcpu_e500); } +void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + clear_tlb_refs(vcpu_e500); + clear_tlb1_bitmap(vcpu_e500); +} + static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, unsigned int eaddr, int as) { @@ -412,7 +415,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, struct tlbe_ref *ref) { struct kvm_memory_slot *slot; - unsigned long pfn, hva; + unsigned long pfn = 0; /* silence GCC warning */ + unsigned long hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; @@ -521,7 +525,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); pfn = gfn_to_pfn_memslot(slot, gfn); - if (is_error_pfn(pfn)) { + if (is_error_noslot_pfn(pfn)) { printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", (long)gfn); return; @@ -541,6 +545,9 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, /* Clear i-cache for new pages */ kvmppc_mmu_flush_icache(pfn); + + /* Drop refcount on page, so that mmu notifiers can clear it */ + kvm_release_pfn_clean(pfn); } /* XXX only map the one-one case, for now use TLB0 */ @@ -682,14 +689,11 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) return EMULATE_DONE; } -int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); unsigned int ia; int esel, tlbsel; - gva_t ea; - - ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb); ia = (ea >> 2) & 0x1; @@ -716,7 +720,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) } static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, - int pid, int rt) + int pid, int type) { struct kvm_book3e_206_tlb_entry *tlbe; int tid, esel; @@ -725,7 +729,7 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) { tlbe = get_entry(vcpu_e500, tlbsel, esel); tid = get_tlb_tid(tlbe); - if (rt == 0 || tid == pid) { + if (type == 0 || tid == pid) { inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); } @@ -733,14 +737,9 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, } static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, - int ra, int rb) + gva_t ea) { int tlbsel, esel; - gva_t ea; - - ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb); - if (ra) - ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra); for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1); @@ -752,16 +751,16 @@ static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, } } -int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb) +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int pid = get_cur_spid(vcpu); - if (rt == 0 || rt == 1) { - tlbilx_all(vcpu_e500, 0, pid, rt); - tlbilx_all(vcpu_e500, 1, pid, rt); - } else if (rt == 3) { - tlbilx_one(vcpu_e500, pid, ra, rb); + if (type == 0 || type == 1) { + tlbilx_all(vcpu_e500, 0, pid, type); + tlbilx_all(vcpu_e500, 1, pid, type); + } else if (type == 3) { + tlbilx_one(vcpu_e500, pid, ea); } return EMULATE_DONE; @@ -786,16 +785,13 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) return EMULATE_DONE; } -int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int as = !!get_cur_sas(vcpu); unsigned int pid = get_cur_spid(vcpu); int esel, tlbsel; struct kvm_book3e_206_tlb_entry *gtlbe = NULL; - gva_t ea; - - ea = kvmppc_get_gpr(vcpu, rb); for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); @@ -875,6 +871,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) gtlbe->mas1 = vcpu->arch.shared->mas1; gtlbe->mas2 = vcpu->arch.shared->mas2; + if (!(vcpu->arch.shared->msr & MSR_CM)) + gtlbe->mas2 &= 0xffffffffUL; gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, @@ -1039,8 +1037,12 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, sesel = 0; /* unused */ priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; - kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, - &priv->ref, eaddr, &stlbe); + /* Only triggers after clear_tlb_refs */ + if (unlikely(!(priv->ref.flags & E500_TLB_VALID))) + kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + else + kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, + &priv->ref, eaddr, &stlbe); break; case 1: { @@ -1060,6 +1062,49 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); } +/************* MMU Notifiers *************/ + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_unmap_hva(hva); + + /* + * Flush all shadow tlb entries everywhere. This is slow, but + * we are 100% sure that we catch the to be unmapped page + */ + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + /* kvm_unmap_hva flushes everything anyways */ + kvm_unmap_hva(kvm, start); + + return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + /* The page will get remapped properly on its next fault */ + kvm_unmap_hva(kvm, hva); +} + +/*****************************************/ + static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) { int i; @@ -1081,6 +1126,8 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) } vcpu_e500->num_shared_tlb_pages = 0; + + kfree(vcpu_e500->shared_tlb_pages); vcpu_e500->shared_tlb_pages = NULL; } else { kfree(vcpu_e500->gtlb_arch); @@ -1178,21 +1225,27 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, } virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); - if (!virt) + if (!virt) { + ret = -ENOMEM; goto err_put_page; + } privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], GFP_KERNEL); privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], GFP_KERNEL); - if (!privs[0] || !privs[1]) - goto err_put_page; + if (!privs[0] || !privs[1]) { + ret = -ENOMEM; + goto err_privs; + } g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], GFP_KERNEL); - if (!g2h_bitmap) - goto err_put_page; + if (!g2h_bitmap) { + ret = -ENOMEM; + goto err_privs; + } free_gtlb(vcpu_e500); @@ -1232,10 +1285,11 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, kvmppc_recalc_tlb1map_range(vcpu_e500); return 0; -err_put_page: +err_privs: kfree(privs[0]); kfree(privs[1]); +err_put_page: for (i = 0; i < num_pages; i++) put_page(pages[i]); @@ -1332,7 +1386,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (!vcpu_e500->gtlb_priv[1]) goto err; - vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) * + vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) * vcpu_e500->gtlb_params[1].entries, GFP_KERNEL); if (!vcpu_e500->g2h_tlb1_map) diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index ee04abaefe23..b0855e5d8905 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -131,6 +131,125 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) return vcpu->arch.dec - jd; } +static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ + enum emulation_result emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); + + switch (sprn) { + case SPRN_SRR0: + vcpu->arch.shared->srr0 = spr_val; + break; + case SPRN_SRR1: + vcpu->arch.shared->srr1 = spr_val; + break; + + /* XXX We need to context-switch the timebase for + * watchdog and FIT. */ + case SPRN_TBWL: break; + case SPRN_TBWU: break; + + case SPRN_MSSSR0: break; + + case SPRN_DEC: + vcpu->arch.dec = spr_val; + kvmppc_emulate_dec(vcpu); + break; + + case SPRN_SPRG0: + vcpu->arch.shared->sprg0 = spr_val; + break; + case SPRN_SPRG1: + vcpu->arch.shared->sprg1 = spr_val; + break; + case SPRN_SPRG2: + vcpu->arch.shared->sprg2 = spr_val; + break; + case SPRN_SPRG3: + vcpu->arch.shared->sprg3 = spr_val; + break; + + default: + emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, + spr_val); + if (emulated == EMULATE_FAIL) + printk(KERN_INFO "mtspr: unknown spr " + "0x%x\n", sprn); + break; + } + + kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); + + return emulated; +} + +static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ + enum emulation_result emulated = EMULATE_DONE; + ulong spr_val = 0; + + switch (sprn) { + case SPRN_SRR0: + spr_val = vcpu->arch.shared->srr0; + break; + case SPRN_SRR1: + spr_val = vcpu->arch.shared->srr1; + break; + case SPRN_PVR: + spr_val = vcpu->arch.pvr; + break; + case SPRN_PIR: + spr_val = vcpu->vcpu_id; + break; + case SPRN_MSSSR0: + spr_val = 0; + break; + + /* Note: mftb and TBRL/TBWL are user-accessible, so + * the guest can always access the real TB anyways. + * In fact, we probably will never see these traps. */ + case SPRN_TBWL: + spr_val = get_tb() >> 32; + break; + case SPRN_TBWU: + spr_val = get_tb(); + break; + + case SPRN_SPRG0: + spr_val = vcpu->arch.shared->sprg0; + break; + case SPRN_SPRG1: + spr_val = vcpu->arch.shared->sprg1; + break; + case SPRN_SPRG2: + spr_val = vcpu->arch.shared->sprg2; + break; + case SPRN_SPRG3: + spr_val = vcpu->arch.shared->sprg3; + break; + /* Note: SPRG4-7 are user-readable, so we don't get + * a trap. */ + + case SPRN_DEC: + spr_val = kvmppc_get_dec(vcpu, get_tb()); + break; + default: + emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, + &spr_val); + if (unlikely(emulated == EMULATE_FAIL)) { + printk(KERN_INFO "mfspr: unknown spr " + "0x%x\n", sprn); + } + break; + } + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, rt, spr_val); + kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); + + return emulated; +} + /* XXX to do: * lhax * lhaux @@ -156,7 +275,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) int sprn = get_sprn(inst); enum emulation_result emulated = EMULATE_DONE; int advance = 1; - ulong spr_val = 0; /* this default type might be overwritten by subcategories */ kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); @@ -236,62 +354,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_31_XOP_MFSPR: - switch (sprn) { - case SPRN_SRR0: - spr_val = vcpu->arch.shared->srr0; - break; - case SPRN_SRR1: - spr_val = vcpu->arch.shared->srr1; - break; - case SPRN_PVR: - spr_val = vcpu->arch.pvr; - break; - case SPRN_PIR: - spr_val = vcpu->vcpu_id; - break; - case SPRN_MSSSR0: - spr_val = 0; - break; - - /* Note: mftb and TBRL/TBWL are user-accessible, so - * the guest can always access the real TB anyways. - * In fact, we probably will never see these traps. */ - case SPRN_TBWL: - spr_val = get_tb() >> 32; - break; - case SPRN_TBWU: - spr_val = get_tb(); - break; - - case SPRN_SPRG0: - spr_val = vcpu->arch.shared->sprg0; - break; - case SPRN_SPRG1: - spr_val = vcpu->arch.shared->sprg1; - break; - case SPRN_SPRG2: - spr_val = vcpu->arch.shared->sprg2; - break; - case SPRN_SPRG3: - spr_val = vcpu->arch.shared->sprg3; - break; - /* Note: SPRG4-7 are user-readable, so we don't get - * a trap. */ - - case SPRN_DEC: - spr_val = kvmppc_get_dec(vcpu, get_tb()); - break; - default: - emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, - &spr_val); - if (unlikely(emulated == EMULATE_FAIL)) { - printk(KERN_INFO "mfspr: unknown spr " - "0x%x\n", sprn); - } - break; - } - kvmppc_set_gpr(vcpu, rt, spr_val); - kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); + emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt); break; case OP_31_XOP_STHX: @@ -308,49 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case OP_31_XOP_MTSPR: - spr_val = kvmppc_get_gpr(vcpu, rs); - switch (sprn) { - case SPRN_SRR0: - vcpu->arch.shared->srr0 = spr_val; - break; - case SPRN_SRR1: - vcpu->arch.shared->srr1 = spr_val; - break; - - /* XXX We need to context-switch the timebase for - * watchdog and FIT. */ - case SPRN_TBWL: break; - case SPRN_TBWU: break; - - case SPRN_MSSSR0: break; - - case SPRN_DEC: - vcpu->arch.dec = spr_val; - kvmppc_emulate_dec(vcpu); - break; - - case SPRN_SPRG0: - vcpu->arch.shared->sprg0 = spr_val; - break; - case SPRN_SPRG1: - vcpu->arch.shared->sprg1 = spr_val; - break; - case SPRN_SPRG2: - vcpu->arch.shared->sprg2 = spr_val; - break; - case SPRN_SPRG3: - vcpu->arch.shared->sprg3 = spr_val; - break; - - default: - emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, - spr_val); - if (emulated == EMULATE_FAIL) - printk(KERN_INFO "mtspr: unknown spr " - "0x%x\n", sprn); - break; - } - kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); + emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); break; case OP_31_XOP_DCBI: diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4d213b8b0fb5..70739a089560 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -30,6 +30,7 @@ #include <asm/kvm_ppc.h> #include <asm/tlbflush.h> #include <asm/cputhreads.h> +#include <asm/irqflags.h> #include "timing.h" #include "../mm/mmu_decl.h" @@ -38,8 +39,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !(v->arch.shared->msr & MSR_WE) || - !!(v->arch.pending_exceptions) || + return !!(v->arch.pending_exceptions) || v->requests; } @@ -48,6 +48,85 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return 1; } +#ifndef CONFIG_KVM_BOOK3S_64_HV +/* + * Common checks before entering the guest world. Call with interrupts + * disabled. + * + * returns: + * + * == 1 if we're ready to go into guest state + * <= 0 if we need to go back to the host with return value + */ +int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + int r = 1; + + WARN_ON_ONCE(!irqs_disabled()); + while (true) { + if (need_resched()) { + local_irq_enable(); + cond_resched(); + local_irq_disable(); + continue; + } + + if (signal_pending(current)) { + kvmppc_account_exit(vcpu, SIGNAL_EXITS); + vcpu->run->exit_reason = KVM_EXIT_INTR; + r = -EINTR; + break; + } + + vcpu->mode = IN_GUEST_MODE; + + /* + * Reading vcpu->requests must happen after setting vcpu->mode, + * so we don't miss a request because the requester sees + * OUTSIDE_GUEST_MODE and assumes we'll be checking requests + * before next entering the guest (and thus doesn't IPI). + */ + smp_mb(); + + if (vcpu->requests) { + /* Make sure we process requests preemptable */ + local_irq_enable(); + trace_kvm_check_requests(vcpu); + r = kvmppc_core_check_requests(vcpu); + local_irq_disable(); + if (r > 0) + continue; + break; + } + + if (kvmppc_core_prepare_to_enter(vcpu)) { + /* interrupts got enabled in between, so we + are back at square 1 */ + continue; + } + +#ifdef CONFIG_PPC64 + /* lazy EE magic */ + hard_irq_disable(); + if (lazy_irq_pending()) { + /* Got an interrupt in between, try again */ + local_irq_enable(); + local_irq_disable(); + kvm_guest_exit(); + continue; + } + + trace_hardirqs_on(); +#endif + + kvm_guest_enter(); + break; + } + + return r; +} +#endif /* CONFIG_KVM_BOOK3S_64_HV */ + int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) { int nr = kvmppc_get_gpr(vcpu, 11); @@ -67,18 +146,18 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) } switch (nr) { - case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE: + case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE): { vcpu->arch.magic_page_pa = param1; vcpu->arch.magic_page_ea = param2; r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; - r = HC_EV_SUCCESS; + r = EV_SUCCESS; break; } - case HC_VENDOR_KVM | KVM_HC_FEATURES: - r = HC_EV_SUCCESS; + case KVM_HCALL_TOKEN(KVM_HC_FEATURES): + r = EV_SUCCESS; #if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2) /* XXX Missing magic page on 44x */ r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); @@ -86,8 +165,13 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) /* Second return value is in r4 */ break; + case EV_HCALL_TOKEN(EV_IDLE): + r = EV_SUCCESS; + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + break; default: - r = HC_EV_UNIMPLEMENTED; + r = EV_UNIMPLEMENTED; break; } @@ -220,6 +304,7 @@ int kvm_dev_ioctl_check_extension(long ext) switch (ext) { #ifdef CONFIG_BOOKE case KVM_CAP_PPC_BOOKE_SREGS: + case KVM_CAP_PPC_BOOKE_WATCHDOG: #else case KVM_CAP_PPC_SEGSTATE: case KVM_CAP_PPC_HIOR: @@ -229,6 +314,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: case KVM_CAP_ONE_REG: + case KVM_CAP_IOEVENTFD: r = 1; break; #ifndef CONFIG_KVM_BOOK3S_64_HV @@ -260,10 +346,22 @@ int kvm_dev_ioctl_check_extension(long ext) if (cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; +#endif case KVM_CAP_SYNC_MMU: +#ifdef CONFIG_KVM_BOOK3S_64_HV r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; +#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) + r = 1; +#else + r = 0; + break; +#endif +#ifdef CONFIG_KVM_BOOK3S_64_HV + case KVM_CAP_PPC_HTAB_FD: + r = 1; break; #endif + break; case KVM_CAP_NR_VCPUS: /* * Recommending a number of CPUs is somewhat arbitrary; we @@ -302,19 +400,12 @@ long kvm_arch_dev_ioctl(struct file *filp, void kvm_arch_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - if (!dont || free->arch.rmap != dont->arch.rmap) { - vfree(free->arch.rmap); - free->arch.rmap = NULL; - } + kvmppc_core_free_memslot(free, dont); } int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) { - slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); - if (!slot->arch.rmap) - return -ENOMEM; - - return 0; + return kvmppc_core_create_memslot(slot, npages); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -323,7 +414,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc) { - return kvmppc_core_prepare_memory_region(kvm, mem); + return kvmppc_core_prepare_memory_region(kvm, memslot, mem); } void kvm_arch_commit_memory_region(struct kvm *kvm, @@ -331,7 +422,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot old, int user_alloc) { - kvmppc_core_commit_memory_region(kvm, mem); + kvmppc_core_commit_memory_region(kvm, mem, old); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -341,6 +432,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { + kvmppc_core_flush_memslot(kvm, slot); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) @@ -354,6 +446,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) return vcpu; } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { /* Make sure we're not using the vcpu anymore */ @@ -390,6 +487,8 @@ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { + int ret; + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; @@ -398,13 +497,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); #endif - - return 0; + ret = kvmppc_subarch_vcpu_init(vcpu); + return ret; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -420,7 +520,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); #endif kvmppc_core_vcpu_load(vcpu, cpu); - vcpu->cpu = smp_processor_id(); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -429,7 +528,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #ifdef CONFIG_BOOKE vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); #endif - vcpu->cpu = -1; } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, @@ -527,6 +625,13 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->mmio_is_write = 0; vcpu->arch.mmio_sign_extend = 0; + if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data)) { + kvmppc_complete_mmio_load(vcpu, run); + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + return EMULATE_DO_MMIO; } @@ -536,8 +641,8 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, { int r; - r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian); vcpu->arch.mmio_sign_extend = 1; + r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian); return r; } @@ -575,6 +680,13 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, } } + if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data)) { + kvmppc_complete_mmio_load(vcpu, run); + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + return EMULATE_DO_MMIO; } @@ -649,6 +761,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.papr_enabled = true; break; +#ifdef CONFIG_BOOKE + case KVM_CAP_PPC_BOOKE_WATCHDOG: + r = 0; + vcpu->arch.watchdog_enabled = true; + break; +#endif #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) case KVM_CAP_SW_TLB: { struct kvm_config_tlb cfg; @@ -751,9 +869,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) { + u32 inst_nop = 0x60000000; +#ifdef CONFIG_KVM_BOOKE_HV + u32 inst_sc1 = 0x44000022; + pvinfo->hcall[0] = inst_sc1; + pvinfo->hcall[1] = inst_nop; + pvinfo->hcall[2] = inst_nop; + pvinfo->hcall[3] = inst_nop; +#else u32 inst_lis = 0x3c000000; u32 inst_ori = 0x60000000; - u32 inst_nop = 0x60000000; u32 inst_sc = 0x44000002; u32 inst_imm_mask = 0xffff; @@ -770,6 +895,9 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); pvinfo->hcall[2] = inst_sc; pvinfo->hcall[3] = inst_nop; +#endif + + pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE; return 0; } @@ -832,6 +960,17 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + + case KVM_PPC_GET_HTAB_FD: { + struct kvm *kvm = filp->private_data; + struct kvm_get_htab_fd ghf; + + r = -EFAULT; + if (copy_from_user(&ghf, argp, sizeof(ghf))) + break; + r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); + break; + } #endif /* CONFIG_KVM_BOOK3S_64_HV */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index ddb6a2149d44..e326489a5420 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -31,6 +31,126 @@ TRACE_EVENT(kvm_ppc_instr, __entry->inst, __entry->pc, __entry->emulate) ); +#ifdef CONFIG_PPC_BOOK3S +#define kvm_trace_symbol_exit \ + {0x100, "SYSTEM_RESET"}, \ + {0x200, "MACHINE_CHECK"}, \ + {0x300, "DATA_STORAGE"}, \ + {0x380, "DATA_SEGMENT"}, \ + {0x400, "INST_STORAGE"}, \ + {0x480, "INST_SEGMENT"}, \ + {0x500, "EXTERNAL"}, \ + {0x501, "EXTERNAL_LEVEL"}, \ + {0x502, "EXTERNAL_HV"}, \ + {0x600, "ALIGNMENT"}, \ + {0x700, "PROGRAM"}, \ + {0x800, "FP_UNAVAIL"}, \ + {0x900, "DECREMENTER"}, \ + {0x980, "HV_DECREMENTER"}, \ + {0xc00, "SYSCALL"}, \ + {0xd00, "TRACE"}, \ + {0xe00, "H_DATA_STORAGE"}, \ + {0xe20, "H_INST_STORAGE"}, \ + {0xe40, "H_EMUL_ASSIST"}, \ + {0xf00, "PERFMON"}, \ + {0xf20, "ALTIVEC"}, \ + {0xf40, "VSX"} +#else +#define kvm_trace_symbol_exit \ + {0, "CRITICAL"}, \ + {1, "MACHINE_CHECK"}, \ + {2, "DATA_STORAGE"}, \ + {3, "INST_STORAGE"}, \ + {4, "EXTERNAL"}, \ + {5, "ALIGNMENT"}, \ + {6, "PROGRAM"}, \ + {7, "FP_UNAVAIL"}, \ + {8, "SYSCALL"}, \ + {9, "AP_UNAVAIL"}, \ + {10, "DECREMENTER"}, \ + {11, "FIT"}, \ + {12, "WATCHDOG"}, \ + {13, "DTLB_MISS"}, \ + {14, "ITLB_MISS"}, \ + {15, "DEBUG"}, \ + {32, "SPE_UNAVAIL"}, \ + {33, "SPE_FP_DATA"}, \ + {34, "SPE_FP_ROUND"}, \ + {35, "PERFORMANCE_MONITOR"}, \ + {36, "DOORBELL"}, \ + {37, "DOORBELL_CRITICAL"}, \ + {38, "GUEST_DBELL"}, \ + {39, "GUEST_DBELL_CRIT"}, \ + {40, "HV_SYSCALL"}, \ + {41, "HV_PRIV"} +#endif + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) +#ifdef CONFIG_KVM_BOOK3S_PR + __field( unsigned long, srr1 ) +#endif + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( +#ifdef CONFIG_KVM_BOOK3S_PR + struct kvmppc_book3s_shadow_vcpu *svcpu; +#endif + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; +#ifdef CONFIG_KVM_BOOK3S_PR + svcpu = svcpu_get(vcpu); + __entry->srr1 = svcpu->shadow_srr1; + svcpu_put(svcpu); +#endif + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" +#ifdef CONFIG_KVM_BOOK3S_PR + " | srr1=0x%lx" +#endif + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, +#ifdef CONFIG_KVM_BOOK3S_PR + __entry->srr1, +#endif + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + TRACE_EVENT(kvm_stlb_inval, TP_PROTO(unsigned int stlb_index), TP_ARGS(stlb_index), @@ -98,41 +218,31 @@ TRACE_EVENT(kvm_gtlb_write, __entry->word1, __entry->word2) ); - -/************************************************************************* - * Book3S trace points * - *************************************************************************/ - -#ifdef CONFIG_KVM_BOOK3S_PR - -TRACE_EVENT(kvm_book3s_exit, - TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), - TP_ARGS(exit_nr, vcpu), +TRACE_EVENT(kvm_check_requests, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), TP_STRUCT__entry( - __field( unsigned int, exit_nr ) - __field( unsigned long, pc ) - __field( unsigned long, msr ) - __field( unsigned long, dar ) - __field( unsigned long, srr1 ) + __field( __u32, cpu_nr ) + __field( __u32, requests ) ), TP_fast_assign( - struct kvmppc_book3s_shadow_vcpu *svcpu; - __entry->exit_nr = exit_nr; - __entry->pc = kvmppc_get_pc(vcpu); - __entry->dar = kvmppc_get_fault_dar(vcpu); - __entry->msr = vcpu->arch.shared->msr; - svcpu = svcpu_get(vcpu); - __entry->srr1 = svcpu->shadow_srr1; - svcpu_put(svcpu); + __entry->cpu_nr = vcpu->vcpu_id; + __entry->requests = vcpu->requests; ), - TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", - __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar, - __entry->srr1) + TP_printk("vcpu=%x requests=%x", + __entry->cpu_nr, __entry->requests) ); + +/************************************************************************* + * Book3S trace points * + *************************************************************************/ + +#ifdef CONFIG_KVM_BOOK3S_PR + TRACE_EVENT(kvm_book3s_reenter, TP_PROTO(int r, struct kvm_vcpu *vcpu), TP_ARGS(r, vcpu), @@ -395,6 +505,44 @@ TRACE_EVENT(kvm_booke206_gtlb_write, __entry->mas2, __entry->mas7_3) ); +TRACE_EVENT(kvm_booke206_ref_release, + TP_PROTO(__u64 pfn, __u32 flags), + TP_ARGS(pfn, flags), + + TP_STRUCT__entry( + __field( __u64, pfn ) + __field( __u32, flags ) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->flags = flags; + ), + + TP_printk("pfn=%llx flags=%x", + __entry->pfn, __entry->flags) +); + +TRACE_EVENT(kvm_booke_queue_irqprio, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), + TP_ARGS(vcpu, priority), + + TP_STRUCT__entry( + __field( __u32, cpu_nr ) + __field( __u32, priority ) + __field( unsigned long, pending ) + ), + + TP_fast_assign( + __entry->cpu_nr = vcpu->vcpu_id; + __entry->priority = priority; + __entry->pending = vcpu->arch.pending_exceptions; + ), + + TP_printk("vcpu=%x prio=%x pending=%lx", + __entry->cpu_nr, __entry->priority, __entry->pending) +); + #endif #endif /* _TRACE_KVM_H */ |