summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/api.txt47
-rw-r--r--Documentation/virtual/kvm/devices/s390_flic.txt91
-rw-r--r--arch/arm/include/asm/kvm_arm.h4
-rw-r--r--arch/arm/include/asm/kvm_asm.h4
-rw-r--r--arch/arm/include/asm/kvm_host.h9
-rw-r--r--arch/arm/include/asm/kvm_mmu.h30
-rw-r--r--arch/arm/kernel/asm-offsets.c1
-rw-r--r--arch/arm/kvm/coproc.c84
-rw-r--r--arch/arm/kvm/coproc.h14
-rw-r--r--arch/arm/kvm/coproc_a15.c2
-rw-r--r--arch/arm/kvm/coproc_a7.c2
-rw-r--r--arch/arm/kvm/guest.c1
-rw-r--r--arch/arm/kvm/interrupts_head.S21
-rw-r--r--arch/arm/kvm/mmu.c110
-rw-r--r--arch/arm64/include/asm/kvm_arm.h3
-rw-r--r--arch/arm64/include/asm/kvm_asm.h3
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h22
-rw-r--r--arch/arm64/kvm/sys_regs.c99
-rw-r--r--arch/arm64/kvm/sys_regs.h2
-rw-r--r--arch/ia64/kvm/kvm-ia64.c1
-rw-r--r--arch/mips/include/asm/kvm_host.h417
-rw-r--r--arch/mips/kvm/kvm_mips_emul.c40
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h5
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h12
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h2
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h2
-rw-r--r--arch/powerpc/include/asm/reg.h1
-rw-r--r--arch/powerpc/include/asm/tm.h4
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c9
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c28
-rw-r--r--arch/powerpc/kvm/book3s_hv.c159
-rw-r--r--arch/powerpc/kvm/book3s_hv_interrupts.S22
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c6
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S187
-rw-r--r--arch/powerpc/kvm/book3s_rtas.c7
-rw-r--r--arch/s390/include/asm/irq.h1
-rw-r--r--arch/s390/include/asm/kvm_host.h98
-rw-r--r--arch/s390/include/asm/pgtable.h2
-rw-r--r--arch/s390/include/asm/processor.h1
-rw-r--r--arch/s390/include/uapi/asm/kvm.h43
-rw-r--r--arch/s390/kernel/irq.c1
-rw-r--r--arch/s390/kvm/Kconfig4
-rw-r--r--arch/s390/kvm/Makefile2
-rw-r--r--arch/s390/kvm/diag.c84
-rw-r--r--arch/s390/kvm/interrupt.c704
-rw-r--r--arch/s390/kvm/irq.h22
-rw-r--r--arch/s390/kvm/kvm-s390.c212
-rw-r--r--arch/s390/kvm/kvm-s390.h7
-rw-r--r--arch/s390/kvm/priv.c7
-rw-r--r--arch/s390/kvm/sigp.c157
-rw-r--r--arch/s390/kvm/trace.h46
-rw-r--r--arch/s390/mm/fault.c26
-rw-r--r--arch/x86/include/asm/kvm_host.h18
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/asm/xsave.h2
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h1
-rw-r--r--arch/x86/kernel/kvm.c1
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kvm/cpuid.c37
-rw-r--r--arch/x86/kvm/emulate.c8
-rw-r--r--arch/x86/kvm/mmu.c2
-rw-r--r--arch/x86/kvm/paging_tmpl.h7
-rw-r--r--arch/x86/kvm/svm.c84
-rw-r--r--arch/x86/kvm/vmx.c334
-rw-r--r--arch/x86/kvm/x86.c145
-rw-r--r--arch/x86/kvm/x86.h5
-rw-r--r--drivers/s390/kvm/virtio_ccw.c323
-rw-r--r--include/linux/kvm_host.h20
-rw-r--r--include/uapi/linux/kvm.h83
-rw-r--r--virt/kvm/Kconfig4
-rw-r--r--virt/kvm/async_pf.c27
-rw-r--r--virt/kvm/eventfd.c8
-rw-r--r--virt/kvm/ioapic.c108
-rw-r--r--virt/kvm/kvm_main.c12
74 files changed, 3265 insertions, 838 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 6cd63a9010fb..c24211da4120 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -586,8 +586,8 @@ struct kvm_fpu {
4.24 KVM_CREATE_IRQCHIP
-Capability: KVM_CAP_IRQCHIP
-Architectures: x86, ia64, ARM, arm64
+Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390)
+Architectures: x86, ia64, ARM, arm64, s390
Type: vm ioctl
Parameters: none
Returns: 0 on success, -1 on error
@@ -596,7 +596,10 @@ Creates an interrupt controller model in the kernel. On x86, creates a virtual
ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
only go to the IOAPIC. On ia64, a IOSAPIC is created. On ARM/arm64, a GIC is
-created.
+created. On s390, a dummy irq routing table is created.
+
+Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled
+before KVM_CREATE_IRQCHIP can be used.
4.25 KVM_IRQ_LINE
@@ -612,6 +615,20 @@ On some architectures it is required that an interrupt controller model has
been previously created with KVM_CREATE_IRQCHIP. Note that edge-triggered
interrupts require the level to be set to 1 and then back to 0.
+On real hardware, interrupt pins can be active-low or active-high. This
+does not matter for the level field of struct kvm_irq_level: 1 always
+means active (asserted), 0 means inactive (deasserted).
+
+x86 allows the operating system to program the interrupt polarity
+(active-low/active-high) for level-triggered interrupts, and KVM used
+to consider the polarity. However, due to bitrot in the handling of
+active-low interrupts, the above convention is now valid on x86 too.
+This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED. Userspace
+should not present interrupts to the guest as active-low unless this
+capability is present (or unless it is not using the in-kernel irqchip,
+of course).
+
+
ARM/arm64 can signal an interrupt either at the CPU level, or at the
in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to
use PPIs designated for specific cpus. The irq field is interpreted
@@ -628,7 +645,7 @@ The irq_type field has the following values:
(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs)
-In both cases, level is used to raise/lower the line.
+In both cases, level is used to assert/deassert the line.
struct kvm_irq_level {
union {
@@ -918,9 +935,9 @@ documentation when it pops into existence).
4.37 KVM_ENABLE_CAP
-Capability: KVM_CAP_ENABLE_CAP
+Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM
Architectures: ppc, s390
-Type: vcpu ioctl
+Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM)
Parameters: struct kvm_enable_cap (in)
Returns: 0 on success; -1 on error
@@ -951,6 +968,8 @@ function properly, this is the place to put them.
__u8 pad[64];
};
+The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl
+for vm-wide capabilities.
4.38 KVM_GET_MP_STATE
@@ -1320,7 +1339,7 @@ KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
4.52 KVM_SET_GSI_ROUTING
Capability: KVM_CAP_IRQ_ROUTING
-Architectures: x86 ia64
+Architectures: x86 ia64 s390
Type: vm ioctl
Parameters: struct kvm_irq_routing (in)
Returns: 0 on success, -1 on error
@@ -1343,6 +1362,7 @@ struct kvm_irq_routing_entry {
union {
struct kvm_irq_routing_irqchip irqchip;
struct kvm_irq_routing_msi msi;
+ struct kvm_irq_routing_s390_adapter adapter;
__u32 pad[8];
} u;
};
@@ -1350,6 +1370,7 @@ struct kvm_irq_routing_entry {
/* gsi routing entry types */
#define KVM_IRQ_ROUTING_IRQCHIP 1
#define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_S390_ADAPTER 3
No flags are specified so far, the corresponding field must be set to zero.
@@ -1365,6 +1386,14 @@ struct kvm_irq_routing_msi {
__u32 pad;
};
+struct kvm_irq_routing_s390_adapter {
+ __u64 ind_addr;
+ __u64 summary_addr;
+ __u64 ind_offset;
+ __u32 summary_offset;
+ __u32 adapter_id;
+};
+
4.53 KVM_ASSIGN_SET_MSIX_NR
@@ -2566,6 +2595,10 @@ executed a memory-mapped I/O instruction which could not be satisfied
by kvm. The 'data' member contains the written data if 'is_write' is
true, and should be filled by application code otherwise.
+The 'data' member contains, in its first 'len' bytes, the value as it would
+appear if the VCPU performed a load or store of the appropriate width directly
+to the byte array.
+
NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR,
KVM_EXIT_PAPR and KVM_EXIT_EPR the corresponding
operations are complete (and guest state is consistent) only after userspace
diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
new file mode 100644
index 000000000000..4ceef53164b0
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/s390_flic.txt
@@ -0,0 +1,91 @@
+FLIC (floating interrupt controller)
+====================================
+
+FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some
+machine check interruptions. All interrupts are stored in a per-vm list of
+pending interrupts. FLIC performs operations on this list.
+
+Only one FLIC instance may be instantiated.
+
+FLIC provides support to
+- add interrupts (KVM_DEV_FLIC_ENQUEUE)
+- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS)
+- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
+- enable/disable for the guest transparent async page faults
+- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*)
+
+Groups:
+ KVM_DEV_FLIC_ENQUEUE
+ Passes a buffer and length into the kernel which are then injected into
+ the list of pending interrupts.
+ attr->addr contains the pointer to the buffer and attr->attr contains
+ the length of the buffer.
+ The format of the data structure kvm_s390_irq as it is copied from userspace
+ is defined in usr/include/linux/kvm.h.
+
+ KVM_DEV_FLIC_GET_ALL_IRQS
+ Copies all floating interrupts into a buffer provided by userspace.
+ When the buffer is too small it returns -ENOMEM, which is the indication
+ for userspace to try again with a bigger buffer.
+ All interrupts remain pending, i.e. are not deleted from the list of
+ currently pending interrupts.
+ attr->addr contains the userspace address of the buffer into which all
+ interrupt data will be copied.
+ attr->attr contains the size of the buffer in bytes.
+
+ KVM_DEV_FLIC_CLEAR_IRQS
+ Simply deletes all elements from the list of currently pending floating
+ interrupts. No interrupts are injected into the guest.
+
+ KVM_DEV_FLIC_APF_ENABLE
+ Enables async page faults for the guest. So in case of a major page fault
+ the host is allowed to handle this async and continues the guest.
+
+ KVM_DEV_FLIC_APF_DISABLE_WAIT
+ Disables async page faults for the guest and waits until already pending
+ async page faults are done. This is necessary to trigger a completion interrupt
+ for every init interrupt before migrating the interrupt list.
+
+ KVM_DEV_FLIC_ADAPTER_REGISTER
+ Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter
+ describing the adapter to register:
+
+struct kvm_s390_io_adapter {
+ __u32 id;
+ __u8 isc;
+ __u8 maskable;
+ __u8 swap;
+ __u8 pad;
+};
+
+ id contains the unique id for the adapter, isc the I/O interruption subclass
+ to use, maskable whether this adapter may be masked (interrupts turned off)
+ and swap whether the indicators need to be byte swapped.
+
+
+ KVM_DEV_FLIC_ADAPTER_MODIFY
+ Modifies attributes of an existing I/O adapter interrupt source. Takes
+ a kvm_s390_io_adapter_req specifiying the adapter and the operation:
+
+struct kvm_s390_io_adapter_req {
+ __u32 id;
+ __u8 type;
+ __u8 mask;
+ __u16 pad0;
+ __u64 addr;
+};
+
+ id specifies the adapter and type the operation. The supported operations
+ are:
+
+ KVM_S390_IO_ADAPTER_MASK
+ mask or unmask the adapter, as specified in mask
+
+ KVM_S390_IO_ADAPTER_MAP
+ perform a gmap translation for the guest address provided in addr,
+ pin a userspace page for the translated address and add it to the
+ list of mappings
+
+ KVM_S390_IO_ADAPTER_UNMAP
+ release a userspace page for the translated address specified in addr
+ from the list of mappings
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 1d3153c7eb41..816db0bf2dd8 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -55,6 +55,7 @@
* The bits we set in HCR:
* TAC: Trap ACTLR
* TSC: Trap SMC
+ * TVM: Trap VM ops (until MMU and caches are on)
* TSW: Trap cache operations by set/way
* TWI: Trap WFI
* TWE: Trap WFE
@@ -68,8 +69,7 @@
*/
#define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \
HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \
- HCR_TWE | HCR_SWIO | HCR_TIDCP)
-#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
+ HCR_TVM | HCR_TWE | HCR_SWIO | HCR_TIDCP)
/* System Control Register (SCTLR) bits */
#define SCTLR_TE (1 << 30)
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 661da11f76f4..53b3c4a50d5c 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -48,7 +48,9 @@
#define c13_TID_URO 26 /* Thread ID, User R/O */
#define c13_TID_PRIV 27 /* Thread ID, Privileged */
#define c14_CNTKCTL 28 /* Timer Control Register (PL1) */
-#define NR_CP15_REGS 29 /* Number of regs (incl. invalid) */
+#define c10_AMAIR0 29 /* Auxilary Memory Attribute Indirection Reg0 */
+#define c10_AMAIR1 30 /* Auxilary Memory Attribute Indirection Reg1 */
+#define NR_CP15_REGS 31 /* Number of regs (incl. invalid) */
#define ARM_EXCEPTION_RESET 0
#define ARM_EXCEPTION_UNDEFINED 1
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 098f7dd6d564..09af14999c9b 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -101,6 +101,12 @@ struct kvm_vcpu_arch {
/* The CPU type we expose to the VM */
u32 midr;
+ /* HYP trapping configuration */
+ u32 hcr;
+
+ /* Interrupt related fields */
+ u32 irq_lines; /* IRQ and FIQ levels */
+
/* Exception Information */
struct kvm_vcpu_fault_info fault;
@@ -128,9 +134,6 @@ struct kvm_vcpu_arch {
/* IO related fields */
struct kvm_decode mmio_decode;
- /* Interrupt related fields */
- u32 irq_lines; /* IRQ and FIQ levels */
-
/* Cache some mmu pages needed inside spinlock regions */
struct kvm_mmu_memory_cache mmu_page_cache;
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 2d122adcdb22..5c7aa3c1519f 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -114,11 +114,34 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
pmd_val(*pmd) |= L_PMD_S2_RDWR;
}
+/* Open coded p*d_addr_end that can deal with 64bit addresses */
+#define kvm_pgd_addr_end(addr, end) \
+({ u64 __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
+ (__boundary - 1 < (end) - 1)? __boundary: (end); \
+})
+
+#define kvm_pud_addr_end(addr,end) (end)
+
+#define kvm_pmd_addr_end(addr, end) \
+({ u64 __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \
+ (__boundary - 1 < (end) - 1)? __boundary: (end); \
+})
+
struct kvm;
-static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
- unsigned long size)
+#define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l))
+
+static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
{
+ return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101;
+}
+
+static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
+ unsigned long size)
+{
+ if (!vcpu_has_cache_enabled(vcpu))
+ kvm_flush_dcache_to_poc((void *)hva, size);
+
/*
* If we are going to insert an instruction page and the icache is
* either VIPT or PIPT, there is a potential problem where the host
@@ -139,9 +162,10 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
}
}
-#define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l))
#define kvm_virt_to_phys(x) virt_to_idmap((unsigned long)(x))
+void stage2_flush_vm(struct kvm *kvm);
+
#endif /* !__ASSEMBLY__ */
#endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index ded041711beb..85598b5d1efd 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -174,6 +174,7 @@ int main(void)
DEFINE(VCPU_FIQ_REGS, offsetof(struct kvm_vcpu, arch.regs.fiq_regs));
DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_pc));
DEFINE(VCPU_CPSR, offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_cpsr));
+ DEFINE(VCPU_HCR, offsetof(struct kvm_vcpu, arch.hcr));
DEFINE(VCPU_IRQ_LINES, offsetof(struct kvm_vcpu, arch.irq_lines));
DEFINE(VCPU_HSR, offsetof(struct kvm_vcpu, arch.fault.hsr));
DEFINE(VCPU_HxFAR, offsetof(struct kvm_vcpu, arch.fault.hxfar));
diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
index 78c0885d6501..c58a35116f63 100644
--- a/arch/arm/kvm/coproc.c
+++ b/arch/arm/kvm/coproc.c
@@ -23,6 +23,7 @@
#include <asm/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_coproc.h>
+#include <asm/kvm_mmu.h>
#include <asm/cacheflush.h>
#include <asm/cputype.h>
#include <trace/events/kvm.h>
@@ -205,6 +206,44 @@ done:
}
/*
+ * Generic accessor for VM registers. Only called as long as HCR_TVM
+ * is set.
+ */
+static bool access_vm_reg(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r)
+{
+ BUG_ON(!p->is_write);
+
+ vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
+ if (p->is_64bit)
+ vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
+
+ return true;
+}
+
+/*
+ * SCTLR accessor. Only called as long as HCR_TVM is set. If the
+ * guest enables the MMU, we stop trapping the VM sys_regs and leave
+ * it in complete control of the caches.
+ *
+ * Used by the cpu-specific code.
+ */
+bool access_sctlr(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r)
+{
+ access_vm_reg(vcpu, p, r);
+
+ if (vcpu_has_cache_enabled(vcpu)) { /* MMU+Caches enabled? */
+ vcpu->arch.hcr &= ~HCR_TVM;
+ stage2_flush_vm(vcpu->kvm);
+ }
+
+ return true;
+}
+
+/*
* We could trap ID_DFR0 and tell the guest we don't support performance
* monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was
* NAKed, so it will read the PMCR anyway.
@@ -261,33 +300,36 @@ static const struct coproc_reg cp15_regs[] = {
{ CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32,
NULL, reset_val, c1_CPACR, 0x00000000 },
- /* TTBR0/TTBR1: swapped by interrupt.S. */
- { CRm64( 2), Op1( 0), is64, NULL, reset_unknown64, c2_TTBR0 },
- { CRm64( 2), Op1( 1), is64, NULL, reset_unknown64, c2_TTBR1 },
-
- /* TTBCR: swapped by interrupt.S. */
+ /* TTBR0/TTBR1/TTBCR: swapped by interrupt.S. */
+ { CRm64( 2), Op1( 0), is64, access_vm_reg, reset_unknown64, c2_TTBR0 },
+ { CRn(2), CRm( 0), Op1( 0), Op2( 0), is32,
+ access_vm_reg, reset_unknown, c2_TTBR0 },
+ { CRn(2), CRm( 0), Op1( 0), Op2( 1), is32,
+ access_vm_reg, reset_unknown, c2_TTBR1 },
{ CRn( 2), CRm( 0), Op1( 0), Op2( 2), is32,
- NULL, reset_val, c2_TTBCR, 0x00000000 },
+ access_vm_reg, reset_val, c2_TTBCR, 0x00000000 },
+ { CRm64( 2), Op1( 1), is64, access_vm_reg, reset_unknown64, c2_TTBR1 },
+
/* DACR: swapped by interrupt.S. */
{ CRn( 3), CRm( 0), Op1( 0), Op2( 0), is32,
- NULL, reset_unknown, c3_DACR },
+ access_vm_reg, reset_unknown, c3_DACR },
/* DFSR/IFSR/ADFSR/AIFSR: swapped by interrupt.S. */
{ CRn( 5), CRm( 0), Op1( 0), Op2( 0), is32,
- NULL, reset_unknown, c5_DFSR },
+ access_vm_reg, reset_unknown, c5_DFSR },
{ CRn( 5), CRm( 0), Op1( 0), Op2( 1), is32,
- NULL, reset_unknown, c5_IFSR },
+ access_vm_reg, reset_unknown, c5_IFSR },
{ CRn( 5), CRm( 1), Op1( 0), Op2( 0), is32,
- NULL, reset_unknown, c5_ADFSR },
+ access_vm_reg, reset_unknown, c5_ADFSR },
{ CRn( 5), CRm( 1), Op1( 0), Op2( 1), is32,
- NULL, reset_unknown, c5_AIFSR },
+ access_vm_reg, reset_unknown, c5_AIFSR },
/* DFAR/IFAR: swapped by interrupt.S. */
{ CRn( 6), CRm( 0), Op1( 0), Op2( 0), is32,
- NULL, reset_unknown, c6_DFAR },
+ access_vm_reg, reset_unknown, c6_DFAR },
{ CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32,
- NULL, reset_unknown, c6_IFAR },
+ access_vm_reg, reset_unknown, c6_IFAR },
/* PAR swapped by interrupt.S */
{ CRm64( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR },
@@ -324,9 +366,15 @@ static const struct coproc_reg cp15_regs[] = {
/* PRRR/NMRR (aka MAIR0/MAIR1): swapped by interrupt.S. */
{ CRn(10), CRm( 2), Op1( 0), Op2( 0), is32,
- NULL, reset_unknown, c10_PRRR},
+ access_vm_reg, reset_unknown, c10_PRRR},
{ CRn(10), CRm( 2), Op1( 0), Op2( 1), is32,
- NULL, reset_unknown, c10_NMRR},
+ access_vm_reg, reset_unknown, c10_NMRR},
+
+ /* AMAIR0/AMAIR1: swapped by interrupt.S. */
+ { CRn(10), CRm( 3), Op1( 0), Op2( 0), is32,
+ access_vm_reg, reset_unknown, c10_AMAIR0},
+ { CRn(10), CRm( 3), Op1( 0), Op2( 1), is32,
+ access_vm_reg, reset_unknown, c10_AMAIR1},
/* VBAR: swapped by interrupt.S. */
{ CRn(12), CRm( 0), Op1( 0), Op2( 0), is32,
@@ -334,7 +382,7 @@ static const struct coproc_reg cp15_regs[] = {
/* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */
{ CRn(13), CRm( 0), Op1( 0), Op2( 1), is32,
- NULL, reset_val, c13_CID, 0x00000000 },
+ access_vm_reg, reset_val, c13_CID, 0x00000000 },
{ CRn(13), CRm( 0), Op1( 0), Op2( 2), is32,
NULL, reset_unknown, c13_TID_URW },
{ CRn(13), CRm( 0), Op1( 0), Op2( 3), is32,
@@ -443,7 +491,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
struct coproc_params params;
- params.CRm = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf;
+ params.CRn = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf;
params.Rt1 = (kvm_vcpu_get_hsr(vcpu) >> 5) & 0xf;
params.is_write = ((kvm_vcpu_get_hsr(vcpu) & 1) == 0);
params.is_64bit = true;
@@ -451,7 +499,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
params.Op1 = (kvm_vcpu_get_hsr(vcpu) >> 16) & 0xf;
params.Op2 = 0;
params.Rt2 = (kvm_vcpu_get_hsr(vcpu) >> 10) & 0xf;
- params.CRn = 0;
+ params.CRm = 0;
return emulate_cp15(vcpu, &params);
}
diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h
index 0461d5c8d3de..1a44bbe39643 100644
--- a/arch/arm/kvm/coproc.h
+++ b/arch/arm/kvm/coproc.h
@@ -58,8 +58,8 @@ static inline void print_cp_instr(const struct coproc_params *p)
{
/* Look, we even formatted it for you to paste into the table! */
if (p->is_64bit) {
- kvm_pr_unimpl(" { CRm(%2lu), Op1(%2lu), is64, func_%s },\n",
- p->CRm, p->Op1, p->is_write ? "write" : "read");
+ kvm_pr_unimpl(" { CRm64(%2lu), Op1(%2lu), is64, func_%s },\n",
+ p->CRn, p->Op1, p->is_write ? "write" : "read");
} else {
kvm_pr_unimpl(" { CRn(%2lu), CRm(%2lu), Op1(%2lu), Op2(%2lu), is32,"
" func_%s },\n",
@@ -135,13 +135,13 @@ static inline int cmp_reg(const struct coproc_reg *i1,
return -1;
if (i1->CRn != i2->CRn)
return i1->CRn - i2->CRn;
- if (i1->is_64 != i2->is_64)
- return i2->is_64 - i1->is_64;
if (i1->CRm != i2->CRm)
return i1->CRm - i2->CRm;
if (i1->Op1 != i2->Op1)
return i1->Op1 - i2->Op1;
- return i1->Op2 - i2->Op2;
+ if (i1->Op2 != i2->Op2)
+ return i1->Op2 - i2->Op2;
+ return i2->is_64 - i1->is_64;
}
@@ -153,4 +153,8 @@ static inline int cmp_reg(const struct coproc_reg *i1,
#define is64 .is_64 = true
#define is32 .is_64 = false
+bool access_sctlr(struct kvm_vcpu *vcpu,
+ const struct coproc_params *p,
+ const struct coproc_reg *r);
+
#endif /* __ARM_KVM_COPROC_LOCAL_H__ */
diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c
index bb0cac1410cc..e6f4ae48bda9 100644
--- a/arch/arm/kvm/coproc_a15.c
+++ b/arch/arm/kvm/coproc_a15.c
@@ -34,7 +34,7 @@
static const struct coproc_reg a15_regs[] = {
/* SCTLR: swapped by interrupt.S. */
{ CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
- NULL, reset_val, c1_SCTLR, 0x00C50078 },
+ access_sctlr, reset_val, c1_SCTLR, 0x00C50078 },
};
static struct kvm_coproc_target_table a15_target_table = {
diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c
index 1df767331588..17fc7cd479d3 100644
--- a/arch/arm/kvm/coproc_a7.c
+++ b/arch/arm/kvm/coproc_a7.c
@@ -37,7 +37,7 @@
static const struct coproc_reg a7_regs[] = {
/* SCTLR: swapped by interrupt.S. */
{ CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32,
- NULL, reset_val, c1_SCTLR, 0x00C50878 },
+ access_sctlr, reset_val, c1_SCTLR, 0x00C50878 },
};
static struct kvm_coproc_target_table a7_target_table = {
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 2786eae10c0d..b23a59c1c522 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -38,6 +38,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.hcr = HCR_GUEST_MASK;
return 0;
}
diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
index 6f18695a09cb..76af93025574 100644
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S
@@ -303,13 +303,17 @@ vcpu .req r0 @ vcpu pointer always in r0
mrc p15, 0, r2, c14, c1, 0 @ CNTKCTL
mrrc p15, 0, r4, r5, c7 @ PAR
+ mrc p15, 0, r6, c10, c3, 0 @ AMAIR0
+ mrc p15, 0, r7, c10, c3, 1 @ AMAIR1
.if \store_to_vcpu == 0
- push {r2,r4-r5}
+ push {r2,r4-r7}
.else
str r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
add r12, vcpu, #CP15_OFFSET(c7_PAR)
strd r4, r5, [r12]
+ str r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)]
+ str r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)]
.endif
.endm
@@ -322,15 +326,19 @@ vcpu .req r0 @ vcpu pointer always in r0
*/
.macro write_cp15_state read_from_vcpu
.if \read_from_vcpu == 0
- pop {r2,r4-r5}
+ pop {r2,r4-r7}
.else
ldr r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
add r12, vcpu, #CP15_OFFSET(c7_PAR)
ldrd r4, r5, [r12]
+ ldr r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)]
+ ldr r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)]
.endif
mcr p15, 0, r2, c14, c1, 0 @ CNTKCTL
mcrr p15, 0, r4, r5, c7 @ PAR
+ mcr p15, 0, r6, c10, c3, 0 @ AMAIR0
+ mcr p15, 0, r7, c10, c3, 1 @ AMAIR1
.if \read_from_vcpu == 0
pop {r2-r12}
@@ -597,17 +605,14 @@ vcpu .req r0 @ vcpu pointer always in r0
/* Enable/Disable: stage-2 trans., trap interrupts, trap wfi, trap smc */
.macro configure_hyp_role operation
- mrc p15, 4, r2, c1, c1, 0 @ HCR
- bic r2, r2, #HCR_VIRT_EXCP_MASK
- ldr r3, =HCR_GUEST_MASK
.if \operation == vmentry
- orr r2, r2, r3
+ ldr r2, [vcpu, #VCPU_HCR]
ldr r3, [vcpu, #VCPU_IRQ_LINES]
orr r2, r2, r3
.else
- bic r2, r2, r3
+ mov r2, #0
.endif
- mcr p15, 4, r2, c1, c1, 0
+ mcr p15, 4, r2, c1, c1, 0 @ HCR
.endm
.macro load_vcpu
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 7789857d1470..80bb1e6c2c29 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -144,8 +144,9 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
while (addr < end) {
pgd = pgdp + pgd_index(addr);
pud = pud_offset(pgd, addr);
+ pte = NULL;
if (pud_none(*pud)) {
- addr = pud_addr_end(addr, end);
+ addr = kvm_pud_addr_end(addr, end);
continue;
}
@@ -155,13 +156,13 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
* move on.
*/
clear_pud_entry(kvm, pud, addr);
- addr = pud_addr_end(addr, end);
+ addr = kvm_pud_addr_end(addr, end);
continue;
}
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
- addr = pmd_addr_end(addr, end);
+ addr = kvm_pmd_addr_end(addr, end);
continue;
}
@@ -174,12 +175,12 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
/*
* If the pmd entry is to be cleared, walk back up the ladder
*/
- if (kvm_pmd_huge(*pmd) || page_empty(pte)) {
+ if (kvm_pmd_huge(*pmd) || (pte && page_empty(pte))) {
clear_pmd_entry(kvm, pmd, addr);
- next = pmd_addr_end(addr, end);
+ next = kvm_pmd_addr_end(addr, end);
if (page_empty(pmd) && !page_empty(pud)) {
clear_pud_entry(kvm, pud, addr);
- next = pud_addr_end(addr, end);
+ next = kvm_pud_addr_end(addr, end);
}
}
@@ -187,6 +188,99 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp,
}
}
+static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
+ phys_addr_t addr, phys_addr_t end)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ if (!pte_none(*pte)) {
+ hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT);
+ kvm_flush_dcache_to_poc((void*)hva, PAGE_SIZE);
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
+ phys_addr_t addr, phys_addr_t end)
+{
+ pmd_t *pmd;
+ phys_addr_t next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = kvm_pmd_addr_end(addr, end);
+ if (!pmd_none(*pmd)) {
+ if (kvm_pmd_huge(*pmd)) {
+ hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT);
+ kvm_flush_dcache_to_poc((void*)hva, PMD_SIZE);
+ } else {
+ stage2_flush_ptes(kvm, pmd, addr, next);
+ }
+ }
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
+ phys_addr_t addr, phys_addr_t end)
+{
+ pud_t *pud;
+ phys_addr_t next;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = kvm_pud_addr_end(addr, end);
+ if (!pud_none(*pud)) {
+ if (pud_huge(*pud)) {
+ hva_t hva = gfn_to_hva(kvm, addr >> PAGE_SHIFT);
+ kvm_flush_dcache_to_poc((void*)hva, PUD_SIZE);
+ } else {
+ stage2_flush_pmds(kvm, pud, addr, next);
+ }
+ }
+ } while (pud++, addr = next, addr != end);
+}
+
+static void stage2_flush_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+ phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
+ phys_addr_t next;
+ pgd_t *pgd;
+
+ pgd = kvm->arch.pgd + pgd_index(addr);
+ do {
+ next = kvm_pgd_addr_end(addr, end);
+ stage2_flush_puds(kvm, pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the stage 2 page tables and invalidate any cache lines
+ * backing memory already mapped to the VM.
+ */
+void stage2_flush_vm(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ slots = kvm_memslots(kvm);
+ kvm_for_each_memslot(memslot, slots)
+ stage2_flush_memslot(kvm, memslot);
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+}
+
/**
* free_boot_hyp_pgd - free HYP boot page tables
*
@@ -715,7 +809,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_set_s2pmd_writable(&new_pmd);
kvm_set_pfn_dirty(pfn);
}
- coherent_icache_guest_page(kvm, hva & PMD_MASK, PMD_SIZE);
+ coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE);
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
} else {
pte_t new_pte = pfn_pte(pfn, PAGE_S2);
@@ -723,7 +817,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_set_s2pte_writable(&new_pte);
kvm_set_pfn_dirty(pfn);
}
- coherent_icache_guest_page(kvm, hva, PAGE_SIZE);
+ coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, false);
}
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 21ef48d32ff2..3d6903006a8a 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -62,6 +62,7 @@
* RW: 64bit by default, can be overriden for 32bit VMs
* TAC: Trap ACTLR
* TSC: Trap SMC
+ * TVM: Trap VM ops (until M+C set in SCTLR_EL1)
* TSW: Trap cache operations by set/way
* TWE: Trap WFE
* TWI: Trap WFI
@@ -74,7 +75,7 @@
* SWIO: Turn set/way invalidates into set/way clean+invalidate
*/
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
- HCR_BSU_IS | HCR_FB | HCR_TAC | \
+ HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
HCR_AMO | HCR_IMO | HCR_FMO | \
HCR_SWIO | HCR_TIDCP | HCR_RW)
#define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index b25763bc0ec4..9fcd54b1e16d 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -79,7 +79,8 @@
#define c13_TID_URW (TPIDR_EL0 * 2) /* Thread ID, User R/W */
#define c13_TID_URO (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */
#define c13_TID_PRIV (TPIDR_EL1 * 2) /* Thread ID, Privileged */
-#define c10_AMAIR (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
+#define c10_AMAIR0 (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
+#define c10_AMAIR1 (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */
#define c14_CNTKCTL (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */
#define NR_CP15_REGS (NR_SYS_REGS * 2)
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 7f1f9408ff66..7d29847a893b 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -106,7 +106,6 @@ static inline bool kvm_is_write_fault(unsigned long esr)
return true;
}
-static inline void kvm_clean_dcache_area(void *addr, size_t size) {}
static inline void kvm_clean_pgd(pgd_t *pgd) {}
static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
static inline void kvm_clean_pte(pte_t *pte) {}
@@ -122,11 +121,25 @@ static inline void kvm_set_s2pmd_writable(pmd_t *pmd)
pmd_val(*pmd) |= PMD_S2_RDWR;
}
+#define kvm_pgd_addr_end(addr, end) pgd_addr_end(addr, end)
+#define kvm_pud_addr_end(addr, end) pud_addr_end(addr, end)
+#define kvm_pmd_addr_end(addr, end) pmd_addr_end(addr, end)
+
struct kvm;
-static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
- unsigned long size)
+#define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l))
+
+static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
{
+ return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
+}
+
+static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
+ unsigned long size)
+{
+ if (!vcpu_has_cache_enabled(vcpu))
+ kvm_flush_dcache_to_poc((void *)hva, size);
+
if (!icache_is_aliasing()) { /* PIPT */
flush_icache_range(hva, hva + size);
} else if (!icache_is_aivivt()) { /* non ASID-tagged VIVT */
@@ -135,8 +148,9 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, hva_t hva,
}
}
-#define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l))
#define kvm_virt_to_phys(x) __virt_to_phys((unsigned long)(x))
+void stage2_flush_vm(struct kvm *kvm);
+
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 02e9d09e1d80..03244582bc55 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -27,6 +27,7 @@
#include <asm/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_coproc.h>
+#include <asm/kvm_mmu.h>
#include <asm/cacheflush.h>
#include <asm/cputype.h>
#include <trace/events/kvm.h>
@@ -121,6 +122,48 @@ done:
}
/*
+ * Generic accessor for VM registers. Only called as long as HCR_TVM
+ * is set.
+ */
+static bool access_vm_reg(struct kvm_vcpu *vcpu,
+ const struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ unsigned long val;
+
+ BUG_ON(!p->is_write);
+
+ val = *vcpu_reg(vcpu, p->Rt);
+ if (!p->is_aarch32) {
+ vcpu_sys_reg(vcpu, r->reg) = val;
+ } else {
+ vcpu_cp15(vcpu, r->reg) = val & 0xffffffffUL;
+ if (!p->is_32bit)
+ vcpu_cp15(vcpu, r->reg + 1) = val >> 32;
+ }
+ return true;
+}
+
+/*
+ * SCTLR_EL1 accessor. Only called as long as HCR_TVM is set. If the
+ * guest enables the MMU, we stop trapping the VM sys_regs and leave
+ * it in complete control of the caches.
+ */
+static bool access_sctlr(struct kvm_vcpu *vcpu,
+ const struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ access_vm_reg(vcpu, p, r);
+
+ if (vcpu_has_cache_enabled(vcpu)) { /* MMU+Caches enabled? */
+ vcpu->arch.hcr_el2 &= ~HCR_TVM;
+ stage2_flush_vm(vcpu->kvm);
+ }
+
+ return true;
+}
+
+/*
* We could trap ID_DFR0 and tell the guest we don't support performance
* monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was
* NAKed, so it will read the PMCR anyway.
@@ -185,32 +228,32 @@ static const struct sys_reg_desc sys_reg_descs[] = {
NULL, reset_mpidr, MPIDR_EL1 },
/* SCTLR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000),
- NULL, reset_val, SCTLR_EL1, 0x00C50078 },
+ access_sctlr, reset_val, SCTLR_EL1, 0x00C50078 },
/* CPACR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010),
NULL, reset_val, CPACR_EL1, 0 },
/* TTBR0_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b000),
- NULL, reset_unknown, TTBR0_EL1 },
+ access_vm_reg, reset_unknown, TTBR0_EL1 },
/* TTBR1_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b001),
- NULL, reset_unknown, TTBR1_EL1 },
+ access_vm_reg, reset_unknown, TTBR1_EL1 },
/* TCR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b010),
- NULL, reset_val, TCR_EL1, 0 },
+ access_vm_reg, reset_val, TCR_EL1, 0 },
/* AFSR0_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b000),
- NULL, reset_unknown, AFSR0_EL1 },
+ access_vm_reg, reset_unknown, AFSR0_EL1 },
/* AFSR1_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b001),
- NULL, reset_unknown, AFSR1_EL1 },
+ access_vm_reg, reset_unknown, AFSR1_EL1 },
/* ESR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0010), Op2(0b000),
- NULL, reset_unknown, ESR_EL1 },
+ access_vm_reg, reset_unknown, ESR_EL1 },
/* FAR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000),
- NULL, reset_unknown, FAR_EL1 },
+ access_vm_reg, reset_unknown, FAR_EL1 },
/* PAR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b0111), CRm(0b0100), Op2(0b000),
NULL, reset_unknown, PAR_EL1 },
@@ -224,17 +267,17 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* MAIR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000),
- NULL, reset_unknown, MAIR_EL1 },
+ access_vm_reg, reset_unknown, MAIR_EL1 },
/* AMAIR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0011), Op2(0b000),
- NULL, reset_amair_el1, AMAIR_EL1 },
+ access_vm_reg, reset_amair_el1, AMAIR_EL1 },
/* VBAR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000),
NULL, reset_val, VBAR_EL1, 0 },
/* CONTEXTIDR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001),
- NULL, reset_val, CONTEXTIDR_EL1, 0 },
+ access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
/* TPIDR_EL1 */
{ Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b100),
NULL, reset_unknown, TPIDR_EL1 },
@@ -305,14 +348,32 @@ static const struct sys_reg_desc sys_reg_descs[] = {
NULL, reset_val, FPEXC32_EL2, 0x70 },
};
-/* Trapped cp15 registers */
+/*
+ * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding,
+ * depending on the way they are accessed (as a 32bit or a 64bit
+ * register).
+ */
static const struct sys_reg_desc cp15_regs[] = {
+ { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
+ { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_sctlr, NULL, c1_SCTLR },
+ { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
+ { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, c2_TTBR1 },
+ { Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, c2_TTBCR },
+ { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, c3_DACR },
+ { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, c5_DFSR },
+ { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, c5_IFSR },
+ { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, c5_ADFSR },
+ { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, c5_AIFSR },
+ { Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, c6_DFAR },
+ { Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, c6_IFAR },
+
/*
* DC{C,I,CI}SW operations:
*/
{ Op1( 0), CRn( 7), CRm( 6), Op2( 2), access_dcsw },
{ Op1( 0), CRn( 7), CRm(10), Op2( 2), access_dcsw },
{ Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw },
+
{ Op1( 0), CRn( 9), CRm(12), Op2( 0), pm_fake },
{ Op1( 0), CRn( 9), CRm(12), Op2( 1), pm_fake },
{ Op1( 0), CRn( 9), CRm(12), Op2( 2), pm_fake },
@@ -326,6 +387,14 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1( 0), CRn( 9), CRm(14), Op2( 0), pm_fake },
{ Op1( 0), CRn( 9), CRm(14), Op2( 1), pm_fake },
{ Op1( 0), CRn( 9), CRm(14), Op2( 2), pm_fake },
+
+ { Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, c10_PRRR },
+ { Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR },
+ { Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, c10_AMAIR0 },
+ { Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, c10_AMAIR1 },
+ { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID },
+
+ { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
};
/* Target specific emulation tables */
@@ -437,6 +506,8 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
u32 hsr = kvm_vcpu_get_hsr(vcpu);
int Rt2 = (hsr >> 10) & 0xf;
+ params.is_aarch32 = true;
+ params.is_32bit = false;
params.CRm = (hsr >> 1) & 0xf;
params.Rt = (hsr >> 5) & 0xf;
params.is_write = ((hsr & 1) == 0);
@@ -480,6 +551,8 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run)
struct sys_reg_params params;
u32 hsr = kvm_vcpu_get_hsr(vcpu);
+ params.is_aarch32 = true;
+ params.is_32bit = true;
params.CRm = (hsr >> 1) & 0xf;
params.Rt = (hsr >> 5) & 0xf;
params.is_write = ((hsr & 1) == 0);
@@ -549,6 +622,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
struct sys_reg_params params;
unsigned long esr = kvm_vcpu_get_hsr(vcpu);
+ params.is_aarch32 = false;
+ params.is_32bit = false;
params.Op0 = (esr >> 20) & 3;
params.Op1 = (esr >> 14) & 0x7;
params.CRn = (esr >> 10) & 0xf;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index d50d3722998e..d411e251412c 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -30,6 +30,8 @@ struct sys_reg_params {
u8 Op2;
u8 Rt;
bool is_write;
+ bool is_aarch32;
+ bool is_32bit; /* Only valid if is_aarch32 is true */
};
struct sys_reg_desc {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 53f44bee9ebb..6a4309bb821a 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -199,6 +199,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IRQCHIP:
case KVM_CAP_MP_STATE:
case KVM_CAP_IRQ_INJECT_STATUS:
+ case KVM_CAP_IOAPIC_POLARITY_IGNORED:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index a995fce87791..060aaa6348d7 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -30,16 +30,16 @@
/* Special address that contains the comm page, used for reducing # of traps */
-#define KVM_GUEST_COMMPAGE_ADDR 0x0
+#define KVM_GUEST_COMMPAGE_ADDR 0x0
#define KVM_GUEST_KERNEL_MODE(vcpu) ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \
((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0))
-#define KVM_GUEST_KUSEG 0x00000000UL
-#define KVM_GUEST_KSEG0 0x40000000UL
-#define KVM_GUEST_KSEG23 0x60000000UL
-#define KVM_GUEST_KSEGX(a) ((_ACAST32_(a)) & 0x60000000)
-#define KVM_GUEST_CPHYSADDR(a) ((_ACAST32_(a)) & 0x1fffffff)
+#define KVM_GUEST_KUSEG 0x00000000UL
+#define KVM_GUEST_KSEG0 0x40000000UL
+#define KVM_GUEST_KSEG23 0x60000000UL
+#define KVM_GUEST_KSEGX(a) ((_ACAST32_(a)) & 0x60000000)
+#define KVM_GUEST_CPHYSADDR(a) ((_ACAST32_(a)) & 0x1fffffff)
#define KVM_GUEST_CKSEG0ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG0)
#define KVM_GUEST_CKSEG1ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG1)
@@ -52,17 +52,17 @@
#define KVM_GUEST_KSEG1ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG1)
#define KVM_GUEST_KSEG23ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23)
-#define KVM_INVALID_PAGE 0xdeadbeef
-#define KVM_INVALID_INST 0xdeadbeef
-#define KVM_INVALID_ADDR 0xdeadbeef
+#define KVM_INVALID_PAGE 0xdeadbeef
+#define KVM_INVALID_INST 0xdeadbeef
+#define KVM_INVALID_ADDR 0xdeadbeef
-#define KVM_MALTA_GUEST_RTC_ADDR 0xb8000070UL
+#define KVM_MALTA_GUEST_RTC_ADDR 0xb8000070UL
-#define GUEST_TICKS_PER_JIFFY (40000000/HZ)
-#define MS_TO_NS(x) (x * 1E6L)
+#define GUEST_TICKS_PER_JIFFY (40000000/HZ)
+#define MS_TO_NS(x) (x * 1E6L)
-#define CAUSEB_DC 27
-#define CAUSEF_DC (_ULCAST_(1) << 27)
+#define CAUSEB_DC 27
+#define CAUSEF_DC (_ULCAST_(1) << 27)
struct kvm;
struct kvm_run;
@@ -126,8 +126,8 @@ struct kvm_arch {
int commpage_tlb;
};
-#define N_MIPS_COPROC_REGS 32
-#define N_MIPS_COPROC_SEL 8
+#define N_MIPS_COPROC_REGS 32
+#define N_MIPS_COPROC_SEL 8
struct mips_coproc {
unsigned long reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL];
@@ -139,124 +139,124 @@ struct mips_coproc {
/*
* Coprocessor 0 register names
*/
-#define MIPS_CP0_TLB_INDEX 0
-#define MIPS_CP0_TLB_RANDOM 1
-#define MIPS_CP0_TLB_LOW 2
-#define MIPS_CP0_TLB_LO0 2
-#define MIPS_CP0_TLB_LO1 3
-#define MIPS_CP0_TLB_CONTEXT 4
-#define MIPS_CP0_TLB_PG_MASK 5
-#define MIPS_CP0_TLB_WIRED 6
-#define MIPS_CP0_HWRENA 7
-#define MIPS_CP0_BAD_VADDR 8
-#define MIPS_CP0_COUNT 9
-#define MIPS_CP0_TLB_HI 10
-#define MIPS_CP0_COMPARE 11
-#define MIPS_CP0_STATUS 12
-#define MIPS_CP0_CAUSE 13
-#define MIPS_CP0_EXC_PC 14
-#define MIPS_CP0_PRID 15
-#define MIPS_CP0_CONFIG 16
-#define MIPS_CP0_LLADDR 17
-#define MIPS_CP0_WATCH_LO 18
-#define MIPS_CP0_WATCH_HI 19
-#define MIPS_CP0_TLB_XCONTEXT 20
-#define MIPS_CP0_ECC 26
-#define MIPS_CP0_CACHE_ERR 27
-#define MIPS_CP0_TAG_LO 28
-#define MIPS_CP0_TAG_HI 29
-#define MIPS_CP0_ERROR_PC 30
-#define MIPS_CP0_DEBUG 23
-#define MIPS_CP0_DEPC 24
-#define MIPS_CP0_PERFCNT 25
-#define MIPS_CP0_ERRCTL 26
-#define MIPS_CP0_DATA_LO 28
-#define MIPS_CP0_DATA_HI 29
-#define MIPS_CP0_DESAVE 31
-
-#define MIPS_CP0_CONFIG_SEL 0
-#define MIPS_CP0_CONFIG1_SEL 1
-#define MIPS_CP0_CONFIG2_SEL 2
-#define MIPS_CP0_CONFIG3_SEL 3
+#define MIPS_CP0_TLB_INDEX 0
+#define MIPS_CP0_TLB_RANDOM 1
+#define MIPS_CP0_TLB_LOW 2
+#define MIPS_CP0_TLB_LO0 2
+#define MIPS_CP0_TLB_LO1 3
+#define MIPS_CP0_TLB_CONTEXT 4
+#define MIPS_CP0_TLB_PG_MASK 5
+#define MIPS_CP0_TLB_WIRED 6
+#define MIPS_CP0_HWRENA 7
+#define MIPS_CP0_BAD_VADDR 8
+#define MIPS_CP0_COUNT 9
+#define MIPS_CP0_TLB_HI 10
+#define MIPS_CP0_COMPARE 11
+#define MIPS_CP0_STATUS 12
+#define MIPS_CP0_CAUSE 13
+#define MIPS_CP0_EXC_PC 14
+#define MIPS_CP0_PRID 15
+#define MIPS_CP0_CONFIG 16
+#define MIPS_CP0_LLADDR 17
+#define MIPS_CP0_WATCH_LO 18
+#define MIPS_CP0_WATCH_HI 19
+#define MIPS_CP0_TLB_XCONTEXT 20
+#define MIPS_CP0_ECC 26
+#define MIPS_CP0_CACHE_ERR 27
+#define MIPS_CP0_TAG_LO 28
+#define MIPS_CP0_TAG_HI 29
+#define MIPS_CP0_ERROR_PC 30
+#define MIPS_CP0_DEBUG 23
+#define MIPS_CP0_DEPC 24
+#define MIPS_CP0_PERFCNT 25
+#define MIPS_CP0_ERRCTL 26
+#define MIPS_CP0_DATA_LO 28
+#define MIPS_CP0_DATA_HI 29
+#define MIPS_CP0_DESAVE 31
+
+#define MIPS_CP0_CONFIG_SEL 0
+#define MIPS_CP0_CONFIG1_SEL 1
+#define MIPS_CP0_CONFIG2_SEL 2
+#define MIPS_CP0_CONFIG3_SEL 3
/* Config0 register bits */
-#define CP0C0_M 31
-#define CP0C0_K23 28
-#define CP0C0_KU 25
-#define CP0C0_MDU 20
-#define CP0C0_MM 17
-#define CP0C0_BM 16
-#define CP0C0_BE 15
-#define CP0C0_AT 13
-#define CP0C0_AR 10
-#define CP0C0_MT 7
-#define CP0C0_VI 3
-#define CP0C0_K0 0
+#define CP0C0_M 31
+#define CP0C0_K23 28
+#define CP0C0_KU 25
+#define CP0C0_MDU 20
+#define CP0C0_MM 17
+#define CP0C0_BM 16
+#define CP0C0_BE 15
+#define CP0C0_AT 13
+#define CP0C0_AR 10
+#define CP0C0_MT 7
+#define CP0C0_VI 3
+#define CP0C0_K0 0
/* Config1 register bits */
-#define CP0C1_M 31
-#define CP0C1_MMU 25
-#define CP0C1_IS 22
-#define CP0C1_IL 19
-#define CP0C1_IA 16
-#define CP0C1_DS 13
-#define CP0C1_DL 10
-#define CP0C1_DA 7
-#define CP0C1_C2 6
-#define CP0C1_MD 5
-#define CP0C1_PC 4
-#define CP0C1_WR 3
-#define CP0C1_CA 2
-#define CP0C1_EP 1
-#define CP0C1_FP 0
+#define CP0C1_M 31
+#define CP0C1_MMU 25
+#define CP0C1_IS 22
+#define CP0C1_IL 19
+#define CP0C1_IA 16
+#define CP0C1_DS 13
+#define CP0C1_DL 10
+#define CP0C1_DA 7
+#define CP0C1_C2 6
+#define CP0C1_MD 5
+#define CP0C1_PC 4
+#define CP0C1_WR 3
+#define CP0C1_CA 2
+#define CP0C1_EP 1
+#define CP0C1_FP 0
/* Config2 Register bits */
-#define CP0C2_M 31
-#define CP0C2_TU 28
-#define CP0C2_TS 24
-#define CP0C2_TL 20
-#define CP0C2_TA 16
-#define CP0C2_SU 12
-#define CP0C2_SS 8
-#define CP0C2_SL 4
-#define CP0C2_SA 0
+#define CP0C2_M 31
+#define CP0C2_TU 28
+#define CP0C2_TS 24
+#define CP0C2_TL 20
+#define CP0C2_TA 16
+#define CP0C2_SU 12
+#define CP0C2_SS 8
+#define CP0C2_SL 4
+#define CP0C2_SA 0
/* Config3 Register bits */
-#define CP0C3_M 31
-#define CP0C3_ISA_ON_EXC 16
-#define CP0C3_ULRI 13
-#define CP0C3_DSPP 10
-#define CP0C3_LPA 7
-#define CP0C3_VEIC 6
-#define CP0C3_VInt 5
-#define CP0C3_SP 4
-#define CP0C3_MT 2
-#define CP0C3_SM 1
-#define CP0C3_TL 0
+#define CP0C3_M 31
+#define CP0C3_ISA_ON_EXC 16
+#define CP0C3_ULRI 13
+#define CP0C3_DSPP 10
+#define CP0C3_LPA 7
+#define CP0C3_VEIC 6
+#define CP0C3_VInt 5
+#define CP0C3_SP 4
+#define CP0C3_MT 2
+#define CP0C3_SM 1
+#define CP0C3_TL 0
/* Have config1, Cacheable, noncoherent, write-back, write allocate*/
-#define MIPS_CONFIG0 \
+#define MIPS_CONFIG0 \
((1 << CP0C0_M) | (0x3 << CP0C0_K0))
/* Have config2, no coprocessor2 attached, no MDMX support attached,
no performance counters, watch registers present,
no code compression, EJTAG present, no FPU, no watch registers */
-#define MIPS_CONFIG1 \
-((1 << CP0C1_M) | \
- (0 << CP0C1_C2) | (0 << CP0C1_MD) | (0 << CP0C1_PC) | \
- (0 << CP0C1_WR) | (0 << CP0C1_CA) | (1 << CP0C1_EP) | \
+#define MIPS_CONFIG1 \
+((1 << CP0C1_M) | \
+ (0 << CP0C1_C2) | (0 << CP0C1_MD) | (0 << CP0C1_PC) | \
+ (0 << CP0C1_WR) | (0 << CP0C1_CA) | (1 << CP0C1_EP) | \
(0 << CP0C1_FP))
/* Have config3, no tertiary/secondary caches implemented */
-#define MIPS_CONFIG2 \
+#define MIPS_CONFIG2 \
((1 << CP0C2_M))
/* No config4, no DSP ASE, no large physaddr (PABITS),
no external interrupt controller, no vectored interrupts,
no 1kb pages, no SmartMIPS ASE, no trace logic */
-#define MIPS_CONFIG3 \
-((0 << CP0C3_M) | (0 << CP0C3_DSPP) | (0 << CP0C3_LPA) | \
- (0 << CP0C3_VEIC) | (0 << CP0C3_VInt) | (0 << CP0C3_SP) | \
+#define MIPS_CONFIG3 \
+((0 << CP0C3_M) | (0 << CP0C3_DSPP) | (0 << CP0C3_LPA) | \
+ (0 << CP0C3_VEIC) | (0 << CP0C3_VInt) | (0 << CP0C3_SP) | \
(0 << CP0C3_SM) | (0 << CP0C3_TL))
/* MMU types, the first four entries have the same layout as the
@@ -274,36 +274,36 @@ enum mips_mmu_types {
/*
* Trap codes
*/
-#define T_INT 0 /* Interrupt pending */
-#define T_TLB_MOD 1 /* TLB modified fault */
-#define T_TLB_LD_MISS 2 /* TLB miss on load or ifetch */
-#define T_TLB_ST_MISS 3 /* TLB miss on a store */
-#define T_ADDR_ERR_LD 4 /* Address error on a load or ifetch */
-#define T_ADDR_ERR_ST 5 /* Address error on a store */
-#define T_BUS_ERR_IFETCH 6 /* Bus error on an ifetch */
-#define T_BUS_ERR_LD_ST 7 /* Bus error on a load or store */
-#define T_SYSCALL 8 /* System call */
-#define T_BREAK 9 /* Breakpoint */
-#define T_RES_INST 10 /* Reserved instruction exception */
-#define T_COP_UNUSABLE 11 /* Coprocessor unusable */
-#define T_OVFLOW 12 /* Arithmetic overflow */
+#define T_INT 0 /* Interrupt pending */
+#define T_TLB_MOD 1 /* TLB modified fault */
+#define T_TLB_LD_MISS 2 /* TLB miss on load or ifetch */
+#define T_TLB_ST_MISS 3 /* TLB miss on a store */
+#define T_ADDR_ERR_LD 4 /* Address error on a load or ifetch */
+#define T_ADDR_ERR_ST 5 /* Address error on a store */
+#define T_BUS_ERR_IFETCH 6 /* Bus error on an ifetch */
+#define T_BUS_ERR_LD_ST 7 /* Bus error on a load or store */
+#define T_SYSCALL 8 /* System call */
+#define T_BREAK 9 /* Breakpoint */
+#define T_RES_INST 10 /* Reserved instruction exception */
+#define T_COP_UNUSABLE 11 /* Coprocessor unusable */
+#define T_OVFLOW 12 /* Arithmetic overflow */
/*
* Trap definitions added for r4000 port.
*/
-#define T_TRAP 13 /* Trap instruction */
-#define T_VCEI 14 /* Virtual coherency exception */
-#define T_FPE 15 /* Floating point exception */
-#define T_WATCH 23 /* Watch address reference */
-#define T_VCED 31 /* Virtual coherency data */
+#define T_TRAP 13 /* Trap instruction */
+#define T_VCEI 14 /* Virtual coherency exception */
+#define T_FPE 15 /* Floating point exception */
+#define T_WATCH 23 /* Watch address reference */
+#define T_VCED 31 /* Virtual coherency data */
/* Resume Flags */
-#define RESUME_FLAG_DR (1<<0) /* Reload guest nonvolatile state? */
-#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
+#define RESUME_FLAG_DR (1<<0) /* Reload guest nonvolatile state? */
+#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
-#define RESUME_GUEST 0
-#define RESUME_GUEST_DR RESUME_FLAG_DR
-#define RESUME_HOST RESUME_FLAG_HOST
+#define RESUME_GUEST 0
+#define RESUME_GUEST_DR RESUME_FLAG_DR
+#define RESUME_HOST RESUME_FLAG_HOST
enum emulation_result {
EMULATE_DONE, /* no further processing */
@@ -313,24 +313,27 @@ enum emulation_result {
EMULATE_PRIV_FAIL,
};
-#define MIPS3_PG_G 0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
-#define MIPS3_PG_V 0x00000002 /* Valid */
-#define MIPS3_PG_NV 0x00000000
-#define MIPS3_PG_D 0x00000004 /* Dirty */
+#define MIPS3_PG_G 0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
+#define MIPS3_PG_V 0x00000002 /* Valid */
+#define MIPS3_PG_NV 0x00000000
+#define MIPS3_PG_D 0x00000004 /* Dirty */
#define mips3_paddr_to_tlbpfn(x) \
- (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
+ (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
#define mips3_tlbpfn_to_paddr(x) \
- ((unsigned long)((x) & MIPS3_PG_FRAME) << MIPS3_PG_SHIFT)
+ ((unsigned long)((x) & MIPS3_PG_FRAME) << MIPS3_PG_SHIFT)
-#define MIPS3_PG_SHIFT 6
-#define MIPS3_PG_FRAME 0x3fffffc0
+#define MIPS3_PG_SHIFT 6
+#define MIPS3_PG_FRAME 0x3fffffc0
-#define VPN2_MASK 0xffffe000
-#define TLB_IS_GLOBAL(x) (((x).tlb_lo0 & MIPS3_PG_G) && ((x).tlb_lo1 & MIPS3_PG_G))
-#define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK)
-#define TLB_ASID(x) ((x).tlb_hi & ASID_MASK)
-#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) ? ((x).tlb_lo1 & MIPS3_PG_V) : ((x).tlb_lo0 & MIPS3_PG_V))
+#define VPN2_MASK 0xffffe000
+#define TLB_IS_GLOBAL(x) (((x).tlb_lo0 & MIPS3_PG_G) && \
+ ((x).tlb_lo1 & MIPS3_PG_G))
+#define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK)
+#define TLB_ASID(x) ((x).tlb_hi & ASID_MASK)
+#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) \
+ ? ((x).tlb_lo1 & MIPS3_PG_V) \
+ : ((x).tlb_lo0 & MIPS3_PG_V))
struct kvm_mips_tlb {
long tlb_mask;
@@ -339,7 +342,7 @@ struct kvm_mips_tlb {
long tlb_lo1;
};
-#define KVM_MIPS_GUEST_TLB_SIZE 64
+#define KVM_MIPS_GUEST_TLB_SIZE 64
struct kvm_vcpu_arch {
void *host_ebase, *guest_ebase;
unsigned long host_stack;
@@ -400,65 +403,67 @@ struct kvm_vcpu_arch {
};
-#define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0])
-#define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
-#define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0])
-#define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0])
-#define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
-#define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
-#define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
-#define kvm_read_c0_guest_pagemask(cop0) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
-#define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
-#define kvm_read_c0_guest_wired(cop0) (cop0->reg[MIPS_CP0_TLB_WIRED][0])
-#define kvm_write_c0_guest_wired(cop0, val) (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val))
-#define kvm_read_c0_guest_badvaddr(cop0) (cop0->reg[MIPS_CP0_BAD_VADDR][0])
-#define kvm_write_c0_guest_badvaddr(cop0, val) (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val))
-#define kvm_read_c0_guest_count(cop0) (cop0->reg[MIPS_CP0_COUNT][0])
-#define kvm_write_c0_guest_count(cop0, val) (cop0->reg[MIPS_CP0_COUNT][0] = (val))
-#define kvm_read_c0_guest_entryhi(cop0) (cop0->reg[MIPS_CP0_TLB_HI][0])
-#define kvm_write_c0_guest_entryhi(cop0, val) (cop0->reg[MIPS_CP0_TLB_HI][0] = (val))
-#define kvm_read_c0_guest_compare(cop0) (cop0->reg[MIPS_CP0_COMPARE][0])
-#define kvm_write_c0_guest_compare(cop0, val) (cop0->reg[MIPS_CP0_COMPARE][0] = (val))
-#define kvm_read_c0_guest_status(cop0) (cop0->reg[MIPS_CP0_STATUS][0])
-#define kvm_write_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] = (val))
-#define kvm_read_c0_guest_intctl(cop0) (cop0->reg[MIPS_CP0_STATUS][1])
-#define kvm_write_c0_guest_intctl(cop0, val) (cop0->reg[MIPS_CP0_STATUS][1] = (val))
-#define kvm_read_c0_guest_cause(cop0) (cop0->reg[MIPS_CP0_CAUSE][0])
-#define kvm_write_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] = (val))
-#define kvm_read_c0_guest_epc(cop0) (cop0->reg[MIPS_CP0_EXC_PC][0])
-#define kvm_write_c0_guest_epc(cop0, val) (cop0->reg[MIPS_CP0_EXC_PC][0] = (val))
-#define kvm_read_c0_guest_prid(cop0) (cop0->reg[MIPS_CP0_PRID][0])
-#define kvm_write_c0_guest_prid(cop0, val) (cop0->reg[MIPS_CP0_PRID][0] = (val))
-#define kvm_read_c0_guest_ebase(cop0) (cop0->reg[MIPS_CP0_PRID][1])
-#define kvm_write_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] = (val))
-#define kvm_read_c0_guest_config(cop0) (cop0->reg[MIPS_CP0_CONFIG][0])
-#define kvm_read_c0_guest_config1(cop0) (cop0->reg[MIPS_CP0_CONFIG][1])
-#define kvm_read_c0_guest_config2(cop0) (cop0->reg[MIPS_CP0_CONFIG][2])
-#define kvm_read_c0_guest_config3(cop0) (cop0->reg[MIPS_CP0_CONFIG][3])
-#define kvm_read_c0_guest_config7(cop0) (cop0->reg[MIPS_CP0_CONFIG][7])
-#define kvm_write_c0_guest_config(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][0] = (val))
-#define kvm_write_c0_guest_config1(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][1] = (val))
-#define kvm_write_c0_guest_config2(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][2] = (val))
-#define kvm_write_c0_guest_config3(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][3] = (val))
-#define kvm_write_c0_guest_config7(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
-#define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0])
-#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
-
-#define kvm_set_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] |= (val))
-#define kvm_clear_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
-#define kvm_set_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] |= (val))
-#define kvm_clear_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val))
-#define kvm_change_c0_guest_cause(cop0, change, val) \
-{ \
- kvm_clear_c0_guest_cause(cop0, change); \
- kvm_set_c0_guest_cause(cop0, ((val) & (change))); \
+#define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0])
+#define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
+#define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0])
+#define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0])
+#define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
+#define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
+#define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
+#define kvm_read_c0_guest_pagemask(cop0) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
+#define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
+#define kvm_read_c0_guest_wired(cop0) (cop0->reg[MIPS_CP0_TLB_WIRED][0])
+#define kvm_write_c0_guest_wired(cop0, val) (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val))
+#define kvm_read_c0_guest_hwrena(cop0) (cop0->reg[MIPS_CP0_HWRENA][0])
+#define kvm_write_c0_guest_hwrena(cop0, val) (cop0->reg[MIPS_CP0_HWRENA][0] = (val))
+#define kvm_read_c0_guest_badvaddr(cop0) (cop0->reg[MIPS_CP0_BAD_VADDR][0])
+#define kvm_write_c0_guest_badvaddr(cop0, val) (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val))
+#define kvm_read_c0_guest_count(cop0) (cop0->reg[MIPS_CP0_COUNT][0])
+#define kvm_write_c0_guest_count(cop0, val) (cop0->reg[MIPS_CP0_COUNT][0] = (val))
+#define kvm_read_c0_guest_entryhi(cop0) (cop0->reg[MIPS_CP0_TLB_HI][0])
+#define kvm_write_c0_guest_entryhi(cop0, val) (cop0->reg[MIPS_CP0_TLB_HI][0] = (val))
+#define kvm_read_c0_guest_compare(cop0) (cop0->reg[MIPS_CP0_COMPARE][0])
+#define kvm_write_c0_guest_compare(cop0, val) (cop0->reg[MIPS_CP0_COMPARE][0] = (val))
+#define kvm_read_c0_guest_status(cop0) (cop0->reg[MIPS_CP0_STATUS][0])
+#define kvm_write_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] = (val))
+#define kvm_read_c0_guest_intctl(cop0) (cop0->reg[MIPS_CP0_STATUS][1])
+#define kvm_write_c0_guest_intctl(cop0, val) (cop0->reg[MIPS_CP0_STATUS][1] = (val))
+#define kvm_read_c0_guest_cause(cop0) (cop0->reg[MIPS_CP0_CAUSE][0])
+#define kvm_write_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] = (val))
+#define kvm_read_c0_guest_epc(cop0) (cop0->reg[MIPS_CP0_EXC_PC][0])
+#define kvm_write_c0_guest_epc(cop0, val) (cop0->reg[MIPS_CP0_EXC_PC][0] = (val))
+#define kvm_read_c0_guest_prid(cop0) (cop0->reg[MIPS_CP0_PRID][0])
+#define kvm_write_c0_guest_prid(cop0, val) (cop0->reg[MIPS_CP0_PRID][0] = (val))
+#define kvm_read_c0_guest_ebase(cop0) (cop0->reg[MIPS_CP0_PRID][1])
+#define kvm_write_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] = (val))
+#define kvm_read_c0_guest_config(cop0) (cop0->reg[MIPS_CP0_CONFIG][0])
+#define kvm_read_c0_guest_config1(cop0) (cop0->reg[MIPS_CP0_CONFIG][1])
+#define kvm_read_c0_guest_config2(cop0) (cop0->reg[MIPS_CP0_CONFIG][2])
+#define kvm_read_c0_guest_config3(cop0) (cop0->reg[MIPS_CP0_CONFIG][3])
+#define kvm_read_c0_guest_config7(cop0) (cop0->reg[MIPS_CP0_CONFIG][7])
+#define kvm_write_c0_guest_config(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][0] = (val))
+#define kvm_write_c0_guest_config1(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][1] = (val))
+#define kvm_write_c0_guest_config2(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][2] = (val))
+#define kvm_write_c0_guest_config3(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][3] = (val))
+#define kvm_write_c0_guest_config7(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
+#define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0])
+#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
+
+#define kvm_set_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] |= (val))
+#define kvm_clear_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
+#define kvm_set_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] |= (val))
+#define kvm_clear_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] &= ~(val))
+#define kvm_change_c0_guest_cause(cop0, change, val) \
+{ \
+ kvm_clear_c0_guest_cause(cop0, change); \
+ kvm_set_c0_guest_cause(cop0, ((val) & (change))); \
}
-#define kvm_set_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] |= (val))
-#define kvm_clear_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
-#define kvm_change_c0_guest_ebase(cop0, change, val) \
-{ \
- kvm_clear_c0_guest_ebase(cop0, change); \
- kvm_set_c0_guest_ebase(cop0, ((val) & (change))); \
+#define kvm_set_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] |= (val))
+#define kvm_clear_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
+#define kvm_change_c0_guest_ebase(cop0, change, val) \
+{ \
+ kvm_clear_c0_guest_ebase(cop0, change); \
+ kvm_set_c0_guest_ebase(cop0, ((val) & (change))); \
}
diff --git a/arch/mips/kvm/kvm_mips_emul.c b/arch/mips/kvm/kvm_mips_emul.c
index 4b6274b47f33..e3fec99941a7 100644
--- a/arch/mips/kvm/kvm_mips_emul.c
+++ b/arch/mips/kvm/kvm_mips_emul.c
@@ -436,13 +436,6 @@ kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, uint32_t cause,
sel = inst & 0x7;
co_bit = (inst >> 25) & 1;
- /* Verify that the register is valid */
- if (rd > MIPS_CP0_DESAVE) {
- printk("Invalid rd: %d\n", rd);
- er = EMULATE_FAIL;
- goto done;
- }
-
if (co_bit) {
op = (inst) & 0xff;
@@ -1542,8 +1535,15 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
}
if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) {
+ int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
int rd = (inst & RD) >> 11;
int rt = (inst & RT) >> 16;
+ /* If usermode, check RDHWR rd is allowed by guest HWREna */
+ if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) {
+ kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n",
+ rd, opc);
+ goto emulate_ri;
+ }
switch (rd) {
case 0: /* CPU number */
arch->gprs[rt] = 0;
@@ -1567,31 +1567,27 @@ kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
}
break;
case 29:
-#if 1
arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0);
-#else
- /* UserLocal not implemented */
- er = kvm_mips_emulate_ri_exc(cause, opc, run, vcpu);
-#endif
break;
default:
- printk("RDHWR not supported\n");
- er = EMULATE_FAIL;
- break;
+ kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc);
+ goto emulate_ri;
}
} else {
- printk("Emulate RI not supported @ %p: %#x\n", opc, inst);
- er = EMULATE_FAIL;
+ kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst);
+ goto emulate_ri;
}
+ return EMULATE_DONE;
+
+emulate_ri:
/*
- * Rollback PC only if emulation was unsuccessful
+ * Rollback PC (if in branch delay slot then the PC already points to
+ * branch target), and pass the RI exception to the guest OS.
*/
- if (er == EMULATE_FAIL) {
- vcpu->arch.pc = curr_pc;
- }
- return er;
+ vcpu->arch.pc = curr_pc;
+ return kvm_mips_emulate_ri_exc(cause, opc, run, vcpu);
}
enum emulation_result
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 83851aabfdc8..bb1e38a23ac7 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -304,6 +304,11 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
return vcpu->arch.fault_dar;
}
+static inline bool is_kvmppc_resume_guest(int r)
+{
+ return (r == RESUME_GUEST || r == RESUME_GUEST_NV);
+}
+
/* Magic register values loaded into r3 and r4 before the 'sc' assembly
* instruction for the OSI hypercalls */
#define OSI_SC_MAGIC_R3 0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index bf0fa8b0a883..51388befeddb 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -289,6 +289,18 @@ static inline void note_hpte_modification(struct kvm *kvm,
if (atomic_read(&kvm->arch.hpte_mod_interest))
rev->guest_rpte |= HPTE_GR_MODIFIED;
}
+
+/*
+ * Like kvm_memslots(), but for use in real mode when we can't do
+ * any RCU stuff (since the secondary threads are offline from the
+ * kernel's point of view), and we can't print anything.
+ * Thus we use rcu_dereference_raw() rather than rcu_dereference_check().
+ */
+static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
+{
+ return rcu_dereference_raw_notrace(kvm->memslots);
+}
+
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index f3a91dc02c98..821725c1bf46 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -94,7 +94,7 @@ struct kvmppc_host_state {
unsigned long xics_phys;
u32 saved_xirr;
u64 dabr;
- u64 host_mmcr[3];
+ u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
u32 host_pmc[8];
u64 host_purr;
u64 host_spurr;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index fcd53f0d34ba..4096f16502a9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -129,6 +129,8 @@ extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+ unsigned long ioba);
extern struct kvm_rma_info *kvm_alloc_rma(void);
extern void kvm_release_rma(struct kvm_rma_info *ri);
extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 1a36b8ede417..0dcc48af25a3 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -213,6 +213,7 @@
#define SPRN_ACOP 0x1F /* Available Coprocessor Register */
#define SPRN_TFIAR 0x81 /* Transaction Failure Inst Addr */
#define SPRN_TEXASR 0x82 /* Transaction EXception & Summary */
+#define TEXASR_FS __MASK(63-36) /* Transaction Failure Summary */
#define SPRN_TEXASRU 0x83 /* '' '' '' Upper 32 */
#define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */
#define SPRN_CTRLF 0x088
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index 0c9f8b74dd97..c22d704b6d41 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -7,6 +7,8 @@
#include <uapi/asm/tm.h>
+#ifndef __ASSEMBLY__
+
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
extern void do_load_up_transact_fpu(struct thread_struct *thread);
extern void do_load_up_transact_altivec(struct thread_struct *thread);
@@ -21,3 +23,5 @@ extern void tm_recheckpoint(struct thread_struct *thread,
extern void tm_abort(uint8_t cause);
extern void tm_save_sprs(struct thread_struct *thread);
extern void tm_restore_sprs(struct thread_struct *thread);
+
+#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 303ece75b8e4..fb25ebc0af0c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -262,7 +262,14 @@ int kvmppc_mmu_hv_init(void)
static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
{
- kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
+ unsigned long msr = vcpu->arch.intr_msr;
+
+ /* If transactional, change to suspend mode on IRQ delivery */
+ if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
+ msr |= MSR_TS_S;
+ else
+ msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
+ kvmppc_set_msr(vcpu, msr);
}
/*
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 2c25f5412bdb..89e96b3e0039 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -75,3 +75,31 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
return H_TOO_HARD;
}
EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
+
+long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+ unsigned long ioba)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvmppc_spapr_tce_table *stt;
+
+ list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+ if (stt->liobn == liobn) {
+ unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+ struct page *page;
+ u64 *tbl;
+
+ if (ioba >= stt->window_size)
+ return H_PARAMETER;
+
+ page = stt->pages[idx / TCES_PER_PAGE];
+ tbl = (u64 *)page_address(page);
+
+ vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
+ return H_SUCCESS;
+ }
+ }
+
+ /* Didn't find the liobn, punt it to userspace */
+ return H_TOO_HARD;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 17fc9496b6ac..8227dba5af0f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -86,7 +86,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
/* CPU points to the first thread of the core */
if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
-#ifdef CONFIG_KVM_XICS
+#ifdef CONFIG_PPC_ICP_NATIVE
int real_cpu = cpu + vcpu->arch.ptid;
if (paca[real_cpu].kvm_hstate.xics_phys)
xics_wake_cpu(real_cpu);
@@ -879,17 +879,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_IAMR:
*val = get_reg_val(id, vcpu->arch.iamr);
break;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
- case KVM_REG_PPC_TFHAR:
- *val = get_reg_val(id, vcpu->arch.tfhar);
- break;
- case KVM_REG_PPC_TFIAR:
- *val = get_reg_val(id, vcpu->arch.tfiar);
- break;
- case KVM_REG_PPC_TEXASR:
- *val = get_reg_val(id, vcpu->arch.texasr);
- break;
-#endif
case KVM_REG_PPC_FSCR:
*val = get_reg_val(id, vcpu->arch.fscr);
break;
@@ -970,6 +959,69 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_PPR:
*val = get_reg_val(id, vcpu->arch.ppr);
break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ case KVM_REG_PPC_TFHAR:
+ *val = get_reg_val(id, vcpu->arch.tfhar);
+ break;
+ case KVM_REG_PPC_TFIAR:
+ *val = get_reg_val(id, vcpu->arch.tfiar);
+ break;
+ case KVM_REG_PPC_TEXASR:
+ *val = get_reg_val(id, vcpu->arch.texasr);
+ break;
+ case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+ i = id - KVM_REG_PPC_TM_GPR0;
+ *val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
+ break;
+ case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+ {
+ int j;
+ i = id - KVM_REG_PPC_TM_VSR0;
+ if (i < 32)
+ for (j = 0; j < TS_FPRWIDTH; j++)
+ val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
+ else {
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ val->vval = vcpu->arch.vr_tm.vr[i-32];
+ else
+ r = -ENXIO;
+ }
+ break;
+ }
+ case KVM_REG_PPC_TM_CR:
+ *val = get_reg_val(id, vcpu->arch.cr_tm);
+ break;
+ case KVM_REG_PPC_TM_LR:
+ *val = get_reg_val(id, vcpu->arch.lr_tm);
+ break;
+ case KVM_REG_PPC_TM_CTR:
+ *val = get_reg_val(id, vcpu->arch.ctr_tm);
+ break;
+ case KVM_REG_PPC_TM_FPSCR:
+ *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
+ break;
+ case KVM_REG_PPC_TM_AMR:
+ *val = get_reg_val(id, vcpu->arch.amr_tm);
+ break;
+ case KVM_REG_PPC_TM_PPR:
+ *val = get_reg_val(id, vcpu->arch.ppr_tm);
+ break;
+ case KVM_REG_PPC_TM_VRSAVE:
+ *val = get_reg_val(id, vcpu->arch.vrsave_tm);
+ break;
+ case KVM_REG_PPC_TM_VSCR:
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
+ else
+ r = -ENXIO;
+ break;
+ case KVM_REG_PPC_TM_DSCR:
+ *val = get_reg_val(id, vcpu->arch.dscr_tm);
+ break;
+ case KVM_REG_PPC_TM_TAR:
+ *val = get_reg_val(id, vcpu->arch.tar_tm);
+ break;
+#endif
case KVM_REG_PPC_ARCH_COMPAT:
*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
break;
@@ -1039,17 +1091,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_IAMR:
vcpu->arch.iamr = set_reg_val(id, *val);
break;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
- case KVM_REG_PPC_TFHAR:
- vcpu->arch.tfhar = set_reg_val(id, *val);
- break;
- case KVM_REG_PPC_TFIAR:
- vcpu->arch.tfiar = set_reg_val(id, *val);
- break;
- case KVM_REG_PPC_TEXASR:
- vcpu->arch.texasr = set_reg_val(id, *val);
- break;
-#endif
case KVM_REG_PPC_FSCR:
vcpu->arch.fscr = set_reg_val(id, *val);
break;
@@ -1144,6 +1185,68 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_PPR:
vcpu->arch.ppr = set_reg_val(id, *val);
break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ case KVM_REG_PPC_TFHAR:
+ vcpu->arch.tfhar = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TFIAR:
+ vcpu->arch.tfiar = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TEXASR:
+ vcpu->arch.texasr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+ i = id - KVM_REG_PPC_TM_GPR0;
+ vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+ {
+ int j;
+ i = id - KVM_REG_PPC_TM_VSR0;
+ if (i < 32)
+ for (j = 0; j < TS_FPRWIDTH; j++)
+ vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
+ else
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ vcpu->arch.vr_tm.vr[i-32] = val->vval;
+ else
+ r = -ENXIO;
+ break;
+ }
+ case KVM_REG_PPC_TM_CR:
+ vcpu->arch.cr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_LR:
+ vcpu->arch.lr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_CTR:
+ vcpu->arch.ctr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_FPSCR:
+ vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_AMR:
+ vcpu->arch.amr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_PPR:
+ vcpu->arch.ppr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_VRSAVE:
+ vcpu->arch.vrsave_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_VSCR:
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
+ else
+ r = - ENXIO;
+ break;
+ case KVM_REG_PPC_TM_DSCR:
+ vcpu->arch.dscr_tm = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_TM_TAR:
+ vcpu->arch.tar_tm = set_reg_val(id, *val);
+ break;
+#endif
case KVM_REG_PPC_ARCH_COMPAT:
r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
break;
@@ -1360,9 +1463,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
smp_wmb();
#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
if (cpu != smp_processor_id()) {
-#ifdef CONFIG_KVM_XICS
xics_wake_cpu(cpu);
-#endif
if (vcpu->arch.ptid)
++vc->n_woken;
}
@@ -1530,7 +1631,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
vcpu->arch.trap = 0;
if (vcpu->arch.ceded) {
- if (ret != RESUME_GUEST)
+ if (!is_kvmppc_resume_guest(ret))
kvmppc_end_cede(vcpu);
else
kvmppc_set_timer(vcpu);
@@ -1541,7 +1642,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
vc->vcore_state = VCORE_INACTIVE;
list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
arch.run_list) {
- if (vcpu->arch.ret != RESUME_GUEST) {
+ if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
kvmppc_remove_runnable(vc, vcpu);
wake_up(&vcpu->arch.cpu_run);
}
@@ -1731,7 +1832,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
}
- } while (r == RESUME_GUEST);
+ } while (is_kvmppc_resume_guest(r));
out:
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -2366,7 +2467,7 @@ static int kvmppc_book3s_init_hv(void)
*/
r = kvmppc_core_check_processor_compat_hv();
if (r < 0)
- return r;
+ return -ENODEV;
kvm_ops_hv.owner = THIS_MODULE;
kvmppc_hv_ops = &kvm_ops_hv;
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index e873796b1a29..e18e3cfc32de 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -71,6 +71,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtmsrd r10,1
/* Save host PMU registers */
+BEGIN_FTR_SECTION
+ /* Work around P8 PMAE bug */
+ li r3, -1
+ clrrdi r3, r3, 10
+ mfspr r8, SPRN_MMCR2
+ mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */
+ isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mfspr r7, SPRN_MMCR0 /* save MMCR0 */
@@ -87,9 +95,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
cmpwi r5, 0
beq 31f /* skip if not */
mfspr r5, SPRN_MMCR1
+ mfspr r9, SPRN_SIAR
+ mfspr r10, SPRN_SDAR
std r7, HSTATE_MMCR(r13)
std r5, HSTATE_MMCR + 8(r13)
std r6, HSTATE_MMCR + 16(r13)
+ std r9, HSTATE_MMCR + 24(r13)
+ std r10, HSTATE_MMCR + 32(r13)
+BEGIN_FTR_SECTION
+ mfspr r9, SPRN_SIER
+ std r8, HSTATE_MMCR + 40(r13)
+ std r9, HSTATE_MMCR + 48(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mfspr r3, SPRN_PMC1
mfspr r5, SPRN_PMC2
mfspr r6, SPRN_PMC3
@@ -110,6 +127,11 @@ BEGIN_FTR_SECTION
stw r10, HSTATE_PMC + 24(r13)
stw r11, HSTATE_PMC + 28(r13)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+BEGIN_FTR_SECTION
+ mfspr r9, SPRN_SIER
+ std r8, HSTATE_MMCR + 40(r13)
+ std r9, HSTATE_MMCR + 48(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
31:
/*
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 37fb3caa4c80..1d6c56ad5b60 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -111,7 +111,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
ptel = rev->guest_rpte |= rcbits;
gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
- memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+ memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
if (!memslot)
return;
@@ -192,7 +192,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
/* Find the memslot (if any) for this address */
gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
gfn = gpa >> PAGE_SHIFT;
- memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+ memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
pa = 0;
is_io = ~0ul;
rmap = NULL;
@@ -670,7 +670,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
psize = hpte_page_size(v, r);
gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
- memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+ memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
if (memslot) {
hva = __gfn_to_hva_memslot(memslot, gfn);
pte = lookup_linux_pte_and_update(pgdir, hva,
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 53d647f8e741..ffbb871c2bd8 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -28,6 +28,9 @@
#include <asm/exception-64s.h>
#include <asm/kvm_book3s_asm.h>
#include <asm/mmu-hash64.h>
+#include <asm/tm.h>
+
+#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
#ifdef __LITTLE_ENDIAN__
#error Need to fix lppaca and SLB shadow accesses in little endian mode
@@ -106,8 +109,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
ld r3, HSTATE_MMCR(r13)
ld r4, HSTATE_MMCR + 8(r13)
ld r5, HSTATE_MMCR + 16(r13)
+ ld r6, HSTATE_MMCR + 24(r13)
+ ld r7, HSTATE_MMCR + 32(r13)
mtspr SPRN_MMCR1, r4
mtspr SPRN_MMCRA, r5
+ mtspr SPRN_SIAR, r6
+ mtspr SPRN_SDAR, r7
+BEGIN_FTR_SECTION
+ ld r8, HSTATE_MMCR + 40(r13)
+ ld r9, HSTATE_MMCR + 48(r13)
+ mtspr SPRN_MMCR2, r8
+ mtspr SPRN_SIER, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
23:
@@ -597,6 +610,116 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_206, CPU_FTR_ARCH_206, 89)
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+ b skip_tm
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
+
+ /* Turn on TM/FP/VSX/VMX so we can restore them. */
+ mfmsr r5
+ li r6, MSR_TM >> 32
+ sldi r6, r6, 32
+ or r5, r5, r6
+ ori r5, r5, MSR_FP
+ oris r5, r5, (MSR_VEC | MSR_VSX)@h
+ mtmsrd r5
+
+ /*
+ * The user may change these outside of a transaction, so they must
+ * always be context switched.
+ */
+ ld r5, VCPU_TFHAR(r4)
+ ld r6, VCPU_TFIAR(r4)
+ ld r7, VCPU_TEXASR(r4)
+ mtspr SPRN_TFHAR, r5
+ mtspr SPRN_TFIAR, r6
+ mtspr SPRN_TEXASR, r7
+
+ ld r5, VCPU_MSR(r4)
+ rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+ beq skip_tm /* TM not active in guest */
+
+ /* Make sure the failure summary is set, otherwise we'll program check
+ * when we trechkpt. It's possible that this might have been not set
+ * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+ * host.
+ */
+ oris r7, r7, (TEXASR_FS)@h
+ mtspr SPRN_TEXASR, r7
+
+ /*
+ * We need to load up the checkpointed state for the guest.
+ * We need to do this early as it will blow away any GPRs, VSRs and
+ * some SPRs.
+ */
+
+ mr r31, r4
+ addi r3, r31, VCPU_FPRS_TM
+ bl .load_fp_state
+ addi r3, r31, VCPU_VRS_TM
+ bl .load_vr_state
+ mr r4, r31
+ lwz r7, VCPU_VRSAVE_TM(r4)
+ mtspr SPRN_VRSAVE, r7
+
+ ld r5, VCPU_LR_TM(r4)
+ lwz r6, VCPU_CR_TM(r4)
+ ld r7, VCPU_CTR_TM(r4)
+ ld r8, VCPU_AMR_TM(r4)
+ ld r9, VCPU_TAR_TM(r4)
+ mtlr r5
+ mtcr r6
+ mtctr r7
+ mtspr SPRN_AMR, r8
+ mtspr SPRN_TAR, r9
+
+ /*
+ * Load up PPR and DSCR values but don't put them in the actual SPRs
+ * till the last moment to avoid running with userspace PPR and DSCR for
+ * too long.
+ */
+ ld r29, VCPU_DSCR_TM(r4)
+ ld r30, VCPU_PPR_TM(r4)
+
+ std r2, PACATMSCRATCH(r13) /* Save TOC */
+
+ /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+ li r5, 0
+ mtmsrd r5, 1
+
+ /* Load GPRs r0-r28 */
+ reg = 0
+ .rept 29
+ ld reg, VCPU_GPRS_TM(reg)(r31)
+ reg = reg + 1
+ .endr
+
+ mtspr SPRN_DSCR, r29
+ mtspr SPRN_PPR, r30
+
+ /* Load final GPRs */
+ ld 29, VCPU_GPRS_TM(29)(r31)
+ ld 30, VCPU_GPRS_TM(30)(r31)
+ ld 31, VCPU_GPRS_TM(31)(r31)
+
+ /* TM checkpointed state is now setup. All GPRs are now volatile. */
+ TRECHKPT
+
+ /* Now let's get back the state we need. */
+ HMT_MEDIUM
+ GET_PACA(r13)
+ ld r29, HSTATE_DSCR(r13)
+ mtspr SPRN_DSCR, r29
+ ld r4, HSTATE_KVM_VCPU(r13)
+ ld r1, HSTATE_HOST_R1(r13)
+ ld r2, PACATMSCRATCH(r13)
+
+ /* Set the MSR RI since we have our registers back. */
+ li r5, MSR_RI
+ mtmsrd r5, 1
+skip_tm:
+#endif
+
/* Load guest PMU registers */
/* R4 is live here (vcpu pointer) */
li r3, 1
@@ -704,14 +827,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
ld r6, VCPU_VTB(r4)
mtspr SPRN_IC, r5
mtspr SPRN_VTB, r6
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
- ld r5, VCPU_TFHAR(r4)
- ld r6, VCPU_TFIAR(r4)
- ld r7, VCPU_TEXASR(r4)
- mtspr SPRN_TFHAR, r5
- mtspr SPRN_TFIAR, r6
- mtspr SPRN_TEXASR, r7
-#endif
ld r8, VCPU_EBBHR(r4)
mtspr SPRN_EBBHR, r8
ld r5, VCPU_EBBRR(r4)
@@ -736,6 +851,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
* Set the decrementer to the guest decrementer.
*/
ld r8,VCPU_DEC_EXPIRES(r4)
+ /* r8 is a host timebase value here, convert to guest TB */
+ ld r5,HSTATE_KVM_VCORE(r13)
+ ld r6,VCORE_TB_OFFSET(r5)
+ add r8,r8,r6
mftb r7
subf r3,r7,r8
mtspr SPRN_DEC,r3
@@ -817,7 +936,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
12: mtspr SPRN_SRR0, r10
mr r10,r0
mtspr SPRN_SRR1, r11
- ld r11, VCPU_INTR_MSR(r4)
+ mr r9, r4
+ bl kvmppc_msr_interrupt
5:
/*
@@ -1098,17 +1218,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
mftb r6
extsw r5,r5
add r5,r5,r6
+ /* r5 is a guest timebase value here, convert to host TB */
+ ld r3,HSTATE_KVM_VCORE(r13)
+ ld r4,VCORE_TB_OFFSET(r3)
+ subf r5,r4,r5
std r5,VCPU_DEC_EXPIRES(r9)
BEGIN_FTR_SECTION
b 8f
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
- /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
- mfmsr r8
- li r0, 1
- rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
- mtmsrd r8
-
/* Save POWER8-specific registers */
mfspr r5, SPRN_IAMR
mfspr r6, SPRN_PSPB
@@ -1122,14 +1240,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
std r5, VCPU_IC(r9)
std r6, VCPU_VTB(r9)
std r7, VCPU_TAR(r9)
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
- mfspr r5, SPRN_TFHAR
- mfspr r6, SPRN_TFIAR
- mfspr r7, SPRN_TEXASR
- std r5, VCPU_TFHAR(r9)
- std r6, VCPU_TFIAR(r9)
- std r7, VCPU_TEXASR(r9)
-#endif
mfspr r8, SPRN_EBBHR
std r8, VCPU_EBBHR(r9)
mfspr r5, SPRN_EBBRR
@@ -1387,7 +1497,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
ld r8,VCORE_TB_OFFSET(r5)
cmpdi r8,0
beq 17f
- mftb r6 /* current host timebase */
+ mftb r6 /* current guest timebase */
subf r8,r8,r6
mtspr SPRN_TBU40,r8 /* update upper 40 bits */
mftb r7 /* check if lower 24 bits overflowed */
@@ -1557,7 +1667,7 @@ kvmppc_hdsi:
mtspr SPRN_SRR0, r10
mtspr SPRN_SRR1, r11
li r10, BOOK3S_INTERRUPT_DATA_STORAGE
- ld r11, VCPU_INTR_MSR(r9)
+ bl kvmppc_msr_interrupt
fast_interrupt_c_return:
6: ld r7, VCPU_CTR(r9)
lwz r8, VCPU_XER(r9)
@@ -1626,7 +1736,7 @@ kvmppc_hisi:
1: mtspr SPRN_SRR0, r10
mtspr SPRN_SRR1, r11
li r10, BOOK3S_INTERRUPT_INST_STORAGE
- ld r11, VCPU_INTR_MSR(r9)
+ bl kvmppc_msr_interrupt
b fast_interrupt_c_return
3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */
@@ -1669,7 +1779,7 @@ sc_1_fast_return:
mtspr SPRN_SRR0,r10
mtspr SPRN_SRR1,r11
li r10, BOOK3S_INTERRUPT_SYSCALL
- ld r11, VCPU_INTR_MSR(r9)
+ bl kvmppc_msr_interrupt
mr r4,r9
b fast_guest_return
@@ -1691,7 +1801,7 @@ hcall_real_table:
.long 0 /* 0x10 - H_CLEAR_MOD */
.long 0 /* 0x14 - H_CLEAR_REF */
.long .kvmppc_h_protect - hcall_real_table
- .long 0 /* 0x1c - H_GET_TCE */
+ .long .kvmppc_h_get_tce - hcall_real_table
.long .kvmppc_h_put_tce - hcall_real_table
.long 0 /* 0x24 - H_SET_SPRG0 */
.long .kvmppc_h_set_dabr - hcall_real_table
@@ -1997,7 +2107,7 @@ machine_check_realmode:
beq mc_cont
/* If not, deliver a machine check. SRR0/1 are already set */
li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
- ld r11, VCPU_INTR_MSR(r9)
+ bl kvmppc_msr_interrupt
b fast_interrupt_c_return
/*
@@ -2138,8 +2248,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
mfspr r6,SPRN_VRSAVE
stw r6,VCPU_VRSAVE(r31)
mtlr r30
- mtmsrd r5
- isync
blr
/*
@@ -2186,3 +2294,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
*/
kvmppc_bad_host_intr:
b .
+
+/*
+ * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken
+ * from VCPU_INTR_MSR and is modified based on the required TM state changes.
+ * r11 has the guest MSR value (in/out)
+ * r9 has a vcpu pointer (in)
+ * r0 is used as a scratch register
+ */
+kvmppc_msr_interrupt:
+ rldicl r0, r11, 64 - MSR_TS_S_LG, 62
+ cmpwi r0, 2 /* Check if we are in transactional state.. */
+ ld r11, VCPU_INTR_MSR(r9)
+ bne 1f
+ /* ... if transactional, change to suspended */
+ li r0, 1
+1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+ blr
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index cf95cdef73c9..7a053157483b 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -213,8 +213,11 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
gpa_t args_phys;
int rc;
- /* r4 contains the guest physical address of the RTAS args */
- args_phys = kvmppc_get_gpr(vcpu, 4);
+ /*
+ * r4 contains the guest physical address of the RTAS args
+ * Mask off the top 4 bits since this is a guest real address
+ */
+ args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM;
rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
if (rc)
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index 5f8bcc5fe423..35f0faab5361 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -53,6 +53,7 @@ enum interruption_class {
IRQIO_PCI,
IRQIO_MSI,
IRQIO_VIR,
+ IRQIO_VAI,
NMI_NMI,
CPU_RST,
NR_ARCH_IRQS
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 9bf95bb30f1a..154b60089be9 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -16,12 +16,22 @@
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/kvm_host.h>
+#include <linux/kvm.h>
#include <asm/debug.h>
#include <asm/cpu.h>
+#include <asm/isc.h>
#define KVM_MAX_VCPUS 64
#define KVM_USER_MEM_SLOTS 32
+/*
+ * These seem to be used for allocating ->chip in the routing table,
+ * which we don't use. 4096 is an out-of-thin-air value. If we need
+ * to look at ->chip later on, we'll need to revisit this.
+ */
+#define KVM_NR_IRQCHIPS 1
+#define KVM_IRQCHIP_NUM_PINS 4096
+
struct sca_entry {
atomic_t scn;
__u32 reserved;
@@ -108,7 +118,9 @@ struct kvm_s390_sie_block {
__u32 fac; /* 0x01a0 */
__u8 reserved1a4[20]; /* 0x01a4 */
__u64 cbrlo; /* 0x01b8 */
- __u8 reserved1c0[40]; /* 0x01c0 */
+ __u8 reserved1c0[30]; /* 0x01c0 */
+ __u64 pp; /* 0x01de */
+ __u8 reserved1e6[2]; /* 0x01e6 */
__u64 itdba; /* 0x01e8 */
__u8 reserved1f0[16]; /* 0x01f0 */
} __attribute__((packed));
@@ -171,18 +183,6 @@ struct kvm_vcpu_stat {
u32 diagnose_9c;
};
-struct kvm_s390_io_info {
- __u16 subchannel_id; /* 0x0b8 */
- __u16 subchannel_nr; /* 0x0ba */
- __u32 io_int_parm; /* 0x0bc */
- __u32 io_int_word; /* 0x0c0 */
-};
-
-struct kvm_s390_ext_info {
- __u32 ext_params;
- __u64 ext_params2;
-};
-
#define PGM_OPERATION 0x01
#define PGM_PRIVILEGED_OP 0x02
#define PGM_EXECUTE 0x03
@@ -191,27 +191,6 @@ struct kvm_s390_ext_info {
#define PGM_SPECIFICATION 0x06
#define PGM_DATA 0x07
-struct kvm_s390_pgm_info {
- __u16 code;
-};
-
-struct kvm_s390_prefix_info {
- __u32 address;
-};
-
-struct kvm_s390_extcall_info {
- __u16 code;
-};
-
-struct kvm_s390_emerg_info {
- __u16 code;
-};
-
-struct kvm_s390_mchk_info {
- __u64 cr14;
- __u64 mcic;
-};
-
struct kvm_s390_interrupt_info {
struct list_head list;
u64 type;
@@ -246,9 +225,8 @@ struct kvm_s390_float_interrupt {
struct list_head list;
atomic_t active;
int next_rr_cpu;
- unsigned long idle_mask[(KVM_MAX_VCPUS + sizeof(long) - 1)
- / sizeof(long)];
- struct kvm_s390_local_interrupt *local_int[KVM_MAX_VCPUS];
+ unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int irq_count;
};
@@ -265,6 +243,10 @@ struct kvm_vcpu_arch {
u64 stidp_data;
};
struct gmap *gmap;
+#define KVM_S390_PFAULT_TOKEN_INVALID (-1UL)
+ unsigned long pfault_token;
+ unsigned long pfault_select;
+ unsigned long pfault_compare;
};
struct kvm_vm_stat {
@@ -274,12 +256,36 @@ struct kvm_vm_stat {
struct kvm_arch_memory_slot {
};
+struct s390_map_info {
+ struct list_head list;
+ __u64 guest_addr;
+ __u64 addr;
+ struct page *page;
+};
+
+struct s390_io_adapter {
+ unsigned int id;
+ int isc;
+ bool maskable;
+ bool masked;
+ bool swap;
+ struct rw_semaphore maps_lock;
+ struct list_head maps;
+ atomic_t nr_maps;
+};
+
+#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
+#define MAX_S390_ADAPTER_MAPS 256
+
struct kvm_arch{
struct sca_block *sca;
debug_info_t *dbf;
struct kvm_s390_float_interrupt float_int;
+ struct kvm_device *flic;
struct gmap *gmap;
int css_support;
+ int use_irqchip;
+ struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
};
#define KVM_HVA_ERR_BAD (-1UL)
@@ -290,6 +296,24 @@ static inline bool kvm_is_error_hva(unsigned long addr)
return IS_ERR_VALUE(addr);
}
+#define ASYNC_PF_PER_VCPU 64
+struct kvm_vcpu;
+struct kvm_async_pf;
+struct kvm_arch_async_pf {
+ unsigned long pfault_token;
+};
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+
extern int sie64a(struct kvm_s390_sie_block *, u64 *);
extern char sie_exit;
#endif
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1ab75eaacbd4..50a75d96f939 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -782,6 +782,7 @@ static inline void pgste_set_pte(pte_t *ptep, pte_t entry)
* @table: pointer to the page directory
* @asce: address space control element for gmap page table
* @crst_list: list of all crst tables used in the guest address space
+ * @pfault_enabled: defines if pfaults are applicable for the guest
*/
struct gmap {
struct list_head list;
@@ -790,6 +791,7 @@ struct gmap {
unsigned long asce;
void *private;
struct list_head crst_list;
+ bool pfault_enabled;
};
/**
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 0a876bc543d3..dc5fc4f90e52 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -79,6 +79,7 @@ struct thread_struct {
unsigned long ksp; /* kernel stack pointer */
mm_segment_t mm_segment;
unsigned long gmap_addr; /* address of last gmap fault. */
+ unsigned int gmap_pfault; /* signal of a pending guest pfault */
struct per_regs per_user; /* User specified PER registers */
struct per_event per_event; /* Cause of the last PER trap */
unsigned long per_flags; /* Flags to control debug behavior */
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index d25da598ec62..c003c6a73b1e 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -16,6 +16,44 @@
#define __KVM_S390
+/* Device control API: s390-specific devices */
+#define KVM_DEV_FLIC_GET_ALL_IRQS 1
+#define KVM_DEV_FLIC_ENQUEUE 2
+#define KVM_DEV_FLIC_CLEAR_IRQS 3
+#define KVM_DEV_FLIC_APF_ENABLE 4
+#define KVM_DEV_FLIC_APF_DISABLE_WAIT 5
+#define KVM_DEV_FLIC_ADAPTER_REGISTER 6
+#define KVM_DEV_FLIC_ADAPTER_MODIFY 7
+/*
+ * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
+ * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
+ * There are also sclp and machine checks. This gives us
+ * sizeof(kvm_s390_irq)*(4*65536+8+64*64+1+1) = 72 * 266250 = 19170000
+ * Lets round up to 8192 pages.
+ */
+#define KVM_S390_MAX_FLOAT_IRQS 266250
+#define KVM_S390_FLIC_MAX_BUFFER 0x2000000
+
+struct kvm_s390_io_adapter {
+ __u32 id;
+ __u8 isc;
+ __u8 maskable;
+ __u8 swap;
+ __u8 pad;
+};
+
+#define KVM_S390_IO_ADAPTER_MASK 1
+#define KVM_S390_IO_ADAPTER_MAP 2
+#define KVM_S390_IO_ADAPTER_UNMAP 3
+
+struct kvm_s390_io_adapter_req {
+ __u32 id;
+ __u8 type;
+ __u8 mask;
+ __u16 pad0;
+ __u64 addr;
+};
+
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
/* general purpose regs for s390 */
@@ -57,4 +95,9 @@ struct kvm_sync_regs {
#define KVM_REG_S390_EPOCHDIFF (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x2)
#define KVM_REG_S390_CPU_TIMER (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x3)
#define KVM_REG_S390_CLOCK_COMP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x4)
+#define KVM_REG_S390_PFTOKEN (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x5)
+#define KVM_REG_S390_PFCOMPARE (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x6)
+#define KVM_REG_S390_PFSELECT (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x7)
+#define KVM_REG_S390_PP (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x8)
+#define KVM_REG_S390_GBEA (KVM_REG_S390 | KVM_REG_SIZE_U64 | 0x9)
#endif
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index a770be97db4d..d42b14cc72a4 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -85,6 +85,7 @@ static const struct irq_class irqclass_sub_desc[NR_ARCH_IRQS] = {
[IRQIO_PCI] = {.name = "PCI", .desc = "[I/O] PCI Interrupt" },
[IRQIO_MSI] = {.name = "MSI", .desc = "[I/O] MSI Interrupt" },
[IRQIO_VIR] = {.name = "VIR", .desc = "[I/O] Virtual I/O Devices"},
+ [IRQIO_VAI] = {.name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"},
[NMI_NMI] = {.name = "NMI", .desc = "[NMI] Machine Check"},
[CPU_RST] = {.name = "RST", .desc = "[CPU] CPU Restart"},
};
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 70b46eacf8e1..10d529ac9821 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -23,6 +23,10 @@ config KVM
select ANON_INODES
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_EVENTFD
+ select KVM_ASYNC_PF
+ select KVM_ASYNC_PF_SYNC
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQ_ROUTING
---help---
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 40b4c6470f88..d3adb37e93a4 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -7,7 +7,7 @@
# as published by the Free Software Foundation.
KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o
+common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 6f9cfa500372..03a05ffb662f 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -18,6 +18,7 @@
#include "kvm-s390.h"
#include "trace.h"
#include "trace-s390.h"
+#include "gaccess.h"
static int diag_release_pages(struct kvm_vcpu *vcpu)
{
@@ -47,6 +48,87 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
return 0;
}
+static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
+{
+ struct prs_parm {
+ u16 code;
+ u16 subcode;
+ u16 parm_len;
+ u16 parm_version;
+ u64 token_addr;
+ u64 select_mask;
+ u64 compare_mask;
+ u64 zarch;
+ };
+ struct prs_parm parm;
+ int rc;
+ u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4;
+ u16 ry = (vcpu->arch.sie_block->ipa & 0x0f);
+ unsigned long hva_token = KVM_HVA_ERR_BAD;
+
+ if (vcpu->run->s.regs.gprs[rx] & 7)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ if (copy_from_guest(vcpu, &parm, vcpu->run->s.regs.gprs[rx], sizeof(parm)))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ switch (parm.subcode) {
+ case 0: /* TOKEN */
+ if (vcpu->arch.pfault_token != KVM_S390_PFAULT_TOKEN_INVALID) {
+ /*
+ * If the pagefault handshake is already activated,
+ * the token must not be changed. We have to return
+ * decimal 8 instead, as mandated in SC24-6084.
+ */
+ vcpu->run->s.regs.gprs[ry] = 8;
+ return 0;
+ }
+
+ if ((parm.compare_mask & parm.select_mask) != parm.compare_mask ||
+ parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ hva_token = gfn_to_hva(vcpu->kvm, gpa_to_gfn(parm.token_addr));
+ if (kvm_is_error_hva(hva_token))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ vcpu->arch.pfault_token = parm.token_addr;
+ vcpu->arch.pfault_select = parm.select_mask;
+ vcpu->arch.pfault_compare = parm.compare_mask;
+ vcpu->run->s.regs.gprs[ry] = 0;
+ rc = 0;
+ break;
+ case 1: /*
+ * CANCEL
+ * Specification allows to let already pending tokens survive
+ * the cancel, therefore to reduce code complexity, we assume
+ * all outstanding tokens are already pending.
+ */
+ if (parm.token_addr || parm.select_mask ||
+ parm.compare_mask || parm.zarch)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ vcpu->run->s.regs.gprs[ry] = 0;
+ /*
+ * If the pfault handling was not established or is already
+ * canceled SC24-6084 requests to return decimal 4.
+ */
+ if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
+ vcpu->run->s.regs.gprs[ry] = 4;
+ else
+ vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+
+ rc = 0;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ break;
+ }
+
+ return rc;
+}
+
static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
@@ -153,6 +235,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
return __diag_time_slice_end(vcpu);
case 0x9c:
return __diag_time_slice_end_directed(vcpu);
+ case 0x258:
+ return __diag_page_ref_service(vcpu);
case 0x308:
return __diag_ipl_functions(vcpu);
case 0x500:
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5f79d2d79ca7..200a8f9390b6 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1,7 +1,7 @@
/*
* handling kvm guest interrupts
*
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2014
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2 only)
@@ -13,6 +13,7 @@
#include <linux/interrupt.h>
#include <linux/kvm_host.h>
#include <linux/hrtimer.h>
+#include <linux/mmu_context.h>
#include <linux/signal.h>
#include <linux/slab.h>
#include <asm/asm-offsets.h>
@@ -31,7 +32,7 @@ static int is_ioint(u64 type)
return ((type & 0xfffe0000u) != 0xfffe0000u);
}
-static int psw_extint_disabled(struct kvm_vcpu *vcpu)
+int psw_extint_disabled(struct kvm_vcpu *vcpu)
{
return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
}
@@ -78,11 +79,8 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
return 1;
return 0;
case KVM_S390_INT_SERVICE:
- if (psw_extint_disabled(vcpu))
- return 0;
- if (vcpu->arch.sie_block->gcr[0] & 0x200ul)
- return 1;
- return 0;
+ case KVM_S390_INT_PFAULT_INIT:
+ case KVM_S390_INT_PFAULT_DONE:
case KVM_S390_INT_VIRTIO:
if (psw_extint_disabled(vcpu))
return 0;
@@ -117,14 +115,12 @@ static int __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
static void __set_cpu_idle(struct kvm_vcpu *vcpu)
{
- BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1);
atomic_set_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
set_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
}
static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
{
- BUG_ON(vcpu->vcpu_id > KVM_MAX_VCPUS - 1);
atomic_clear_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
clear_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
}
@@ -150,6 +146,8 @@ static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
case KVM_S390_INT_EXTERNAL_CALL:
case KVM_S390_INT_EMERGENCY:
case KVM_S390_INT_SERVICE:
+ case KVM_S390_INT_PFAULT_INIT:
+ case KVM_S390_INT_PFAULT_DONE:
case KVM_S390_INT_VIRTIO:
if (psw_extint_disabled(vcpu))
__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
@@ -223,6 +221,30 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
rc |= put_guest(vcpu, inti->ext.ext_params,
(u32 __user *)__LC_EXT_PARAMS);
break;
+ case KVM_S390_INT_PFAULT_INIT:
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
+ inti->ext.ext_params2);
+ rc = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE);
+ rc |= put_guest(vcpu, 0x0600, (u16 __user *) __LC_EXT_CPU_ADDR);
+ rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+ __LC_EXT_NEW_PSW, sizeof(psw_t));
+ rc |= put_guest(vcpu, inti->ext.ext_params2,
+ (u64 __user *) __LC_EXT_PARAMS2);
+ break;
+ case KVM_S390_INT_PFAULT_DONE:
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
+ inti->ext.ext_params2);
+ rc = put_guest(vcpu, 0x2603, (u16 __user *) __LC_EXT_INT_CODE);
+ rc |= put_guest(vcpu, 0x0680, (u16 __user *) __LC_EXT_CPU_ADDR);
+ rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+ &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+ rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+ __LC_EXT_NEW_PSW, sizeof(psw_t));
+ rc |= put_guest(vcpu, inti->ext.ext_params2,
+ (u64 __user *) __LC_EXT_PARAMS2);
+ break;
case KVM_S390_INT_VIRTIO:
VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
inti->ext.ext_params, inti->ext.ext_params2);
@@ -357,7 +379,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
return 1;
}
-static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
@@ -482,11 +504,26 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
struct kvm_vcpu *vcpu;
vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
+ vcpu->preempted = true;
tasklet_schedule(&vcpu->arch.tasklet);
return HRTIMER_NORESTART;
}
+void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+ struct kvm_s390_interrupt_info *n, *inti = NULL;
+
+ spin_lock_bh(&li->lock);
+ list_for_each_entry_safe(inti, n, &li->list, list) {
+ list_del(&inti->list);
+ kfree(inti);
+ }
+ atomic_set(&li->active, 0);
+ spin_unlock_bh(&li->lock);
+}
+
void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -528,6 +565,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
list_for_each_entry_safe(inti, n, &fi->list, list) {
if (__interrupt_is_deliverable(vcpu, inti)) {
list_del(&inti->list);
+ fi->irq_count--;
deliver = 1;
break;
}
@@ -583,6 +621,7 @@ void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu)
if ((inti->type == KVM_S390_MCHK) &&
__interrupt_is_deliverable(vcpu, inti)) {
list_del(&inti->list);
+ fi->irq_count--;
deliver = 1;
break;
}
@@ -650,8 +689,10 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
inti = iter;
break;
}
- if (inti)
+ if (inti) {
list_del_init(&inti->list);
+ fi->irq_count--;
+ }
if (list_empty(&fi->list))
atomic_set(&fi->active, 0);
spin_unlock(&fi->lock);
@@ -659,53 +700,101 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
return inti;
}
-int kvm_s390_inject_vm(struct kvm *kvm,
- struct kvm_s390_interrupt *s390int)
+static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
{
struct kvm_s390_local_interrupt *li;
struct kvm_s390_float_interrupt *fi;
- struct kvm_s390_interrupt_info *inti, *iter;
+ struct kvm_s390_interrupt_info *iter;
+ struct kvm_vcpu *dst_vcpu = NULL;
int sigcpu;
+ int rc = 0;
+
+ mutex_lock(&kvm->lock);
+ fi = &kvm->arch.float_int;
+ spin_lock(&fi->lock);
+ if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) {
+ rc = -EINVAL;
+ goto unlock_fi;
+ }
+ fi->irq_count++;
+ if (!is_ioint(inti->type)) {
+ list_add_tail(&inti->list, &fi->list);
+ } else {
+ u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
+
+ /* Keep I/O interrupts sorted in isc order. */
+ list_for_each_entry(iter, &fi->list, list) {
+ if (!is_ioint(iter->type))
+ continue;
+ if (int_word_to_isc_bits(iter->io.io_int_word)
+ <= isc_bits)
+ continue;
+ break;
+ }
+ list_add_tail(&inti->list, &iter->list);
+ }
+ atomic_set(&fi->active, 1);
+ sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
+ if (sigcpu == KVM_MAX_VCPUS) {
+ do {
+ sigcpu = fi->next_rr_cpu++;
+ if (sigcpu == KVM_MAX_VCPUS)
+ sigcpu = fi->next_rr_cpu = 0;
+ } while (kvm_get_vcpu(kvm, sigcpu) == NULL);
+ }
+ dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
+ li = &dst_vcpu->arch.local_int;
+ spin_lock_bh(&li->lock);
+ atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+ if (waitqueue_active(li->wq))
+ wake_up_interruptible(li->wq);
+ kvm_get_vcpu(kvm, sigcpu)->preempted = true;
+ spin_unlock_bh(&li->lock);
+unlock_fi:
+ spin_unlock(&fi->lock);
+ mutex_unlock(&kvm->lock);
+ return rc;
+}
+
+int kvm_s390_inject_vm(struct kvm *kvm,
+ struct kvm_s390_interrupt *s390int)
+{
+ struct kvm_s390_interrupt_info *inti;
inti = kzalloc(sizeof(*inti), GFP_KERNEL);
if (!inti)
return -ENOMEM;
- switch (s390int->type) {
+ inti->type = s390int->type;
+ switch (inti->type) {
case KVM_S390_INT_VIRTIO:
VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
s390int->parm, s390int->parm64);
- inti->type = s390int->type;
inti->ext.ext_params = s390int->parm;
inti->ext.ext_params2 = s390int->parm64;
break;
case KVM_S390_INT_SERVICE:
VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
- inti->type = s390int->type;
inti->ext.ext_params = s390int->parm;
break;
- case KVM_S390_PROGRAM_INT:
- case KVM_S390_SIGP_STOP:
- case KVM_S390_INT_EXTERNAL_CALL:
- case KVM_S390_INT_EMERGENCY:
- kfree(inti);
- return -EINVAL;
+ case KVM_S390_INT_PFAULT_DONE:
+ inti->type = s390int->type;
+ inti->ext.ext_params2 = s390int->parm64;
+ break;
case KVM_S390_MCHK:
VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
s390int->parm64);
- inti->type = s390int->type;
inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
inti->mchk.mcic = s390int->parm64;
break;
case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
- if (s390int->type & IOINT_AI_MASK)
+ if (inti->type & IOINT_AI_MASK)
VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
else
VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
s390int->type & IOINT_CSSID_MASK,
s390int->type & IOINT_SSID_MASK,
s390int->type & IOINT_SCHID_MASK);
- inti->type = s390int->type;
inti->io.subchannel_id = s390int->parm >> 16;
inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -718,43 +807,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
2);
- mutex_lock(&kvm->lock);
- fi = &kvm->arch.float_int;
- spin_lock(&fi->lock);
- if (!is_ioint(inti->type))
- list_add_tail(&inti->list, &fi->list);
- else {
- u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
-
- /* Keep I/O interrupts sorted in isc order. */
- list_for_each_entry(iter, &fi->list, list) {
- if (!is_ioint(iter->type))
- continue;
- if (int_word_to_isc_bits(iter->io.io_int_word)
- <= isc_bits)
- continue;
- break;
- }
- list_add_tail(&inti->list, &iter->list);
- }
- atomic_set(&fi->active, 1);
- sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
- if (sigcpu == KVM_MAX_VCPUS) {
- do {
- sigcpu = fi->next_rr_cpu++;
- if (sigcpu == KVM_MAX_VCPUS)
- sigcpu = fi->next_rr_cpu = 0;
- } while (fi->local_int[sigcpu] == NULL);
- }
- li = fi->local_int[sigcpu];
- spin_lock_bh(&li->lock);
- atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
- if (waitqueue_active(li->wq))
- wake_up_interruptible(li->wq);
- spin_unlock_bh(&li->lock);
- spin_unlock(&fi->lock);
- mutex_unlock(&kvm->lock);
- return 0;
+ return __inject_vm(kvm, inti);
}
int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
@@ -814,6 +867,10 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
inti->type = s390int->type;
inti->mchk.mcic = s390int->parm64;
break;
+ case KVM_S390_INT_PFAULT_INIT:
+ inti->type = s390int->type;
+ inti->ext.ext_params2 = s390int->parm64;
+ break;
case KVM_S390_INT_VIRTIO:
case KVM_S390_INT_SERVICE:
case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
@@ -837,7 +894,528 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
+ vcpu->preempted = true;
spin_unlock_bh(&li->lock);
mutex_unlock(&vcpu->kvm->lock);
return 0;
}
+
+static void clear_floating_interrupts(struct kvm *kvm)
+{
+ struct kvm_s390_float_interrupt *fi;
+ struct kvm_s390_interrupt_info *n, *inti = NULL;
+
+ mutex_lock(&kvm->lock);
+ fi = &kvm->arch.float_int;
+ spin_lock(&fi->lock);
+ list_for_each_entry_safe(inti, n, &fi->list, list) {
+ list_del(&inti->list);
+ kfree(inti);
+ }
+ fi->irq_count = 0;
+ atomic_set(&fi->active, 0);
+ spin_unlock(&fi->lock);
+ mutex_unlock(&kvm->lock);
+}
+
+static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
+ u8 *addr)
+{
+ struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+ struct kvm_s390_irq irq = {0};
+
+ irq.type = inti->type;
+ switch (inti->type) {
+ case KVM_S390_INT_PFAULT_INIT:
+ case KVM_S390_INT_PFAULT_DONE:
+ case KVM_S390_INT_VIRTIO:
+ case KVM_S390_INT_SERVICE:
+ irq.u.ext = inti->ext;
+ break;
+ case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+ irq.u.io = inti->io;
+ break;
+ case KVM_S390_MCHK:
+ irq.u.mchk = inti->mchk;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_to_user(uptr, &irq, sizeof(irq)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_all_floating_irqs(struct kvm *kvm, __u8 *buf, __u64 len)
+{
+ struct kvm_s390_interrupt_info *inti;
+ struct kvm_s390_float_interrupt *fi;
+ int ret = 0;
+ int n = 0;
+
+ mutex_lock(&kvm->lock);
+ fi = &kvm->arch.float_int;
+ spin_lock(&fi->lock);
+
+ list_for_each_entry(inti, &fi->list, list) {
+ if (len < sizeof(struct kvm_s390_irq)) {
+ /* signal userspace to try again */
+ ret = -ENOMEM;
+ break;
+ }
+ ret = copy_irq_to_user(inti, buf);
+ if (ret)
+ break;
+ buf += sizeof(struct kvm_s390_irq);
+ len -= sizeof(struct kvm_s390_irq);
+ n++;
+ }
+
+ spin_unlock(&fi->lock);
+ mutex_unlock(&kvm->lock);
+
+ return ret < 0 ? ret : n;
+}
+
+static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ int r;
+
+ switch (attr->group) {
+ case KVM_DEV_FLIC_GET_ALL_IRQS:
+ r = get_all_floating_irqs(dev->kvm, (u8 *) attr->addr,
+ attr->attr);
+ break;
+ default:
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
+static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
+ u64 addr)
+{
+ struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
+ void *target = NULL;
+ void __user *source;
+ u64 size;
+
+ if (get_user(inti->type, (u64 __user *)addr))
+ return -EFAULT;
+
+ switch (inti->type) {
+ case KVM_S390_INT_PFAULT_INIT:
+ case KVM_S390_INT_PFAULT_DONE:
+ case KVM_S390_INT_VIRTIO:
+ case KVM_S390_INT_SERVICE:
+ target = (void *) &inti->ext;
+ source = &uptr->u.ext;
+ size = sizeof(inti->ext);
+ break;
+ case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+ target = (void *) &inti->io;
+ source = &uptr->u.io;
+ size = sizeof(inti->io);
+ break;
+ case KVM_S390_MCHK:
+ target = (void *) &inti->mchk;
+ source = &uptr->u.mchk;
+ size = sizeof(inti->mchk);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_from_user(target, source, size))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int enqueue_floating_irq(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_interrupt_info *inti = NULL;
+ int r = 0;
+ int len = attr->attr;
+
+ if (len % sizeof(struct kvm_s390_irq) != 0)
+ return -EINVAL;
+ else if (len > KVM_S390_FLIC_MAX_BUFFER)
+ return -EINVAL;
+
+ while (len >= sizeof(struct kvm_s390_irq)) {
+ inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+ if (!inti)
+ return -ENOMEM;
+
+ r = copy_irq_from_user(inti, attr->addr);
+ if (r) {
+ kfree(inti);
+ return r;
+ }
+ r = __inject_vm(dev->kvm, inti);
+ if (r) {
+ kfree(inti);
+ return r;
+ }
+ len -= sizeof(struct kvm_s390_irq);
+ attr->addr += sizeof(struct kvm_s390_irq);
+ }
+
+ return r;
+}
+
+static struct s390_io_adapter *get_io_adapter(struct kvm *kvm, unsigned int id)
+{
+ if (id >= MAX_S390_IO_ADAPTERS)
+ return NULL;
+ return kvm->arch.adapters[id];
+}
+
+static int register_io_adapter(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ struct s390_io_adapter *adapter;
+ struct kvm_s390_io_adapter adapter_info;
+
+ if (copy_from_user(&adapter_info,
+ (void __user *)attr->addr, sizeof(adapter_info)))
+ return -EFAULT;
+
+ if ((adapter_info.id >= MAX_S390_IO_ADAPTERS) ||
+ (dev->kvm->arch.adapters[adapter_info.id] != NULL))
+ return -EINVAL;
+
+ adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+ if (!adapter)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&adapter->maps);
+ init_rwsem(&adapter->maps_lock);
+ atomic_set(&adapter->nr_maps, 0);
+ adapter->id = adapter_info.id;
+ adapter->isc = adapter_info.isc;
+ adapter->maskable = adapter_info.maskable;
+ adapter->masked = false;
+ adapter->swap = adapter_info.swap;
+ dev->kvm->arch.adapters[adapter->id] = adapter;
+
+ return 0;
+}
+
+int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
+{
+ int ret;
+ struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+
+ if (!adapter || !adapter->maskable)
+ return -EINVAL;
+ ret = adapter->masked;
+ adapter->masked = masked;
+ return ret;
+}
+
+static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+ struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+ struct s390_map_info *map;
+ int ret;
+
+ if (!adapter || !addr)
+ return -EINVAL;
+
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ INIT_LIST_HEAD(&map->list);
+ map->guest_addr = addr;
+ map->addr = gmap_translate(addr, kvm->arch.gmap);
+ if (map->addr == -EFAULT) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ret = get_user_pages_fast(map->addr, 1, 1, &map->page);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret != 1);
+ down_write(&adapter->maps_lock);
+ if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
+ list_add_tail(&map->list, &adapter->maps);
+ ret = 0;
+ } else {
+ put_page(map->page);
+ ret = -EINVAL;
+ }
+ up_write(&adapter->maps_lock);
+out:
+ if (ret)
+ kfree(map);
+ return ret;
+}
+
+static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+ struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+ struct s390_map_info *map, *tmp;
+ int found = 0;
+
+ if (!adapter || !addr)
+ return -EINVAL;
+
+ down_write(&adapter->maps_lock);
+ list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
+ if (map->guest_addr == addr) {
+ found = 1;
+ atomic_dec(&adapter->nr_maps);
+ list_del(&map->list);
+ put_page(map->page);
+ kfree(map);
+ break;
+ }
+ }
+ up_write(&adapter->maps_lock);
+
+ return found ? 0 : -EINVAL;
+}
+
+void kvm_s390_destroy_adapters(struct kvm *kvm)
+{
+ int i;
+ struct s390_map_info *map, *tmp;
+
+ for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
+ if (!kvm->arch.adapters[i])
+ continue;
+ list_for_each_entry_safe(map, tmp,
+ &kvm->arch.adapters[i]->maps, list) {
+ list_del(&map->list);
+ put_page(map->page);
+ kfree(map);
+ }
+ kfree(kvm->arch.adapters[i]);
+ }
+}
+
+static int modify_io_adapter(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_io_adapter_req req;
+ struct s390_io_adapter *adapter;
+ int ret;
+
+ if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
+ return -EFAULT;
+
+ adapter = get_io_adapter(dev->kvm, req.id);
+ if (!adapter)
+ return -EINVAL;
+ switch (req.type) {
+ case KVM_S390_IO_ADAPTER_MASK:
+ ret = kvm_s390_mask_adapter(dev->kvm, req.id, req.mask);
+ if (ret > 0)
+ ret = 0;
+ break;
+ case KVM_S390_IO_ADAPTER_MAP:
+ ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr);
+ break;
+ case KVM_S390_IO_ADAPTER_UNMAP:
+ ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ int r = 0;
+ unsigned int i;
+ struct kvm_vcpu *vcpu;
+
+ switch (attr->group) {
+ case KVM_DEV_FLIC_ENQUEUE:
+ r = enqueue_floating_irq(dev, attr);
+ break;
+ case KVM_DEV_FLIC_CLEAR_IRQS:
+ r = 0;
+ clear_floating_interrupts(dev->kvm);
+ break;
+ case KVM_DEV_FLIC_APF_ENABLE:
+ dev->kvm->arch.gmap->pfault_enabled = 1;
+ break;
+ case KVM_DEV_FLIC_APF_DISABLE_WAIT:
+ dev->kvm->arch.gmap->pfault_enabled = 0;
+ /*
+ * Make sure no async faults are in transition when
+ * clearing the queues. So we don't need to worry
+ * about late coming workers.
+ */
+ synchronize_srcu(&dev->kvm->srcu);
+ kvm_for_each_vcpu(i, vcpu, dev->kvm)
+ kvm_clear_async_pf_completion_queue(vcpu);
+ break;
+ case KVM_DEV_FLIC_ADAPTER_REGISTER:
+ r = register_io_adapter(dev, attr);
+ break;
+ case KVM_DEV_FLIC_ADAPTER_MODIFY:
+ r = modify_io_adapter(dev, attr);
+ break;
+ default:
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
+static int flic_create(struct kvm_device *dev, u32 type)
+{
+ if (!dev)
+ return -EINVAL;
+ if (dev->kvm->arch.flic)
+ return -EINVAL;
+ dev->kvm->arch.flic = dev;
+ return 0;
+}
+
+static void flic_destroy(struct kvm_device *dev)
+{
+ dev->kvm->arch.flic = NULL;
+ kfree(dev);
+}
+
+/* s390 floating irq controller (flic) */
+struct kvm_device_ops kvm_flic_ops = {
+ .name = "kvm-flic",
+ .get_attr = flic_get_attr,
+ .set_attr = flic_set_attr,
+ .create = flic_create,
+ .destroy = flic_destroy,
+};
+
+static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
+{
+ unsigned long bit;
+
+ bit = bit_nr + (addr % PAGE_SIZE) * 8;
+
+ return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
+}
+
+static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
+ u64 addr)
+{
+ struct s390_map_info *map;
+
+ if (!adapter)
+ return NULL;
+
+ list_for_each_entry(map, &adapter->maps, list) {
+ if (map->guest_addr == addr)
+ return map;
+ }
+ return NULL;
+}
+
+static int adapter_indicators_set(struct kvm *kvm,
+ struct s390_io_adapter *adapter,
+ struct kvm_s390_adapter_int *adapter_int)
+{
+ unsigned long bit;
+ int summary_set, idx;
+ struct s390_map_info *info;
+ void *map;
+
+ info = get_map_info(adapter, adapter_int->ind_addr);
+ if (!info)
+ return -1;
+ map = page_address(info->page);
+ bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
+ set_bit(bit, map);
+ idx = srcu_read_lock(&kvm->srcu);
+ mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
+ set_page_dirty_lock(info->page);
+ info = get_map_info(adapter, adapter_int->summary_addr);
+ if (!info) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -1;
+ }
+ map = page_address(info->page);
+ bit = get_ind_bit(info->addr, adapter_int->summary_offset,
+ adapter->swap);
+ summary_set = test_and_set_bit(bit, map);
+ mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
+ set_page_dirty_lock(info->page);
+ srcu_read_unlock(&kvm->srcu, idx);
+ return summary_set ? 0 : 1;
+}
+
+/*
+ * < 0 - not injected due to error
+ * = 0 - coalesced, summary indicator already active
+ * > 0 - injected interrupt
+ */
+static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ int ret;
+ struct s390_io_adapter *adapter;
+
+ /* We're only interested in the 0->1 transition. */
+ if (!level)
+ return 0;
+ adapter = get_io_adapter(kvm, e->adapter.adapter_id);
+ if (!adapter)
+ return -1;
+ down_read(&adapter->maps_lock);
+ ret = adapter_indicators_set(kvm, adapter, &e->adapter);
+ up_read(&adapter->maps_lock);
+ if ((ret > 0) && !adapter->masked) {
+ struct kvm_s390_interrupt s390int = {
+ .type = KVM_S390_INT_IO(1, 0, 0, 0),
+ .parm = 0,
+ .parm64 = (adapter->isc << 27) | 0x80000000,
+ };
+ ret = kvm_s390_inject_vm(kvm, &s390int);
+ if (ret == 0)
+ ret = 1;
+ }
+ return ret;
+}
+
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ int ret;
+
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_S390_ADAPTER:
+ e->set = set_adapter_int;
+ e->adapter.summary_addr = ue->u.adapter.summary_addr;
+ e->adapter.ind_addr = ue->u.adapter.ind_addr;
+ e->adapter.summary_offset = ue->u.adapter.summary_offset;
+ e->adapter.ind_offset = ue->u.adapter.ind_offset;
+ e->adapter.adapter_id = ue->u.adapter.adapter_id;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ return -EINVAL;
+}
diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h
new file mode 100644
index 000000000000..d98e4159643d
--- /dev/null
+++ b/arch/s390/kvm/irq.h
@@ -0,0 +1,22 @@
+/*
+ * s390 irqchip routines
+ *
+ * Copyright IBM Corp. 2014
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
+ */
+#ifndef __KVM_IRQ_H
+#define __KVM_IRQ_H
+
+#include <linux/kvm_host.h>
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ return 1;
+}
+
+#endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 10b5db3c9bc4..b3ecb8f5b6ce 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -153,11 +153,14 @@ int kvm_dev_ioctl_check_extension(long ext)
#ifdef CONFIG_KVM_S390_UCONTROL
case KVM_CAP_S390_UCONTROL:
#endif
+ case KVM_CAP_ASYNC_PF:
case KVM_CAP_SYNC_REGS:
case KVM_CAP_ONE_REG:
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_S390_CSS_SUPPORT:
case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_DEVICE_CTRL:
+ case KVM_CAP_ENABLE_CAP_VM:
r = 1;
break;
case KVM_CAP_NR_VCPUS:
@@ -186,6 +189,25 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
return 0;
}
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_S390_IRQCHIP:
+ kvm->arch.use_irqchip = 1;
+ r = 0;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -203,6 +225,26 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_s390_inject_vm(kvm, &s390int);
break;
}
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ break;
+ r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+ break;
+ }
+ case KVM_CREATE_IRQCHIP: {
+ struct kvm_irq_routing_entry routing;
+
+ r = -EINVAL;
+ if (kvm->arch.use_irqchip) {
+ /* Set up dummy routing. */
+ memset(&routing, 0, sizeof(routing));
+ kvm_set_irq_routing(kvm, &routing, 0, 0);
+ r = 0;
+ }
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -214,6 +256,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
int rc;
char debug_name[16];
+ static unsigned long sca_offset;
rc = -EINVAL;
#ifdef CONFIG_KVM_S390_UCONTROL
@@ -235,6 +278,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
if (!kvm->arch.sca)
goto out_err;
+ spin_lock(&kvm_lock);
+ sca_offset = (sca_offset + 16) & 0x7f0;
+ kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset);
+ spin_unlock(&kvm_lock);
sprintf(debug_name, "kvm-%u", current->pid);
@@ -255,9 +302,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (!kvm->arch.gmap)
goto out_nogmap;
kvm->arch.gmap->private = kvm;
+ kvm->arch.gmap->pfault_enabled = 0;
}
kvm->arch.css_support = 0;
+ kvm->arch.use_irqchip = 0;
return 0;
out_nogmap:
@@ -272,6 +321,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 3, "%s", "free cpu");
trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
+ kvm_clear_async_pf_completion_queue(vcpu);
if (!kvm_is_ucontrol(vcpu->kvm)) {
clear_bit(63 - vcpu->vcpu_id,
(unsigned long *) &vcpu->kvm->arch.sca->mcn);
@@ -320,11 +370,14 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
debug_unregister(kvm->arch.dbf);
if (!kvm_is_ucontrol(kvm))
gmap_free(kvm->arch.gmap);
+ kvm_s390_destroy_adapters(kvm);
}
/* Section: vcpu related */
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+ kvm_clear_async_pf_completion_queue(vcpu);
if (kvm_is_ucontrol(vcpu->kvm)) {
vcpu->arch.gmap = gmap_alloc(current->mm);
if (!vcpu->arch.gmap)
@@ -385,7 +438,11 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
vcpu->arch.guest_fpregs.fpc = 0;
asm volatile("lfpc %0" : : "Q" (vcpu->arch.guest_fpregs.fpc));
vcpu->arch.sie_block->gbea = 1;
+ vcpu->arch.sie_block->pp = 0;
+ vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+ kvm_clear_async_pf_completion_queue(vcpu);
atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+ kvm_s390_clear_local_irqs(vcpu);
}
int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -466,11 +523,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
spin_lock_init(&vcpu->arch.local_int.lock);
INIT_LIST_HEAD(&vcpu->arch.local_int.list);
vcpu->arch.local_int.float_int = &kvm->arch.float_int;
- spin_lock(&kvm->arch.float_int.lock);
- kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
vcpu->arch.local_int.wq = &vcpu->wq;
vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
- spin_unlock(&kvm->arch.float_int.lock);
rc = kvm_vcpu_init(vcpu, kvm, id);
if (rc)
@@ -490,9 +544,7 @@ out:
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- /* kvm common code refers to this, but never calls it */
- BUG();
- return 0;
+ return kvm_cpu_has_interrupt(vcpu);
}
void s390_vcpu_block(struct kvm_vcpu *vcpu)
@@ -568,6 +620,26 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu,
r = put_user(vcpu->arch.sie_block->ckc,
(u64 __user *)reg->addr);
break;
+ case KVM_REG_S390_PFTOKEN:
+ r = put_user(vcpu->arch.pfault_token,
+ (u64 __user *)reg->addr);
+ break;
+ case KVM_REG_S390_PFCOMPARE:
+ r = put_user(vcpu->arch.pfault_compare,
+ (u64 __user *)reg->addr);
+ break;
+ case KVM_REG_S390_PFSELECT:
+ r = put_user(vcpu->arch.pfault_select,
+ (u64 __user *)reg->addr);
+ break;
+ case KVM_REG_S390_PP:
+ r = put_user(vcpu->arch.sie_block->pp,
+ (u64 __user *)reg->addr);
+ break;
+ case KVM_REG_S390_GBEA:
+ r = put_user(vcpu->arch.sie_block->gbea,
+ (u64 __user *)reg->addr);
+ break;
default:
break;
}
@@ -597,6 +669,26 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
r = get_user(vcpu->arch.sie_block->ckc,
(u64 __user *)reg->addr);
break;
+ case KVM_REG_S390_PFTOKEN:
+ r = get_user(vcpu->arch.pfault_token,
+ (u64 __user *)reg->addr);
+ break;
+ case KVM_REG_S390_PFCOMPARE:
+ r = get_user(vcpu->arch.pfault_compare,
+ (u64 __user *)reg->addr);
+ break;
+ case KVM_REG_S390_PFSELECT:
+ r = get_user(vcpu->arch.pfault_select,
+ (u64 __user *)reg->addr);
+ break;
+ case KVM_REG_S390_PP:
+ r = get_user(vcpu->arch.sie_block->pp,
+ (u64 __user *)reg->addr);
+ break;
+ case KVM_REG_S390_GBEA:
+ r = get_user(vcpu->arch.sie_block->gbea,
+ (u64 __user *)reg->addr);
+ break;
default:
break;
}
@@ -715,10 +807,100 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
return 0;
}
+static long kvm_arch_fault_in_sync(struct kvm_vcpu *vcpu)
+{
+ long rc;
+ hva_t fault = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap);
+ struct mm_struct *mm = current->mm;
+ down_read(&mm->mmap_sem);
+ rc = get_user_pages(current, mm, fault, 1, 1, 0, NULL, NULL);
+ up_read(&mm->mmap_sem);
+ return rc;
+}
+
+static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
+ unsigned long token)
+{
+ struct kvm_s390_interrupt inti;
+ inti.parm64 = token;
+
+ if (start_token) {
+ inti.type = KVM_S390_INT_PFAULT_INIT;
+ WARN_ON_ONCE(kvm_s390_inject_vcpu(vcpu, &inti));
+ } else {
+ inti.type = KVM_S390_INT_PFAULT_DONE;
+ WARN_ON_ONCE(kvm_s390_inject_vm(vcpu->kvm, &inti));
+ }
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token);
+ __kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token);
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ trace_kvm_s390_pfault_done(vcpu, work->arch.pfault_token);
+ __kvm_inject_pfault_token(vcpu, false, work->arch.pfault_token);
+}
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ /* s390 will always inject the page directly */
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+ /*
+ * s390 will always inject the page directly,
+ * but we still want check_async_completion to cleanup
+ */
+ return true;
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
+{
+ hva_t hva;
+ struct kvm_arch_async_pf arch;
+ int rc;
+
+ if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID)
+ return 0;
+ if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) !=
+ vcpu->arch.pfault_compare)
+ return 0;
+ if (psw_extint_disabled(vcpu))
+ return 0;
+ if (kvm_cpu_has_interrupt(vcpu))
+ return 0;
+ if (!(vcpu->arch.sie_block->gcr[0] & 0x200ul))
+ return 0;
+ if (!vcpu->arch.gmap->pfault_enabled)
+ return 0;
+
+ hva = gmap_fault(current->thread.gmap_addr, vcpu->arch.gmap);
+ if (copy_from_guest(vcpu, &arch.pfault_token, vcpu->arch.pfault_token, 8))
+ return 0;
+
+ rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
+ return rc;
+}
+
static int vcpu_pre_run(struct kvm_vcpu *vcpu)
{
int rc, cpuflags;
+ /*
+ * On s390 notifications for arriving pages will be delivered directly
+ * to the guest but the house keeping for completed pfaults is
+ * handled outside the worker.
+ */
+ kvm_check_async_pf_completion(vcpu);
+
memcpy(&vcpu->arch.sie_block->gg14, &vcpu->run->s.regs.gprs[14], 16);
if (need_resched())
@@ -744,7 +926,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
{
- int rc;
+ int rc = -1;
VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
vcpu->arch.sie_block->icptcode);
@@ -758,7 +940,16 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
current->thread.gmap_addr;
vcpu->run->s390_ucontrol.pgm_code = 0x10;
rc = -EREMOTE;
- } else {
+
+ } else if (current->thread.gmap_pfault) {
+ trace_kvm_s390_major_guest_pfault(vcpu);
+ current->thread.gmap_pfault = 0;
+ if (kvm_arch_setup_async_pf(vcpu) ||
+ (kvm_arch_fault_in_sync(vcpu) >= 0))
+ rc = 0;
+ }
+
+ if (rc == -1) {
VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
trace_kvm_s390_sie_fault(vcpu);
rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -768,7 +959,8 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
if (rc == 0) {
if (kvm_is_ucontrol(vcpu->kvm))
- rc = -EOPNOTSUPP;
+ /* Don't exit for host interrupts. */
+ rc = vcpu->arch.sie_block->icptcode ? -EOPNOTSUPP : 0;
else
rc = kvm_handle_sie_intercept(vcpu);
}
@@ -831,8 +1023,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
- BUG_ON(vcpu->kvm->arch.float_int.local_int[vcpu->vcpu_id] == NULL);
-
switch (kvm_run->exit_reason) {
case KVM_EXIT_S390_SIEIC:
case KVM_EXIT_UNKNOWN:
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 564514f410f4..3c1e2274d9ea 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -129,6 +129,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
void kvm_s390_tasklet(unsigned long parm);
void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
+void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu);
int __must_check kvm_s390_inject_vm(struct kvm *kvm,
struct kvm_s390_interrupt *s390int);
int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
@@ -136,6 +137,7 @@ int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
u64 cr6, u64 schid);
+int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked);
/* implemented in priv.c */
int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
@@ -161,4 +163,9 @@ bool kvm_enabled_cmma(void);
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
+/* implemented in interrupt.c */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int psw_extint_disabled(struct kvm_vcpu *vcpu);
+void kvm_s390_destroy_adapters(struct kvm *kvm);
+
#endif
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index aacb6b129914..476e9e218f43 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -396,15 +396,10 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
{
- struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
int cpus = 0;
int n;
- spin_lock(&fi->lock);
- for (n = 0; n < KVM_MAX_VCPUS; n++)
- if (fi->local_int[n])
- cpus++;
- spin_unlock(&fi->lock);
+ cpus = atomic_read(&vcpu->kvm->online_vcpus);
/* deal with other level 3 hypervisors */
if (stsi(mem, 3, 2, 2))
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 87c2b3a3bd3e..26caeb530a78 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -23,29 +23,30 @@
static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
u64 *reg)
{
- struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+ struct kvm_s390_local_interrupt *li;
+ struct kvm_vcpu *dst_vcpu = NULL;
+ int cpuflags;
int rc;
if (cpu_addr >= KVM_MAX_VCPUS)
return SIGP_CC_NOT_OPERATIONAL;
- spin_lock(&fi->lock);
- if (fi->local_int[cpu_addr] == NULL)
- rc = SIGP_CC_NOT_OPERATIONAL;
- else if (!(atomic_read(fi->local_int[cpu_addr]->cpuflags)
- & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED)))
+ dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+ if (!dst_vcpu)
+ return SIGP_CC_NOT_OPERATIONAL;
+ li = &dst_vcpu->arch.local_int;
+
+ cpuflags = atomic_read(li->cpuflags);
+ if (!(cpuflags & (CPUSTAT_ECALL_PEND | CPUSTAT_STOPPED)))
rc = SIGP_CC_ORDER_CODE_ACCEPTED;
else {
*reg &= 0xffffffff00000000UL;
- if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
- & CPUSTAT_ECALL_PEND)
+ if (cpuflags & CPUSTAT_ECALL_PEND)
*reg |= SIGP_STATUS_EXT_CALL_PENDING;
- if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
- & CPUSTAT_STOPPED)
+ if (cpuflags & CPUSTAT_STOPPED)
*reg |= SIGP_STATUS_STOPPED;
rc = SIGP_CC_STATUS_STORED;
}
- spin_unlock(&fi->lock);
VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc);
return rc;
@@ -53,12 +54,13 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
{
- struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
struct kvm_s390_local_interrupt *li;
struct kvm_s390_interrupt_info *inti;
- int rc;
+ struct kvm_vcpu *dst_vcpu = NULL;
- if (cpu_addr >= KVM_MAX_VCPUS)
+ if (cpu_addr < KVM_MAX_VCPUS)
+ dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+ if (!dst_vcpu)
return SIGP_CC_NOT_OPERATIONAL;
inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@ -68,13 +70,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
inti->type = KVM_S390_INT_EMERGENCY;
inti->emerg.code = vcpu->vcpu_id;
- spin_lock(&fi->lock);
- li = fi->local_int[cpu_addr];
- if (li == NULL) {
- rc = SIGP_CC_NOT_OPERATIONAL;
- kfree(inti);
- goto unlock;
- }
+ li = &dst_vcpu->arch.local_int;
spin_lock_bh(&li->lock);
list_add_tail(&inti->list, &li->list);
atomic_set(&li->active, 1);
@@ -82,11 +78,9 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
if (waitqueue_active(li->wq))
wake_up_interruptible(li->wq);
spin_unlock_bh(&li->lock);
- rc = SIGP_CC_ORDER_CODE_ACCEPTED;
VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
-unlock:
- spin_unlock(&fi->lock);
- return rc;
+
+ return SIGP_CC_ORDER_CODE_ACCEPTED;
}
static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr,
@@ -122,12 +116,13 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr,
static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
{
- struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
struct kvm_s390_local_interrupt *li;
struct kvm_s390_interrupt_info *inti;
- int rc;
+ struct kvm_vcpu *dst_vcpu = NULL;
- if (cpu_addr >= KVM_MAX_VCPUS)
+ if (cpu_addr < KVM_MAX_VCPUS)
+ dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+ if (!dst_vcpu)
return SIGP_CC_NOT_OPERATIONAL;
inti = kzalloc(sizeof(*inti), GFP_KERNEL);
@@ -137,13 +132,7 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
inti->type = KVM_S390_INT_EXTERNAL_CALL;
inti->extcall.code = vcpu->vcpu_id;
- spin_lock(&fi->lock);
- li = fi->local_int[cpu_addr];
- if (li == NULL) {
- rc = SIGP_CC_NOT_OPERATIONAL;
- kfree(inti);
- goto unlock;
- }
+ li = &dst_vcpu->arch.local_int;
spin_lock_bh(&li->lock);
list_add_tail(&inti->list, &li->list);
atomic_set(&li->active, 1);
@@ -151,11 +140,9 @@ static int __sigp_external_call(struct kvm_vcpu *vcpu, u16 cpu_addr)
if (waitqueue_active(li->wq))
wake_up_interruptible(li->wq);
spin_unlock_bh(&li->lock);
- rc = SIGP_CC_ORDER_CODE_ACCEPTED;
VCPU_EVENT(vcpu, 4, "sent sigp ext call to cpu %x", cpu_addr);
-unlock:
- spin_unlock(&fi->lock);
- return rc;
+
+ return SIGP_CC_ORDER_CODE_ACCEPTED;
}
static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
@@ -189,31 +176,26 @@ out:
static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
{
- struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
struct kvm_s390_local_interrupt *li;
+ struct kvm_vcpu *dst_vcpu = NULL;
int rc;
if (cpu_addr >= KVM_MAX_VCPUS)
return SIGP_CC_NOT_OPERATIONAL;
- spin_lock(&fi->lock);
- li = fi->local_int[cpu_addr];
- if (li == NULL) {
- rc = SIGP_CC_NOT_OPERATIONAL;
- goto unlock;
- }
+ dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+ if (!dst_vcpu)
+ return SIGP_CC_NOT_OPERATIONAL;
+ li = &dst_vcpu->arch.local_int;
rc = __inject_sigp_stop(li, action);
-unlock:
- spin_unlock(&fi->lock);
VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
if ((action & ACTION_STORE_ON_STOP) != 0 && rc == -ESHUTDOWN) {
/* If the CPU has already been stopped, we still have
* to save the status when doing stop-and-store. This
* has to be done after unlocking all spinlocks. */
- struct kvm_vcpu *dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
rc = kvm_s390_store_status_unloaded(dst_vcpu,
KVM_S390_STORE_STATUS_NOADDR);
}
@@ -224,6 +206,8 @@ unlock:
static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
{
int rc;
+ unsigned int i;
+ struct kvm_vcpu *v;
switch (parameter & 0xff) {
case 0:
@@ -231,6 +215,11 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
break;
case 1:
case 2:
+ kvm_for_each_vcpu(i, v, vcpu->kvm) {
+ v->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID;
+ kvm_clear_async_pf_completion_queue(v);
+ }
+
rc = SIGP_CC_ORDER_CODE_ACCEPTED;
break;
default:
@@ -242,12 +231,18 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
u64 *reg)
{
- struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
- struct kvm_s390_local_interrupt *li = NULL;
+ struct kvm_s390_local_interrupt *li;
+ struct kvm_vcpu *dst_vcpu = NULL;
struct kvm_s390_interrupt_info *inti;
int rc;
u8 tmp;
+ if (cpu_addr < KVM_MAX_VCPUS)
+ dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+ if (!dst_vcpu)
+ return SIGP_CC_NOT_OPERATIONAL;
+ li = &dst_vcpu->arch.local_int;
+
/* make sure that the new value is valid memory */
address = address & 0x7fffe000u;
if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
@@ -261,18 +256,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
if (!inti)
return SIGP_CC_BUSY;
- spin_lock(&fi->lock);
- if (cpu_addr < KVM_MAX_VCPUS)
- li = fi->local_int[cpu_addr];
-
- if (li == NULL) {
- *reg &= 0xffffffff00000000UL;
- *reg |= SIGP_STATUS_INCORRECT_STATE;
- rc = SIGP_CC_STATUS_STORED;
- kfree(inti);
- goto out_fi;
- }
-
spin_lock_bh(&li->lock);
/* cpu must be in stopped state */
if (!(atomic_read(li->cpuflags) & CPUSTAT_STOPPED)) {
@@ -295,8 +278,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", cpu_addr, address);
out_li:
spin_unlock_bh(&li->lock);
-out_fi:
- spin_unlock(&fi->lock);
return rc;
}
@@ -334,28 +315,26 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu, u16 cpu_id,
static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
u64 *reg)
{
+ struct kvm_s390_local_interrupt *li;
+ struct kvm_vcpu *dst_vcpu = NULL;
int rc;
- struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
if (cpu_addr >= KVM_MAX_VCPUS)
return SIGP_CC_NOT_OPERATIONAL;
- spin_lock(&fi->lock);
- if (fi->local_int[cpu_addr] == NULL)
- rc = SIGP_CC_NOT_OPERATIONAL;
- else {
- if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
- & CPUSTAT_RUNNING) {
- /* running */
- rc = SIGP_CC_ORDER_CODE_ACCEPTED;
- } else {
- /* not running */
- *reg &= 0xffffffff00000000UL;
- *reg |= SIGP_STATUS_NOT_RUNNING;
- rc = SIGP_CC_STATUS_STORED;
- }
+ dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+ if (!dst_vcpu)
+ return SIGP_CC_NOT_OPERATIONAL;
+ li = &dst_vcpu->arch.local_int;
+ if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) {
+ /* running */
+ rc = SIGP_CC_ORDER_CODE_ACCEPTED;
+ } else {
+ /* not running */
+ *reg &= 0xffffffff00000000UL;
+ *reg |= SIGP_STATUS_NOT_RUNNING;
+ rc = SIGP_CC_STATUS_STORED;
}
- spin_unlock(&fi->lock);
VCPU_EVENT(vcpu, 4, "sensed running status of cpu %x rc %x", cpu_addr,
rc);
@@ -366,26 +345,22 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu, u16 cpu_addr,
/* Test whether the destination CPU is available and not busy */
static int sigp_check_callable(struct kvm_vcpu *vcpu, u16 cpu_addr)
{
- struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
struct kvm_s390_local_interrupt *li;
int rc = SIGP_CC_ORDER_CODE_ACCEPTED;
+ struct kvm_vcpu *dst_vcpu = NULL;
if (cpu_addr >= KVM_MAX_VCPUS)
return SIGP_CC_NOT_OPERATIONAL;
- spin_lock(&fi->lock);
- li = fi->local_int[cpu_addr];
- if (li == NULL) {
- rc = SIGP_CC_NOT_OPERATIONAL;
- goto out;
- }
-
+ dst_vcpu = kvm_get_vcpu(vcpu->kvm, cpu_addr);
+ if (!dst_vcpu)
+ return SIGP_CC_NOT_OPERATIONAL;
+ li = &dst_vcpu->arch.local_int;
spin_lock_bh(&li->lock);
if (li->action_bits & ACTION_STOP_ON_STOP)
rc = SIGP_CC_BUSY;
spin_unlock_bh(&li->lock);
-out:
- spin_unlock(&fi->lock);
+
return rc;
}
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index 3db76b2daed7..e8e7213d4cc5 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -30,6 +30,52 @@
TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \
__entry->pswmask, __entry->pswaddr, p_args)
+TRACE_EVENT(kvm_s390_major_guest_pfault,
+ TP_PROTO(VCPU_PROTO_COMMON),
+ TP_ARGS(VCPU_ARGS_COMMON),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ ),
+ VCPU_TP_PRINTK("%s", "major fault, maybe applicable for pfault")
+ );
+
+TRACE_EVENT(kvm_s390_pfault_init,
+ TP_PROTO(VCPU_PROTO_COMMON, long pfault_token),
+ TP_ARGS(VCPU_ARGS_COMMON, pfault_token),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(long, pfault_token)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->pfault_token = pfault_token;
+ ),
+ VCPU_TP_PRINTK("init pfault token %ld", __entry->pfault_token)
+ );
+
+TRACE_EVENT(kvm_s390_pfault_done,
+ TP_PROTO(VCPU_PROTO_COMMON, long pfault_token),
+ TP_ARGS(VCPU_ARGS_COMMON, pfault_token),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(long, pfault_token)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->pfault_token = pfault_token;
+ ),
+ VCPU_TP_PRINTK("done pfault token %ld", __entry->pfault_token)
+ );
+
/*
* Tracepoints for SIE entry and exit.
*/
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index d95265b2719f..88cef505453b 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -50,6 +50,7 @@
#define VM_FAULT_BADMAP 0x020000
#define VM_FAULT_BADACCESS 0x040000
#define VM_FAULT_SIGNAL 0x080000
+#define VM_FAULT_PFAULT 0x100000
static unsigned long store_indication __read_mostly;
@@ -227,6 +228,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault)
return;
}
case VM_FAULT_BADCONTEXT:
+ case VM_FAULT_PFAULT:
do_no_context(regs);
break;
case VM_FAULT_SIGNAL:
@@ -264,6 +266,9 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault)
*/
static inline int do_exception(struct pt_regs *regs, int access)
{
+#ifdef CONFIG_PGSTE
+ struct gmap *gmap;
+#endif
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct *vma;
@@ -304,9 +309,10 @@ static inline int do_exception(struct pt_regs *regs, int access)
down_read(&mm->mmap_sem);
#ifdef CONFIG_PGSTE
- if ((current->flags & PF_VCPU) && S390_lowcore.gmap) {
- address = __gmap_fault(address,
- (struct gmap *) S390_lowcore.gmap);
+ gmap = (struct gmap *)
+ ((current->flags & PF_VCPU) ? S390_lowcore.gmap : 0);
+ if (gmap) {
+ address = __gmap_fault(address, gmap);
if (address == -EFAULT) {
fault = VM_FAULT_BADMAP;
goto out_up;
@@ -315,6 +321,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
fault = VM_FAULT_OOM;
goto out_up;
}
+ if (gmap->pfault_enabled)
+ flags |= FAULT_FLAG_RETRY_NOWAIT;
}
#endif
@@ -371,9 +379,19 @@ retry:
regs, address);
}
if (fault & VM_FAULT_RETRY) {
+#ifdef CONFIG_PGSTE
+ if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /* FAULT_FLAG_RETRY_NOWAIT has been set,
+ * mmap_sem has not been released */
+ current->thread.gmap_pfault = 1;
+ fault = VM_FAULT_PFAULT;
+ goto out_up;
+ }
+#endif
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
* of starvation. */
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags &= ~(FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_RETRY_NOWAIT);
flags |= FAULT_FLAG_TRIED;
down_read(&mm->mmap_sem);
goto retry;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fdf83afbb7d9..fcaf9c961265 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -337,6 +337,11 @@ struct kvm_pmu {
u64 reprogram_pmi;
};
+enum {
+ KVM_DEBUGREG_BP_ENABLED = 1,
+ KVM_DEBUGREG_WONT_EXIT = 2,
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -444,7 +449,6 @@ struct kvm_vcpu_arch {
} st;
u64 last_guest_tsc;
- u64 last_kernel_ns;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
u64 this_tsc_nsec;
@@ -464,7 +468,7 @@ struct kvm_vcpu_arch {
struct mtrr_state_type mtrr_state;
u32 pat;
- int switch_db_regs;
+ unsigned switch_db_regs;
unsigned long db[KVM_NR_DB_REGS];
unsigned long dr6;
unsigned long dr7;
@@ -599,6 +603,8 @@ struct kvm_arch {
bool use_master_clock;
u64 master_kernel_ns;
cycle_t master_cycle_now;
+ struct delayed_work kvmclock_update_work;
+ struct delayed_work kvmclock_sync_work;
struct kvm_xen_hvm_config xen_hvm_config;
@@ -702,6 +708,7 @@ struct kvm_x86_ops {
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
u64 (*get_dr6)(struct kvm_vcpu *vcpu);
void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
@@ -728,8 +735,8 @@ struct kvm_x86_ops {
int (*nmi_allowed)(struct kvm_vcpu *vcpu);
bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
- int (*enable_nmi_window)(struct kvm_vcpu *vcpu);
- int (*enable_irq_window)(struct kvm_vcpu *vcpu);
+ void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+ void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
int (*vm_has_apicv)(struct kvm *kvm);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
@@ -765,6 +772,9 @@ struct kvm_x86_ops {
struct x86_instruction_info *info,
enum x86_intercept_stage stage);
void (*handle_external_intr)(struct kvm_vcpu *vcpu);
+ bool (*mpx_supported)(void);
+
+ int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
};
struct kvm_arch_async_pf {
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2067264fb7f5..7004d21e6219 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -85,6 +85,7 @@
#define VM_EXIT_SAVE_IA32_EFER 0x00100000
#define VM_EXIT_LOAD_IA32_EFER 0x00200000
#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
+#define VM_EXIT_CLEAR_BNDCFGS 0x00800000
#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
@@ -95,6 +96,7 @@
#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
+#define VM_ENTRY_LOAD_BNDCFGS 0x00010000
#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
@@ -174,6 +176,8 @@ enum vmcs_field {
GUEST_PDPTR2_HIGH = 0x0000280f,
GUEST_PDPTR3 = 0x00002810,
GUEST_PDPTR3_HIGH = 0x00002811,
+ GUEST_BNDCFGS = 0x00002812,
+ GUEST_BNDCFGS_HIGH = 0x00002813,
HOST_IA32_PAT = 0x00002c00,
HOST_IA32_PAT_HIGH = 0x00002c01,
HOST_IA32_EFER = 0x00002c02,
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 6c1d7411eb00..d949ef28c48b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -16,6 +16,8 @@
#define XSTATE_Hi16_ZMM 0x80
#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
+/* Bit 63 of XCR0 is reserved for future expansion */
+#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63)))
#define FXSAVE_SIZE 512
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 4924f4be2b99..c827ace3121b 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -295,6 +295,7 @@
#define MSR_SMI_COUNT 0x00000034
#define MSR_IA32_FEATURE_CONTROL 0x0000003a
#define MSR_IA32_TSC_ADJUST 0x0000003b
+#define MSR_IA32_BNDCFGS 0x00000d90
#define FEATURE_CONTROL_LOCKED (1<<0)
#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 713f1b3bad52..0331cb389d68 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -417,7 +417,6 @@ void kvm_disable_steal_time(void)
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
- WARN_ON(kvm_register_clock("primary cpu clock"));
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
kvm_spinlock_init();
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e6041094ff26..d9156ceecdff 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -242,7 +242,7 @@ void __init kvmclock_init(void)
hv_clock = __va(mem);
memset(hv_clock, 0, size);
- if (kvm_register_clock("boot clock")) {
+ if (kvm_register_clock("primary cpu clock")) {
hv_clock = NULL;
memblock_free(mem, size);
return;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e5503d8aec1d..bea60671ef8a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -28,7 +28,7 @@ static u32 xstate_required_size(u64 xstate_bv)
int feature_bit = 0;
u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
- xstate_bv &= ~XSTATE_FPSSE;
+ xstate_bv &= XSTATE_EXTEND_MASK;
while (xstate_bv) {
if (xstate_bv & 0x1) {
u32 eax, ebx, ecx, edx;
@@ -43,6 +43,16 @@ static u32 xstate_required_size(u64 xstate_bv)
return ret;
}
+u64 kvm_supported_xcr0(void)
+{
+ u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
+
+ if (!kvm_x86_ops->mpx_supported())
+ xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR);
+
+ return xcr0;
+}
+
void kvm_update_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -73,9 +83,9 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
} else {
vcpu->arch.guest_supported_xcr0 =
(best->eax | ((u64)best->edx << 32)) &
- host_xcr0 & KVM_SUPPORTED_XCR0;
- vcpu->arch.guest_xstate_size =
- xstate_required_size(vcpu->arch.guest_supported_xcr0);
+ kvm_supported_xcr0();
+ vcpu->arch.guest_xstate_size = best->ebx =
+ xstate_required_size(vcpu->arch.xcr0);
}
kvm_pmu_cpuid_update(vcpu);
@@ -210,13 +220,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags = 0;
}
-static bool supported_xcr0_bit(unsigned bit)
-{
- u64 mask = ((u64)1 << bit);
-
- return mask & KVM_SUPPORTED_XCR0 & host_xcr0;
-}
-
#define F(x) bit(X86_FEATURE_##x)
static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
@@ -256,6 +259,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
#endif
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
+ unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -303,7 +307,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.ebx */
const u32 kvm_supported_word9_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
- F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
+ F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
+ F(ADX);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -436,16 +441,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
case 0xd: {
int idx, i;
+ u64 supported = kvm_supported_xcr0();
- entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0;
- entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32;
+ entry->eax &= supported;
+ entry->edx &= supported >> 32;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
for (idx = 1, i = 1; idx < 64; ++idx) {
+ u64 mask = ((u64)1 << idx);
if (*nent >= maxnent)
goto out;
do_cpuid_1_ent(&entry[i], function, idx);
- if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
+ if (entry[i].eax == 0 || !(supported & mask))
continue;
entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 07ffca0a89e9..205b17eed93c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3668,6 +3668,10 @@ static const struct gprefix pfx_vmovntpx = {
I(0, em_mov), N, N, N,
};
+static const struct gprefix pfx_0f_28_0f_29 = {
+ I(Aligned, em_mov), I(Aligned, em_mov), N, N,
+};
+
static const struct escape escape_d9 = { {
N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
}, {
@@ -3870,7 +3874,9 @@ static const struct opcode twobyte_table[256] = {
IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
N, N, N, N,
- N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
+ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
+ N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
N, N, N, N,
/* 0x30 - 0x3F */
II(ImplicitOps | Priv, em_wrmsr, wrmsr),
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b531351a587..f5704d9e5ddc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3329,7 +3329,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.direct_map = vcpu->arch.mmu.direct_map;
arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+ return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
}
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cba218a2f08d..b1e6c1bf68d3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -913,7 +913,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
* and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
* used by guest then tlbs are not flushed, so guest is allowed to access the
* freed pages.
- * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
+ * We set tlbs_dirty to let the notifier know this change and delay the flush
+ * until such a case actually happens.
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
@@ -942,7 +943,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return -EINVAL;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- vcpu->kvm->tlbs_dirty++;
+ vcpu->kvm->tlbs_dirty = true;
continue;
}
@@ -957,7 +958,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i]);
- vcpu->kvm->tlbs_dirty++;
+ vcpu->kvm->tlbs_dirty = true;
continue;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2de1bc09a8d4..7f4f9c2badae 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,6 +34,7 @@
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
+#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/virtext.h>
@@ -303,20 +304,35 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
return vmcb->control.intercept_cr & (1U << bit);
}
-static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
+static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_dr |= (1U << bit);
+ vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
+ | (1 << INTERCEPT_DR1_READ)
+ | (1 << INTERCEPT_DR2_READ)
+ | (1 << INTERCEPT_DR3_READ)
+ | (1 << INTERCEPT_DR4_READ)
+ | (1 << INTERCEPT_DR5_READ)
+ | (1 << INTERCEPT_DR6_READ)
+ | (1 << INTERCEPT_DR7_READ)
+ | (1 << INTERCEPT_DR0_WRITE)
+ | (1 << INTERCEPT_DR1_WRITE)
+ | (1 << INTERCEPT_DR2_WRITE)
+ | (1 << INTERCEPT_DR3_WRITE)
+ | (1 << INTERCEPT_DR4_WRITE)
+ | (1 << INTERCEPT_DR5_WRITE)
+ | (1 << INTERCEPT_DR6_WRITE)
+ | (1 << INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
}
-static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
+static inline void clr_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_host_vmcb(svm);
- vmcb->control.intercept_dr &= ~(1U << bit);
+ vmcb->control.intercept_dr = 0;
recalc_intercepts(svm);
}
@@ -1080,23 +1096,7 @@ static void init_vmcb(struct vcpu_svm *svm)
set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR0_READ);
- set_dr_intercept(svm, INTERCEPT_DR1_READ);
- set_dr_intercept(svm, INTERCEPT_DR2_READ);
- set_dr_intercept(svm, INTERCEPT_DR3_READ);
- set_dr_intercept(svm, INTERCEPT_DR4_READ);
- set_dr_intercept(svm, INTERCEPT_DR5_READ);
- set_dr_intercept(svm, INTERCEPT_DR6_READ);
- set_dr_intercept(svm, INTERCEPT_DR7_READ);
-
- set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
- set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
+ set_dr_intercepts(svm);
set_exception_intercept(svm, PF_VECTOR);
set_exception_intercept(svm, UD_VECTOR);
@@ -1684,6 +1684,21 @@ static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
mark_dirty(svm->vmcb, VMCB_DR);
}
+static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ vcpu->arch.dr6 = svm_get_dr6(vcpu);
+ vcpu->arch.dr7 = svm->vmcb->save.dr7;
+
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ set_dr_intercepts(svm);
+}
+
static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2842,6 +2857,7 @@ static int iret_interception(struct vcpu_svm *svm)
clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
return 1;
}
@@ -2974,6 +2990,17 @@ static int dr_interception(struct vcpu_svm *svm)
unsigned long val;
int err;
+ if (svm->vcpu.guest_debug == 0) {
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ clr_dr_intercepts(svm);
+ svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
return emulate_on_interception(svm);
@@ -3649,7 +3676,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
return ret;
}
-static int enable_irq_window(struct kvm_vcpu *vcpu)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3663,16 +3690,15 @@ static int enable_irq_window(struct kvm_vcpu *vcpu)
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
- return 0;
}
-static int enable_nmi_window(struct kvm_vcpu *vcpu)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
== HF_NMI_MASK)
- return 0; /* IRET will cause a vm exit */
+ return; /* IRET will cause a vm exit */
/*
* Something prevents NMI from been injected. Single step over possible
@@ -3681,7 +3707,6 @@ static int enable_nmi_window(struct kvm_vcpu *vcpu)
svm->nmi_singlestep = true;
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
update_db_bp_intercept(vcpu);
- return 0;
}
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4064,6 +4089,11 @@ static bool svm_invpcid_supported(void)
return false;
}
+static bool svm_mpx_supported(void)
+{
+ return false;
+}
+
static bool svm_has_wbinvd_exit(void)
{
return true;
@@ -4302,6 +4332,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_dr6 = svm_get_dr6,
.set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
+ .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
@@ -4345,6 +4376,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.invpcid_supported = svm_invpcid_supported,
+ .mpx_supported = svm_mpx_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 392752834751..1320e0f8e611 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -31,6 +31,7 @@
#include <linux/ftrace_event.h>
#include <linux/slab.h>
#include <linux/tboot.h>
+#include <linux/hrtimer.h>
#include "kvm_cache_regs.h"
#include "x86.h"
@@ -42,6 +43,7 @@
#include <asm/i387.h>
#include <asm/xcr.h>
#include <asm/perf_event.h>
+#include <asm/debugreg.h>
#include <asm/kexec.h>
#include "trace.h"
@@ -110,6 +112,8 @@ module_param(nested, bool, S_IRUGO);
#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
+#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
@@ -202,6 +206,7 @@ struct __packed vmcs12 {
u64 guest_pdptr1;
u64 guest_pdptr2;
u64 guest_pdptr3;
+ u64 guest_bndcfgs;
u64 host_ia32_pat;
u64 host_ia32_efer;
u64 host_ia32_perf_global_ctrl;
@@ -374,6 +379,9 @@ struct nested_vmx {
*/
struct page *apic_access_page;
u64 msr_ia32_feature_control;
+
+ struct hrtimer preemption_timer;
+ bool preemption_timer_expired;
};
#define POSTED_INTR_ON 0
@@ -441,6 +449,7 @@ struct vcpu_vmx {
#endif
int gs_ldt_reload_needed;
int fs_reload_needed;
+ u64 msr_host_bndcfgs;
} host_state;
struct {
int vm86_active;
@@ -533,6 +542,7 @@ static const unsigned long shadow_read_write_fields[] = {
GUEST_CS_LIMIT,
GUEST_CS_BASE,
GUEST_ES_BASE,
+ GUEST_BNDCFGS,
CR0_GUEST_HOST_MASK,
CR0_READ_SHADOW,
CR4_READ_SHADOW,
@@ -588,6 +598,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(GUEST_PDPTR1, guest_pdptr1),
FIELD64(GUEST_PDPTR2, guest_pdptr2),
FIELD64(GUEST_PDPTR3, guest_pdptr3),
+ FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
FIELD64(HOST_IA32_PAT, host_ia32_pat),
FIELD64(HOST_IA32_EFER, host_ia32_efer),
FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
@@ -718,6 +729,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
+static bool vmx_mpx_supported(void);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -728,6 +740,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+static bool vmx_mpx_supported(void);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1047,6 +1060,12 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
}
+static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -1710,6 +1729,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
if (is_long_mode(&vmx->vcpu))
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
+ if (boot_cpu_has(X86_FEATURE_MPX))
+ rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
vmx->guest_msrs[i].data,
@@ -1747,6 +1768,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
+ if (vmx->host_state.msr_host_bndcfgs)
+ wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
/*
* If the FPU is not active (through the host task or
* the guest vcpu), then restore the cr0.TS bit.
@@ -2248,9 +2271,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
*/
nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
- PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
+ PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
+ nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
/*
* Exit controls
@@ -2265,15 +2288,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
- VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+ VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+ nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
- if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
- !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
- nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
- nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
- }
- nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
- VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
+ if (vmx_mpx_supported())
+ nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2287,6 +2307,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
VM_ENTRY_LOAD_IA32_PAT;
nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
VM_ENTRY_LOAD_IA32_EFER);
+ if (vmx_mpx_supported())
+ nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2342,9 +2364,9 @@ static __init void nested_vmx_setup_ctls_msrs(void)
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
- nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
- VMX_MISC_SAVE_EFER_LMA;
- nested_vmx_misc_low |= VMX_MISC_ACTIVITY_HLT;
+ nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+ VMX_MISC_ACTIVITY_HLT;
nested_vmx_misc_high = 0;
}
@@ -2479,6 +2501,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_IA32_SYSENTER_ESP:
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
+ case MSR_IA32_BNDCFGS:
+ if (!vmx_mpx_supported())
+ return 1;
+ data = vmcs_read64(GUEST_BNDCFGS);
+ break;
case MSR_IA32_FEATURE_CONTROL:
if (!nested_vmx_allowed(vcpu))
return 1;
@@ -2547,6 +2574,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_SYSENTER_ESP:
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
+ case MSR_IA32_BNDCFGS:
+ if (!vmx_mpx_supported())
+ return 1;
+ vmcs_write64(GUEST_BNDCFGS, data);
+ break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
@@ -2832,12 +2864,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmx_capability.ept, vmx_capability.vpid);
}
- min = 0;
+ min = VM_EXIT_SAVE_DEBUG_CONTROLS;
#ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
- VM_EXIT_ACK_INTR_ON_EXIT;
+ VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
@@ -2853,8 +2885,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
- min = 0;
- opt = VM_ENTRY_LOAD_IA32_PAT;
+ min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
+ opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
&_vmentry_control) < 0)
return -EIO;
@@ -4223,6 +4255,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+
+ if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
+ exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+
if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
exec_control &= ~CPU_BASED_TPR_SHADOW;
#ifdef CONFIG_X86_64
@@ -4496,39 +4532,28 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
PIN_BASED_NMI_EXITING;
}
-static int enable_irq_window(struct kvm_vcpu *vcpu)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
- if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
- /*
- * We get here if vmx_interrupt_allowed() said we can't
- * inject to L1 now because L2 must run. The caller will have
- * to make L2 exit right after entry, so we can inject to L1
- * more promptly.
- */
- return -EBUSY;
-
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
- return 0;
}
-static int enable_nmi_window(struct kvm_vcpu *vcpu)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
- if (!cpu_has_virtual_nmis())
- return enable_irq_window(vcpu);
-
- if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI)
- return enable_irq_window(vcpu);
+ if (!cpu_has_virtual_nmis() ||
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ enable_irq_window(vcpu);
+ return;
+ }
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
- return 0;
}
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4620,22 +4645,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu)) {
- if (to_vmx(vcpu)->nested.nested_run_pending)
- return 0;
- if (nested_exit_on_nmi(vcpu)) {
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- NMI_VECTOR | INTR_TYPE_NMI_INTR |
- INTR_INFO_VALID_MASK, 0);
- /*
- * The NMI-triggered VM exit counts as injection:
- * clear this one and block further NMIs.
- */
- vcpu->arch.nmi_pending = 0;
- vmx_set_nmi_mask(vcpu, true);
- return 0;
- }
- }
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
return 0;
@@ -4647,19 +4658,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu)) {
- if (to_vmx(vcpu)->nested.nested_run_pending)
- return 0;
- if (nested_exit_on_intr(vcpu)) {
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
- 0, 0);
- /*
- * fall through to normal code, but now in L1, not L2
- */
- }
- }
-
- return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ return (!to_vmx(vcpu)->nested.nested_run_pending &&
+ vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
@@ -5102,6 +5102,22 @@ static int handle_dr(struct kvm_vcpu *vcpu)
}
}
+ if (vcpu->guest_debug == 0) {
+ u32 cpu_based_vm_exec_control;
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
@@ -5128,6 +5144,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
{
}
+static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ get_debugreg(vcpu->arch.dr6, 6);
+ vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
+
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -5727,6 +5761,18 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
*/
}
+static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
+{
+ struct vcpu_vmx *vmx =
+ container_of(timer, struct vcpu_vmx, nested.preemption_timer);
+
+ vmx->nested.preemption_timer_expired = true;
+ kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+ kvm_vcpu_kick(&vmx->vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
/*
* Emulate the VMXON instruction.
* Currently, we just remember that VMX is active, and do not save or even
@@ -5791,6 +5837,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
vmx->nested.vmcs02_num = 0;
+ hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
vmx->nested.vmxon = true;
skip_emulated_instruction(vcpu);
@@ -6767,9 +6817,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* table is L0's fault.
*/
return 0;
- case EXIT_REASON_PREEMPTION_TIMER:
- return vmcs12->pin_based_vm_exec_control &
- PIN_BASED_VMX_PREEMPTION_TIMER;
case EXIT_REASON_WBINVD:
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
case EXIT_REASON_XSETBV:
@@ -6785,27 +6832,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
}
-static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
-{
- u64 delta_tsc_l1;
- u32 preempt_val_l1, preempt_val_l2, preempt_scale;
-
- if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
- PIN_BASED_VMX_PREEMPTION_TIMER))
- return;
- preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
- MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
- preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
- delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
- - vcpu->arch.last_guest_tsc;
- preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
- if (preempt_val_l2 <= preempt_val_l1)
- preempt_val_l2 = 0;
- else
- preempt_val_l2 -= preempt_val_l1;
- vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
-}
-
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -7052,6 +7078,12 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
local_irq_enable();
}
+static bool vmx_mpx_supported(void)
+{
+ return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
+ (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
+}
+
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -7218,8 +7250,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
- if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
- nested_adjust_preemption_timer(vcpu);
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -7616,6 +7646,28 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
kvm_inject_page_fault(vcpu, fault);
}
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (vcpu->arch.virtual_tsc_khz == 0)
+ return;
+
+ /* Make sure short timeouts reliably trigger an immediate vmexit.
+ * hrtimer_start does not guarantee this. */
+ if (preemption_timeout <= 1) {
+ vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
+ return;
+ }
+
+ preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+ preemption_timeout *= 1000000;
+ do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
+ hrtimer_start(&vmx->nested.preemption_timer,
+ ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
+}
+
/*
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7629,7 +7681,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exec_control;
- u32 exit_control;
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7687,13 +7738,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write64(VMCS_LINK_POINTER, -1ull);
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
- (vmcs_config.pin_based_exec_ctrl |
- vmcs12->pin_based_vm_exec_control));
+ exec_control = vmcs12->pin_based_vm_exec_control;
+ exec_control |= vmcs_config.pin_based_exec_ctrl;
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
- if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
- vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
- vmcs12->vmx_preemption_timer_value);
+ vmx->nested.preemption_timer_expired = false;
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ vmx_start_preemption_timer(vcpu);
/*
* Whether page-faults are trapped is determined by a combination of
@@ -7721,7 +7773,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
enable_ept ? vmcs12->page_fault_error_code_match : 0);
if (cpu_has_secondary_exec_ctrls()) {
- u32 exec_control = vmx_secondary_exec_control(vmx);
+ exec_control = vmx_secondary_exec_control(vmx);
if (!vmx->rdtscp_enabled)
exec_control &= ~SECONDARY_EXEC_RDTSCP;
/* Take the following fields only from vmcs12 */
@@ -7808,10 +7860,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
* bits are further modified by vmx_set_efer() below.
*/
- exit_control = vmcs_config.vmexit_ctrl;
- if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
- exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
- vm_exit_controls_init(vmx, exit_control);
+ vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
* emulated by vmx_set_efer(), below.
@@ -7830,6 +7879,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
set_cr4_guest_host_mask(vmx);
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vmcs_write64(TSC_OFFSET,
vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
@@ -8155,6 +8207,58 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
}
}
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+ vmx->nested.preemption_timer_expired) {
+ if (vmx->nested.nested_run_pending)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
+ return 0;
+ }
+
+ if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
+ if (vmx->nested.nested_run_pending ||
+ vcpu->arch.interrupt.pending)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ NMI_VECTOR | INTR_TYPE_NMI_INTR |
+ INTR_INFO_VALID_MASK, 0);
+ /*
+ * The NMI-triggered VM exit counts as injection:
+ * clear this one and block further NMIs.
+ */
+ vcpu->arch.nmi_pending = 0;
+ vmx_set_nmi_mask(vcpu, true);
+ return 0;
+ }
+
+ if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
+ nested_exit_on_intr(vcpu)) {
+ if (vmx->nested.nested_run_pending)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ }
+
+ return 0;
+}
+
+static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+ ktime_t remaining =
+ hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
+ u64 value;
+
+ if (ktime_to_ns(remaining) <= 0)
+ return 0;
+
+ value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
+ do_div(value, 1000000);
+ return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+}
+
/*
* prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
* and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -8225,10 +8329,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
else
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
- if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
- (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
- vmcs12->vmx_preemption_timer_value =
- vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+ if (nested_cpu_has_preemption_timer(vmcs12)) {
+ if (vmcs12->vm_exit_controls &
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+ vmcs12->vmx_preemption_timer_value =
+ vmx_get_preemption_timer_value(vcpu);
+ hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+ }
/*
* In some cases (usually, nested EPT), L2 is allowed to change its
@@ -8260,6 +8367,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+ if (vmx_mpx_supported())
+ vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
/* update exit information fields: */
@@ -8369,6 +8478,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
+ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
vcpu->arch.pat = vmcs12->host_ia32_pat;
@@ -8495,6 +8608,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
nested_vmx_succeed(vcpu);
if (enable_shadow_vmcs)
vmx->nested.sync_shadow_vmcs = true;
+
+ /* in case we halted in L2 */
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
/*
@@ -8573,6 +8689,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_dr6 = vmx_get_dr6,
.set_dr6 = vmx_set_dr6,
.set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
@@ -8634,6 +8751,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_intercept = vmx_check_intercept,
.handle_external_intr = vmx_handle_external_intr,
+ .mpx_supported = vmx_mpx_supported,
+
+ .check_nested_events = vmx_check_nested_events,
};
static int __init vmx_init(void)
@@ -8721,6 +8841,8 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
+
memcpy(vmx_msr_bitmap_legacy_x2apic,
vmx_msr_bitmap_legacy, PAGE_SIZE);
memcpy(vmx_msr_bitmap_longmode_x2apic,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b8578432d5b..d1c55f8722c6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -595,13 +595,13 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- u64 xcr0;
+ u64 xcr0 = xcr;
+ u64 old_xcr0 = vcpu->arch.xcr0;
u64 valid_bits;
/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
if (index != XCR_XFEATURE_ENABLED_MASK)
return 1;
- xcr0 = xcr;
if (!(xcr0 & XSTATE_FP))
return 1;
if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
@@ -616,8 +616,14 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
if (xcr0 & ~valid_bits)
return 1;
+ if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+ return 1;
+
kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
+
+ if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+ kvm_update_cpuid(vcpu);
return 0;
}
@@ -753,7 +759,9 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
else
dr7 = vcpu->arch.dr7;
kvm_x86_ops->set_dr7(vcpu, dr7);
- vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+ if (dr7 & DR7_BP_EN_MASK)
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
@@ -879,7 +887,7 @@ static u32 msrs_to_save[] = {
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEATURE_CONTROL
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
};
static unsigned num_msrs_to_save;
@@ -1581,7 +1589,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
- vcpu->last_kernel_ns = kernel_ns;
vcpu->last_guest_tsc = tsc_timestamp;
/*
@@ -1623,14 +1630,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* the others.
*
* So in those cases, request a kvmclock update for all vcpus.
- * The worst case for a remote vcpu to update its kvmclock
- * is then bounded by maximum nohz sleep latency.
+ * We need to rate-limit these requests though, as they can
+ * considerably slow guests that have a large number of vcpus.
+ * The time for a remote vcpu to update its kvmclock is bound
+ * by the delay we use to rate-limit the updates.
*/
-static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+static void kvmclock_update_fn(struct work_struct *work)
{
int i;
- struct kvm *kvm = v->kvm;
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_update_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1639,6 +1653,29 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
}
}
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+ struct kvm *kvm = v->kvm;
+
+ set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+ KVMCLOCK_UPDATE_DELAY);
+}
+
+#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
+
+static void kvmclock_sync_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_sync_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+}
+
static bool msr_mtrr_valid(unsigned msr)
{
switch (msr) {
@@ -2323,9 +2360,12 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_VP_INDEX: {
int r;
struct kvm_vcpu *v;
- kvm_for_each_vcpu(r, v, vcpu->kvm)
- if (v == vcpu)
+ kvm_for_each_vcpu(r, v, vcpu->kvm) {
+ if (v == vcpu) {
data = r;
+ break;
+ }
+ }
break;
}
case HV_X64_MSR_EOI:
@@ -2617,6 +2657,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_KVMCLOCK_CTRL:
case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
+ case KVM_CAP_IOAPIC_POLARITY_IGNORED:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -3043,9 +3084,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
- if (xstate_bv & ~KVM_SUPPORTED_XCR0)
- return -EINVAL;
- if (xstate_bv & ~host_xcr0)
+ if (xstate_bv & ~kvm_supported_xcr0())
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state->xsave,
guest_xsave->region, vcpu->arch.guest_xstate_size);
@@ -3898,6 +3937,23 @@ static void kvm_init_msr_list(void)
for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
continue;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed
+ * to the guests in some cases. We could work around this
+ * in VMX with the generic MSR save/load machinery, but it
+ * is not really worthwhile since it will really only
+ * happen with nested virtualization.
+ */
+ switch (msrs_to_save[i]) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_x86_ops->mpx_supported())
+ continue;
+ break;
+ default:
+ break;
+ }
+
if (j < i)
msrs_to_save[j] = msrs_to_save[i];
j++;
@@ -4394,6 +4450,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
+ mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
kvm_mmu_pte_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -5537,9 +5594,10 @@ int kvm_arch_init(void *opaque)
goto out_free_percpu;
kvm_set_mmio_spte_mask();
- kvm_init_msr_list();
kvm_x86_ops = ops;
+ kvm_init_msr_list();
+
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -5782,8 +5840,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
-static void inject_pending_event(struct kvm_vcpu *vcpu)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
{
+ int r;
+
/* try to reinject previous events if any */
if (vcpu->arch.exception.pending) {
trace_kvm_inj_exception(vcpu->arch.exception.nr,
@@ -5793,17 +5853,23 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.reinject);
- return;
+ return 0;
}
if (vcpu->arch.nmi_injected) {
kvm_x86_ops->set_nmi(vcpu);
- return;
+ return 0;
}
if (vcpu->arch.interrupt.pending) {
kvm_x86_ops->set_irq(vcpu);
- return;
+ return 0;
+ }
+
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
}
/* try to inject new event if pending */
@@ -5820,6 +5886,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
kvm_x86_ops->set_irq(vcpu);
}
}
+ return 0;
}
static void process_nmi(struct kvm_vcpu *vcpu)
@@ -5924,15 +5991,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
- inject_pending_event(vcpu);
-
+ if (inject_pending_event(vcpu, req_int_win) != 0)
+ req_immediate_exit = true;
/* enable NMI/IRQ window open exits if needed */
- if (vcpu->arch.nmi_pending)
- req_immediate_exit =
- kvm_x86_ops->enable_nmi_window(vcpu) != 0;
+ else if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
- req_immediate_exit =
- kvm_x86_ops->enable_irq_window(vcpu) != 0;
+ kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
/*
@@ -5992,12 +6057,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ set_debugreg(vcpu->arch.dr6, 6);
}
trace_kvm_entry(vcpu->vcpu_id);
kvm_x86_ops->run(vcpu);
/*
+ * Do this here before restoring debug registers on the host. And
+ * since we do this before handling the vmexit, a DR access vmexit
+ * can (a) read the correct value of the debug registers, (b) set
+ * KVM_DEBUGREG_WONT_EXIT again.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+ int i;
+
+ WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
+
+ /*
* If the guest has used debug registers, at least dr7
* will be disabled while returning to the host.
* If we don't have active breakpoints in the host, we don't
@@ -6711,6 +6792,7 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
int r;
struct msr_data msr;
+ struct kvm *kvm = vcpu->kvm;
r = vcpu_load(vcpu);
if (r)
@@ -6721,6 +6803,9 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
kvm_write_tsc(vcpu, &msr);
vcpu_put(vcpu);
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+
return r;
}
@@ -7013,6 +7098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
pvclock_update_vm_gtod_copy(kvm);
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+
return 0;
}
@@ -7050,6 +7138,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_sync_events(struct kvm *kvm)
{
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
kvm_free_all_assigned_devices(kvm);
kvm_free_pit(kvm);
}
@@ -7248,6 +7338,9 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+ kvm_x86_ops->check_nested_events(vcpu, false);
+
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
|| !list_empty_careful(&vcpu->async_pf.done)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 8da5823bcde6..8c97bac9a895 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -122,9 +122,12 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
-#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
+#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
+ | XSTATE_BNDREGS | XSTATE_BNDCSR)
extern u64 host_xcr0;
+extern u64 kvm_supported_xcr0(void);
+
extern unsigned int min_timer_period_us;
extern struct static_key kvm_no_apic_vcpu;
diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c
index 0fc584832001..1e1fc671f89a 100644
--- a/drivers/s390/kvm/virtio_ccw.c
+++ b/drivers/s390/kvm/virtio_ccw.c
@@ -1,7 +1,7 @@
/*
* ccw based virtio transport
*
- * Copyright IBM Corp. 2012
+ * Copyright IBM Corp. 2012, 2014
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2 only)
@@ -32,6 +32,8 @@
#include <asm/cio.h>
#include <asm/ccwdev.h>
#include <asm/virtio-ccw.h>
+#include <asm/isc.h>
+#include <asm/airq.h>
/*
* virtio related functions
@@ -58,6 +60,9 @@ struct virtio_ccw_device {
unsigned long indicators;
unsigned long indicators2;
struct vq_config_block *config_block;
+ bool is_thinint;
+ bool going_away;
+ void *airq_info;
};
struct vq_info_block {
@@ -72,15 +77,38 @@ struct virtio_feature_desc {
__u8 index;
} __packed;
+struct virtio_thinint_area {
+ unsigned long summary_indicator;
+ unsigned long indicator;
+ u64 bit_nr;
+ u8 isc;
+} __packed;
+
struct virtio_ccw_vq_info {
struct virtqueue *vq;
int num;
void *queue;
struct vq_info_block *info_block;
+ int bit_nr;
struct list_head node;
long cookie;
};
+#define VIRTIO_AIRQ_ISC IO_SCH_ISC /* inherit from subchannel */
+
+#define VIRTIO_IV_BITS (L1_CACHE_BYTES * 8)
+#define MAX_AIRQ_AREAS 20
+
+static int virtio_ccw_use_airq = 1;
+
+struct airq_info {
+ rwlock_t lock;
+ u8 summary_indicator;
+ struct airq_struct airq;
+ struct airq_iv *aiv;
+};
+static struct airq_info *airq_areas[MAX_AIRQ_AREAS];
+
#define CCW_CMD_SET_VQ 0x13
#define CCW_CMD_VDEV_RESET 0x33
#define CCW_CMD_SET_IND 0x43
@@ -91,6 +119,7 @@ struct virtio_ccw_vq_info {
#define CCW_CMD_WRITE_CONF 0x21
#define CCW_CMD_WRITE_STATUS 0x31
#define CCW_CMD_READ_VQ_CONF 0x32
+#define CCW_CMD_SET_IND_ADAPTER 0x73
#define VIRTIO_CCW_DOING_SET_VQ 0x00010000
#define VIRTIO_CCW_DOING_RESET 0x00040000
@@ -102,6 +131,7 @@ struct virtio_ccw_vq_info {
#define VIRTIO_CCW_DOING_SET_IND 0x01000000
#define VIRTIO_CCW_DOING_READ_VQ_CONF 0x02000000
#define VIRTIO_CCW_DOING_SET_CONF_IND 0x04000000
+#define VIRTIO_CCW_DOING_SET_IND_ADAPTER 0x08000000
#define VIRTIO_CCW_INTPARM_MASK 0xffff0000
static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
@@ -109,6 +139,125 @@ static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
return container_of(vdev, struct virtio_ccw_device, vdev);
}
+static void drop_airq_indicator(struct virtqueue *vq, struct airq_info *info)
+{
+ unsigned long i, flags;
+
+ write_lock_irqsave(&info->lock, flags);
+ for (i = 0; i < airq_iv_end(info->aiv); i++) {
+ if (vq == (void *)airq_iv_get_ptr(info->aiv, i)) {
+ airq_iv_free_bit(info->aiv, i);
+ airq_iv_set_ptr(info->aiv, i, 0);
+ break;
+ }
+ }
+ write_unlock_irqrestore(&info->lock, flags);
+}
+
+static void virtio_airq_handler(struct airq_struct *airq)
+{
+ struct airq_info *info = container_of(airq, struct airq_info, airq);
+ unsigned long ai;
+
+ inc_irq_stat(IRQIO_VAI);
+ read_lock(&info->lock);
+ /* Walk through indicators field, summary indicator active. */
+ for (ai = 0;;) {
+ ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv));
+ if (ai == -1UL)
+ break;
+ vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai));
+ }
+ info->summary_indicator = 0;
+ smp_wmb();
+ /* Walk through indicators field, summary indicator not active. */
+ for (ai = 0;;) {
+ ai = airq_iv_scan(info->aiv, ai, airq_iv_end(info->aiv));
+ if (ai == -1UL)
+ break;
+ vring_interrupt(0, (void *)airq_iv_get_ptr(info->aiv, ai));
+ }
+ read_unlock(&info->lock);
+}
+
+static struct airq_info *new_airq_info(void)
+{
+ struct airq_info *info;
+ int rc;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return NULL;
+ rwlock_init(&info->lock);
+ info->aiv = airq_iv_create(VIRTIO_IV_BITS, AIRQ_IV_ALLOC | AIRQ_IV_PTR);
+ if (!info->aiv) {
+ kfree(info);
+ return NULL;
+ }
+ info->airq.handler = virtio_airq_handler;
+ info->airq.lsi_ptr = &info->summary_indicator;
+ info->airq.lsi_mask = 0xff;
+ info->airq.isc = VIRTIO_AIRQ_ISC;
+ rc = register_adapter_interrupt(&info->airq);
+ if (rc) {
+ airq_iv_release(info->aiv);
+ kfree(info);
+ return NULL;
+ }
+ return info;
+}
+
+static void destroy_airq_info(struct airq_info *info)
+{
+ if (!info)
+ return;
+
+ unregister_adapter_interrupt(&info->airq);
+ airq_iv_release(info->aiv);
+ kfree(info);
+}
+
+static unsigned long get_airq_indicator(struct virtqueue *vqs[], int nvqs,
+ u64 *first, void **airq_info)
+{
+ int i, j;
+ struct airq_info *info;
+ unsigned long indicator_addr = 0;
+ unsigned long bit, flags;
+
+ for (i = 0; i < MAX_AIRQ_AREAS && !indicator_addr; i++) {
+ if (!airq_areas[i])
+ airq_areas[i] = new_airq_info();
+ info = airq_areas[i];
+ if (!info)
+ return 0;
+ write_lock_irqsave(&info->lock, flags);
+ bit = airq_iv_alloc(info->aiv, nvqs);
+ if (bit == -1UL) {
+ /* Not enough vacancies. */
+ write_unlock_irqrestore(&info->lock, flags);
+ continue;
+ }
+ *first = bit;
+ *airq_info = info;
+ indicator_addr = (unsigned long)info->aiv->vector;
+ for (j = 0; j < nvqs; j++) {
+ airq_iv_set_ptr(info->aiv, bit + j,
+ (unsigned long)vqs[j]);
+ }
+ write_unlock_irqrestore(&info->lock, flags);
+ }
+ return indicator_addr;
+}
+
+static void virtio_ccw_drop_indicators(struct virtio_ccw_device *vcdev)
+{
+ struct virtio_ccw_vq_info *info;
+
+ list_for_each_entry(info, &vcdev->virtqueues, node)
+ drop_airq_indicator(info->vq, vcdev->airq_info);
+}
+
static int doing_io(struct virtio_ccw_device *vcdev, __u32 flag)
{
unsigned long flags;
@@ -145,6 +294,51 @@ static int ccw_io_helper(struct virtio_ccw_device *vcdev,
return ret ? ret : vcdev->err;
}
+static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev,
+ struct ccw1 *ccw)
+{
+ int ret;
+ unsigned long *indicatorp = NULL;
+ struct virtio_thinint_area *thinint_area = NULL;
+ struct airq_info *airq_info = vcdev->airq_info;
+
+ if (vcdev->is_thinint) {
+ thinint_area = kzalloc(sizeof(*thinint_area),
+ GFP_DMA | GFP_KERNEL);
+ if (!thinint_area)
+ return;
+ thinint_area->summary_indicator =
+ (unsigned long) &airq_info->summary_indicator;
+ thinint_area->isc = VIRTIO_AIRQ_ISC;
+ ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER;
+ ccw->count = sizeof(*thinint_area);
+ ccw->cda = (__u32)(unsigned long) thinint_area;
+ } else {
+ indicatorp = kmalloc(sizeof(&vcdev->indicators),
+ GFP_DMA | GFP_KERNEL);
+ if (!indicatorp)
+ return;
+ *indicatorp = 0;
+ ccw->cmd_code = CCW_CMD_SET_IND;
+ ccw->count = sizeof(vcdev->indicators);
+ ccw->cda = (__u32)(unsigned long) indicatorp;
+ }
+ /* Deregister indicators from host. */
+ vcdev->indicators = 0;
+ ccw->flags = 0;
+ ret = ccw_io_helper(vcdev, ccw,
+ vcdev->is_thinint ?
+ VIRTIO_CCW_DOING_SET_IND_ADAPTER :
+ VIRTIO_CCW_DOING_SET_IND);
+ if (ret && (ret != -ENODEV))
+ dev_info(&vcdev->cdev->dev,
+ "Failed to deregister indicators (%d)\n", ret);
+ else if (vcdev->is_thinint)
+ virtio_ccw_drop_indicators(vcdev);
+ kfree(indicatorp);
+ kfree(thinint_area);
+}
+
static inline long do_kvm_notify(struct subchannel_id schid,
unsigned long queue_index,
long cookie)
@@ -232,11 +426,13 @@ static void virtio_ccw_del_vqs(struct virtio_device *vdev)
{
struct virtqueue *vq, *n;
struct ccw1 *ccw;
+ struct virtio_ccw_device *vcdev = to_vc_device(vdev);
ccw = kzalloc(sizeof(*ccw), GFP_DMA | GFP_KERNEL);
if (!ccw)
return;
+ virtio_ccw_drop_indicator(vcdev, ccw);
list_for_each_entry_safe(vq, n, &vdev->vqs, list)
virtio_ccw_del_vq(vq, ccw);
@@ -326,6 +522,54 @@ out_err:
return ERR_PTR(err);
}
+static int virtio_ccw_register_adapter_ind(struct virtio_ccw_device *vcdev,
+ struct virtqueue *vqs[], int nvqs,
+ struct ccw1 *ccw)
+{
+ int ret;
+ struct virtio_thinint_area *thinint_area = NULL;
+ struct airq_info *info;
+
+ thinint_area = kzalloc(sizeof(*thinint_area), GFP_DMA | GFP_KERNEL);
+ if (!thinint_area) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /* Try to get an indicator. */
+ thinint_area->indicator = get_airq_indicator(vqs, nvqs,
+ &thinint_area->bit_nr,
+ &vcdev->airq_info);
+ if (!thinint_area->indicator) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ info = vcdev->airq_info;
+ thinint_area->summary_indicator =
+ (unsigned long) &info->summary_indicator;
+ thinint_area->isc = VIRTIO_AIRQ_ISC;
+ ccw->cmd_code = CCW_CMD_SET_IND_ADAPTER;
+ ccw->flags = CCW_FLAG_SLI;
+ ccw->count = sizeof(*thinint_area);
+ ccw->cda = (__u32)(unsigned long)thinint_area;
+ ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND_ADAPTER);
+ if (ret) {
+ if (ret == -EOPNOTSUPP) {
+ /*
+ * The host does not support adapter interrupts
+ * for virtio-ccw, stop trying.
+ */
+ virtio_ccw_use_airq = 0;
+ pr_info("Adapter interrupts unsupported on host\n");
+ } else
+ dev_warn(&vcdev->cdev->dev,
+ "enabling adapter interrupts = %d\n", ret);
+ virtio_ccw_drop_indicators(vcdev);
+ }
+out:
+ kfree(thinint_area);
+ return ret;
+}
+
static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
struct virtqueue *vqs[],
vq_callback_t *callbacks[],
@@ -355,15 +599,23 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
if (!indicatorp)
goto out;
*indicatorp = (unsigned long) &vcdev->indicators;
- /* Register queue indicators with host. */
- vcdev->indicators = 0;
- ccw->cmd_code = CCW_CMD_SET_IND;
- ccw->flags = 0;
- ccw->count = sizeof(vcdev->indicators);
- ccw->cda = (__u32)(unsigned long) indicatorp;
- ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
- if (ret)
- goto out;
+ if (vcdev->is_thinint) {
+ ret = virtio_ccw_register_adapter_ind(vcdev, vqs, nvqs, ccw);
+ if (ret)
+ /* no error, just fall back to legacy interrupts */
+ vcdev->is_thinint = 0;
+ }
+ if (!vcdev->is_thinint) {
+ /* Register queue indicators with host. */
+ vcdev->indicators = 0;
+ ccw->cmd_code = CCW_CMD_SET_IND;
+ ccw->flags = 0;
+ ccw->count = sizeof(vcdev->indicators);
+ ccw->cda = (__u32)(unsigned long) indicatorp;
+ ret = ccw_io_helper(vcdev, ccw, VIRTIO_CCW_DOING_SET_IND);
+ if (ret)
+ goto out;
+ }
/* Register indicators2 with host for config changes */
*indicatorp = (unsigned long) &vcdev->indicators2;
vcdev->indicators2 = 0;
@@ -636,6 +888,8 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
struct virtqueue *vq;
struct virtio_driver *drv;
+ if (!vcdev)
+ return;
/* Check if it's a notification from the host. */
if ((intparm == 0) &&
(scsw_stctl(&irb->scsw) ==
@@ -663,6 +917,7 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev,
case VIRTIO_CCW_DOING_SET_CONF_IND:
case VIRTIO_CCW_DOING_RESET:
case VIRTIO_CCW_DOING_READ_VQ_CONF:
+ case VIRTIO_CCW_DOING_SET_IND_ADAPTER:
vcdev->curr_io &= ~activity;
wake_up(&vcdev->wait_q);
break;
@@ -734,23 +989,46 @@ static int virtio_ccw_probe(struct ccw_device *cdev)
return 0;
}
+static struct virtio_ccw_device *virtio_grab_drvdata(struct ccw_device *cdev)
+{
+ unsigned long flags;
+ struct virtio_ccw_device *vcdev;
+
+ spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
+ vcdev = dev_get_drvdata(&cdev->dev);
+ if (!vcdev || vcdev->going_away) {
+ spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
+ return NULL;
+ }
+ vcdev->going_away = true;
+ spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
+ return vcdev;
+}
+
static void virtio_ccw_remove(struct ccw_device *cdev)
{
- struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+ unsigned long flags;
+ struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev);
- if (cdev->online) {
+ if (vcdev && cdev->online)
unregister_virtio_device(&vcdev->vdev);
- dev_set_drvdata(&cdev->dev, NULL);
- }
+ spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
+ dev_set_drvdata(&cdev->dev, NULL);
+ spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
cdev->handler = NULL;
}
static int virtio_ccw_offline(struct ccw_device *cdev)
{
- struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+ unsigned long flags;
+ struct virtio_ccw_device *vcdev = virtio_grab_drvdata(cdev);
- unregister_virtio_device(&vcdev->vdev);
- dev_set_drvdata(&cdev->dev, NULL);
+ if (vcdev) {
+ unregister_virtio_device(&vcdev->vdev);
+ spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
+ dev_set_drvdata(&cdev->dev, NULL);
+ spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
+ }
return 0;
}
@@ -759,6 +1037,7 @@ static int virtio_ccw_online(struct ccw_device *cdev)
{
int ret;
struct virtio_ccw_device *vcdev;
+ unsigned long flags;
vcdev = kzalloc(sizeof(*vcdev), GFP_KERNEL);
if (!vcdev) {
@@ -778,6 +1057,8 @@ static int virtio_ccw_online(struct ccw_device *cdev)
goto out_free;
}
+ vcdev->is_thinint = virtio_ccw_use_airq; /* at least try */
+
vcdev->vdev.dev.parent = &cdev->dev;
vcdev->vdev.dev.release = virtio_ccw_release_dev;
vcdev->vdev.config = &virtio_ccw_config_ops;
@@ -786,7 +1067,9 @@ static int virtio_ccw_online(struct ccw_device *cdev)
INIT_LIST_HEAD(&vcdev->virtqueues);
spin_lock_init(&vcdev->lock);
+ spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
dev_set_drvdata(&cdev->dev, vcdev);
+ spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
vcdev->vdev.id.vendor = cdev->id.cu_type;
vcdev->vdev.id.device = cdev->id.cu_model;
ret = register_virtio_device(&vcdev->vdev);
@@ -797,7 +1080,9 @@ static int virtio_ccw_online(struct ccw_device *cdev)
}
return 0;
out_put:
+ spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
dev_set_drvdata(&cdev->dev, NULL);
+ spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
put_device(&vcdev->vdev.dev);
return ret;
out_free:
@@ -935,6 +1220,10 @@ module_init(virtio_ccw_init);
static void __exit virtio_ccw_exit(void)
{
+ int i;
+
ccw_driver_unregister(&virtio_ccw_driver);
+ for (i = 0; i < MAX_AIRQ_AREAS; i++)
+ destroy_airq_info(airq_areas[i]);
}
module_exit(virtio_ccw_exit);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b8e9a43e501a..7d21cf9f4380 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -192,7 +192,7 @@ struct kvm_async_pf {
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
struct kvm_arch_async_pf *arch);
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif
@@ -297,6 +297,14 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
}
+struct kvm_s390_adapter_int {
+ u64 ind_addr;
+ u64 summary_addr;
+ u64 ind_offset;
+ u32 summary_offset;
+ u32 adapter_id;
+};
+
struct kvm_kernel_irq_routing_entry {
u32 gsi;
u32 type;
@@ -309,6 +317,7 @@ struct kvm_kernel_irq_routing_entry {
unsigned pin;
} irqchip;
struct msi_msg msi;
+ struct kvm_s390_adapter_int adapter;
};
struct hlist_node link;
};
@@ -401,7 +410,9 @@ struct kvm {
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
#endif
- long tlbs_dirty;
+ /* Protected by mmu_lock */
+ bool tlbs_dirty;
+
struct list_head devices;
};
@@ -911,7 +922,11 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+#ifdef CONFIG_S390
+#define KVM_MAX_IRQ_ROUTES 4096 //FIXME: we can have more than that...
+#else
#define KVM_MAX_IRQ_ROUTES 1024
+#endif
int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_set_irq_routing(struct kvm *kvm,
@@ -1064,6 +1079,7 @@ extern struct kvm_device_ops kvm_mpic_ops;
extern struct kvm_device_ops kvm_xics_ops;
extern struct kvm_device_ops kvm_vfio_ops;
extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
+extern struct kvm_device_ops kvm_flic_ops;
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 932d7f2637d6..a8f4ee5d2e82 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -413,6 +413,8 @@ struct kvm_s390_psw {
#define KVM_S390_PROGRAM_INT 0xfffe0001u
#define KVM_S390_SIGP_SET_PREFIX 0xfffe0002u
#define KVM_S390_RESTART 0xfffe0003u
+#define KVM_S390_INT_PFAULT_INIT 0xfffe0004u
+#define KVM_S390_INT_PFAULT_DONE 0xfffe0005u
#define KVM_S390_MCHK 0xfffe1000u
#define KVM_S390_INT_VIRTIO 0xffff2603u
#define KVM_S390_INT_SERVICE 0xffff2401u
@@ -434,6 +436,69 @@ struct kvm_s390_interrupt {
__u64 parm64;
};
+struct kvm_s390_io_info {
+ __u16 subchannel_id;
+ __u16 subchannel_nr;
+ __u32 io_int_parm;
+ __u32 io_int_word;
+};
+
+struct kvm_s390_ext_info {
+ __u32 ext_params;
+ __u32 pad;
+ __u64 ext_params2;
+};
+
+struct kvm_s390_pgm_info {
+ __u64 trans_exc_code;
+ __u64 mon_code;
+ __u64 per_address;
+ __u32 data_exc_code;
+ __u16 code;
+ __u16 mon_class_nr;
+ __u8 per_code;
+ __u8 per_atmid;
+ __u8 exc_access_id;
+ __u8 per_access_id;
+ __u8 op_access_id;
+ __u8 pad[3];
+};
+
+struct kvm_s390_prefix_info {
+ __u32 address;
+};
+
+struct kvm_s390_extcall_info {
+ __u16 code;
+};
+
+struct kvm_s390_emerg_info {
+ __u16 code;
+};
+
+struct kvm_s390_mchk_info {
+ __u64 cr14;
+ __u64 mcic;
+ __u64 failing_storage_address;
+ __u32 ext_damage_code;
+ __u32 pad;
+ __u8 fixed_logout[16];
+};
+
+struct kvm_s390_irq {
+ __u64 type;
+ union {
+ struct kvm_s390_io_info io;
+ struct kvm_s390_ext_info ext;
+ struct kvm_s390_pgm_info pgm;
+ struct kvm_s390_emerg_info emerg;
+ struct kvm_s390_extcall_info extcall;
+ struct kvm_s390_prefix_info prefix;
+ struct kvm_s390_mchk_info mchk;
+ char reserved[64];
+ } u;
+};
+
/* for KVM_SET_GUEST_DEBUG */
#define KVM_GUESTDBG_ENABLE 0x00000001
@@ -675,6 +740,9 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_SPAPR_MULTITCE 94
#define KVM_CAP_EXT_EMUL_CPUID 95
#define KVM_CAP_HYPERV_TIME 96
+#define KVM_CAP_IOAPIC_POLARITY_IGNORED 97
+#define KVM_CAP_ENABLE_CAP_VM 98
+#define KVM_CAP_S390_IRQCHIP 99
#ifdef KVM_CAP_IRQ_ROUTING
@@ -690,9 +758,18 @@ struct kvm_irq_routing_msi {
__u32 pad;
};
+struct kvm_irq_routing_s390_adapter {
+ __u64 ind_addr;
+ __u64 summary_addr;
+ __u64 ind_offset;
+ __u32 summary_offset;
+ __u32 adapter_id;
+};
+
/* gsi routing entry types */
#define KVM_IRQ_ROUTING_IRQCHIP 1
#define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_S390_ADAPTER 3
struct kvm_irq_routing_entry {
__u32 gsi;
@@ -702,6 +779,7 @@ struct kvm_irq_routing_entry {
union {
struct kvm_irq_routing_irqchip irqchip;
struct kvm_irq_routing_msi msi;
+ struct kvm_irq_routing_s390_adapter adapter;
__u32 pad[8];
} u;
};
@@ -855,6 +933,7 @@ struct kvm_device_attr {
#define KVM_DEV_VFIO_GROUP_ADD 1
#define KVM_DEV_VFIO_GROUP_DEL 2
#define KVM_DEV_TYPE_ARM_VGIC_V2 5
+#define KVM_DEV_TYPE_FLIC 6
/*
* ioctls for VM fds
@@ -1009,6 +1088,10 @@ struct kvm_s390_ucas_mapping {
/* Available with KVM_CAP_DEBUGREGS */
#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs)
#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs)
+/*
+ * vcpu version available with KVM_ENABLE_CAP
+ * vm version available with KVM_CAP_ENABLE_CAP_VM
+ */
#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap)
/* Available with KVM_CAP_XSAVE */
#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave)
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index fbe1a48bd629..13f2d19793e3 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -22,6 +22,10 @@ config KVM_MMIO
config KVM_ASYNC_PF
bool
+# Toggle to switch between direct notification and batch job
+config KVM_ASYNC_PF_SYNC
+ bool
+
config HAVE_KVM_MSI
bool
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 8631d9c14320..10df100c4514 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -28,6 +28,21 @@
#include "async_pf.h"
#include <trace/events/kvm.h>
+static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+ kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+#ifndef CONFIG_KVM_ASYNC_PF_SYNC
+ kvm_arch_async_page_present(vcpu, work);
+#endif
+}
+
static struct kmem_cache *async_pf_cache;
int kvm_async_pf_init(void)
@@ -69,6 +84,7 @@ static void async_pf_execute(struct work_struct *work)
down_read(&mm->mmap_sem);
get_user_pages(current, mm, addr, 1, 1, 0, NULL, NULL);
up_read(&mm->mmap_sem);
+ kvm_async_page_present_sync(vcpu, apf);
unuse_mm(mm);
spin_lock(&vcpu->async_pf.lock);
@@ -97,11 +113,16 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
list_entry(vcpu->async_pf.queue.next,
typeof(*work), queue);
list_del(&work->queue);
+
+#ifdef CONFIG_KVM_ASYNC_PF_SYNC
+ flush_work(&work->work);
+#else
if (cancel_work_sync(&work->work)) {
mmdrop(work->mm);
kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
kmem_cache_free(async_pf_cache, work);
}
+#endif
}
spin_lock(&vcpu->async_pf.lock);
@@ -130,7 +151,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->async_pf.lock);
kvm_arch_async_page_ready(vcpu, work);
- kvm_arch_async_page_present(vcpu, work);
+ kvm_async_page_present_async(vcpu, work);
list_del(&work->queue);
vcpu->async_pf.queued--;
@@ -138,7 +159,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
}
}
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
struct kvm_arch_async_pf *arch)
{
struct kvm_async_pf *work;
@@ -159,7 +180,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
work->wakeup_all = false;
work->vcpu = vcpu;
work->gva = gva;
- work->addr = gfn_to_hva(vcpu->kvm, gfn);
+ work->addr = hva;
work->arch = *arch;
work->mm = current->mm;
atomic_inc(&work->mm->mm_count);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index abe4d6043b36..29c2a04e036e 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -391,19 +391,19 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
lockdep_is_held(&kvm->irqfds.lock));
irqfd_update(kvm, irqfd, irq_rt);
- events = f.file->f_op->poll(f.file, &irqfd->pt);
-
list_add_tail(&irqfd->list, &kvm->irqfds.items);
+ spin_unlock_irq(&kvm->irqfds.lock);
+
/*
* Check if there was an event already pending on the eventfd
* before we registered, and trigger it as if we didn't miss it.
*/
+ events = f.file->f_op->poll(f.file, &irqfd->pt);
+
if (events & POLLIN)
schedule_work(&irqfd->inject);
- spin_unlock_irq(&kvm->irqfds.lock);
-
/*
* do not drop the file until the irqfd is fully initialized, otherwise
* we might race against the POLLHUP
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ce9ed99ad7dc..d4b601547f1f 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -50,7 +50,7 @@
#else
#define ioapic_debug(fmt, arg...)
#endif
-static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq,
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -163,23 +163,67 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
return false;
}
-static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx,
- bool line_status)
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+ int irq_level, bool line_status)
{
- union kvm_ioapic_redirect_entry *pent;
- int injected = -1;
+ union kvm_ioapic_redirect_entry entry;
+ u32 mask = 1 << irq;
+ u32 old_irr;
+ int edge, ret;
- pent = &ioapic->redirtbl[idx];
+ entry = ioapic->redirtbl[irq];
+ edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
- if (!pent->fields.mask) {
- injected = ioapic_deliver(ioapic, idx, line_status);
- if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
- pent->fields.remote_irr = 1;
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+ * this only happens if a previous edge has not been delivered due
+ * do masking. For level interrupts, the remote_irr field tells
+ * us if the interrupt is waiting for an EOI.
+ *
+ * RTC is special: it is edge-triggered, but userspace likes to know
+ * if it has been already ack-ed via EOI because coalesced RTC
+ * interrupts lead to time drift in Windows guests. So we track
+ * EOI manually for the RTC interrupt.
+ */
+ if (irq == RTC_GSI && line_status &&
+ rtc_irq_check_coalesced(ioapic)) {
+ ret = 0;
+ goto out;
}
- return injected;
+ old_irr = ioapic->irr;
+ ioapic->irr |= mask;
+ if ((edge && old_irr == ioapic->irr) ||
+ (!edge && entry.fields.remote_irr)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+ u32 idx;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+ ioapic_set_irq(ioapic, idx, 1, true);
+
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
}
+
static void update_handled_vectors(struct kvm_ioapic *ioapic)
{
DECLARE_BITMAP(handled_vectors, 256);
@@ -282,12 +326,15 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
}
}
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
{
union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
struct kvm_lapic_irq irqe;
int ret;
+ if (entry->fields.mask)
+ return -1;
+
ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
"vector=%x trig_mode=%x\n",
entry->fields.dest_id, entry->fields.dest_mode,
@@ -302,6 +349,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
irqe.level = 1;
irqe.shorthand = 0;
+ if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+ ioapic->irr &= ~(1 << irq);
+
if (irq == RTC_GSI && line_status) {
BUG_ON(ioapic->rtc_status.pending_eoi != 0);
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
@@ -310,45 +360,24 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
} else
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+ if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+ entry->fields.remote_irr = 1;
+
return ret;
}
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status)
{
- u32 old_irr;
- u32 mask = 1 << irq;
- union kvm_ioapic_redirect_entry entry;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
spin_lock(&ioapic->lock);
- old_irr = ioapic->irr;
irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
irq_source_id, level);
- entry = ioapic->redirtbl[irq];
- irq_level ^= entry.fields.polarity;
- if (!irq_level) {
- ioapic->irr &= ~mask;
- ret = 1;
- } else {
- int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+ ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
- if (irq == RTC_GSI && line_status &&
- rtc_irq_check_coalesced(ioapic)) {
- ret = 0; /* coalesced */
- goto out;
- }
- ioapic->irr |= mask;
- if ((edge && old_irr != ioapic->irr) ||
- (!edge && !entry.fields.remote_irr))
- ret = ioapic_service(ioapic, irq, line_status);
- else
- ret = 0; /* report coalesced interrupt */
- }
-out:
- trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
spin_unlock(&ioapic->lock);
return ret;
@@ -394,7 +423,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
ent->fields.remote_irr = 0;
- if (!ent->fields.mask && (ioapic->irr & (1 << i)))
+ if (ioapic->irr & (1 << i))
ioapic_service(ioapic, i, false);
}
}
@@ -595,9 +624,10 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
spin_lock(&ioapic->lock);
memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ ioapic->irr = 0;
update_handled_vectors(ioapic);
kvm_vcpu_request_scan_ioapic(kvm);
- kvm_rtc_eoi_tracking_restore_all(ioapic);
+ kvm_ioapic_inject_all(ioapic, state->irr);
spin_unlock(&ioapic->lock);
return 0;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b5ec7fb986f6..56baae8c2f56 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -186,12 +186,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
- long dirty_count = kvm->tlbs_dirty;
-
- smp_mb();
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.remote_tlb_flush;
- cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
+ kvm->tlbs_dirty = false;
}
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
@@ -1804,7 +1801,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue;
if (vcpu == me)
continue;
- if (waitqueue_active(&vcpu->wq))
+ if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
continue;
@@ -2284,6 +2281,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
ops = &kvm_arm_vgic_v2_ops;
break;
#endif
+#ifdef CONFIG_S390
+ case KVM_DEV_TYPE_FLIC:
+ ops = &kvm_flic_ops;
+ break;
+#endif
default:
return -ENODEV;
}