summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-16 13:00:24 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-16 13:00:24 -0800
commit974aa5630b318938273d7efe7a2cf031c7b927db (patch)
treeb79803c07b9c16d87058ce69f80ebe173cdfd838 /arch
parent441692aafc1731087bbaf657a8b6059d95c2a6df (diff)
parenta6014f1ab7088dc02b58991cfb6b32a34afdbf12 (diff)
downloadlinux-974aa5630b318938273d7efe7a2cf031c7b927db.tar.gz
linux-974aa5630b318938273d7efe7a2cf031c7b927db.tar.bz2
linux-974aa5630b318938273d7efe7a2cf031c7b927db.zip
Merge tag 'kvm-4.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Radim Krčmář: "First batch of KVM changes for 4.15 Common: - Python 3 support in kvm_stat - Accounting of slabs to kmemcg ARM: - Optimized arch timer handling for KVM/ARM - Improvements to the VGIC ITS code and introduction of an ITS reset ioctl - Unification of the 32-bit fault injection logic - More exact external abort matching logic PPC: - Support for running hashed page table (HPT) MMU mode on a host that is using the radix MMU mode; single threaded mode on POWER 9 is added as a pre-requisite - Resolution of merge conflicts with the last second 4.14 HPT fixes - Fixes and cleanups s390: - Some initial preparation patches for exitless interrupts and crypto - New capability for AIS migration - Fixes x86: - Improved emulation of LAPIC timer mode changes, MCi_STATUS MSRs, and after-reset state - Refined dependencies for VMX features - Fixes for nested SMI injection - A lot of cleanups" * tag 'kvm-4.15-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (89 commits) KVM: s390: provide a capability for AIS state migration KVM: s390: clear_io_irq() requests are not expected for adapter interrupts KVM: s390: abstract conversion between isc and enum irq_types KVM: s390: vsie: use common code functions for pinning KVM: s390: SIE considerations for AP Queue virtualization KVM: s390: document memory ordering for kvm_s390_vcpu_wakeup KVM: PPC: Book3S HV: Cosmetic post-merge cleanups KVM: arm/arm64: fix the incompatible matching for external abort KVM: arm/arm64: Unify 32bit fault injection KVM: arm/arm64: vgic-its: Implement KVM_DEV_ARM_ITS_CTRL_RESET KVM: arm/arm64: Document KVM_DEV_ARM_ITS_CTRL_RESET KVM: arm/arm64: vgic-its: Free caches when GITS_BASER Valid bit is cleared KVM: arm/arm64: vgic-its: New helper functions to free the caches KVM: arm/arm64: vgic-its: Remove kvm_its_unmap_device arm/arm64: KVM: Load the timer state when enabling the timer KVM: arm/arm64: Rework kvm_timer_should_fire KVM: arm/arm64: Get rid of kvm_timer_flush_hwstate KVM: arm/arm64: Avoid phys timer emulation in vcpu entry/exit KVM: arm/arm64: Move phys_timer_emulate function KVM: arm/arm64: Use kvm_arm_timer_set/get_reg for guest register traps ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/kvm_asm.h2
-rw-r--r--arch/arm/include/asm/kvm_emulate.h38
-rw-r--r--arch/arm/include/asm/kvm_hyp.h4
-rw-r--r--arch/arm/include/uapi/asm/kvm.h7
-rw-r--r--arch/arm/kvm/emulate.c137
-rw-r--r--arch/arm/kvm/hyp/switch.c7
-rw-r--r--arch/arm64/include/asm/arch_timer.h8
-rw-r--r--arch/arm64/include/asm/kvm_asm.h2
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h5
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h4
-rw-r--r--arch/arm64/include/asm/timex.h2
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h7
-rw-r--r--arch/arm64/kvm/hyp/switch.c6
-rw-r--r--arch/arm64/kvm/inject_fault.c88
-rw-r--r--arch/arm64/kvm/sys_regs.c41
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h3
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h140
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h13
-rw-r--r--arch/powerpc/include/asm/kvm_host.h6
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h3
-rw-r--r--arch/powerpc/kernel/asm-offsets.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c128
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c51
-rw-r--r--arch/powerpc/kvm/book3s_64_slb.S2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c347
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c117
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c65
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S197
-rw-r--r--arch/powerpc/kvm/book3s_pr.c16
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c2
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c2
-rw-r--r--arch/powerpc/kvm/powerpc.c3
-rw-r--r--arch/s390/include/asm/kvm_host.h25
-rw-r--r--arch/s390/kvm/interrupt.c26
-rw-r--r--arch/s390/kvm/kvm-s390.c1
-rw-r--r--arch/s390/kvm/vsie.c50
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h8
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/kvm/emulate.c9
-rw-r--r--arch/x86/kvm/lapic.c91
-rw-r--r--arch/x86/kvm/mmu.c115
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h18
-rw-r--r--arch/x86/kvm/svm.c241
-rw-r--r--arch/x86/kvm/vmx.c208
-rw-r--r--arch/x86/kvm/x86.c94
47 files changed, 1474 insertions, 877 deletions
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 14d68a4d826f..36dd2962a42d 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -68,6 +68,8 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
+extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
+
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
extern void __init_stage2_translation(void);
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 98089ffd91bb..3d22eb87f919 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -25,7 +25,22 @@
#include <asm/kvm_arm.h>
#include <asm/cputype.h>
+/* arm64 compatibility macros */
+#define COMPAT_PSR_MODE_ABT ABT_MODE
+#define COMPAT_PSR_MODE_UND UND_MODE
+#define COMPAT_PSR_T_BIT PSR_T_BIT
+#define COMPAT_PSR_I_BIT PSR_I_BIT
+#define COMPAT_PSR_A_BIT PSR_A_BIT
+#define COMPAT_PSR_E_BIT PSR_E_BIT
+#define COMPAT_PSR_IT_MASK PSR_IT_MASK
+
unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num);
+
+static inline unsigned long *vcpu_reg32(struct kvm_vcpu *vcpu, u8 reg_num)
+{
+ return vcpu_reg(vcpu, reg_num);
+}
+
unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu);
static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu,
@@ -42,10 +57,25 @@ static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr);
-void kvm_inject_undefined(struct kvm_vcpu *vcpu);
+void kvm_inject_undef32(struct kvm_vcpu *vcpu);
+void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr);
+void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_vabt(struct kvm_vcpu *vcpu);
-void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
-void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
+
+static inline void kvm_inject_undefined(struct kvm_vcpu *vcpu)
+{
+ kvm_inject_undef32(vcpu);
+}
+
+static inline void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+ kvm_inject_dabt32(vcpu, addr);
+}
+
+static inline void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+ kvm_inject_pabt32(vcpu, addr);
+}
static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu)
{
@@ -203,7 +233,7 @@ static inline u8 kvm_vcpu_trap_get_fault_type(struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu)
{
- switch (kvm_vcpu_trap_get_fault_type(vcpu)) {
+ switch (kvm_vcpu_trap_get_fault(vcpu)) {
case FSC_SEA:
case FSC_SEA_TTW0:
case FSC_SEA_TTW1:
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index 14b5903f0224..ab20ffa8b9e7 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -98,8 +98,8 @@
#define cntvoff_el2 CNTVOFF
#define cnthctl_el2 CNTHCTL
-void __timer_save_state(struct kvm_vcpu *vcpu);
-void __timer_restore_state(struct kvm_vcpu *vcpu);
+void __timer_enable_traps(struct kvm_vcpu *vcpu);
+void __timer_disable_traps(struct kvm_vcpu *vcpu);
void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 1f57bbe82b6f..6edd177bb1c7 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -152,6 +152,12 @@ struct kvm_arch_memory_slot {
(__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64)
#define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__)
+/* PL1 Physical Timer Registers */
+#define KVM_REG_ARM_PTIMER_CTL ARM_CP15_REG32(0, 14, 2, 1)
+#define KVM_REG_ARM_PTIMER_CNT ARM_CP15_REG64(0, 14)
+#define KVM_REG_ARM_PTIMER_CVAL ARM_CP15_REG64(2, 14)
+
+/* Virtual Timer Registers */
#define KVM_REG_ARM_TIMER_CTL ARM_CP15_REG32(0, 14, 3, 1)
#define KVM_REG_ARM_TIMER_CNT ARM_CP15_REG64(1, 14)
#define KVM_REG_ARM_TIMER_CVAL ARM_CP15_REG64(3, 14)
@@ -216,6 +222,7 @@ struct kvm_arch_memory_slot {
#define KVM_DEV_ARM_ITS_SAVE_TABLES 1
#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2
#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3
+#define KVM_DEV_ARM_ITS_CTRL_RESET 4
/* KVM_IRQ_LINE irq field index values */
#define KVM_ARM_IRQ_TYPE_SHIFT 24
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index 30a13647c54c..cdff963f133a 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -165,143 +165,6 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu)
* Inject exceptions into the guest
*/
-static u32 exc_vector_base(struct kvm_vcpu *vcpu)
-{
- u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
- u32 vbar = vcpu_cp15(vcpu, c12_VBAR);
-
- if (sctlr & SCTLR_V)
- return 0xffff0000;
- else /* always have security exceptions */
- return vbar;
-}
-
-/*
- * Switch to an exception mode, updating both CPSR and SPSR. Follow
- * the logic described in AArch32.EnterMode() from the ARMv8 ARM.
- */
-static void kvm_update_psr(struct kvm_vcpu *vcpu, unsigned long mode)
-{
- unsigned long cpsr = *vcpu_cpsr(vcpu);
- u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
-
- *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | mode;
-
- switch (mode) {
- case FIQ_MODE:
- *vcpu_cpsr(vcpu) |= PSR_F_BIT;
- /* Fall through */
- case ABT_MODE:
- case IRQ_MODE:
- *vcpu_cpsr(vcpu) |= PSR_A_BIT;
- /* Fall through */
- default:
- *vcpu_cpsr(vcpu) |= PSR_I_BIT;
- }
-
- *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT);
-
- if (sctlr & SCTLR_TE)
- *vcpu_cpsr(vcpu) |= PSR_T_BIT;
- if (sctlr & SCTLR_EE)
- *vcpu_cpsr(vcpu) |= PSR_E_BIT;
-
- /* Note: These now point to the mode banked copies */
- *vcpu_spsr(vcpu) = cpsr;
-}
-
-/**
- * kvm_inject_undefined - inject an undefined exception into the guest
- * @vcpu: The VCPU to receive the undefined exception
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- *
- * Modelled after TakeUndefInstrException() pseudocode.
- */
-void kvm_inject_undefined(struct kvm_vcpu *vcpu)
-{
- unsigned long cpsr = *vcpu_cpsr(vcpu);
- bool is_thumb = (cpsr & PSR_T_BIT);
- u32 vect_offset = 4;
- u32 return_offset = (is_thumb) ? 2 : 4;
-
- kvm_update_psr(vcpu, UND_MODE);
- *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
-
- /* Branch to exception vector */
- *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset;
-}
-
-/*
- * Modelled after TakeDataAbortException() and TakePrefetchAbortException
- * pseudocode.
- */
-static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr)
-{
- u32 vect_offset;
- u32 return_offset = (is_pabt) ? 4 : 8;
- bool is_lpae;
-
- kvm_update_psr(vcpu, ABT_MODE);
- *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
-
- if (is_pabt)
- vect_offset = 12;
- else
- vect_offset = 16;
-
- /* Branch to exception vector */
- *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset;
-
- if (is_pabt) {
- /* Set IFAR and IFSR */
- vcpu_cp15(vcpu, c6_IFAR) = addr;
- is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
- /* Always give debug fault for now - should give guest a clue */
- if (is_lpae)
- vcpu_cp15(vcpu, c5_IFSR) = 1 << 9 | 0x22;
- else
- vcpu_cp15(vcpu, c5_IFSR) = 2;
- } else { /* !iabt */
- /* Set DFAR and DFSR */
- vcpu_cp15(vcpu, c6_DFAR) = addr;
- is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
- /* Always give debug fault for now - should give guest a clue */
- if (is_lpae)
- vcpu_cp15(vcpu, c5_DFSR) = 1 << 9 | 0x22;
- else
- vcpu_cp15(vcpu, c5_DFSR) = 2;
- }
-
-}
-
-/**
- * kvm_inject_dabt - inject a data abort into the guest
- * @vcpu: The VCPU to receive the undefined exception
- * @addr: The address to report in the DFAR
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- */
-void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
-{
- inject_abt(vcpu, false, addr);
-}
-
-/**
- * kvm_inject_pabt - inject a prefetch abort into the guest
- * @vcpu: The VCPU to receive the undefined exception
- * @addr: The address to report in the DFAR
- *
- * It is assumed that this code is called from the VCPU thread and that the
- * VCPU therefore is not currently executing guest code.
- */
-void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
-{
- inject_abt(vcpu, true, addr);
-}
-
/**
* kvm_inject_vabt - inject an async abort / SError into the guest
* @vcpu: The VCPU to receive the exception
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index ebd2dd46adf7..330c9ce34ba5 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -174,7 +174,7 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__activate_vm(vcpu);
__vgic_restore_state(vcpu);
- __timer_restore_state(vcpu);
+ __timer_enable_traps(vcpu);
__sysreg_restore_state(guest_ctxt);
__banked_restore_state(guest_ctxt);
@@ -191,7 +191,8 @@ again:
__banked_save_state(guest_ctxt);
__sysreg_save_state(guest_ctxt);
- __timer_save_state(vcpu);
+ __timer_disable_traps(vcpu);
+
__vgic_save_state(vcpu);
__deactivate_traps(vcpu);
@@ -237,7 +238,7 @@ void __hyp_text __noreturn __hyp_panic(int cause)
vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR);
host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
- __timer_save_state(vcpu);
+ __timer_disable_traps(vcpu);
__deactivate_traps(vcpu);
__deactivate_vm(vcpu);
__banked_restore_state(host_ctxt);
diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
index bdedd8f748d1..f2a234d6516c 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -52,6 +52,7 @@ struct arch_timer_erratum_workaround {
const char *desc;
u32 (*read_cntp_tval_el0)(void);
u32 (*read_cntv_tval_el0)(void);
+ u64 (*read_cntpct_el0)(void);
u64 (*read_cntvct_el0)(void);
int (*set_next_event_phys)(unsigned long, struct clock_event_device *);
int (*set_next_event_virt)(unsigned long, struct clock_event_device *);
@@ -149,11 +150,8 @@ static inline void arch_timer_set_cntkctl(u32 cntkctl)
static inline u64 arch_counter_get_cntpct(void)
{
- /*
- * AArch64 kernel and user space mandate the use of CNTVCT.
- */
- BUG();
- return 0;
+ isb();
+ return arch_timer_reg_read_stable(cntpct_el0);
}
static inline u64 arch_counter_get_cntvct(void)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 26a64d0f9ab9..ab4d0a926043 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -55,6 +55,8 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
+extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
+
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
extern u64 __vgic_v3_get_ich_vtr_el2(void);
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index e5df3fce0008..5f28dfa14cee 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -41,6 +41,9 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
void kvm_inject_vabt(struct kvm_vcpu *vcpu);
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
+void kvm_inject_undef32(struct kvm_vcpu *vcpu);
+void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr);
+void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr);
static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
{
@@ -237,7 +240,7 @@ static inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu)
{
- switch (kvm_vcpu_trap_get_fault_type(vcpu)) {
+ switch (kvm_vcpu_trap_get_fault(vcpu)) {
case FSC_SEA:
case FSC_SEA_TTW0:
case FSC_SEA_TTW1:
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 4572a9b560fa..08d3bb66c8b7 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -129,8 +129,8 @@ void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
-void __timer_save_state(struct kvm_vcpu *vcpu);
-void __timer_restore_state(struct kvm_vcpu *vcpu);
+void __timer_enable_traps(struct kvm_vcpu *vcpu);
+void __timer_disable_traps(struct kvm_vcpu *vcpu);
void __sysreg_save_host_state(struct kvm_cpu_context *ctxt);
void __sysreg_restore_host_state(struct kvm_cpu_context *ctxt);
diff --git a/arch/arm64/include/asm/timex.h b/arch/arm64/include/asm/timex.h
index 81a076eb37fa..9ad60bae5c8d 100644
--- a/arch/arm64/include/asm/timex.h
+++ b/arch/arm64/include/asm/timex.h
@@ -22,7 +22,7 @@
* Use the current timer as a cycle counter since this is what we use for
* the delay loop.
*/
-#define get_cycles() arch_counter_get_cntvct()
+#define get_cycles() arch_timer_read_counter()
#include <asm-generic/timex.h>
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 51149ec75fe4..9abbf3044654 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -196,6 +196,12 @@ struct kvm_arch_memory_slot {
#define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64)
+/* Physical Timer EL0 Registers */
+#define KVM_REG_ARM_PTIMER_CTL ARM64_SYS_REG(3, 3, 14, 2, 1)
+#define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2)
+#define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1)
+
+/* EL0 Virtual Timer Registers */
#define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1)
#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2)
#define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2)
@@ -228,6 +234,7 @@ struct kvm_arch_memory_slot {
#define KVM_DEV_ARM_ITS_SAVE_TABLES 1
#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2
#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3
+#define KVM_DEV_ARM_ITS_CTRL_RESET 4
/* Device Control API on vcpu fd */
#define KVM_ARM_VCPU_PMU_V3_CTRL 0
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 951f3ebaff26..525c01f48867 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -304,7 +304,7 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__activate_vm(vcpu);
__vgic_restore_state(vcpu);
- __timer_restore_state(vcpu);
+ __timer_enable_traps(vcpu);
/*
* We must restore the 32-bit state before the sysregs, thanks
@@ -374,7 +374,7 @@ again:
__sysreg_save_guest_state(guest_ctxt);
__sysreg32_save_state(vcpu);
- __timer_save_state(vcpu);
+ __timer_disable_traps(vcpu);
__vgic_save_state(vcpu);
__deactivate_traps(vcpu);
@@ -442,7 +442,7 @@ void __hyp_text __noreturn __hyp_panic(void)
vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
- __timer_save_state(vcpu);
+ __timer_disable_traps(vcpu);
__deactivate_traps(vcpu);
__deactivate_vm(vcpu);
__sysreg_restore_host_state(host_ctxt);
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 3556715a774e..8ecbcb40e317 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -33,88 +33,6 @@
#define LOWER_EL_AArch64_VECTOR 0x400
#define LOWER_EL_AArch32_VECTOR 0x600
-/*
- * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
- */
-static const u8 return_offsets[8][2] = {
- [0] = { 0, 0 }, /* Reset, unused */
- [1] = { 4, 2 }, /* Undefined */
- [2] = { 0, 0 }, /* SVC, unused */
- [3] = { 4, 4 }, /* Prefetch abort */
- [4] = { 8, 8 }, /* Data abort */
- [5] = { 0, 0 }, /* HVC, unused */
- [6] = { 4, 4 }, /* IRQ, unused */
- [7] = { 4, 4 }, /* FIQ, unused */
-};
-
-static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
-{
- unsigned long cpsr;
- unsigned long new_spsr_value = *vcpu_cpsr(vcpu);
- bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT);
- u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
- u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
-
- cpsr = mode | COMPAT_PSR_I_BIT;
-
- if (sctlr & (1 << 30))
- cpsr |= COMPAT_PSR_T_BIT;
- if (sctlr & (1 << 25))
- cpsr |= COMPAT_PSR_E_BIT;
-
- *vcpu_cpsr(vcpu) = cpsr;
-
- /* Note: These now point to the banked copies */
- *vcpu_spsr(vcpu) = new_spsr_value;
- *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
-
- /* Branch to exception vector */
- if (sctlr & (1 << 13))
- vect_offset += 0xffff0000;
- else /* always have security exceptions */
- vect_offset += vcpu_cp15(vcpu, c12_VBAR);
-
- *vcpu_pc(vcpu) = vect_offset;
-}
-
-static void inject_undef32(struct kvm_vcpu *vcpu)
-{
- prepare_fault32(vcpu, COMPAT_PSR_MODE_UND, 4);
-}
-
-/*
- * Modelled after TakeDataAbortException() and TakePrefetchAbortException
- * pseudocode.
- */
-static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
- unsigned long addr)
-{
- u32 vect_offset;
- u32 *far, *fsr;
- bool is_lpae;
-
- if (is_pabt) {
- vect_offset = 12;
- far = &vcpu_cp15(vcpu, c6_IFAR);
- fsr = &vcpu_cp15(vcpu, c5_IFSR);
- } else { /* !iabt */
- vect_offset = 16;
- far = &vcpu_cp15(vcpu, c6_DFAR);
- fsr = &vcpu_cp15(vcpu, c5_DFSR);
- }
-
- prepare_fault32(vcpu, COMPAT_PSR_MODE_ABT | COMPAT_PSR_A_BIT, vect_offset);
-
- *far = addr;
-
- /* Give the guest an IMPLEMENTATION DEFINED exception */
- is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
- if (is_lpae)
- *fsr = 1 << 9 | 0x34;
- else
- *fsr = 0x14;
-}
-
enum exception_type {
except_type_sync = 0,
except_type_irq = 0x80,
@@ -211,7 +129,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
{
if (!(vcpu->arch.hcr_el2 & HCR_RW))
- inject_abt32(vcpu, false, addr);
+ kvm_inject_dabt32(vcpu, addr);
else
inject_abt64(vcpu, false, addr);
}
@@ -227,7 +145,7 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
{
if (!(vcpu->arch.hcr_el2 & HCR_RW))
- inject_abt32(vcpu, true, addr);
+ kvm_inject_pabt32(vcpu, addr);
else
inject_abt64(vcpu, true, addr);
}
@@ -241,7 +159,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
void kvm_inject_undefined(struct kvm_vcpu *vcpu)
{
if (!(vcpu->arch.hcr_el2 & HCR_RW))
- inject_undef32(vcpu);
+ kvm_inject_undef32(vcpu);
else
inject_undef64(vcpu);
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a0ee9b05e3d4..1830ebc227d1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -842,13 +842,16 @@ static bool access_cntp_tval(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
u64 now = kvm_phys_timer_read();
+ u64 cval;
- if (p->is_write)
- ptimer->cnt_cval = p->regval + now;
- else
- p->regval = ptimer->cnt_cval - now;
+ if (p->is_write) {
+ kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL,
+ p->regval + now);
+ } else {
+ cval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
+ p->regval = cval - now;
+ }
return true;
}
@@ -857,24 +860,10 @@ static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
- if (p->is_write) {
- /* ISTATUS bit is read-only */
- ptimer->cnt_ctl = p->regval & ~ARCH_TIMER_CTRL_IT_STAT;
- } else {
- u64 now = kvm_phys_timer_read();
-
- p->regval = ptimer->cnt_ctl;
- /*
- * Set ISTATUS bit if it's expired.
- * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
- * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
- * regardless of ENABLE bit for our implementation convenience.
- */
- if (ptimer->cnt_cval <= now)
- p->regval |= ARCH_TIMER_CTRL_IT_STAT;
- }
+ if (p->is_write)
+ kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CTL, p->regval);
+ else
+ p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CTL);
return true;
}
@@ -883,12 +872,10 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
if (p->is_write)
- ptimer->cnt_cval = p->regval;
+ kvm_arm_timer_set_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL, p->regval);
else
- p->regval = ptimer->cnt_cval;
+ p->regval = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_PTIMER_CVAL);
return true;
}
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index b8d5b8e35244..9a667007bff8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -216,7 +216,8 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
bool writing, bool *writable);
extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
unsigned long *rmap, long pte_index, int realmode);
-extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
+extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
+ unsigned long gfn, unsigned long psize);
extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
unsigned long pte_index);
void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d55c7f881ce7..735cfa35298a 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,6 +20,8 @@
#ifndef __ASM_KVM_BOOK3S_64_H__
#define __ASM_KVM_BOOK3S_64_H__
+#include <linux/string.h>
+#include <asm/bitops.h>
#include <asm/book3s/64/mmu-hash.h>
/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
@@ -107,18 +109,96 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
hpte[0] = cpu_to_be64(hpte_v);
}
+/*
+ * These functions encode knowledge of the POWER7/8/9 hardware
+ * interpretations of the HPTE LP (large page size) field.
+ */
+static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
+{
+ unsigned int lphi;
+
+ if (!(h & HPTE_V_LARGE))
+ return 12; /* 4kB */
+ lphi = (l >> 16) & 0xf;
+ switch ((l >> 12) & 0xf) {
+ case 0:
+ return !lphi ? 24 : -1; /* 16MB */
+ break;
+ case 1:
+ return 16; /* 64kB */
+ break;
+ case 3:
+ return !lphi ? 34 : -1; /* 16GB */
+ break;
+ case 7:
+ return (16 << 8) + 12; /* 64kB in 4kB */
+ break;
+ case 8:
+ if (!lphi)
+ return (24 << 8) + 16; /* 16MB in 64kkB */
+ if (lphi == 3)
+ return (24 << 8) + 12; /* 16MB in 4kB */
+ break;
+ }
+ return -1;
+}
+
+static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l)
+{
+ return kvmppc_hpte_page_shifts(h, l) & 0xff;
+}
+
+static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l)
+{
+ int tmp = kvmppc_hpte_page_shifts(h, l);
+
+ if (tmp >= 0x100)
+ tmp >>= 8;
+ return tmp;
+}
+
+static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r)
+{
+ return 1ul << kvmppc_hpte_actual_page_shift(v, r);
+}
+
+static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift)
+{
+ switch (base_shift) {
+ case 12:
+ switch (actual_shift) {
+ case 12:
+ return 0;
+ case 16:
+ return 7;
+ case 24:
+ return 0x38;
+ }
+ break;
+ case 16:
+ switch (actual_shift) {
+ case 16:
+ return 1;
+ case 24:
+ return 8;
+ }
+ break;
+ case 24:
+ return 0;
+ }
+ return -1;
+}
+
static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
unsigned long pte_index)
{
- int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
- unsigned int penc;
+ int a_pgshift, b_pgshift;
unsigned long rb = 0, va_low, sllp;
- unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
- if (v & HPTE_V_LARGE) {
- i = hpte_page_sizes[lp];
- b_psize = i & 0xf;
- a_psize = i >> 4;
+ b_pgshift = a_pgshift = kvmppc_hpte_page_shifts(v, r);
+ if (a_pgshift >= 0x100) {
+ b_pgshift &= 0xff;
+ a_pgshift >>= 8;
}
/*
@@ -152,37 +232,33 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
va_low ^= v >> (SID_SHIFT_1T - 16);
va_low &= 0x7ff;
- switch (b_psize) {
- case MMU_PAGE_4K:
- sllp = get_sllp_encoding(a_psize);
- rb |= sllp << 5; /* AP field */
+ if (b_pgshift == 12) {
+ if (a_pgshift > 12) {
+ sllp = (a_pgshift == 16) ? 5 : 4;
+ rb |= sllp << 5; /* AP field */
+ }
rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */
- break;
- default:
- {
+ } else {
int aval_shift;
/*
* remaining bits of AVA/LP fields
* Also contain the rr bits of LP
*/
- rb |= (va_low << mmu_psize_defs[b_psize].shift) & 0x7ff000;
+ rb |= (va_low << b_pgshift) & 0x7ff000;
/*
* Now clear not needed LP bits based on actual psize
*/
- rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1);
+ rb &= ~((1ul << a_pgshift) - 1);
/*
* AVAL field 58..77 - base_page_shift bits of va
* we have space for 58..64 bits, Missing bits should
* be zero filled. +1 is to take care of L bit shift
*/
- aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1;
+ aval_shift = 64 - (77 - b_pgshift) + 1;
rb |= ((va_low << aval_shift) & 0xfe);
rb |= 1; /* L field */
- penc = mmu_psize_defs[b_psize].penc[a_psize];
- rb |= penc << 12; /* LP field */
- break;
- }
+ rb |= r & 0xff000 & ((1ul << a_pgshift) - 1); /* LP field */
}
rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */
return rb;
@@ -370,6 +446,28 @@ static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt)
return (1UL << (hpt->order - 7)) - 1;
}
+/* Set bits in a dirty bitmap, which is in LE format */
+static inline void set_dirty_bits(unsigned long *map, unsigned long i,
+ unsigned long npages)
+{
+
+ if (npages >= 8)
+ memset((char *)map + i / 8, 0xff, npages / 8);
+ else
+ for (; npages; ++i, --npages)
+ __set_bit_le(i, map);
+}
+
+static inline void set_dirty_bits_atomic(unsigned long *map, unsigned long i,
+ unsigned long npages)
+{
+ if (npages >= 8)
+ memset((char *)map + i / 8, 0xff, npages / 8);
+ else
+ for (; npages; ++i, --npages)
+ set_bit_le(i, map);
+}
+
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 7cea76f11c26..ab386af2904f 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -82,6 +82,16 @@ struct kvm_split_mode {
u8 do_nap;
u8 napped[MAX_SMT_THREADS];
struct kvmppc_vcore *vc[MAX_SUBCORES];
+ /* Bits for changing lpcr on P9 */
+ unsigned long lpcr_req;
+ unsigned long lpidr_req;
+ unsigned long host_lpcr;
+ u32 do_set;
+ u32 do_restore;
+ union {
+ u32 allphases;
+ u8 phase[4];
+ } lpcr_sync;
};
/*
@@ -107,7 +117,8 @@ struct kvmppc_host_state {
u8 hwthread_req;
u8 hwthread_state;
u8 host_ipi;
- u8 ptid;
+ u8 ptid; /* thread number within subcore when split */
+ u8 tid; /* thread number within whole core */
struct kvm_vcpu *kvm_vcpu;
struct kvmppc_vcore *kvm_vcore;
void __iomem *xics_phys;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e372ed871c51..3aa5b577cd60 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -235,10 +235,7 @@ struct revmap_entry {
*/
#define KVMPPC_RMAP_LOCK_BIT 63
#define KVMPPC_RMAP_RC_SHIFT 32
-#define KVMPPC_RMAP_CHG_SHIFT 48
#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
-#define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
-#define KVMPPC_RMAP_CHG_ORDER (0x3ful << KVMPPC_RMAP_CHG_SHIFT)
#define KVMPPC_RMAP_PRESENT 0x100000000ul
#define KVMPPC_RMAP_INDEX 0xfffffffful
@@ -276,7 +273,7 @@ struct kvm_arch {
int tlbie_lock;
unsigned long lpcr;
unsigned long vrma_slb_v;
- int hpte_setup_done;
+ int mmu_ready;
atomic_t vcpus_running;
u32 online_vcores;
atomic_t hpte_mod_interest;
@@ -284,6 +281,7 @@ struct kvm_arch {
cpumask_t cpu_in_guest;
u8 radix;
u8 fwnmi_enabled;
+ bool threads_indep;
pgd_t *pgtable;
u64 process_table;
struct dentry *debugfs_dir;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ba5fadd6f3c9..96753f3aac6d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -168,6 +168,7 @@ extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order);
extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
+extern void kvmppc_rmap_reset(struct kvm *kvm);
extern long kvmppc_prepare_vrma(struct kvm *kvm,
struct kvm_userspace_memory_region *mem);
extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
@@ -177,6 +178,8 @@ extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
struct iommu_group *grp);
extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
struct iommu_group *grp);
+extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm);
+extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm);
extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce_64 *args);
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 9aace433491a..6b958414b4e0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -642,6 +642,7 @@ int main(void)
HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
HSTATE_FIELD(HSTATE_PTID, ptid);
+ HSTATE_FIELD(HSTATE_TID, tid);
HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
@@ -667,6 +668,8 @@ int main(void)
OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
+ OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
+ OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 59247af5fd45..235319c2574e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -73,8 +73,6 @@ struct kvm_resize_hpt {
struct kvm_hpt_info hpt;
};
-static void kvmppc_rmap_reset(struct kvm *kvm);
-
int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
{
unsigned long hpt = 0;
@@ -106,7 +104,6 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
/* Allocate reverse map array */
rev = vmalloc(sizeof(struct revmap_entry) * npte);
if (!rev) {
- pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
if (cma)
kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
else
@@ -137,19 +134,22 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
long err = -EBUSY;
struct kvm_hpt_info info;
- if (kvm_is_radix(kvm))
- return -EINVAL;
-
mutex_lock(&kvm->lock);
- if (kvm->arch.hpte_setup_done) {
- kvm->arch.hpte_setup_done = 0;
- /* order hpte_setup_done vs. vcpus_running */
+ if (kvm->arch.mmu_ready) {
+ kvm->arch.mmu_ready = 0;
+ /* order mmu_ready vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
- kvm->arch.hpte_setup_done = 1;
+ kvm->arch.mmu_ready = 1;
goto out;
}
}
+ if (kvm_is_radix(kvm)) {
+ err = kvmppc_switch_mmu_to_hpt(kvm);
+ if (err)
+ goto out;
+ }
+
if (kvm->arch.hpt.order == order) {
/* We already have a suitable HPT */
@@ -183,6 +183,7 @@ out:
void kvmppc_free_hpt(struct kvm_hpt_info *info)
{
vfree(info->rev);
+ info->rev = NULL;
if (info->cma)
kvm_free_hpt_cma(virt_to_page(info->virt),
1 << (info->order - PAGE_SHIFT));
@@ -334,7 +335,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
{
unsigned long ra_mask;
- ra_mask = hpte_page_size(v, r) - 1;
+ ra_mask = kvmppc_actual_pgsz(v, r) - 1;
return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
}
@@ -350,6 +351,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
int index;
int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
+ if (kvm_is_radix(vcpu->kvm))
+ return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite);
+
/* Get SLB entry */
if (virtmode) {
slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
@@ -505,7 +509,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
mmio_update = atomic64_read(&kvm->arch.mmio_update);
if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
r = vcpu->arch.pgfault_cache->rpte;
- psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
+ psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
+ r);
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
gfn_base = gpa_base >> PAGE_SHIFT;
gpa = gpa_base | (ea & (psize - 1));
@@ -534,7 +539,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
return RESUME_GUEST;
/* Translate the logical address and get the page */
- psize = hpte_page_size(hpte[0], r);
+ psize = kvmppc_actual_pgsz(hpte[0], r);
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
gfn_base = gpa_base >> PAGE_SHIFT;
gpa = gpa_base | (ea & (psize - 1));
@@ -650,10 +655,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/*
* If the HPT is being resized, don't update the HPTE,
* instead let the guest retry after the resize operation is complete.
- * The synchronization for hpte_setup_done test vs. set is provided
+ * The synchronization for mmu_ready test vs. set is provided
* by the HPTE lock.
*/
- if (!kvm->arch.hpte_setup_done)
+ if (!kvm->arch.mmu_ready)
goto out_unlock;
if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
@@ -720,7 +725,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_put;
}
-static void kvmppc_rmap_reset(struct kvm *kvm)
+void kvmppc_rmap_reset(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -786,6 +791,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
/* Must be called with both HPTE and rmap locked */
static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
+ struct kvm_memory_slot *memslot,
unsigned long *rmapp, unsigned long gfn)
{
__be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
@@ -808,7 +814,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
/* Now check and modify the HPTE */
ptel = rev[i].guest_rpte;
- psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
+ psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel);
if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
hpte_rpn(ptel, psize) == gfn) {
hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
@@ -817,8 +823,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
/* Harvest R and C */
rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
- if (rcbits & HPTE_R_C)
- kvmppc_update_rmap_change(rmapp, psize);
+ if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap)
+ kvmppc_update_dirty_map(memslot, gfn, psize);
if (rcbits & ~rev[i].guest_rpte) {
rev[i].guest_rpte = ptel | rcbits;
note_hpte_modification(kvm, &rev[i]);
@@ -856,7 +862,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
continue;
}
- kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
+ kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn);
unlock_rmap(rmapp);
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
}
@@ -1039,14 +1045,6 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
retry:
lock_rmap(rmapp);
- if (*rmapp & KVMPPC_RMAP_CHANGED) {
- long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
- >> KVMPPC_RMAP_CHG_SHIFT;
- *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
- npages_dirty = 1;
- if (change_order > PAGE_SHIFT)
- npages_dirty = 1ul << (change_order - PAGE_SHIFT);
- }
if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
unlock_rmap(rmapp);
return npages_dirty;
@@ -1102,7 +1100,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
rev[i].guest_rpte |= HPTE_R_C;
note_hpte_modification(kvm, &rev[i]);
}
- n = hpte_page_size(v, r);
+ n = kvmppc_actual_pgsz(v, r);
n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (n > npages_dirty)
npages_dirty = n;
@@ -1138,7 +1136,7 @@ void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map)
{
- unsigned long i, j;
+ unsigned long i;
unsigned long *rmapp;
preempt_disable();
@@ -1150,9 +1148,8 @@ long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
* since we always put huge-page HPTEs in the rmap chain
* corresponding to their page base address.
*/
- if (npages && map)
- for (j = i; npages; ++j, --npages)
- __set_bit_le(j, map);
+ if (npages)
+ set_dirty_bits(map, i, npages);
++rmapp;
}
preempt_enable();
@@ -1196,7 +1193,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
struct page *page = virt_to_page(va);
struct kvm_memory_slot *memslot;
unsigned long gfn;
- unsigned long *rmap;
int srcu_idx;
put_page(page);
@@ -1204,20 +1200,12 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
if (!dirty)
return;
- /* We need to mark this page dirty in the rmap chain */
+ /* We need to mark this page dirty in the memslot dirty_bitmap, if any */
gfn = gpa >> PAGE_SHIFT;
srcu_idx = srcu_read_lock(&kvm->srcu);
memslot = gfn_to_memslot(kvm, gfn);
- if (memslot) {
- if (!kvm_is_radix(kvm)) {
- rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
- lock_rmap(rmap);
- *rmap |= KVMPPC_RMAP_CHANGED;
- unlock_rmap(rmap);
- } else if (memslot->dirty_bitmap) {
- mark_page_dirty(kvm, gfn);
- }
- }
+ if (memslot && memslot->dirty_bitmap)
+ set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap);
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
@@ -1277,7 +1265,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
guest_rpte = rev->guest_rpte;
ret = -EIO;
- apsize = hpte_page_size(vpte, guest_rpte);
+ apsize = kvmppc_actual_pgsz(vpte, guest_rpte);
if (!apsize)
goto out;
@@ -1292,7 +1280,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
lock_rmap(rmapp);
- kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
+ kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn);
unlock_rmap(rmapp);
}
@@ -1465,7 +1453,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
struct kvm_resize_hpt *resize;
int ret;
- if (flags != 0)
+ if (flags != 0 || kvm_is_radix(kvm))
return -EINVAL;
if (shift && ((shift < 18) || (shift > 46)))
@@ -1531,7 +1519,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
struct kvm_resize_hpt *resize;
long ret;
- if (flags != 0)
+ if (flags != 0 || kvm_is_radix(kvm))
return -EINVAL;
if (shift && ((shift < 18) || (shift > 46)))
@@ -1543,15 +1531,15 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
/* This shouldn't be possible */
ret = -EIO;
- if (WARN_ON(!kvm->arch.hpte_setup_done))
+ if (WARN_ON(!kvm->arch.mmu_ready))
goto out_no_hpt;
/* Stop VCPUs from running while we mess with the HPT */
- kvm->arch.hpte_setup_done = 0;
+ kvm->arch.mmu_ready = 0;
smp_mb();
/* Boot all CPUs out of the guest so they re-read
- * hpte_setup_done */
+ * mmu_ready */
on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
ret = -ENXIO;
@@ -1574,7 +1562,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
out:
/* Let VCPUs run again */
- kvm->arch.hpte_setup_done = 1;
+ kvm->arch.mmu_ready = 1;
smp_mb();
out_no_hpt:
resize_hpt_release(kvm, resize);
@@ -1717,6 +1705,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
if (!access_ok(VERIFY_WRITE, buf, count))
return -EFAULT;
+ if (kvm_is_radix(kvm))
+ return 0;
first_pass = ctx->first_pass;
flags = ctx->flags;
@@ -1810,20 +1800,22 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
unsigned long tmp[2];
ssize_t nb;
long int err, ret;
- int hpte_setup;
+ int mmu_ready;
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
+ if (kvm_is_radix(kvm))
+ return -EINVAL;
/* lock out vcpus from running while we're doing this */
mutex_lock(&kvm->lock);
- hpte_setup = kvm->arch.hpte_setup_done;
- if (hpte_setup) {
- kvm->arch.hpte_setup_done = 0; /* temporarily */
- /* order hpte_setup_done vs. vcpus_running */
+ mmu_ready = kvm->arch.mmu_ready;
+ if (mmu_ready) {
+ kvm->arch.mmu_ready = 0; /* temporarily */
+ /* order mmu_ready vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
- kvm->arch.hpte_setup_done = 1;
+ kvm->arch.mmu_ready = 1;
mutex_unlock(&kvm->lock);
return -EBUSY;
}
@@ -1876,7 +1868,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
"r=%lx\n", ret, i, v, r);
goto out;
}
- if (!hpte_setup && is_vrma_hpte(v)) {
+ if (!mmu_ready && is_vrma_hpte(v)) {
unsigned long psize = hpte_base_page_size(v, r);
unsigned long senc = slb_pgsize_encoding(psize);
unsigned long lpcr;
@@ -1885,7 +1877,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
(VRMA_VSID << SLB_VSID_SHIFT_1T);
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
- hpte_setup = 1;
+ mmu_ready = 1;
}
++i;
hptp += 2;
@@ -1901,9 +1893,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
}
out:
- /* Order HPTE updates vs. hpte_setup_done */
+ /* Order HPTE updates vs. mmu_ready */
smp_wmb();
- kvm->arch.hpte_setup_done = hpte_setup;
+ kvm->arch.mmu_ready = mmu_ready;
mutex_unlock(&kvm->lock);
if (err)
@@ -2012,6 +2004,10 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
struct kvm *kvm;
__be64 *hptp;
+ kvm = p->kvm;
+ if (kvm_is_radix(kvm))
+ return 0;
+
ret = mutex_lock_interruptible(&p->mutex);
if (ret)
return ret;
@@ -2034,7 +2030,6 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
}
}
- kvm = p->kvm;
i = p->hpt_index;
hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
@@ -2109,10 +2104,7 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
- if (kvm_is_radix(vcpu->kvm))
- mmu->xlate = kvmppc_mmu_radix_xlate;
- else
- mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+ mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index c5d7435455f1..58618f644c56 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -474,26 +474,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
return ret;
}
-static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long gfn, unsigned int order)
-{
- unsigned long i, limit;
- unsigned long *dp;
-
- if (!memslot->dirty_bitmap)
- return;
- limit = 1ul << order;
- if (limit < BITS_PER_LONG) {
- for (i = 0; i < limit; ++i)
- mark_page_dirty(kvm, gfn + i);
- return;
- }
- dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
- limit /= BITS_PER_LONG;
- for (i = 0; i < limit; ++i)
- *dp++ = ~0ul;
-}
-
/* Called with kvm->lock held */
int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
@@ -508,12 +488,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift);
- if (old & _PAGE_DIRTY) {
- if (!shift)
- mark_page_dirty(kvm, gfn);
- else
- mark_pages_dirty(kvm, memslot,
- gfn, shift - PAGE_SHIFT);
+ if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
+ unsigned long npages = 1;
+ if (shift)
+ npages = 1ul << (shift - PAGE_SHIFT);
+ kvmppc_update_dirty_map(memslot, gfn, npages);
}
}
return 0;
@@ -579,20 +558,8 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map)
{
unsigned long i, j;
- unsigned long n, *p;
int npages;
- /*
- * Radix accumulates dirty bits in the first half of the
- * memslot's dirty_bitmap area, for when pages are paged
- * out or modified by the host directly. Pick up these
- * bits and add them to the map.
- */
- n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
- p = memslot->dirty_bitmap;
- for (i = 0; i < n; ++i)
- map[i] |= xchg(&p[i], 0);
-
for (i = 0; i < memslot->npages; i = j) {
npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
@@ -604,9 +571,10 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
* real address, if npages > 1 we can skip to i + npages.
*/
j = i + 1;
- if (npages)
- for (j = i; npages; ++j, --npages)
- __set_bit_le(j, map);
+ if (npages) {
+ set_dirty_bits(map, i, npages);
+ i = j + npages;
+ }
}
return 0;
}
@@ -694,6 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm)
pgd_clear(pgd);
}
pgd_free(kvm->mm, kvm->arch.pgtable);
+ kvm->arch.pgtable = NULL;
}
static void pte_ctor(void *addr)
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 3589c4e3d49b..688722acd692 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -113,7 +113,7 @@ slb_do_enter:
/* Remove all SLB entries that are in use. */
- li r0, r0
+ li r0, 0
slbmte r0, r0
slbia
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 40e5857c4b1c..79ea3d9269db 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -19,6 +19,7 @@
*/
#include <linux/kvm_host.h>
+#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/preempt.h>
@@ -98,6 +99,10 @@ static int target_smt_mode;
module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
+static bool indep_threads_mode = true;
+module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
+
#ifdef CONFIG_KVM_XICS
static struct kernel_param_ops module_param_ops = {
.set = param_set_int,
@@ -115,6 +120,7 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+static void kvmppc_setup_partition_table(struct kvm *kvm);
static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
int *ip)
@@ -1734,9 +1740,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
* MMU mode (radix or HPT), unfortunately, but since we only support
* HPT guests on a HPT host so far, that isn't an impediment yet.
*/
-static int threads_per_vcore(void)
+static int threads_per_vcore(struct kvm *kvm)
{
- if (cpu_has_feature(CPU_FTR_ARCH_300))
+ if (kvm->arch.threads_indep)
return 1;
return threads_per_subcore;
}
@@ -1774,7 +1780,7 @@ static struct debugfs_timings_element {
{"cede", offsetof(struct kvm_vcpu, arch.cede_time)},
};
-#define N_TIMINGS (sizeof(timings) / sizeof(timings[0]))
+#define N_TIMINGS (ARRAY_SIZE(timings))
struct debugfs_timings_state {
struct kvm_vcpu *vcpu;
@@ -2228,11 +2234,10 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
kvmppc_ipi_thread(cpu);
}
-static void kvmppc_wait_for_nap(void)
+static void kvmppc_wait_for_nap(int n_threads)
{
int cpu = smp_processor_id();
int i, loops;
- int n_threads = threads_per_vcore();
if (n_threads <= 1)
return;
@@ -2319,7 +2324,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
vc->vcore_state = VCORE_PREEMPT;
vc->pcpu = smp_processor_id();
- if (vc->num_threads < threads_per_vcore()) {
+ if (vc->num_threads < threads_per_vcore(vc->kvm)) {
spin_lock(&lp->lock);
list_add_tail(&vc->preempt_list, &lp->list);
spin_unlock(&lp->lock);
@@ -2357,7 +2362,7 @@ struct core_info {
/*
* This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
- * respectively in 2-way micro-threading (split-core) mode.
+ * respectively in 2-way micro-threading (split-core) mode on POWER8.
*/
static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
@@ -2373,7 +2378,14 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
static bool subcore_config_ok(int n_subcores, int n_threads)
{
- /* Can only dynamically split if unsplit to begin with */
+ /*
+ * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core
+ * mode, with one thread per subcore.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return n_subcores <= 4 && n_threads == 1;
+
+ /* On POWER8, can only dynamically split if unsplit to begin with */
if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
return false;
if (n_subcores > MAX_SUBCORES)
@@ -2404,6 +2416,11 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
return false;
+ /* POWER9 currently requires all threads to be in the same MMU mode */
+ if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+ kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
+ return false;
+
if (n_threads < cip->max_subcore_threads)
n_threads = cip->max_subcore_threads;
if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@ -2632,6 +2649,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int target_threads;
int controlled_threads;
int trap;
+ bool is_power8;
+ bool hpt_on_radix;
/*
* Remove from the list any threads that have a signal pending
@@ -2654,15 +2673,19 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* the number of threads per subcore, except on POWER9,
* where it's 1 because the threads are (mostly) independent.
*/
- controlled_threads = threads_per_vcore();
+ controlled_threads = threads_per_vcore(vc->kvm);
/*
* Make sure we are running on primary threads, and that secondary
* threads are offline. Also check if the number of threads in this
* guest are greater than the current system threads per guest.
+ * On POWER9, we need to be not in independent-threads mode if
+ * this is a HPT guest on a radix host.
*/
- if ((controlled_threads > 1) &&
- ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+ hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
+ if (((controlled_threads > 1) &&
+ ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
+ (hpt_on_radix && vc->kvm->arch.threads_indep)) {
for_each_runnable_thread(i, vcpu, vc) {
vcpu->arch.ret = -EBUSY;
kvmppc_remove_runnable(vc, vcpu);
@@ -2699,14 +2722,13 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* Hard-disable interrupts, and check resched flag and signals.
* If we need to reschedule or deliver a signal, clean up
* and return without going into the guest(s).
- * If the hpte_setup_done flag has been cleared, don't go into the
+ * If the mmu_ready flag has been cleared, don't go into the
* guest because that means a HPT resize operation is in progress.
*/
local_irq_disable();
hard_irq_disable();
if (lazy_irq_pending() || need_resched() ||
- recheck_signals(&core_info) ||
- (!kvm_is_radix(vc->kvm) && !vc->kvm->arch.hpte_setup_done)) {
+ recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) {
local_irq_enable();
vc->vcore_state = VCORE_INACTIVE;
/* Unlock all except the primary vcore */
@@ -2728,32 +2750,51 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cmd_bit = stat_bit = 0;
split = core_info.n_subcores;
sip = NULL;
- if (split > 1) {
- /* threads_per_subcore must be MAX_SMT_THREADS (8) here */
- if (split == 2 && (dynamic_mt_modes & 2)) {
- cmd_bit = HID0_POWER8_1TO2LPAR;
- stat_bit = HID0_POWER8_2LPARMODE;
- } else {
- split = 4;
- cmd_bit = HID0_POWER8_1TO4LPAR;
- stat_bit = HID0_POWER8_4LPARMODE;
- }
- subcore_size = MAX_SMT_THREADS / split;
+ is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
+ && !cpu_has_feature(CPU_FTR_ARCH_300);
+
+ if (split > 1 || hpt_on_radix) {
sip = &split_info;
memset(&split_info, 0, sizeof(split_info));
- split_info.rpr = mfspr(SPRN_RPR);
- split_info.pmmar = mfspr(SPRN_PMMAR);
- split_info.ldbar = mfspr(SPRN_LDBAR);
- split_info.subcore_size = subcore_size;
for (sub = 0; sub < core_info.n_subcores; ++sub)
split_info.vc[sub] = core_info.vc[sub];
+
+ if (is_power8) {
+ if (split == 2 && (dynamic_mt_modes & 2)) {
+ cmd_bit = HID0_POWER8_1TO2LPAR;
+ stat_bit = HID0_POWER8_2LPARMODE;
+ } else {
+ split = 4;
+ cmd_bit = HID0_POWER8_1TO4LPAR;
+ stat_bit = HID0_POWER8_4LPARMODE;
+ }
+ subcore_size = MAX_SMT_THREADS / split;
+ split_info.rpr = mfspr(SPRN_RPR);
+ split_info.pmmar = mfspr(SPRN_PMMAR);
+ split_info.ldbar = mfspr(SPRN_LDBAR);
+ split_info.subcore_size = subcore_size;
+ } else {
+ split_info.subcore_size = 1;
+ if (hpt_on_radix) {
+ /* Use the split_info for LPCR/LPIDR changes */
+ split_info.lpcr_req = vc->lpcr;
+ split_info.lpidr_req = vc->kvm->arch.lpid;
+ split_info.host_lpcr = vc->kvm->arch.host_lpcr;
+ split_info.do_set = 1;
+ }
+ }
+
/* order writes to split_info before kvm_split_mode pointer */
smp_wmb();
}
- for (thr = 0; thr < controlled_threads; ++thr)
+
+ for (thr = 0; thr < controlled_threads; ++thr) {
+ paca[pcpu + thr].kvm_hstate.tid = thr;
+ paca[pcpu + thr].kvm_hstate.napping = 0;
paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+ }
- /* Initiate micro-threading (split-core) if required */
+ /* Initiate micro-threading (split-core) on POWER8 if required */
if (cmd_bit) {
unsigned long hid0 = mfspr(SPRN_HID0);
@@ -2772,7 +2813,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
/* Start all the threads */
active = 0;
for (sub = 0; sub < core_info.n_subcores; ++sub) {
- thr = subcore_thread_map[sub];
+ thr = is_power8 ? subcore_thread_map[sub] : sub;
thr0_done = false;
active |= 1 << thr;
pvc = core_info.vc[sub];
@@ -2799,18 +2840,20 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* the vcore pointer in the PACA of the secondaries.
*/
smp_mb();
- if (cmd_bit)
- split_info.do_nap = 1; /* ask secondaries to nap when done */
/*
* When doing micro-threading, poke the inactive threads as well.
* This gets them to the nap instruction after kvm_do_nap,
* which reduces the time taken to unsplit later.
+ * For POWER9 HPT guest on radix host, we need all the secondary
+ * threads woken up so they can do the LPCR/LPIDR change.
*/
- if (split > 1)
+ if (cmd_bit || hpt_on_radix) {
+ split_info.do_nap = 1; /* ask secondaries to nap when done */
for (thr = 1; thr < threads_per_subcore; ++thr)
if (!(active & (1 << thr)))
kvmppc_ipi_thread(pcpu + thr);
+ }
vc->vcore_state = VCORE_RUNNING;
preempt_disable();
@@ -2844,10 +2887,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
vc->vcore_state = VCORE_EXITING;
/* wait for secondary threads to finish writing their state to memory */
- kvmppc_wait_for_nap();
+ kvmppc_wait_for_nap(controlled_threads);
/* Return to whole-core mode if we split the core earlier */
- if (split > 1) {
+ if (cmd_bit) {
unsigned long hid0 = mfspr(SPRN_HID0);
unsigned long loops = 0;
@@ -2863,8 +2906,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cpu_relax();
++loops;
}
- split_info.do_nap = 0;
+ } else if (hpt_on_radix) {
+ /* Wait for all threads to have seen final sync */
+ for (thr = 1; thr < controlled_threads; ++thr) {
+ while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
+ HMT_low();
+ barrier();
+ }
+ HMT_medium();
+ }
}
+ split_info.do_nap = 0;
kvmppc_set_host_core(pcpu);
@@ -3073,6 +3125,25 @@ out:
trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
}
+static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
+{
+ int r = 0;
+ struct kvm *kvm = vcpu->kvm;
+
+ mutex_lock(&kvm->lock);
+ if (!kvm->arch.mmu_ready) {
+ if (!kvm_is_radix(kvm))
+ r = kvmppc_hv_setup_htab_rma(vcpu);
+ if (!r) {
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ kvmppc_setup_partition_table(kvm);
+ kvm->arch.mmu_ready = 1;
+ }
+ }
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
int n_ceded, i, r;
@@ -3129,15 +3200,15 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
!signal_pending(current)) {
- /* See if the HPT and VRMA are ready to go */
- if (!kvm_is_radix(vcpu->kvm) &&
- !vcpu->kvm->arch.hpte_setup_done) {
+ /* See if the MMU is ready to go */
+ if (!vcpu->kvm->arch.mmu_ready) {
spin_unlock(&vc->lock);
- r = kvmppc_hv_setup_htab_rma(vcpu);
+ r = kvmhv_setup_mmu(vcpu);
spin_lock(&vc->lock);
if (r) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.hardware_entry_failure_reason = 0;
+ kvm_run->fail_entry.
+ hardware_entry_failure_reason = 0;
vcpu->arch.ret = r;
break;
}
@@ -3219,6 +3290,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
unsigned long ebb_regs[3] = {}; /* shut up GCC */
unsigned long user_tar = 0;
unsigned int user_vrsave;
+ struct kvm *kvm;
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -3256,8 +3328,9 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
return -EINTR;
}
- atomic_inc(&vcpu->kvm->arch.vcpus_running);
- /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
+ kvm = vcpu->kvm;
+ atomic_inc(&kvm->arch.vcpus_running);
+ /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
smp_mb();
flush_all_to_thread(current);
@@ -3285,10 +3358,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
trace_kvm_hcall_exit(vcpu, r);
kvmppc_core_prepare_to_enter(vcpu);
} else if (r == RESUME_PAGE_FAULT) {
- srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ srcu_idx = srcu_read_lock(&kvm->srcu);
r = kvmppc_book3s_hv_page_fault(run, vcpu,
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
- srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
} else if (r == RESUME_PASSTHROUGH) {
if (WARN_ON(xive_enabled()))
r = H_SUCCESS;
@@ -3308,27 +3381,26 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
mtspr(SPRN_VRSAVE, user_vrsave);
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
- atomic_dec(&vcpu->kvm->arch.vcpus_running);
+ atomic_dec(&kvm->arch.vcpus_running);
return r;
}
static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
- int linux_psize)
+ int shift, int sllp)
{
- struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
-
- if (!def->shift)
- return;
- (*sps)->page_shift = def->shift;
- (*sps)->slb_enc = def->sllp;
- (*sps)->enc[0].page_shift = def->shift;
- (*sps)->enc[0].pte_enc = def->penc[linux_psize];
+ (*sps)->page_shift = shift;
+ (*sps)->slb_enc = sllp;
+ (*sps)->enc[0].page_shift = shift;
+ (*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
/*
- * Add 16MB MPSS support if host supports it
+ * Add 16MB MPSS support (may get filtered out by userspace)
*/
- if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
- (*sps)->enc[1].page_shift = 24;
- (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+ if (shift != 24) {
+ int penc = kvmppc_pgsize_lp_encoding(shift, 24);
+ if (penc != -1) {
+ (*sps)->enc[1].page_shift = 24;
+ (*sps)->enc[1].pte_enc = penc;
+ }
}
(*sps)++;
}
@@ -3339,13 +3411,6 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
struct kvm_ppc_one_seg_page_size *sps;
/*
- * Since we don't yet support HPT guests on a radix host,
- * return an error if the host uses radix.
- */
- if (radix_enabled())
- return -EINVAL;
-
- /*
* POWER7, POWER8 and POWER9 all support 32 storage keys for data.
* POWER7 doesn't support keys for instruction accesses,
* POWER8 and POWER9 do.
@@ -3353,16 +3418,15 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
info->data_keys = 32;
info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
- info->flags = KVM_PPC_PAGE_SIZES_REAL;
- if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
- info->flags |= KVM_PPC_1T_SEGMENTS;
- info->slb_size = mmu_slb_size;
+ /* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
+ info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
+ info->slb_size = 32;
/* We only support these sizes for now, and no muti-size segments */
sps = &info->sps[0];
- kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
- kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
- kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
+ kvmppc_add_seg_page_size(&sps, 12, 0);
+ kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
+ kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
return 0;
}
@@ -3377,7 +3441,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int i, r;
unsigned long n;
- unsigned long *buf;
+ unsigned long *buf, *p;
struct kvm_vcpu *vcpu;
mutex_lock(&kvm->slots_lock);
@@ -3393,8 +3457,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
goto out;
/*
- * Use second half of bitmap area because radix accumulates
- * bits in the first half.
+ * Use second half of bitmap area because both HPT and radix
+ * accumulate bits in the first half.
*/
n = kvm_dirty_bitmap_bytes(memslot);
buf = memslot->dirty_bitmap + n / sizeof(long);
@@ -3407,6 +3471,16 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
if (r)
goto out;
+ /*
+ * We accumulate dirty bits in the first half of the
+ * memslot's dirty_bitmap area, for when pages are paged
+ * out or modified by the host directly. Pick up these
+ * bits and add them to the map.
+ */
+ p = memslot->dirty_bitmap;
+ for (i = 0; i < n / sizeof(long); ++i)
+ buf[i] |= xchg(&p[i], 0);
+
/* Harvest dirty bits from VPA and DTL updates */
/* Note: we never modify the SLB shadow buffer areas */
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -3438,15 +3512,6 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
unsigned long npages)
{
- /*
- * For now, if radix_enabled() then we only support radix guests,
- * and in that case we don't need the rmap array.
- */
- if (radix_enabled()) {
- slot->arch.rmap = NULL;
- return 0;
- }
-
slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
if (!slot->arch.rmap)
return -ENOMEM;
@@ -3467,8 +3532,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
const struct kvm_memory_slot *new)
{
unsigned long npages = mem->memory_size >> PAGE_SHIFT;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
/*
* If we are making a new memslot, it might make
@@ -3478,18 +3541,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
*/
if (npages)
atomic64_inc(&kvm->arch.mmio_update);
-
- if (npages && old->npages && !kvm_is_radix(kvm)) {
- /*
- * If modifying a memslot, reset all the rmap dirty bits.
- * If this is a new memslot, we don't need to do anything
- * since the rmap array starts out as all zeroes,
- * i.e. no pages are dirty.
- */
- slots = kvm_memslots(kvm);
- memslot = id_to_memslot(slots, mem->slot);
- kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
- }
}
/*
@@ -3545,6 +3596,10 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
}
+/*
+ * Set up HPT (hashed page table) and RMA (real-mode area).
+ * Must be called with kvm->lock held.
+ */
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
{
int err = 0;
@@ -3556,10 +3611,6 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
unsigned long psize, porder;
int srcu_idx;
- mutex_lock(&kvm->lock);
- if (kvm->arch.hpte_setup_done)
- goto out; /* another vcpu beat us to it */
-
/* Allocate hashed page table (if not done already) and reset it */
if (!kvm->arch.hpt.virt) {
int order = KVM_DEFAULT_HPT_ORDER;
@@ -3618,18 +3669,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
/* the -4 is to account for senc values starting at 0x10 */
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
- } else {
- kvmppc_setup_partition_table(kvm);
}
- /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
+ /* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
smp_wmb();
- kvm->arch.hpte_setup_done = 1;
err = 0;
out_srcu:
srcu_read_unlock(&kvm->srcu, srcu_idx);
out:
- mutex_unlock(&kvm->lock);
return err;
up_out:
@@ -3637,6 +3684,34 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
goto out_srcu;
}
+/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
+{
+ kvmppc_free_radix(kvm);
+ kvmppc_update_lpcr(kvm, LPCR_VPM1,
+ LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+ kvmppc_rmap_reset(kvm);
+ kvm->arch.radix = 0;
+ kvm->arch.process_table = 0;
+ return 0;
+}
+
+/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
+{
+ int err;
+
+ err = kvmppc_init_vm_radix(kvm);
+ if (err)
+ return err;
+
+ kvmppc_free_hpt(&kvm->arch.hpt);
+ kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
+ LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+ kvm->arch.radix = 1;
+ return 0;
+}
+
#ifdef CONFIG_KVM_XICS
/*
* Allocate a per-core structure for managing state about which cores are
@@ -3780,10 +3855,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
}
/*
- * For now, if the host uses radix, the guest must be radix.
+ * If the host uses radix, the guest starts out as radix.
*/
if (radix_enabled()) {
kvm->arch.radix = 1;
+ kvm->arch.mmu_ready = 1;
lpcr &= ~LPCR_VPM1;
lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
ret = kvmppc_init_vm_radix(kvm);
@@ -3803,7 +3879,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
* Work out how many sets the TLB has, for the use of
* the TLB invalidation loop in book3s_hv_rmhandlers.S.
*/
- if (kvm_is_radix(kvm))
+ if (radix_enabled())
kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */
else if (cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
@@ -3815,10 +3891,12 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
/*
* Track that we now have a HV mode VM active. This blocks secondary
* CPU threads from coming online.
- * On POWER9, we only need to do this for HPT guests on a radix
- * host, which is not yet supported.
+ * On POWER9, we only need to do this if the "indep_threads_mode"
+ * module parameter has been set to N.
*/
- if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ kvm->arch.threads_indep = indep_threads_mode;
+ if (!kvm->arch.threads_indep)
kvm_hv_vm_activated();
/*
@@ -3858,7 +3936,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
{
debugfs_remove_recursive(kvm->arch.debugfs_dir);
- if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ if (!kvm->arch.threads_indep)
kvm_hv_vm_deactivated();
kvmppc_free_vcores(kvm);
@@ -4193,6 +4271,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
{
unsigned long lpcr;
int radix;
+ int err;
/* If not on a POWER9, reject it */
if (!cpu_has_feature(CPU_FTR_ARCH_300))
@@ -4202,12 +4281,8 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
return -EINVAL;
- /* We can't change a guest to/from radix yet */
- radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
- if (radix != kvm_is_radix(kvm))
- return -EINVAL;
-
/* GR (guest radix) bit in process_table field must match */
+ radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
if (!!(cfg->process_table & PATB_GR) != radix)
return -EINVAL;
@@ -4215,15 +4290,40 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if ((cfg->process_table & PRTS_MASK) > 24)
return -EINVAL;
+ /* We can change a guest to/from radix now, if the host is radix */
+ if (radix && !radix_enabled())
+ return -EINVAL;
+
mutex_lock(&kvm->lock);
+ if (radix != kvm_is_radix(kvm)) {
+ if (kvm->arch.mmu_ready) {
+ kvm->arch.mmu_ready = 0;
+ /* order mmu_ready vs. vcpus_running */
+ smp_mb();
+ if (atomic_read(&kvm->arch.vcpus_running)) {
+ kvm->arch.mmu_ready = 1;
+ err = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ if (radix)
+ err = kvmppc_switch_mmu_to_radix(kvm);
+ else
+ err = kvmppc_switch_mmu_to_hpt(kvm);
+ if (err)
+ goto out_unlock;
+ }
+
kvm->arch.process_table = cfg->process_table;
kvmppc_setup_partition_table(kvm);
lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
- mutex_unlock(&kvm->lock);
+ err = 0;
- return 0;
+ out_unlock:
+ mutex_unlock(&kvm->lock);
+ return err;
}
static struct kvmppc_ops kvm_ops_hv = {
@@ -4365,4 +4465,3 @@ module_exit(kvmppc_book3s_exit_hv);
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(KVM_MINOR);
MODULE_ALIAS("devname:kvm");
-
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 90644db9d38e..49a2c7825e04 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -278,7 +278,8 @@ void kvmhv_commence_exit(int trap)
struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
int ptid = local_paca->kvm_hstate.ptid;
struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
- int me, ee, i;
+ int me, ee, i, t;
+ int cpu0;
/* Set our bit in the threads-exiting-guest map in the 0xff00
bits of vcore->entry_exit_map */
@@ -320,6 +321,22 @@ void kvmhv_commence_exit(int trap)
if ((ee >> 8) == 0)
kvmhv_interrupt_vcore(vc, ee);
}
+
+ /*
+ * On POWER9 when running a HPT guest on a radix host (sip != NULL),
+ * we have to interrupt inactive CPU threads to get them to
+ * restore the host LPCR value.
+ */
+ if (sip->lpcr_req) {
+ if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
+ vc = local_paca->kvm_hstate.kvm_vcore;
+ cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
+ for (t = 1; t < threads_per_core; ++t) {
+ if (sip->napped[t])
+ kvmhv_rm_send_ipi(cpu0 + t);
+ }
+ }
+ }
}
struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
@@ -529,6 +546,8 @@ static inline bool is_rm(void)
unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_xirr(vcpu);
@@ -541,6 +560,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
vcpu->arch.gpr[5] = get_tb();
if (xive_enabled()) {
if (is_rm())
@@ -554,6 +575,8 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_ipoll(vcpu, server);
@@ -567,6 +590,8 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
unsigned long mfrr)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_ipi(vcpu, server, mfrr);
@@ -579,6 +604,8 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_cppr(vcpu, cppr);
@@ -591,6 +618,8 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
{
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_eoi(vcpu, xirr);
@@ -601,3 +630,89 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
return xics_rm_h_eoi(vcpu, xirr);
}
#endif /* CONFIG_KVM_XICS */
+
+void kvmppc_bad_interrupt(struct pt_regs *regs)
+{
+ die("Bad interrupt in KVM entry/exit code", regs, SIGABRT);
+ panic("Bad KVM trap");
+}
+
+/*
+ * Functions used to switch LPCR HR and UPRT bits on all threads
+ * when entering and exiting HPT guests on a radix host.
+ */
+
+#define PHASE_REALMODE 1 /* in real mode */
+#define PHASE_SET_LPCR 2 /* have set LPCR */
+#define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */
+#define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */
+
+#define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
+
+static void wait_for_sync(struct kvm_split_mode *sip, int phase)
+{
+ int thr = local_paca->kvm_hstate.tid;
+
+ sip->lpcr_sync.phase[thr] |= phase;
+ phase = ALL(phase);
+ while ((sip->lpcr_sync.allphases & phase) != phase) {
+ HMT_low();
+ barrier();
+ }
+ HMT_medium();
+}
+
+void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
+{
+ unsigned long rb, set;
+
+ /* wait for every other thread to get to real mode */
+ wait_for_sync(sip, PHASE_REALMODE);
+
+ /* Set LPCR and LPIDR */
+ mtspr(SPRN_LPCR, sip->lpcr_req);
+ mtspr(SPRN_LPID, sip->lpidr_req);
+ isync();
+
+ /* Invalidate the TLB on thread 0 */
+ if (local_paca->kvm_hstate.tid == 0) {
+ sip->do_set = 0;
+ asm volatile("ptesync" : : : "memory");
+ for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) {
+ rb = TLBIEL_INVAL_SET_LPID +
+ (set << TLBIEL_INVAL_SET_SHIFT);
+ asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
+ "r" (rb), "r" (0));
+ }
+ asm volatile("ptesync" : : : "memory");
+ }
+
+ /* indicate that we have done so and wait for others */
+ wait_for_sync(sip, PHASE_SET_LPCR);
+ /* order read of sip->lpcr_sync.allphases vs. sip->do_set */
+ smp_rmb();
+}
+
+/*
+ * Called when a thread that has been in the guest needs
+ * to reload the host LPCR value - but only on POWER9 when
+ * running a HPT guest on a radix host.
+ */
+void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
+{
+ /* we're out of the guest... */
+ wait_for_sync(sip, PHASE_OUT_OF_GUEST);
+
+ mtspr(SPRN_LPID, 0);
+ mtspr(SPRN_LPCR, sip->host_lpcr);
+ isync();
+
+ if (local_paca->kvm_hstate.tid == 0) {
+ sip->do_restore = 0;
+ smp_wmb(); /* order store of do_restore vs. phase */
+ }
+
+ wait_for_sync(sip, PHASE_RESET_LPCR);
+ smp_mb();
+ local_paca->kvm_hstate.kvm_split_mode = NULL;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 4efe364f1188..26c11f678fbf 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -107,30 +107,50 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
}
EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
-/* Update the changed page order field of an rmap entry */
-void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize)
+/* Update the dirty bitmap of a memslot */
+void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
+ unsigned long gfn, unsigned long psize)
{
- unsigned long order;
+ unsigned long npages;
- if (!psize)
+ if (!psize || !memslot->dirty_bitmap)
return;
- order = ilog2(psize);
- order <<= KVMPPC_RMAP_CHG_SHIFT;
- if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER))
- *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order;
+ npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE;
+ gfn -= memslot->base_gfn;
+ set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages);
+}
+EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map);
+
+static void kvmppc_set_dirty_from_hpte(struct kvm *kvm,
+ unsigned long hpte_v, unsigned long hpte_gr)
+{
+ struct kvm_memory_slot *memslot;
+ unsigned long gfn;
+ unsigned long psize;
+
+ psize = kvmppc_actual_pgsz(hpte_v, hpte_gr);
+ gfn = hpte_rpn(hpte_gr, psize);
+ memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+ if (memslot && memslot->dirty_bitmap)
+ kvmppc_update_dirty_map(memslot, gfn, psize);
}
-EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
- unsigned long hpte_gr)
+ unsigned long hpte_gr,
+ struct kvm_memory_slot **memslotp,
+ unsigned long *gfnp)
{
struct kvm_memory_slot *memslot;
unsigned long *rmap;
unsigned long gfn;
- gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
+ gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr));
memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+ if (memslotp)
+ *memslotp = memslot;
+ if (gfnp)
+ *gfnp = gfn;
if (!memslot)
return NULL;
@@ -147,10 +167,12 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
unsigned long ptel, head;
unsigned long *rmap;
unsigned long rcbits;
+ struct kvm_memory_slot *memslot;
+ unsigned long gfn;
rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
ptel = rev->guest_rpte |= rcbits;
- rmap = revmap_for_hpte(kvm, hpte_v, ptel);
+ rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn);
if (!rmap)
return;
lock_rmap(rmap);
@@ -169,7 +191,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
}
*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
if (rcbits & HPTE_R_C)
- kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r));
+ kvmppc_update_dirty_map(memslot, gfn,
+ kvmppc_actual_pgsz(hpte_v, hpte_r));
unlock_rmap(rmap);
}
@@ -193,7 +216,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
if (kvm_is_radix(kvm))
return H_FUNCTION;
- psize = hpte_page_size(pteh, ptel);
+ psize = kvmppc_actual_pgsz(pteh, ptel);
if (!psize)
return H_PARAMETER;
writing = hpte_is_writable(ptel);
@@ -797,7 +820,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
gr |= r & (HPTE_R_R | HPTE_R_C);
if (r & HPTE_R_R) {
kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
- rmap = revmap_for_hpte(kvm, v, gr);
+ rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL);
if (rmap) {
lock_rmap(rmap);
*rmap |= KVMPPC_RMAP_REFERENCED;
@@ -819,7 +842,6 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
__be64 *hpte;
unsigned long v, r, gr;
struct revmap_entry *rev;
- unsigned long *rmap;
long ret = H_NOT_FOUND;
if (kvm_is_radix(kvm))
@@ -848,16 +870,9 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
r = be64_to_cpu(hpte[1]);
gr |= r & (HPTE_R_R | HPTE_R_C);
if (r & HPTE_R_C) {
- unsigned long psize = hpte_page_size(v, r);
hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
eieio();
- rmap = revmap_for_hpte(kvm, v, gr);
- if (rmap) {
- lock_rmap(rmap);
- *rmap |= KVMPPC_RMAP_CHANGED;
- kvmppc_update_rmap_change(rmap, psize);
- unlock_rmap(rmap);
- }
+ kvmppc_set_dirty_from_hpte(kvm, v, gr);
}
}
vcpu->arch.gpr[4] = gr;
@@ -1014,7 +1029,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
* Check the HPTE again, including base page size
*/
if ((v & valid) && (v & mask) == val &&
- hpte_base_page_size(v, r) == (1ul << pshift))
+ kvmppc_hpte_base_page_shift(v, r) == pshift)
/* Return with the HPTE still locked */
return (hash << 3) + (i >> 1);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 68bf0f14a962..2659844784b8 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -31,6 +31,7 @@
#include <asm/tm.h>
#include <asm/opal.h>
#include <asm/xive-regs.h>
+#include <asm/thread_info.h>
/* Sign-extend HDEC if not on POWER9 */
#define EXTEND_HDEC(reg) \
@@ -81,6 +82,19 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
RFI
kvmppc_call_hv_entry:
+BEGIN_FTR_SECTION
+ /* On P9, do LPCR setting, if necessary */
+ ld r3, HSTATE_SPLIT_MODE(r13)
+ cmpdi r3, 0
+ beq 46f
+ lwz r4, KVM_SPLIT_DO_SET(r3)
+ cmpwi r4, 0
+ beq 46f
+ bl kvmhv_p9_set_lpcr
+ nop
+46:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
ld r4, HSTATE_KVM_VCPU(r13)
bl kvmppc_hv_entry
@@ -387,6 +401,7 @@ kvm_secondary_got_guest:
ld r6, HSTATE_SPLIT_MODE(r13)
cmpdi r6, 0
beq 63f
+BEGIN_FTR_SECTION
ld r0, KVM_SPLIT_RPR(r6)
mtspr SPRN_RPR, r0
ld r0, KVM_SPLIT_PMMAR(r6)
@@ -394,6 +409,15 @@ kvm_secondary_got_guest:
ld r0, KVM_SPLIT_LDBAR(r6)
mtspr SPRN_LDBAR, r0
isync
+FTR_SECTION_ELSE
+ /* On P9 we use the split_info for coordinating LPCR changes */
+ lwz r4, KVM_SPLIT_DO_SET(r6)
+ cmpwi r4, 0
+ beq 63f
+ mr r3, r6
+ bl kvmhv_p9_set_lpcr
+ nop
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
63:
/* Order load of vcpu after load of vcore */
lwsync
@@ -464,6 +488,12 @@ kvm_no_guest:
ld r3, HSTATE_SPLIT_MODE(r13)
cmpdi r3, 0
beq kvm_no_guest
+ lwz r0, KVM_SPLIT_DO_SET(r3)
+ cmpwi r0, 0
+ bne kvmhv_do_set
+ lwz r0, KVM_SPLIT_DO_RESTORE(r3)
+ cmpwi r0, 0
+ bne kvmhv_do_restore
lbz r0, KVM_SPLIT_DO_NAP(r3)
cmpwi r0, 0
beq kvm_no_guest
@@ -476,6 +506,19 @@ kvm_no_guest:
stb r0, HSTATE_HWTHREAD_STATE(r13)
b kvm_no_guest
+kvmhv_do_set:
+ /* Set LPCR, LPIDR etc. on P9 */
+ HMT_MEDIUM
+ bl kvmhv_p9_set_lpcr
+ nop
+ b kvm_no_guest
+
+kvmhv_do_restore:
+ HMT_MEDIUM
+ bl kvmhv_p9_restore_lpcr
+ nop
+ b kvm_no_guest
+
/*
* Here the primary thread is trying to return the core to
* whole-core mode, so we need to nap.
@@ -513,8 +556,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* Set kvm_split_mode.napped[tid] = 1 */
ld r3, HSTATE_SPLIT_MODE(r13)
li r0, 1
- lhz r4, PACAPACAINDEX(r13)
- clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */
+ lbz r4, HSTATE_TID(r13)
addi r4, r4, KVM_SPLIT_NAPPED
stbx r0, r3, r4
/* Check the do_nap flag again after setting napped[] */
@@ -1911,10 +1953,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
19: lis r8,0x7fff /* MAX_INT@h */
mtspr SPRN_HDEC,r8
-16: ld r8,KVM_HOST_LPCR(r4)
+16:
+BEGIN_FTR_SECTION
+ /* On POWER9 with HPT-on-radix we need to wait for all other threads */
+ ld r3, HSTATE_SPLIT_MODE(r13)
+ cmpdi r3, 0
+ beq 47f
+ lwz r8, KVM_SPLIT_DO_RESTORE(r3)
+ cmpwi r8, 0
+ beq 47f
+ stw r12, STACK_SLOT_TRAP(r1)
+ bl kvmhv_p9_restore_lpcr
+ nop
+ lwz r12, STACK_SLOT_TRAP(r1)
+ b 48f
+47:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ ld r8,KVM_HOST_LPCR(r4)
mtspr SPRN_LPCR,r8
isync
-
+48:
/* load host SLB entries */
BEGIN_MMU_FTR_SECTION
b 0f
@@ -3133,10 +3191,139 @@ kvmppc_restore_tm:
/*
* We come here if we get any exception or interrupt while we are
* executing host real mode code while in guest MMU context.
- * For now just spin, but we should do something better.
+ * r12 is (CR << 32) | vector
+ * r13 points to our PACA
+ * r12 is saved in HSTATE_SCRATCH0(r13)
+ * ctr is saved in HSTATE_SCRATCH1(r13) if RELOCATABLE
+ * r9 is saved in HSTATE_SCRATCH2(r13)
+ * r13 is saved in HSPRG1
+ * cfar is saved in HSTATE_CFAR(r13)
+ * ppr is saved in HSTATE_PPR(r13)
*/
kvmppc_bad_host_intr:
+ /*
+ * Switch to the emergency stack, but start half-way down in
+ * case we were already on it.
+ */
+ mr r9, r1
+ std r1, PACAR1(r13)
+ ld r1, PACAEMERGSP(r13)
+ subi r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE
+ std r9, 0(r1)
+ std r0, GPR0(r1)
+ std r9, GPR1(r1)
+ std r2, GPR2(r1)
+ SAVE_4GPRS(3, r1)
+ SAVE_2GPRS(7, r1)
+ srdi r0, r12, 32
+ clrldi r12, r12, 32
+ std r0, _CCR(r1)
+ std r12, _TRAP(r1)
+ andi. r0, r12, 2
+ beq 1f
+ mfspr r3, SPRN_HSRR0
+ mfspr r4, SPRN_HSRR1
+ mfspr r5, SPRN_HDAR
+ mfspr r6, SPRN_HDSISR
+ b 2f
+1: mfspr r3, SPRN_SRR0
+ mfspr r4, SPRN_SRR1
+ mfspr r5, SPRN_DAR
+ mfspr r6, SPRN_DSISR
+2: std r3, _NIP(r1)
+ std r4, _MSR(r1)
+ std r5, _DAR(r1)
+ std r6, _DSISR(r1)
+ ld r9, HSTATE_SCRATCH2(r13)
+ ld r12, HSTATE_SCRATCH0(r13)
+ GET_SCRATCH0(r0)
+ SAVE_4GPRS(9, r1)
+ std r0, GPR13(r1)
+ SAVE_NVGPRS(r1)
+ ld r5, HSTATE_CFAR(r13)
+ std r5, ORIG_GPR3(r1)
+ mflr r3
+#ifdef CONFIG_RELOCATABLE
+ ld r4, HSTATE_SCRATCH1(r13)
+#else
+ mfctr r4
+#endif
+ mfxer r5
+ lbz r6, PACASOFTIRQEN(r13)
+ std r3, _LINK(r1)
+ std r4, _CTR(r1)
+ std r5, _XER(r1)
+ std r6, SOFTE(r1)
+ ld r2, PACATOC(r13)
+ LOAD_REG_IMMEDIATE(3, 0x7265677368657265)
+ std r3, STACK_FRAME_OVERHEAD-16(r1)
+
+ /*
+ * On POWER9 do a minimal restore of the MMU and call C code,
+ * which will print a message and panic.
+ * XXX On POWER7 and POWER8, we just spin here since we don't
+ * know what the other threads are doing (and we don't want to
+ * coordinate with them) - but at least we now have register state
+ * in memory that we might be able to look at from another CPU.
+ */
+BEGIN_FTR_SECTION
b .
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+ ld r9, HSTATE_KVM_VCPU(r13)
+ ld r10, VCPU_KVM(r9)
+
+ li r0, 0
+ mtspr SPRN_AMR, r0
+ mtspr SPRN_IAMR, r0
+ mtspr SPRN_CIABR, r0
+ mtspr SPRN_DAWRX, r0
+
+ /* Flush the ERAT on radix P9 DD1 guest exit */
+BEGIN_FTR_SECTION
+ PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
+
+BEGIN_MMU_FTR_SECTION
+ b 4f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
+
+ slbmte r0, r0
+ slbia
+ ptesync
+ ld r8, PACA_SLBSHADOWPTR(r13)
+ .rept SLB_NUM_BOLTED
+ li r3, SLBSHADOW_SAVEAREA
+ LDX_BE r5, r8, r3
+ addi r3, r3, 8
+ LDX_BE r6, r8, r3
+ andis. r7, r5, SLB_ESID_V@h
+ beq 3f
+ slbmte r6, r5
+3: addi r8, r8, 16
+ .endr
+
+4: lwz r7, KVM_HOST_LPID(r10)
+ mtspr SPRN_LPID, r7
+ mtspr SPRN_PID, r0
+ ld r8, KVM_HOST_LPCR(r10)
+ mtspr SPRN_LPCR, r8
+ isync
+ li r0, KVM_GUEST_MODE_NONE
+ stb r0, HSTATE_IN_GUEST(r13)
+
+ /*
+ * Turn on the MMU and jump to C code
+ */
+ bcl 20, 31, .+4
+5: mflr r3
+ addi r3, r3, 9f - 5b
+ ld r4, PACAKMSR(r13)
+ mtspr SPRN_SRR0, r3
+ mtspr SPRN_SRR1, r4
+ rfid
+9: addi r3, r1, STACK_FRAME_OVERHEAD
+ bl kvmppc_bad_interrupt
+ b 9b
/*
* This mimics the MSR transition on IRQ delivery. The new guest MSR is taken
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 69a09444d46e..d0dc8624198f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1326,12 +1326,22 @@ static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu,
kvmppc_set_pvr_pr(vcpu, sregs->pvr);
vcpu3s->sdr1 = sregs->u.s.sdr1;
+#ifdef CONFIG_PPC_BOOK3S_64
if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+ /* Flush all SLB entries */
+ vcpu->arch.mmu.slbmte(vcpu, 0, 0);
+ vcpu->arch.mmu.slbia(vcpu);
+
for (i = 0; i < 64; i++) {
- vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
- sregs->u.s.ppc64.slb[i].slbe);
+ u64 rb = sregs->u.s.ppc64.slb[i].slbe;
+ u64 rs = sregs->u.s.ppc64.slb[i].slbv;
+
+ if (rb & SLB_ESID_V)
+ vcpu->arch.mmu.slbmte(vcpu, rs, rb);
}
- } else {
+ } else
+#endif
+ {
for (i = 0; i < 16; i++) {
vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
}
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 8a4205fa774f..dae3be5ff42b 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -419,6 +419,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
case H_PROTECT:
case H_BULK_REMOVE:
case H_PUT_TCE:
+ case H_PUT_TCE_INDIRECT:
+ case H_STUFF_TCE:
case H_CEDE:
case H_LOGICAL_CI_LOAD:
case H_LOGICAL_CI_STORE:
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index c6c734424c70..423b21393bc9 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -377,7 +377,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
start = vma->vm_pgoff;
end = start +
- ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
+ vma_pages(vma);
pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1abe6eb51335..6b6c53c42ac9 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -590,8 +590,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && radix_enabled());
break;
case KVM_CAP_PPC_MMU_HASH_V3:
- r = !!(hv_enabled && !radix_enabled() &&
- cpu_has_feature(CPU_FTR_ARCH_300));
+ r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
break;
#endif
case KVM_CAP_SYNC_MMU:
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index fd006a272024..f3a9b5a445b6 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -685,11 +685,28 @@ struct kvm_s390_crypto {
__u8 dea_kw;
};
+#define APCB0_MASK_SIZE 1
+struct kvm_s390_apcb0 {
+ __u64 apm[APCB0_MASK_SIZE]; /* 0x0000 */
+ __u64 aqm[APCB0_MASK_SIZE]; /* 0x0008 */
+ __u64 adm[APCB0_MASK_SIZE]; /* 0x0010 */
+ __u64 reserved18; /* 0x0018 */
+};
+
+#define APCB1_MASK_SIZE 4
+struct kvm_s390_apcb1 {
+ __u64 apm[APCB1_MASK_SIZE]; /* 0x0000 */
+ __u64 aqm[APCB1_MASK_SIZE]; /* 0x0020 */
+ __u64 adm[APCB1_MASK_SIZE]; /* 0x0040 */
+ __u64 reserved60[4]; /* 0x0060 */
+};
+
struct kvm_s390_crypto_cb {
- __u8 reserved00[72]; /* 0x0000 */
- __u8 dea_wrapping_key_mask[24]; /* 0x0048 */
- __u8 aes_wrapping_key_mask[32]; /* 0x0060 */
- __u8 reserved80[128]; /* 0x0080 */
+ struct kvm_s390_apcb0 apcb0; /* 0x0000 */
+ __u8 reserved20[0x0048 - 0x0020]; /* 0x0020 */
+ __u8 dea_wrapping_key_mask[24]; /* 0x0048 */
+ __u8 aes_wrapping_key_mask[32]; /* 0x0060 */
+ struct kvm_s390_apcb1 apcb1; /* 0x0080 */
};
/*
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 329b2843fee2..fa557372d600 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -213,6 +213,16 @@ static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
vcpu->arch.local_int.pending_irqs;
}
+static inline int isc_to_irq_type(unsigned long isc)
+{
+ return IRQ_PEND_IO_ISC_0 + isc;
+}
+
+static inline int irq_type_to_isc(unsigned long irq_type)
+{
+ return irq_type - IRQ_PEND_IO_ISC_0;
+}
+
static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
unsigned long active_mask)
{
@@ -220,7 +230,7 @@ static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
for (i = 0; i <= MAX_ISC; i++)
if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i)))
- active_mask &= ~(1UL << (IRQ_PEND_IO_ISC_0 + i));
+ active_mask &= ~(1UL << (isc_to_irq_type(i)));
return active_mask;
}
@@ -901,7 +911,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
fi = &vcpu->kvm->arch.float_int;
spin_lock(&fi->lock);
- isc_list = &fi->lists[irq_type - IRQ_PEND_IO_ISC_0];
+ isc_list = &fi->lists[irq_type_to_isc(irq_type)];
inti = list_first_entry_or_null(isc_list,
struct kvm_s390_interrupt_info,
list);
@@ -1074,6 +1084,12 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
* in kvm_vcpu_block without having the waitqueue set (polling)
*/
vcpu->valid_wakeup = true;
+ /*
+ * This is mostly to document, that the read in swait_active could
+ * be moved before other stores, leading to subtle races.
+ * All current users do not store or use an atomic like update
+ */
+ smp_mb__after_atomic();
if (swait_active(&vcpu->wq)) {
/*
* The vcpu gave up the cpu voluntarily, mark it as a good
@@ -1395,7 +1411,7 @@ static struct kvm_s390_interrupt_info *get_io_int(struct kvm *kvm,
list_del_init(&iter->list);
fi->counters[FIRQ_CNTR_IO] -= 1;
if (list_empty(isc_list))
- clear_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs);
+ clear_bit(isc_to_irq_type(isc), &fi->pending_irqs);
spin_unlock(&fi->lock);
return iter;
}
@@ -1522,7 +1538,7 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
isc = int_word_to_isc(inti->io.io_int_word);
list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
list_add_tail(&inti->list, list);
- set_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs);
+ set_bit(isc_to_irq_type(isc), &fi->pending_irqs);
spin_unlock(&fi->lock);
return 0;
}
@@ -2175,6 +2191,8 @@ static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr)
return -EINVAL;
if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid)))
return -EFAULT;
+ if (!schid)
+ return -EINVAL;
kfree(kvm_s390_get_io_int(kvm, isc_mask, schid));
/*
* If userspace is conforming to the architecture, we can have at most
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4bc70afe0a10..98ad8b9e0360 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -395,6 +395,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_USER_INSTR0:
case KVM_CAP_S390_CMMA_MIGRATION:
case KVM_CAP_S390_AIS:
+ case KVM_CAP_S390_AIS_MIGRATION:
r = 1;
break;
case KVM_CAP_S390_MEM_OP:
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b18b5652e5c5..a311938b63b3 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -443,22 +443,14 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
*
* Returns: - 0 on success
* - -EINVAL if the gpa is not valid guest storage
- * - -ENOMEM if out of memory
*/
static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
{
struct page *page;
- hva_t hva;
- int rc;
- hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
- if (kvm_is_error_hva(hva))
+ page = gfn_to_page(kvm, gpa_to_gfn(gpa));
+ if (is_error_page(page))
return -EINVAL;
- rc = get_user_pages_fast(hva, 1, 1, &page);
- if (rc < 0)
- return rc;
- else if (rc != 1)
- return -ENOMEM;
*hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
return 0;
}
@@ -466,11 +458,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
{
- struct page *page;
-
- page = virt_to_page(hpa);
- set_page_dirty_lock(page);
- put_page(page);
+ kvm_release_pfn_dirty(hpa >> PAGE_SHIFT);
/* mark the page always as dirty for migration */
mark_page_dirty(kvm, gpa_to_gfn(gpa));
}
@@ -557,7 +545,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
rc = set_validity_icpt(scb_s, 0x003bU);
if (!rc) {
rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
- if (rc == -EINVAL)
+ if (rc)
rc = set_validity_icpt(scb_s, 0x0034U);
}
if (rc)
@@ -574,10 +562,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/* 256 bytes cannot cross page boundaries */
rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
- if (rc == -EINVAL)
+ if (rc) {
rc = set_validity_icpt(scb_s, 0x0080U);
- if (rc)
goto unpin;
+ }
scb_s->itdba = hpa;
}
@@ -592,10 +580,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
* if this block gets bigger, we have to shadow it.
*/
rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
- if (rc == -EINVAL)
+ if (rc) {
rc = set_validity_icpt(scb_s, 0x1310U);
- if (rc)
goto unpin;
+ }
scb_s->gvrd = hpa;
}
@@ -607,11 +595,11 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/* 64 bytes cannot cross page boundaries */
rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
- if (rc == -EINVAL)
+ if (rc) {
rc = set_validity_icpt(scb_s, 0x0043U);
- /* Validity 0x0044 will be checked by SIE */
- if (rc)
goto unpin;
+ }
+ /* Validity 0x0044 will be checked by SIE */
scb_s->riccbd = hpa;
}
if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
@@ -635,10 +623,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
* cross page boundaries
*/
rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
- if (rc == -EINVAL)
+ if (rc) {
rc = set_validity_icpt(scb_s, 0x10b0U);
- if (rc)
goto unpin;
+ }
scb_s->sdnxo = hpa | sdnxc;
}
return 0;
@@ -663,7 +651,6 @@ static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
*
* Returns: - 0 if the scb was pinned.
* - > 0 if control has to be given to guest 2
- * - -ENOMEM if out of memory
*/
static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
gpa_t gpa)
@@ -672,14 +659,13 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
int rc;
rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
- if (rc == -EINVAL) {
+ if (rc) {
rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- if (!rc)
- rc = 1;
+ WARN_ON_ONCE(rc);
+ return 1;
}
- if (!rc)
- vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
- return rc;
+ vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+ return 0;
}
/*
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index ee23a43386a2..034caa1a084e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -226,6 +226,8 @@ struct x86_emulate_ops {
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
+ int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+
};
typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9d7d856b2d89..1bfb99770c34 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1061,6 +1061,11 @@ struct kvm_x86_ops {
void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+ int (*smi_allowed)(struct kvm_vcpu *vcpu);
+ int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
+ int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
+ int (*enable_smi_window)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1426,4 +1431,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#endif
}
+#define put_smstate(type, buf, offset, val) \
+ *(type *)((buf) + (offset) - 0x7e00) = val
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index caec8417539f..8b6780751132 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -70,11 +70,11 @@
#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
-#define SECONDARY_EXEC_RDRAND 0x00000800
+#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
-#define SECONDARY_EXEC_RDSEED 0x00010000
+#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
#define SECONDARY_EXEC_TSC_SCALING 0x02000000
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d90cdc77e077..8079d141792a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2591,6 +2591,15 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
smbase = ctxt->ops->get_smbase(ctxt);
+
+ /*
+ * Give pre_leave_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (ctxt->ops->pre_leave_smm(ctxt, smbase))
+ return X86EMUL_UNHANDLEABLE;
+
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, smbase + 0x8000);
else
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36c90d631096..943acbf00c69 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1301,14 +1301,42 @@ static void update_divide_count(struct kvm_lapic *apic)
apic->divide_count);
}
+static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
+{
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+}
+
static void apic_update_lvtt(struct kvm_lapic *apic)
{
u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask;
if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
apic->lapic_timer.timer_mode = timer_mode;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ limit_periodic_timer_frequency(apic);
}
}
@@ -1430,6 +1458,30 @@ static void start_sw_period(struct kvm_lapic *apic)
HRTIMER_MODE_ABS_PINNED);
}
+static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+{
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+}
+
static bool set_target_expiration(struct kvm_lapic *apic)
{
ktime_t now;
@@ -1439,27 +1491,13 @@ static bool set_target_expiration(struct kvm_lapic *apic)
apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
- if (!apic->lapic_timer.period)
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
return false;
-
- /*
- * Do not allow the guest to program periodic timers with small
- * interval, since the hrtimers are not throttled by the host
- * scheduler.
- */
- if (apic_lvtt_period(apic)) {
- s64 min_period = min_timer_period_us * 1000LL;
-
- if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
- "lapic timer period limited to %lld ns\n",
- apic->vcpu->vcpu_id,
- apic->lapic_timer.period, min_period);
- apic->lapic_timer.period = min_period;
- }
}
+ limit_periodic_timer_frequency(apic);
+
apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
PRIx64 ", "
"timer initial count 0x%x, period %lldns, "
@@ -1515,6 +1553,9 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
return false;
+ if (!ktimer->tscdeadline)
+ return false;
+
r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
if (r < 0)
return false;
@@ -1738,13 +1779,21 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
start_apic_timer(apic);
break;
- case APIC_TDCR:
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
+
if (val & 4)
apic_debug("KVM_WRITE:TDCR %x\n", val);
kvm_lapic_set_reg(apic, APIC_TDCR, val);
update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
break;
-
+ }
case APIC_ESR:
if (apic_x2apic_mode(apic) && val != 0) {
apic_debug("KVM_WRITE:ESR not zero %x\n", val);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a119b361b8b7..e5e66e5c6640 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -150,6 +150,20 @@ module_param(dbg, bool, 0644);
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3
+/*
+ * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ *
+ * For handle_mmio_page_fault only:
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ */
+enum {
+ RET_PF_RETRY = 0,
+ RET_PF_EMULATE = 1,
+ RET_PF_INVALID = 2,
+};
+
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
@@ -2424,7 +2438,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
- return __shadow_walk_next(iterator, *iterator->sptep);
+ __shadow_walk_next(iterator, *iterator->sptep);
}
static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2794,13 +2808,13 @@ done:
return ret;
}
-static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
- bool speculative, bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
- bool emulate = false;
+ int ret = RET_PF_RETRY;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2830,12 +2844,12 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- emulate = true;
+ ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep)))
- emulate = true;
+ ret = RET_PF_EMULATE;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
@@ -2855,7 +2869,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
kvm_release_pfn_clean(pfn);
- return emulate;
+ return ret;
}
static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2994,14 +3008,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
- * Return 1 to tell kvm to emulate it.
*/
if (pfn == KVM_PFN_ERR_RO_FAULT)
- return 1;
+ return RET_PF_EMULATE;
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
- return 0;
+ return RET_PF_RETRY;
}
return -EFAULT;
@@ -3286,13 +3299,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
}
if (fast_page_fault(vcpu, v, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
return r;
@@ -3312,7 +3325,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
@@ -3659,54 +3672,38 @@ exit:
return reserved;
}
-/*
- * Return values of handle_mmio_page_fault:
- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
- * directly.
- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
- * fault path update the mmio spte.
- * RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
- */
-enum {
- RET_MMIO_PF_EMULATE = 1,
- RET_MMIO_PF_INVALID = 2,
- RET_MMIO_PF_RETRY = 0,
- RET_MMIO_PF_BUG = -1
-};
-
static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
if (mmio_info_in_cache(vcpu, addr, direct))
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
if (WARN_ON(reserved))
- return RET_MMIO_PF_BUG;
+ return -EINVAL;
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
if (!check_mmio_spte(vcpu, spte))
- return RET_MMIO_PF_INVALID;
+ return RET_PF_INVALID;
if (direct)
addr = 0;
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
}
/*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
- return RET_MMIO_PF_RETRY;
+ return RET_PF_RETRY;
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
@@ -3756,7 +3753,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3820,8 +3817,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect)
+ u64 fault_address, char *insn, int insn_len)
{
int r = 1;
@@ -3829,7 +3825,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
default:
trace_kvm_page_fault(fault_address, error_code);
- if (need_unprotect && kvm_event_needs_reinjection(vcpu))
+ if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
@@ -3876,7 +3872,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3893,13 +3889,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
}
if (fast_page_fault(vcpu, gpa, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
@@ -3919,7 +3915,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4918,25 +4914,25 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
vcpu->arch.gpa_val = cr2;
}
+ r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2, direct);
- if (r == RET_MMIO_PF_EMULATE) {
+ if (r == RET_PF_EMULATE) {
emulation_type = 0;
goto emulate;
}
- if (r == RET_MMIO_PF_RETRY)
- return 1;
- if (r < 0)
- return r;
- /* Must be RET_MMIO_PF_INVALID. */
}
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
- false);
+ if (r == RET_PF_INVALID) {
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+ false);
+ WARN_ON(r == RET_PF_INVALID);
+ }
+
+ if (r == RET_PF_RETRY)
+ return 1;
if (r < 0)
return r;
- if (!r)
- return 1;
/*
* Before emulating the instruction, check if the error code
@@ -4993,8 +4989,7 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu.pae_root);
- if (vcpu->arch.mmu.lm_root != NULL)
- free_page((unsigned long)vcpu->arch.mmu.lm_root);
+ free_page((unsigned long)vcpu->arch.mmu.lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -5464,10 +5459,8 @@ static struct shrinker mmu_shrinker = {
static void mmu_destroy_caches(void)
{
- if (pte_list_desc_cache)
- kmem_cache_destroy(pte_list_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
}
int kvm_mmu_module_init(void)
@@ -5476,13 +5469,13 @@ int kvm_mmu_module_init(void)
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!pte_list_desc_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!mmu_page_header_cache)
goto nomem;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index efc857615d8e..5b408c0ad612 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -66,8 +66,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect);
+ u64 fault_address, char *insn, int insn_len);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f18d1f8d332b..5abae72266b7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -593,7 +593,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, emulate;
+ int top_level, ret;
direct_access = gw->pte_access;
@@ -659,15 +659,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
clear_sp_write_flooding_count(it.sptep);
- emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
- it.level, gw->gfn, pfn, prefault, map_writable);
+ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
- return emulate;
+ return ret;
out_gpte_changed:
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
/*
@@ -762,12 +762,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!prefault)
inject_page_fault(vcpu, &walker.fault);
- return 0;
+ return RET_PF_RETRY;
}
if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
shadow_page_table_clear_flood(vcpu, addr);
- return 1;
+ return RET_PF_EMULATE;
}
vcpu->arch.write_fault_to_shadow_pgtable = false;
@@ -789,7 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
&map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
return r;
@@ -834,7 +834,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e68f0b3cbf7..b71daed3cca2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1034,15 +1034,12 @@ static int avic_ga_log_notifier(u32 ga_tag)
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
- if (!vcpu)
- return 0;
-
/* Note:
* At this point, the IOMMU should have already set the pending
* bit in the vAPIC backing page. So, we just need to schedule
* in the vcpu.
*/
- if (vcpu->mode == OUTSIDE_GUEST_MODE)
+ if (vcpu)
kvm_vcpu_wake_up(vcpu);
return 0;
@@ -2144,7 +2141,18 @@ static int pf_interception(struct vcpu_svm *svm)
return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
svm->vmcb->control.insn_bytes,
- svm->vmcb->control.insn_len, !npt_enabled);
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ svm->vmcb->control.insn_bytes,
+ svm->vmcb->control.insn_len);
}
static int db_interception(struct vcpu_svm *svm)
@@ -2916,70 +2924,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
return true;
}
-static bool nested_svm_vmrun(struct vcpu_svm *svm)
+static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+ struct vmcb *nested_vmcb, struct page *page)
{
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct page *page;
- u64 vmcb_gpa;
-
- vmcb_gpa = svm->vmcb->save.rax;
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return false;
-
- if (!nested_vmcb_checks(nested_vmcb)) {
- nested_vmcb->control.exit_code = SVM_EXIT_ERR;
- nested_vmcb->control.exit_code_hi = 0;
- nested_vmcb->control.exit_info_1 = 0;
- nested_vmcb->control.exit_info_2 = 0;
-
- nested_svm_unmap(page);
-
- return false;
- }
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
- nested_vmcb->save.rip,
- nested_vmcb->control.int_ctl,
- nested_vmcb->control.event_inj,
- nested_vmcb->control.nested_ctl);
-
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
- nested_vmcb->control.intercept_cr >> 16,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
-
- /* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- /*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
- */
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(hsave, vmcb);
-
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
else
@@ -3072,6 +3019,73 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
enable_gif(svm);
mark_all_dirty(svm->vmcb);
+}
+
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+ u64 vmcb_gpa;
+
+ vmcb_gpa = svm->vmcb->save.rax;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return false;
+
+ if (!nested_vmcb_checks(nested_vmcb)) {
+ nested_vmcb->control.exit_code = SVM_EXIT_ERR;
+ nested_vmcb->control.exit_code_hi = 0;
+ nested_vmcb->control.exit_info_1 = 0;
+ nested_vmcb->control.exit_info_2 = 0;
+
+ nested_svm_unmap(page);
+
+ return false;
+ }
+
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+ nested_vmcb->save.rip,
+ nested_vmcb->control.int_ctl,
+ nested_vmcb->control.event_inj,
+ nested_vmcb->control.nested_ctl);
+
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
+ /* Clear internal status */
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
+ hsave->save.rip = kvm_rip_read(&svm->vcpu);
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+
+ copy_vmcb_control_area(hsave, vmcb);
+
+ enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
return true;
}
@@ -3173,7 +3187,7 @@ static int stgi_interception(struct vcpu_svm *svm)
/*
* If VGIF is enabled, the STGI intercept is only added to
- * detect the opening of the NMI window; remove it now.
+ * detect the opening of the SMI/NMI window; remove it now.
*/
if (vgif_enabled(svm))
clr_intercept(svm, INTERCEPT_STGI);
@@ -4131,7 +4145,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MONITOR] = monitor_interception,
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
- [SVM_EXIT_NPF] = pf_interception,
+ [SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = emulate_on_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -5393,6 +5407,88 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
vcpu->arch.mcg_cap &= 0x1ff;
}
+static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Per APM Vol.2 15.22.2 "Response to SMI" */
+ if (!gif_set(svm))
+ return 0;
+
+ if (is_guest_mode(&svm->vcpu) &&
+ svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
+ /* TODO: Might need to set exit_info_1 and exit_info_2 here */
+ svm->vmcb->control.exit_code = SVM_EXIT_SMI;
+ svm->nested.exit_required = true;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* FED8h - SVM Guest */
+ put_smstate(u64, smstate, 0x7ed8, 1);
+ /* FEE0h - SVM Guest VMCB Physical Address */
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_vmexit(svm);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ struct {
+ u64 guest;
+ u64 vmcb;
+ } svm_state_save;
+ int ret;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
+ sizeof(svm_state_save));
+ if (ret)
+ return ret;
+
+ if (svm_state_save.guest) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
+ if (nested_vmcb)
+ enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
+ else
+ ret = 1;
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ }
+ return ret;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!gif_set(svm)) {
+ if (vgif_enabled(svm))
+ set_intercept(svm, INTERCEPT_STGI);
+ /* STGI will cause a vm exit */
+ return 1;
+ }
+ return 0;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -5503,6 +5599,11 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.deliver_posted_interrupt = svm_deliver_avic_intr,
.update_pi_irte = svm_update_pi_irte,
.setup_mce = svm_setup_mce,
+
+ .smi_allowed = svm_smi_allowed,
+ .pre_enter_smm = svm_pre_enter_smm,
+ .pre_leave_smm = svm_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a6f4f095f8f4..7c3522a989d0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -486,6 +486,14 @@ struct nested_vmx {
u64 nested_vmx_cr4_fixed1;
u64 nested_vmx_vmcs_enum;
u64 nested_vmx_vmfunc_controls;
+
+ /* SMM related state */
+ struct {
+ /* in VMX operation on SMM entry? */
+ bool vmxon;
+ /* in guest mode on SMM entry? */
+ bool guest_mode;
+ } smm;
};
#define POSTED_INTR_ON 0
@@ -900,16 +908,13 @@ static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
static bool vmx_xsaves_supported(void);
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
-static int alloc_identity_pagetable(struct kvm *kvm);
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -1598,18 +1603,15 @@ static inline void vpid_sync_context(int vpid)
static inline void ept_sync_global(void)
{
- if (cpu_has_vmx_invept_global())
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
}
static inline void ept_sync_context(u64 eptp)
{
- if (enable_ept) {
- if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
- else
- ept_sync_global();
- }
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
}
static __always_inline void vmcs_check16(unsigned long field)
@@ -2831,8 +2833,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_PML;
vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
}
- } else
- vmx->nested.nested_vmx_ept_caps = 0;
+ }
if (cpu_has_vmx_vmfunc()) {
vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -2841,8 +2842,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* Advertise EPTP switching unconditionally
* since we emulate it
*/
- vmx->nested.nested_vmx_vmfunc_controls =
- VMX_VMFUNC_EPTP_SWITCHING;
+ if (enable_ept)
+ vmx->nested.nested_vmx_vmfunc_controls =
+ VMX_VMFUNC_EPTP_SWITCHING;
}
/*
@@ -2856,8 +2858,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_VPID;
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
VMX_VPID_EXTENT_SUPPORTED_MASK;
- } else
- vmx->nested.nested_vmx_vpid_caps = 0;
+ }
if (enable_unrestricted_guest)
vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -3544,7 +3545,8 @@ static int hardware_enable(void)
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
kvm_cpu_vmxon(phys_addr);
- ept_sync_global();
+ if (enable_ept)
+ ept_sync_global();
return 0;
}
@@ -3657,8 +3659,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_RDSEED |
- SECONDARY_EXEC_RDRAND |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_ENABLE_VMFUNC;
@@ -3679,14 +3681,25 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+ &vmx_capability.ept, &vmx_capability.vpid);
+
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- vmx_capability.ept, vmx_capability.vpid);
+ } else if (vmx_capability.ept) {
+ vmx_capability.ept = 0;
+ pr_warn_once("EPT CAP should not exist if not support "
+ "1-setting enable EPT VM-execution control\n");
+ }
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+ vmx_capability.vpid) {
+ vmx_capability.vpid = 0;
+ pr_warn_once("VPID CAP should not exist if not support "
+ "1-setting enable VPID VM-execution control\n");
}
min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
@@ -4781,18 +4794,18 @@ static int init_rmode_identity_map(struct kvm *kvm)
kvm_pfn_t identity_map_pfn;
u32 tmp;
- if (!enable_ept)
- return 0;
-
/* Protect kvm->arch.ept_identity_pagetable_done. */
mutex_lock(&kvm->slots_lock);
if (likely(kvm->arch.ept_identity_pagetable_done))
goto out2;
+ if (!kvm->arch.ept_identity_map_addr)
+ kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
- r = alloc_identity_pagetable(kvm);
+ r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm->arch.ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
goto out2;
@@ -4864,20 +4877,6 @@ out:
return r;
}
-static int alloc_identity_pagetable(struct kvm *kvm)
-{
- /* Called with kvm->slots_lock held. */
-
- int r = 0;
-
- BUG_ON(kvm->arch.ept_identity_pagetable_done);
-
- r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm->arch.ept_identity_map_addr, PAGE_SIZE);
-
- return r;
-}
-
static int allocate_vpid(void)
{
int vpid;
@@ -5282,13 +5281,13 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
static bool vmx_rdrand_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
}
static bool vmx_rdseed_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
}
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
@@ -5382,30 +5381,30 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (vmx_rdrand_supported()) {
bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
if (rdrand_enabled)
- exec_control &= ~SECONDARY_EXEC_RDRAND;
+ exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
if (nested) {
if (rdrand_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDRAND;
+ ~SECONDARY_EXEC_RDRAND_EXITING;
}
}
if (vmx_rdseed_supported()) {
bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
if (rdseed_enabled)
- exec_control &= ~SECONDARY_EXEC_RDSEED;
+ exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
if (nested) {
if (rdseed_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDSEED;
+ ~SECONDARY_EXEC_RDSEED_EXITING;
}
}
@@ -5426,7 +5425,7 @@ static void ept_set_mmio_spte_mask(void)
/*
* Sets up the vmcs for emulated real mode.
*/
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
unsigned long a;
@@ -5539,8 +5538,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
-
- return 0;
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -5604,6 +5601,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ if (kvm_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, 0);
setup_msrs(vmx);
@@ -5912,8 +5911,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
cr2 = vmcs_readl(EXIT_QUALIFICATION);
/* EPT won't cause page fault directly */
WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
- true);
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -6747,16 +6745,14 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ept() ||
!cpu_has_vmx_ept_4levels() ||
- !cpu_has_vmx_ept_mt_wb()) {
+ !cpu_has_vmx_ept_mt_wb() ||
+ !cpu_has_vmx_invept_global())
enable_ept = 0;
- enable_unrestricted_guest = 0;
- enable_ept_ad_bits = 0;
- }
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
- if (!cpu_has_vmx_unrestricted_guest())
+ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
@@ -6776,8 +6772,13 @@ static __init int hardware_setup(void)
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
- if (!cpu_has_vmx_ple())
+ if (!cpu_has_vmx_ple()) {
ple_gap = 0;
+ ple_window = 0;
+ ple_window_grow = 0;
+ ple_window_max = 0;
+ ple_window_shrink = 0;
+ }
if (!cpu_has_vmx_apicv()) {
enable_apicv = 0;
@@ -8415,9 +8416,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_RDPMC:
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
case EXIT_REASON_RDRAND:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
case EXIT_REASON_RDSEED:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@ -9475,7 +9476,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx->loaded_vmcs = vmcs;
vmx_vcpu_put(vcpu);
vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
put_cpu();
}
@@ -9556,11 +9556,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu;
- err = vmx_vcpu_setup(vmx);
+ vmx_vcpu_setup(vmx);
vmx_vcpu_put(&vmx->vcpu);
put_cpu();
- if (err)
- goto free_vmcs;
if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
@@ -9568,9 +9566,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
}
if (enable_ept) {
- if (!kvm->arch.ept_identity_map_addr)
- kvm->arch.ept_identity_map_addr =
- VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = init_rmode_identity_map(kvm);
if (err)
goto free_vmcs;
@@ -11325,6 +11320,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
@@ -11421,8 +11418,11 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
leave_guest_mode(vcpu);
if (likely(!vmx->fail)) {
- prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
- exit_qualification);
+ if (exit_reason == -1)
+ sync_vmcs12(vcpu, vmcs12);
+ else
+ prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+ exit_qualification);
if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
@@ -11486,7 +11486,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
*/
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (enable_shadow_vmcs)
+ if (enable_shadow_vmcs && exit_reason != -1)
vmx->nested.sync_shadow_vmcs = true;
/* in case we halted in L2 */
@@ -11510,12 +11510,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
- trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
- vmcs12->exit_qualification,
- vmcs12->idt_vectoring_info_field,
- vmcs12->vm_exit_intr_info,
- vmcs12->vm_exit_intr_error_code,
- KVM_ISA_VMX);
+ if (exit_reason != -1)
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
load_vmcs12_host_state(vcpu, vmcs12);
@@ -11938,6 +11939,54 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEATURE_CONTROL_LMCE;
}
+static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+ return 1;
+}
+
+static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ return 0;
+}
+
+static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
+
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
+
+ if (vmx->nested.smm.guest_mode) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ ret = enter_vmx_non_root_mode(vcpu, false);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ if (ret)
+ return ret;
+
+ vmx->nested.smm.guest_mode = false;
+ }
+ return 0;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -12063,6 +12112,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
#endif
.setup_mce = vmx_setup_mce,
+
+ .smi_allowed = vmx_smi_allowed,
+ .pre_enter_smm = vmx_pre_enter_smm,
+ .pre_leave_smm = vmx_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03869eb7fcd6..34c85aa2e2d1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2006,10 +2006,12 @@ static void kvmclock_sync_fn(struct work_struct *work)
KVMCLOCK_SYNC_PERIOD);
}
-static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
switch (msr) {
case MSR_IA32_MCG_STATUS:
@@ -2034,6 +2036,9 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if ((offset & 0x3) == 0 &&
data != 0 && (data | (1 << 10)) != ~(u64)0)
return -1;
+ if (!msr_info->host_initiated &&
+ (offset & 0x3) == 1 && data != 0)
+ return -1;
vcpu->arch.mce_banks[offset] = data;
break;
}
@@ -2283,7 +2288,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- return set_msr_mce(vcpu, msr, data);
+ return set_msr_mce(vcpu, msr_info);
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -4034,10 +4039,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_IDENTITY_MAP_ADDR: {
u64 ident_addr;
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto set_identity_unlock;
r = -EFAULT;
if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
- goto out;
+ goto set_identity_unlock;
r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+set_identity_unlock:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_SET_NR_MMU_PAGES:
@@ -5275,6 +5286,11 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla
kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
}
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5316,6 +5332,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
+ .pre_leave_smm = emulator_pre_leave_smm,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6426,7 +6443,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
kvm_x86_ops->queue_exception(vcpu);
- } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) {
vcpu->arch.smi_pending = false;
enter_smm(vcpu);
} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
@@ -6473,9 +6490,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-#define put_smstate(type, buf, offset, val) \
- *(type *)((buf) + (offset) - 0x7e00) = val
-
static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
@@ -6641,13 +6655,20 @@ static void enter_smm(struct kvm_vcpu *vcpu)
u32 cr0;
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
- vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
enter_smm_save_state_32(vcpu, buf);
+ /*
+ * Give pre_enter_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. leave guest mode) after we've saved the state into
+ * the SMM state-save area.
+ */
+ kvm_x86_ops->pre_enter_smm(vcpu, buf);
+
+ vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (kvm_x86_ops->get_nmi_mask(vcpu))
@@ -6876,17 +6897,23 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
else {
- /* Enable NMI/IRQ window open exits if needed.
+ /* Enable SMI/NMI/IRQ window open exits if needed.
*
- * SMIs have two cases: 1) they can be nested, and
- * then there is nothing to do here because RSM will
- * cause a vmexit anyway; 2) or the SMI can be pending
- * because inject_pending_event has completed the
- * injection of an IRQ or NMI from the previous vmexit,
- * and then we request an immediate exit to inject the SMI.
+ * SMIs have three cases:
+ * 1) They can be nested, and then there is nothing to
+ * do here because RSM will cause a vmexit anyway.
+ * 2) There is an ISA-specific reason why SMI cannot be
+ * injected, and the moment when this changes can be
+ * intercepted.
+ * 3) Or the SMI can be pending because
+ * inject_pending_event has completed the injection
+ * of an IRQ or NMI from the previous vmexit, and
+ * then we request an immediate exit to inject the
+ * SMI.
*/
if (vcpu->arch.smi_pending && !is_smm(vcpu))
- req_immediate_exit = true;
+ if (!kvm_x86_ops->enable_smi_window(vcpu))
+ req_immediate_exit = true;
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -7798,18 +7825,40 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
+ if (kvm_mpx_supported()) {
+ void *mpx_state_buffer;
+
+ /*
+ * To avoid have the INIT path from kvm_apic_has_events() that be
+ * called with loaded FPU and does not let userspace fix the state.
+ */
+ kvm_put_guest_fpu(vcpu);
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDREGS);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDCSR);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+ }
+
if (!init_event) {
kvm_pmu_reset(vcpu);
vcpu->arch.smbase = 0x30000;
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
vcpu->arch.msr_misc_features_enables = 0;
+
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
}
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
+ vcpu->arch.ia32_xss = 0;
+
kvm_x86_ops->vcpu_reset(vcpu, init_event);
}
@@ -7974,16 +8023,11 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
- struct kvm *kvm;
int r;
- BUG_ON(vcpu->kvm == NULL);
- kvm = vcpu->kvm;
-
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -8001,7 +8045,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (r < 0)
goto fail_free_pio_data;
- if (irqchip_in_kernel(kvm)) {
+ if (irqchip_in_kernel(vcpu->kvm)) {
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
@@ -8023,10 +8067,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
fx_init(vcpu);
- vcpu->arch.ia32_tsc_adjust_msr = 0x0;
- vcpu->arch.pv_time_enabled = false;
-
- vcpu->arch.guest_supported_xcr0 = 0;
vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);