summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 10:14:24 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 10:14:24 -0800
commit55065bc52795faae549abfb912aacc622dd63876 (patch)
tree63683547e41ed459a2a8747eeafb5e969633d54f /arch
parent008d23e4852d78bb2618f2035f8b2110b6a6b968 (diff)
parente5c301428294cb8925667c9ee39f817c4ab1c2c9 (diff)
downloadlinux-55065bc52795faae549abfb912aacc622dd63876.tar.gz
linux-55065bc52795faae549abfb912aacc622dd63876.tar.bz2
linux-55065bc52795faae549abfb912aacc622dd63876.zip
Merge branch 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.38' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (142 commits) KVM: Initialize fpu state in preemptible context KVM: VMX: when entering real mode align segment base to 16 bytes KVM: MMU: handle 'map_writable' in set_spte() function KVM: MMU: audit: allow audit more guests at the same time KVM: Fetch guest cr3 from hardware on demand KVM: Replace reads of vcpu->arch.cr3 by an accessor KVM: MMU: only write protect mappings at pagetable level KVM: VMX: Correct asm constraint in vmcs_load()/vmcs_clear() KVM: MMU: Initialize base_role for tdp mmus KVM: VMX: Optimize atomic EFER load KVM: VMX: Add definitions for more vm entry/exit control bits KVM: SVM: copy instruction bytes from VMCB KVM: SVM: implement enhanced INVLPG intercept KVM: SVM: enhance mov DR intercept handler KVM: SVM: enhance MOV CR intercept handler KVM: SVM: add new SVM feature bit names KVM: cleanup emulate_instruction KVM: move complete_insn_gp() into x86.c KVM: x86: fix CR8 handling KVM guest: Fix kvm clock initialization when it's configured out ...
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/include/asm/kvm_host.h4
-rw-r--r--arch/ia64/kvm/kvm-ia64.c30
-rw-r--r--arch/powerpc/kvm/book3s.c4
-rw-r--r--arch/powerpc/kvm/powerpc.c20
-rw-r--r--arch/s390/kvm/kvm-s390.c23
-rw-r--r--arch/x86/include/asm/kvm_emulate.h35
-rw-r--r--arch/x86/include/asm/kvm_host.h99
-rw-r--r--arch/x86/include/asm/kvm_para.h24
-rw-r--r--arch/x86/include/asm/svm.h57
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/vmx.h15
-rw-r--r--arch/x86/kernel/entry_32.S10
-rw-r--r--arch/x86/kernel/entry_64.S3
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/kvm.c317
-rw-r--r--arch/x86/kernel/kvmclock.c13
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c367
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h22
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/mmu.c251
-rw-r--r--arch/x86/kvm/mmu_audit.c39
-rw-r--r--arch/x86/kvm/paging_tmpl.h147
-rw-r--r--arch/x86/kvm/svm.c865
-rw-r--r--arch/x86/kvm/trace.h17
-rw-r--r--arch/x86/kvm/vmx.c156
-rw-r--r--arch/x86/kvm/x86.c474
28 files changed, 2015 insertions, 986 deletions
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 2f229e5de498..2689ee54a1c9 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -590,6 +590,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
void kvm_sal_emul(struct kvm_vcpu *vcpu);
+#define __KVM_HAVE_ARCH_VM_ALLOC 1
+struct kvm *kvm_arch_alloc_vm(void);
+void kvm_arch_free_vm(struct kvm *kvm);
+
#endif /* __ASSEMBLY__*/
#endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index f56a6316e134..70d224d4264c 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -749,7 +749,7 @@ out:
return r;
}
-static struct kvm *kvm_alloc_kvm(void)
+struct kvm *kvm_arch_alloc_vm(void)
{
struct kvm *kvm;
@@ -760,7 +760,7 @@ static struct kvm *kvm_alloc_kvm(void)
vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
if (!vm_base)
- return ERR_PTR(-ENOMEM);
+ return NULL;
memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
kvm = (struct kvm *)(vm_base +
@@ -806,10 +806,12 @@ static void kvm_build_io_pmt(struct kvm *kvm)
#define GUEST_PHYSICAL_RR4 0x2739
#define VMM_INIT_RR 0x1660
-static void kvm_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm)
{
BUG_ON(!kvm);
+ kvm->arch.is_sn2 = ia64_platform_is("sn2");
+
kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
kvm->arch.vmm_init_rr = VMM_INIT_RR;
@@ -823,21 +825,8 @@ static void kvm_init_vm(struct kvm *kvm)
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
-}
-
-struct kvm *kvm_arch_create_vm(void)
-{
- struct kvm *kvm = kvm_alloc_kvm();
-
- if (IS_ERR(kvm))
- return ERR_PTR(-ENOMEM);
-
- kvm->arch.is_sn2 = ia64_platform_is("sn2");
-
- kvm_init_vm(kvm);
-
- return kvm;
+ return 0;
}
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
@@ -962,7 +951,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
r = kvm_setup_default_irq_routing(kvm);
if (r) {
+ mutex_lock(&kvm->slots_lock);
kvm_ioapic_destroy(kvm);
+ mutex_unlock(&kvm->slots_lock);
goto out;
}
break;
@@ -1357,7 +1348,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
return -EINVAL;
}
-static void free_kvm(struct kvm *kvm)
+void kvm_arch_free_vm(struct kvm *kvm)
{
unsigned long vm_base = kvm->arch.vm_base;
@@ -1399,9 +1390,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
#endif
kfree(kvm->arch.vioapic);
kvm_release_vm_pages(kvm);
- kvm_free_physmem(kvm);
- cleanup_srcu_struct(&kvm->srcu);
- free_kvm(kvm);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index e316847c08c0..badc983031b3 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1307,12 +1307,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
int err = -ENOMEM;
unsigned long p;
- vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s));
+ vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
if (!vcpu_book3s)
goto out;
- memset(vcpu_book3s, 0, sizeof(struct kvmppc_vcpu_book3s));
-
vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
if (!vcpu_book3s->shadow_vcpu)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 38f756f25053..99758460efde 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -145,18 +145,12 @@ void kvm_arch_check_processor_compat(void *rtn)
*(int *)rtn = kvmppc_core_check_processor_compat();
}
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
{
- struct kvm *kvm;
-
- kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
- if (!kvm)
- return ERR_PTR(-ENOMEM);
-
- return kvm;
+ return 0;
}
-static void kvmppc_free_vcpus(struct kvm *kvm)
+void kvm_arch_destroy_vm(struct kvm *kvm)
{
unsigned int i;
struct kvm_vcpu *vcpu;
@@ -176,14 +170,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
{
}
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
- kvmppc_free_vcpus(kvm);
- kvm_free_physmem(kvm);
- cleanup_srcu_struct(&kvm->srcu);
- kfree(kvm);
-}
-
int kvm_dev_ioctl_check_extension(long ext)
{
int r;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 985d825494f1..bade533ba288 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -164,24 +164,18 @@ long kvm_arch_vm_ioctl(struct file *filp,
return r;
}
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
{
- struct kvm *kvm;
int rc;
char debug_name[16];
rc = s390_enable_sie();
if (rc)
- goto out_nokvm;
-
- rc = -ENOMEM;
- kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
- if (!kvm)
- goto out_nokvm;
+ goto out_err;
kvm->arch.sca = (struct sca_block *) get_zeroed_page(GFP_KERNEL);
if (!kvm->arch.sca)
- goto out_nosca;
+ goto out_err;
sprintf(debug_name, "kvm-%u", current->pid);
@@ -195,13 +189,11 @@ struct kvm *kvm_arch_create_vm(void)
debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
VM_EVENT(kvm, 3, "%s", "vm created");
- return kvm;
+ return 0;
out_nodbf:
free_page((unsigned long)(kvm->arch.sca));
-out_nosca:
- kfree(kvm);
-out_nokvm:
- return ERR_PTR(rc);
+out_err:
+ return rc;
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -240,11 +232,8 @@ void kvm_arch_sync_events(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_free_vcpus(kvm);
- kvm_free_physmem(kvm);
free_page((unsigned long)(kvm->arch.sca));
debug_unregister(kvm->arch.dbf);
- cleanup_srcu_struct(&kvm->srcu);
- kfree(kvm);
}
/* Section: vcpu related */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b36c6b3fe144..8e37deb1eb38 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -15,6 +15,14 @@
struct x86_emulate_ctxt;
+struct x86_exception {
+ u8 vector;
+ bool error_code_valid;
+ u16 error_code;
+ bool nested_page_fault;
+ u64 address; /* cr2 or nested page fault gpa */
+};
+
/*
* x86_emulate_ops:
*
@@ -64,7 +72,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
/*
* write_std: Write bytes of standard (non-emulated/special) memory.
@@ -74,7 +83,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to write to memory.
*/
int (*write_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
/*
* fetch: Read bytes of standard (non-emulated/special) memory.
* Used for instruction fetch.
@@ -83,7 +93,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*fetch)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
/*
* read_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +105,7 @@ struct x86_emulate_ops {
int (*read_emulated)(unsigned long addr,
void *val,
unsigned int bytes,
- unsigned int *error,
+ struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@@ -107,7 +118,7 @@ struct x86_emulate_ops {
int (*write_emulated)(unsigned long addr,
const void *val,
unsigned int bytes,
- unsigned int *error,
+ struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@@ -122,7 +133,7 @@ struct x86_emulate_ops {
const void *old,
const void *new,
unsigned int bytes,
- unsigned int *error,
+ struct x86_exception *fault,
struct kvm_vcpu *vcpu);
int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -159,7 +170,10 @@ struct operand {
};
union {
unsigned long *reg;
- unsigned long mem;
+ struct segmented_address {
+ ulong ea;
+ unsigned seg;
+ } mem;
} addr;
union {
unsigned long val;
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt {
bool perm_ok; /* do not check permissions if true */
- int exception; /* exception that happens during emulation or -1 */
- u32 error_code; /* error code for exception */
- bool error_code_valid;
+ bool have_exception;
+ struct x86_exception exception;
/* decode cache */
struct decode_cache decode;
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f702f82aa1eb..aa75f21a9fba 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -83,11 +83,14 @@
#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
+#define ASYNC_PF_PER_VCPU 64
+
extern spinlock_t kvm_lock;
extern struct list_head vm_list;
struct kvm_vcpu;
struct kvm;
+struct kvm_async_pf;
enum kvm_reg {
VCPU_REGS_RAX = 0,
@@ -114,6 +117,7 @@ enum kvm_reg {
enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+ VCPU_EXREG_CR3,
};
enum {
@@ -238,16 +242,18 @@ struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
- void (*inject_page_fault)(struct kvm_vcpu *vcpu);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
+ bool prefault);
+ void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
- u32 *error);
+ struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync);
+ struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa;
int root_level;
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch {
*/
struct kvm_mmu *walk_mmu;
- /*
- * This struct is filled with the necessary information to propagate a
- * page fault into the guest
- */
- struct {
- u64 address;
- unsigned error_code;
- bool nested;
- } fault;
-
/* only needed in kvm_pv_mmu_op() path, but it's hot so
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch {
u64 hv_vapic;
cpumask_var_t wbinvd_dirty_mask;
+
+ struct {
+ bool halted;
+ gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+ struct gfn_to_hva_cache data;
+ u64 msr_val;
+ u32 id;
+ bool send_user_only;
+ } apf;
};
struct kvm_arch {
@@ -456,6 +461,10 @@ struct kvm_arch {
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
+
+ #ifdef CONFIG_KVM_MMU_AUDIT
+ int audit_point;
+ #endif
};
struct kvm_vm_stat {
@@ -529,6 +538,7 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*decache_cr3)(struct kvm_vcpu *vcpu);
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -582,9 +592,17 @@ struct kvm_x86_ops {
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
const struct trace_print_flags *exit_reasons_str;
};
+struct kvm_arch_async_pf {
+ u32 token;
+ gfn_t gfn;
+ unsigned long cr3;
+ bool direct_map;
+};
+
extern struct kvm_x86_ops *kvm_x86_ops;
int kvm_mmu_module_init(void);
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
-void kvm_mmu_set_base_ptes(u64 base_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
@@ -623,8 +640,15 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2, u16 error_code, int emulation_type);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+ int emulation_type, void *insn, int insn_len);
+
+static inline int emulate_instruction(struct kvm_vcpu *vcpu,
+ int emulation_type)
+{
+ return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+}
+
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_enable_tdp(void);
@@ -766,20 +795,25 @@ enum {
#define HF_VINTR_MASK (1 << 2)
#define HF_NMI_MASK (1 << 3)
#define HF_IRET_MASK (1 << 4)
+#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
/*
* Hardware virtualization extension instructions may fault if a
* reboot turns off virtualization while processes are running.
* Trap the fault and ignore the instruction if that happens.
*/
-asmlinkage void kvm_handle_fault_on_reboot(void);
+asmlinkage void kvm_spurious_fault(void);
+extern bool kvm_rebooting;
#define __kvm_handle_fault_on_reboot(insn) \
"666: " insn "\n\t" \
+ "668: \n\t" \
".pushsection .fixup, \"ax\" \n" \
"667: \n\t" \
+ "cmpb $0, kvm_rebooting \n\t" \
+ "jne 668b \n\t" \
__ASM_SIZE(push) " $666b \n\t" \
- "jmp kvm_handle_fault_on_reboot \n\t" \
+ "call kvm_spurious_fault \n\t" \
".popsection \n\t" \
".pushsection __ex_table, \"a\" \n\t" \
_ASM_PTR " 666b, 667b \n\t" \
@@ -799,4 +833,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b562b6184bc..a427bf77a93d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
* are available. The use of 0x11 and 0x12 is deprecated
*/
#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_ASYNC_PF 4
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,13 @@
/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
#define KVM_MAX_MMU_OP_BATCH 32
+#define KVM_ASYNC_PF_ENABLED (1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
+
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
#define KVM_MMU_OP_FLUSH_TLB 2
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
};
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
+struct kvm_vcpu_pv_apf_data {
+ __u32 reason;
+ __u8 pad[60];
+ __u32 enabled;
+};
+
#ifdef __KERNEL__
#include <asm/processor.h>
extern void kvmclock_init(void);
+extern int kvm_register_clock(char *txt);
/* This instruction is vmcall. On non-VT architectures, it will generate a
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void)
#ifdef CONFIG_KVM_GUEST
void __init kvm_guest_init(void);
+void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wake(u32 token);
+u32 kvm_read_and_reset_pf_reason(void);
#else
#define kvm_guest_init() do { } while (0)
+#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wake(T) do {} while(0)
+static inline u32 kvm_read_and_reset_pf_reason(void)
+{
+ return 0;
+}
#endif
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0e831059ac5a..f2b83bc7d784 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -47,14 +47,13 @@ enum {
INTERCEPT_MONITOR,
INTERCEPT_MWAIT,
INTERCEPT_MWAIT_COND,
+ INTERCEPT_XSETBV,
};
struct __attribute__ ((__packed__)) vmcb_control_area {
- u16 intercept_cr_read;
- u16 intercept_cr_write;
- u16 intercept_dr_read;
- u16 intercept_dr_write;
+ u32 intercept_cr;
+ u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
u8 reserved_1[42];
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 event_inj_err;
u64 nested_cr3;
u64 lbr_ctl;
- u64 reserved_5;
+ u32 clean;
+ u32 reserved_5;
u64 next_rip;
- u8 reserved_6[816];
+ u8 insn_len;
+ u8 insn_bytes[15];
+ u8 reserved_6[800];
};
#define TLB_CONTROL_DO_NOTHING 0
#define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
#define V_TPR_MASK 0x0f
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
#define SVM_SELECTOR_CODE_MASK (1 << 3)
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-#define INTERCEPT_CR8_MASK (1 << 8)
-
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
+#define INTERCEPT_CR0_READ 0
+#define INTERCEPT_CR3_READ 3
+#define INTERCEPT_CR4_READ 4
+#define INTERCEPT_CR8_READ 8
+#define INTERCEPT_CR0_WRITE (16 + 0)
+#define INTERCEPT_CR3_WRITE (16 + 3)
+#define INTERCEPT_CR4_WRITE (16 + 4)
+#define INTERCEPT_CR8_WRITE (16 + 8)
+
+#define INTERCEPT_DR0_READ 0
+#define INTERCEPT_DR1_READ 1
+#define INTERCEPT_DR2_READ 2
+#define INTERCEPT_DR3_READ 3
+#define INTERCEPT_DR4_READ 4
+#define INTERCEPT_DR5_READ 5
+#define INTERCEPT_DR6_READ 6
+#define INTERCEPT_DR7_READ 7
+#define INTERCEPT_DR0_WRITE (16 + 0)
+#define INTERCEPT_DR1_WRITE (16 + 1)
+#define INTERCEPT_DR2_WRITE (16 + 2)
+#define INTERCEPT_DR3_WRITE (16 + 3)
+#define INTERCEPT_DR4_WRITE (16 + 4)
+#define INTERCEPT_DR5_WRITE (16 + 5)
+#define INTERCEPT_DR6_WRITE (16 + 6)
+#define INTERCEPT_DR7_WRITE (16 + 7)
#define SVM_EVTINJ_VEC_MASK 0xff
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
+#define SVM_EXITINFO_REG_MASK 0x0F
+
#define SVM_EXIT_READ_CR0 0x000
#define SVM_EXIT_READ_CR3 0x003
#define SVM_EXIT_READ_CR4 0x004
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXIT_MONITOR 0x08a
#define SVM_EXIT_MWAIT 0x08b
#define SVM_EXIT_MWAIT_COND 0x08c
+#define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_ERR -1
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda56781d..0310da67307f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
asmlinkage void stack_segment(void);
asmlinkage void general_protection(void);
asmlinkage void page_fault(void);
+asmlinkage void async_page_fault(void);
asmlinkage void spurious_interrupt_bug(void);
asmlinkage void coprocessor_error(void);
asmlinkage void alignment_check(void);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9f0cbd987d50..84471b810460 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,15 +66,23 @@
#define PIN_BASED_NMI_EXITING 0x00000008
#define PIN_BASED_VIRTUAL_NMIS 0x00000020
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
#define VM_EXIT_SAVE_IA32_PAT 0x00040000
#define VM_EXIT_LOAD_IA32_PAT 0x00080000
+#define VM_EXIT_SAVE_IA32_EFER 0x00100000
+#define VM_EXIT_LOAD_IA32_EFER 0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
/* VMCS Encodings */
enum vmcs_field {
@@ -239,6 +247,7 @@ enum vmcs_field {
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
@@ -296,6 +305,12 @@ enum vmcs_field {
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
+/* GUEST_ACTIVITY_STATE flags */
+#define GUEST_ACTIVITY_ACTIVE 0
+#define GUEST_ACTIVITY_HLT 1
+#define GUEST_ACTIVITY_SHUTDOWN 2
+#define GUEST_ACTIVITY_WAIT_SIPI 3
+
/*
* Exit Qualifications for MOV for Control Register Access
*/
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 591e60104278..c8b4efad7ebb 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
CFI_ENDPROC
END(general_protection)
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+ RING0_EC_FRAME
+ pushl $do_async_page_fault
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(apf_page_fault)
+#endif
+
/*
* End of kprobes section
*/
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index d3b895f375d3..aed1ffbeb0c9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1329,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment
#endif
errorentry general_protection do_general_protection
errorentry page_fault do_page_fault
+#ifdef CONFIG_KVM_GUEST
+errorentry async_page_fault do_async_page_fault
+#endif
#ifdef CONFIG_X86_MCE
paranoidzeroentry machine_check *machine_check_vector(%rip)
#endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 58bb239a2fd7..e60c38cc0eed 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
set_stopped_child_used_math(tsk);
return 0;
}
+EXPORT_SYMBOL_GPL(init_fpu);
/*
* The xstateregs_active() routine is the same as the fpregs_active() routine,
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..8dc44662394b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
#include <asm/timer.h>
+#include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
#define MMU_QUEUE_SIZE 1024
+static int kvmapf = 1;
+
+static int parse_no_kvmapf(char *arg)
+{
+ kvmapf = 0;
+ return 0;
+}
+
+early_param("no-kvmapf", parse_no_kvmapf);
+
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
};
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static struct kvm_para_state *kvm_para_state(void)
{
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
{
}
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+ struct hlist_node link;
+ wait_queue_head_t wq;
+ u32 token;
+ int cpu;
+ bool halted;
+ struct mm_struct *mm;
+};
+
+static struct kvm_task_sleep_head {
+ spinlock_t lock;
+ struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+ u32 token)
+{
+ struct hlist_node *p;
+
+ hlist_for_each(p, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->token == token)
+ return n;
+ }
+
+ return NULL;
+}
+
+void kvm_async_pf_task_wait(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node n, *e;
+ DEFINE_WAIT(wait);
+ int cpu, idle;
+
+ cpu = get_cpu();
+ idle = idle_cpu(cpu);
+ put_cpu();
+
+ spin_lock(&b->lock);
+ e = _find_apf_task(b, token);
+ if (e) {
+ /* dummy entry exist -> wake up was delivered ahead of PF */
+ hlist_del(&e->link);
+ kfree(e);
+ spin_unlock(&b->lock);
+ return;
+ }
+
+ n.token = token;
+ n.cpu = smp_processor_id();
+ n.mm = current->active_mm;
+ n.halted = idle || preempt_count() > 1;
+ atomic_inc(&n.mm->mm_count);
+ init_waitqueue_head(&n.wq);
+ hlist_add_head(&n.link, &b->list);
+ spin_unlock(&b->lock);
+
+ for (;;) {
+ if (!n.halted)
+ prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (hlist_unhashed(&n.link))
+ break;
+
+ if (!n.halted) {
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
+ } else {
+ /*
+ * We cannot reschedule. So halt.
+ */
+ native_safe_halt();
+ local_irq_disable();
+ }
+ }
+ if (!n.halted)
+ finish_wait(&n.wq, &wait);
+
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
+
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
+{
+ hlist_del_init(&n->link);
+ if (!n->mm)
+ return;
+ mmdrop(n->mm);
+ if (n->halted)
+ smp_send_reschedule(n->cpu);
+ else if (waitqueue_active(&n->wq))
+ wake_up(&n->wq);
+}
+
+static void apf_task_wake_all(void)
+{
+ int i;
+
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+ struct hlist_node *p, *next;
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+ spin_lock(&b->lock);
+ hlist_for_each_safe(p, next, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->cpu == smp_processor_id())
+ apf_task_wake_one(n);
+ }
+ spin_unlock(&b->lock);
+ }
+}
+
+void kvm_async_pf_task_wake(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *n;
+
+ if (token == ~0) {
+ apf_task_wake_all();
+ return;
+ }
+
+again:
+ spin_lock(&b->lock);
+ n = _find_apf_task(b, token);
+ if (!n) {
+ /*
+ * async PF was not yet handled.
+ * Add dummy entry for the token.
+ */
+ n = kmalloc(sizeof(*n), GFP_ATOMIC);
+ if (!n) {
+ /*
+ * Allocation failed! Busy wait while other cpu
+ * handles async PF.
+ */
+ spin_unlock(&b->lock);
+ cpu_relax();
+ goto again;
+ }
+ n->token = token;
+ n->cpu = smp_processor_id();
+ n->mm = NULL;
+ init_waitqueue_head(&n->wq);
+ hlist_add_head(&n->link, &b->list);
+ } else
+ apf_task_wake_one(n);
+ spin_unlock(&b->lock);
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
+
+u32 kvm_read_and_reset_pf_reason(void)
+{
+ u32 reason = 0;
+
+ if (__get_cpu_var(apf_reason).enabled) {
+ reason = __get_cpu_var(apf_reason).reason;
+ __get_cpu_var(apf_reason).reason = 0;
+ }
+
+ return reason;
+}
+EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+
+dotraplinkage void __kprobes
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ switch (kvm_read_and_reset_pf_reason()) {
+ default:
+ do_page_fault(regs, error_code);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ /* page is swapped out by the host. */
+ kvm_async_pf_task_wait((u32)read_cr2());
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ kvm_async_pf_task_wake((u32)read_cr2());
+ break;
+ }
+}
+
static void kvm_mmu_op(void *buffer, unsigned len)
{
int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
#endif
}
+void __cpuinit kvm_guest_cpu_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
+ u64 pa = __pa(&__get_cpu_var(apf_reason));
+
+#ifdef CONFIG_PREEMPT
+ pa |= KVM_ASYNC_PF_SEND_ALWAYS;
+#endif
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
+ __get_cpu_var(apf_reason).enabled = 1;
+ printk(KERN_INFO"KVM setup async PF for cpu %d\n",
+ smp_processor_id());
+ }
+}
+
+static void kvm_pv_disable_apf(void *unused)
+{
+ if (!__get_cpu_var(apf_reason).enabled)
+ return;
+
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+ __get_cpu_var(apf_reason).enabled = 0;
+
+ printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
+ smp_processor_id());
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
+
+#ifdef CONFIG_SMP
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+#ifdef CONFIG_KVM_CLOCK
+ WARN_ON(kvm_register_clock("primary cpu clock"));
+#endif
+ kvm_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+}
+
+static void kvm_guest_cpu_online(void *dummy)
+{
+ kvm_guest_cpu_init();
+}
+
+static void kvm_guest_cpu_offline(void *dummy)
+{
+ kvm_pv_disable_apf(NULL);
+ apf_task_wake_all();
+}
+
+static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE_FROZEN:
+ smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
+ .notifier_call = kvm_cpu_notify,
+};
+#endif
+
+static void __init kvm_apf_trap_init(void)
+{
+ set_intr_gate(14, &async_page_fault);
+}
+
void __init kvm_guest_init(void)
{
+ int i;
+
if (!kvm_para_available())
return;
paravirt_ops_setup();
+ register_reboot_notifier(&kvm_pv_reboot_nb);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+ spin_lock_init(&async_pf_sleepers[i].lock);
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+ x86_init.irqs.trap_init = kvm_apf_trap_init;
+
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+ register_cpu_notifier(&kvm_cpu_notifier);
+#else
+ kvm_guest_cpu_init();
+#endif
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca43ce31a19c..f98d3eafe07a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-static int kvm_register_clock(char *txt)
+int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high, ret;
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
}
#endif
-#ifdef CONFIG_SMP
-static void __init kvm_smp_prepare_boot_cpu(void)
-{
- WARN_ON(kvm_register_clock("primary cpu clock"));
- native_smp_prepare_boot_cpu();
-}
-#endif
-
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@@ -206,9 +198,6 @@ void __init kvmclock_init(void)
x86_cpuinit.setup_percpu_clockev =
kvm_setup_secondary_clock;
#endif
-#ifdef CONFIG_SMP
- smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
-#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
+ select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
-EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
CFLAGS_x86.o := -I.
CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o \
assigned-dev.o)
kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
+kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38b6e8dafaff..caf966781d25 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -20,16 +20,8 @@
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf(_f , ## _a)
-#else
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
#include <linux/module.h>
#include <asm/kvm_emulate.h>
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
}
static inline unsigned long
-register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+register_address(struct decode_cache *c, unsigned long reg)
{
- return base + address_mask(c, reg);
+ return address_mask(c, reg);
}
static inline void
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
return ops->get_cached_segment_base(seg, ctxt->vcpu);
}
-static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct decode_cache *c)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct decode_cache *c)
{
if (!c->has_seg_override)
return 0;
- return seg_base(ctxt, ops, c->seg_override);
+ return c->seg_override;
}
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static ulong linear(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr)
{
- return seg_base(ctxt, ops, VCPU_SREG_ES);
-}
-
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- return seg_base(ctxt, ops, VCPU_SREG_SS);
-}
+ struct decode_cache *c = &ctxt->decode;
+ ulong la;
-static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
- u32 error, bool valid)
-{
- ctxt->exception = vec;
- ctxt->error_code = error;
- ctxt->error_code_valid = valid;
+ la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+ if (c->ad_bytes != 8)
+ la &= (u32)-1;
+ return la;
}
-static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
{
- emulate_exception(ctxt, GP_VECTOR, err, true);
+ ctxt->exception.vector = vec;
+ ctxt->exception.error_code = error;
+ ctxt->exception.error_code_valid = valid;
+ return X86EMUL_PROPAGATE_FAULT;
}
-static void emulate_pf(struct x86_emulate_ctxt *ctxt)
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
{
- emulate_exception(ctxt, PF_VECTOR, 0, true);
+ return emulate_exception(ctxt, GP_VECTOR, err, true);
}
-static void emulate_ud(struct x86_emulate_ctxt *ctxt)
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
{
- emulate_exception(ctxt, UD_VECTOR, 0, false);
+ return emulate_exception(ctxt, UD_VECTOR, 0, false);
}
-static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
{
- emulate_exception(ctxt, TS_VECTOR, err, true);
+ return emulate_exception(ctxt, TS_VECTOR, err, true);
}
static int emulate_de(struct x86_emulate_ctxt *ctxt)
{
- emulate_exception(ctxt, DE_VECTOR, 0, false);
- return X86EMUL_PROPAGATE_FAULT;
+ return emulate_exception(ctxt, DE_VECTOR, 0, false);
}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
cur_size = fc->end - fc->start;
size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
- size, ctxt->vcpu, NULL);
+ size, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
fc->end += size;
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
- ulong addr,
+ struct segmented_address addr,
u16 *size, unsigned long *address, int op_bytes)
{
int rc;
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
- rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
+ rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
+ ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
+ addr.ea += 2;
+ rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
+ ctxt->vcpu, &ctxt->exception);
return rc;
}
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
break;
}
}
- op->addr.mem = modrm_ea;
+ op->addr.mem.ea = modrm_ea;
done:
return rc;
}
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
op->type = OP_MEM;
switch (c->ad_bytes) {
case 2:
- op->addr.mem = insn_fetch(u16, 2, c->eip);
+ op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
break;
case 4:
- op->addr.mem = insn_fetch(u32, 4, c->eip);
+ op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
break;
case 8:
- op->addr.mem = insn_fetch(u64, 8, c->eip);
+ op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
break;
}
done:
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
else if (c->src.bytes == 4)
sv = (s32)c->src.val & (s32)mask;
- c->dst.addr.mem += (sv >> 3);
+ c->dst.addr.mem.ea += (sv >> 3);
}
/* only subword offset */
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
{
int rc;
struct read_cache *mc = &ctxt->decode.mem_read;
- u32 err;
while (size) {
int n = min(size, 8u);
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
if (mc->pos < mc->end)
goto read_cached;
- rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
- ctxt->vcpu);
- if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
+ rc = ops->read_emulated(addr, mc->data + mc->end, n,
+ &ctxt->exception, ctxt->vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
mc->end += n;
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
struct desc_ptr dt;
u16 index = selector >> 3;
int ret;
- u32 err;
ulong addr;
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
- if (dt.size < index * 8 + 7) {
- emulate_gp(ctxt, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
- ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
+ ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
+ &ctxt->exception);
return ret;
}
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
{
struct desc_ptr dt;
u16 index = selector >> 3;
- u32 err;
ulong addr;
int ret;
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
- if (dt.size < index * 8 + 7) {
- emulate_gp(ctxt, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
- ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
+ ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
+ &ctxt->exception);
return ret;
}
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
{
int rc;
struct decode_cache *c = &ctxt->decode;
- u32 err;
switch (c->dst.type) {
case OP_REG:
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
case OP_MEM:
if (c->lock_prefix)
rc = ops->cmpxchg_emulated(
- c->dst.addr.mem,
+ linear(ctxt, c->dst.addr.mem),
&c->dst.orig_val,
&c->dst.val,
c->dst.bytes,
- &err,
+ &ctxt->exception,
ctxt->vcpu);
else
rc = ops->write_emulated(
- c->dst.addr.mem,
+ linear(ctxt, c->dst.addr.mem),
&c->dst.val,
c->dst.bytes,
- &err,
+ &ctxt->exception,
ctxt->vcpu);
- if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
break;
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
- c->regs[VCPU_REGS_RSP]);
+ c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ c->dst.addr.mem.seg = VCPU_SREG_SS;
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
{
struct decode_cache *c = &ctxt->decode;
int rc;
+ struct segmented_address addr;
- rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
- c->regs[VCPU_REGS_RSP]),
- dest, len);
+ addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ addr.seg = VCPU_SREG_SS;
+ rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
change_mask |= EFLG_IF;
break;
case X86EMUL_MODE_VM86:
- if (iopl < 3) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (iopl < 3)
+ return emulate_gp(ctxt, 0);
change_mask |= EFLG_IF;
break;
default: /* real mode */
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
*(unsigned long *)dest =
(ctxt->eflags & ~change_mask) | (val & change_mask);
- if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
-
return rc;
}
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
gva_t cs_addr;
gva_t eip_addr;
u16 cs, eip;
- u32 err;
/* TODO: Add limit checks */
c->src.val = ctxt->eflags;
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
eip_addr = dt.address + (irq << 2);
cs_addr = dt.address + (irq << 2) + 2;
- rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+ rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+ rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE)
return rc;
- if (temp_eip & ~0xffff) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (temp_eip & ~0xffff)
+ return emulate_gp(ctxt, 0);
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86) {
- emulate_ud(ctxt);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_ud(ctxt);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
u16 cs_sel, ss_sel;
/* inject #GP if in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return emulate_gp(ctxt, 0);
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
- if (ctxt->mode == X86EMUL_MODE_PROT64) {
- emulate_ud(ctxt);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_ud(ctxt);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
- if ((msr_data & 0xfffc) == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
break;
case X86EMUL_MODE_PROT64:
- if (msr_data == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
break;
}
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_gp(ctxt, 0);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
switch (usermode) {
case X86EMUL_MODE_PROT32:
cs_sel = (u16)(msr_data + 16);
- if ((msr_data & 0xfffc) == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
ss_sel = (u16)(msr_data + 24);
break;
case X86EMUL_MODE_PROT64:
cs_sel = (u16)(msr_data + 32);
- if (msr_data == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
{
struct tss_segment_16 tss_seg;
int ret;
- u32 err, new_tss_base = get_desc_base(new_desc);
+ u32 new_tss_base = get_desc_base(new_desc);
ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
save_state_to_tss16(ctxt, ops, &tss_seg);
ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ ctxt->vcpu, &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
}
return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int ret;
- if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
+ return emulate_gp(ctxt, 0);
c->eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
{
struct tss_segment_32 tss_seg;
int ret;
- u32 err, new_tss_base = get_desc_base(new_desc);
+ u32 new_tss_base = get_desc_base(new_desc);
ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
save_state_to_tss32(ctxt, ops, &tss_seg);
ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ ctxt->vcpu, &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
}
return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
if ((tss_selector & 3) > next_tss_desc.dpl ||
- ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
+ return emulate_gp(ctxt, 0);
}
desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
int reg, struct operand *op)
{
struct decode_cache *c = &ctxt->decode;
int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
register_address_increment(c, &c->regs[reg], df * op->bytes);
- op->addr.mem = register_address(c, base, c->regs[reg]);
+ op->addr.mem.ea = register_address(c, c->regs[reg]);
+ op->addr.mem.seg = seg;
}
static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
struct decode_cache *c = &ctxt->decode;
u64 tsc = 0;
- if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
+ return emulate_gp(ctxt, 0);
ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
c->regs[VCPU_REGS_RAX] = (u32)tsc;
c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_IMM;
op->bytes = size;
- op->addr.mem = c->eip;
+ op->addr.mem.ea = c->eip;
/* NB. Immediates are sign-extended as necessary. */
switch (op->bytes) {
case 1:
@@ -2678,7 +2610,7 @@ done:
}
int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
{
struct x86_emulate_ops *ops = ctxt->ops;
struct decode_cache *c = &ctxt->decode;
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
struct operand memop = { .type = OP_NONE };
c->eip = ctxt->eip;
- c->fetch.start = c->fetch.end = c->eip;
+ c->fetch.start = c->eip;
+ c->fetch.end = c->fetch.start + insn_len;
+ if (insn_len > 0)
+ memcpy(c->fetch.data, insn, insn_len);
ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
switch (mode) {
@@ -2803,10 +2738,8 @@ done_prefixes:
c->execute = opcode.u.execute;
/* Unrecognised? */
- if (c->d == 0 || (c->d & Undefined)) {
- DPRINTF("Cannot emulate %02x\n", c->b);
+ if (c->d == 0 || (c->d & Undefined))
return -1;
- }
if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
c->op_bytes = 8;
@@ -2831,14 +2764,13 @@ done_prefixes:
if (!c->has_seg_override)
set_seg_override(c, VCPU_SREG_DS);
- if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
- memop.addr.mem += seg_override_base(ctxt, ops, c);
+ memop.addr.mem.seg = seg_override(ctxt, ops, c);
if (memop.type == OP_MEM && c->ad_bytes != 8)
- memop.addr.mem = (u32)memop.addr.mem;
+ memop.addr.mem.ea = (u32)memop.addr.mem.ea;
if (memop.type == OP_MEM && c->rip_relative)
- memop.addr.mem += c->eip;
+ memop.addr.mem.ea += c->eip;
/*
* Decode and fetch the source operand: register, memory
@@ -2890,14 +2822,14 @@ done_prefixes:
case SrcSI:
c->src.type = OP_MEM;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.addr.mem =
- register_address(c, seg_override_base(ctxt, ops, c),
- c->regs[VCPU_REGS_RSI]);
+ c->src.addr.mem.ea =
+ register_address(c, c->regs[VCPU_REGS_RSI]);
+ c->src.addr.mem.seg = seg_override(ctxt, ops, c),
c->src.val = 0;
break;
case SrcImmFAddr:
c->src.type = OP_IMM;
- c->src.addr.mem = c->eip;
+ c->src.addr.mem.ea = c->eip;
c->src.bytes = c->op_bytes + 2;
insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
break;
@@ -2944,7 +2876,7 @@ done_prefixes:
break;
case DstImmUByte:
c->dst.type = OP_IMM;
- c->dst.addr.mem = c->eip;
+ c->dst.addr.mem.ea = c->eip;
c->dst.bytes = 1;
c->dst.val = insn_fetch(u8, 1, c->eip);
break;
@@ -2969,9 +2901,9 @@ done_prefixes:
case DstDI:
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.addr.mem =
- register_address(c, es_base(ctxt, ops),
- c->regs[VCPU_REGS_RDI]);
+ c->dst.addr.mem.ea =
+ register_address(c, c->regs[VCPU_REGS_RDI]);
+ c->dst.addr.mem.seg = VCPU_SREG_ES;
c->dst.val = 0;
break;
case ImplicitOps:
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
ctxt->decode.mem_read.pos = 0;
if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
/* LOCK prefix is allowed only with some instructions */
if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
}
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
}
if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
- rc = read_emulated(ctxt, ops, c->src.addr.mem,
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
c->src.valptr, c->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
}
if (c->src2.type == OP_MEM) {
- rc = read_emulated(ctxt, ops, c->src2.addr.mem,
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
&c->src2.val, c->src2.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
/* optimisation - avoid slow emulated read if Mov */
- rc = read_emulated(ctxt, ops, c->dst.addr.mem,
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
&c->dst.val, c->dst.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3215,13 +3147,13 @@ special_insn:
break;
case 0x8c: /* mov r/m, sreg */
if (c->modrm_reg > VCPU_SREG_GS) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
break;
case 0x8d: /* lea r16/r32, m */
- c->dst.val = c->src.addr.mem;
+ c->dst.val = c->src.addr.mem.ea;
break;
case 0x8e: { /* mov seg, r/m16 */
uint16_t sel;
@@ -3230,7 +3162,7 @@ special_insn:
if (c->modrm_reg == VCPU_SREG_CS ||
c->modrm_reg > VCPU_SREG_GS) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
@@ -3268,7 +3200,6 @@ special_insn:
break;
case 0xa6 ... 0xa7: /* cmps */
c->dst.type = OP_NONE; /* Disable writeback. */
- DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
goto cmp;
case 0xa8 ... 0xa9: /* test ax, imm */
goto test;
@@ -3363,7 +3294,7 @@ special_insn:
do_io_in:
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
}
if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3377,7 +3308,7 @@ special_insn:
c->src.bytes = min(c->src.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->dst.val,
c->src.bytes)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
}
ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3402,14 +3333,14 @@ special_insn:
break;
case 0xfa: /* cli */
if (emulator_bad_iopl(ctxt, ops)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
} else
ctxt->eflags &= ~X86_EFLAGS_IF;
break;
case 0xfb: /* sti */
if (emulator_bad_iopl(ctxt, ops)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
} else {
ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3449,11 +3380,11 @@ writeback:
c->dst.type = saved_dst_type;
if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+ string_addr_inc(ctxt, seg_override(ctxt, ops, c),
VCPU_REGS_RSI, &c->src);
if ((c->d & DstMask) == DstDI)
- string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+ string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
&c->dst);
if (c->rep_prefix && (c->d & String)) {
@@ -3482,6 +3413,8 @@ writeback:
ctxt->eip = c->eip;
done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
twobyte_insn:
@@ -3544,9 +3477,11 @@ twobyte_insn:
break;
case 5: /* not defined */
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
+ emulate_invlpg(ctxt->vcpu,
+ linear(ctxt, c->src.addr.mem));
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@@ -3573,6 +3508,7 @@ twobyte_insn:
case 5 ... 7:
case 9 ... 15:
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3581,6 +3517,7 @@ twobyte_insn:
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3588,6 +3525,7 @@ twobyte_insn:
case 0x22: /* mov reg, cr */
if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
c->dst.type = OP_NONE;
@@ -3596,6 +3534,7 @@ twobyte_insn:
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
@@ -3604,6 +3543,7 @@ twobyte_insn:
~0ULL : ~0U), ctxt->vcpu) < 0) {
/* #UD condition is already handled by the code above */
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
@@ -3615,6 +3555,7 @@ twobyte_insn:
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
rc = X86EMUL_CONTINUE;
@@ -3623,6 +3564,7 @@ twobyte_insn:
/* rdmsr */
if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3785,6 +3727,5 @@ twobyte_insn:
goto writeback;
cannot_emulate:
- DPRINTF("Cannot emulate %02x\n", c->b);
return -1;
}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 975bb45329a1..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr4 & mask;
}
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->decache_cr3(vcpu);
+ return vcpu->arch.cr3;
+}
+
static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 413f8973a855..93cf9d0d3653 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
if (old_ppr != ppr) {
apic_set_reg(apic, APIC_PROCPRI, ppr);
- kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ if (ppr < old_ppr)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
}
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbb04aee8301..9cafbb499813 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
*
*/
+#include "irq.h"
#include "mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
-static u64 __read_mostly shadow_base_present_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-void kvm_mmu_set_base_ptes(u64 base_pte)
-{
- shadow_base_present_pte = base_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
-
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
}
/*
- * Return the pointer to the largepage write count for a given
- * gfn, handling slots that are not large page aligned.
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
*/
-static int *slot_largepage_idx(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+ struct kvm_memory_slot *slot,
+ int level)
{
unsigned long idx;
idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
- return &slot->lpage_info[level - 2][idx].write_count;
+ return &slot->lpage_info[level - 2][idx];
}
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- int *write_count;
+ struct kvm_lpage_info *linfo;
int i;
slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- write_count = slot_largepage_idx(gfn, slot, i);
- *write_count += 1;
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->write_count += 1;
}
}
static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- int *write_count;
+ struct kvm_lpage_info *linfo;
int i;
slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- write_count = slot_largepage_idx(gfn, slot, i);
- *write_count -= 1;
- WARN_ON(*write_count < 0);
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->write_count -= 1;
+ WARN_ON(linfo->write_count < 0);
}
}
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
int level)
{
struct kvm_memory_slot *slot;
- int *largepage_idx;
+ struct kvm_lpage_info *linfo;
slot = gfn_to_memslot(kvm, gfn);
if (slot) {
- largepage_idx = slot_largepage_idx(gfn, slot, level);
- return *largepage_idx;
+ linfo = lpage_info_slot(gfn, slot, level);
+ return linfo->write_count;
}
return 1;
@@ -590,16 +585,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
{
struct kvm_memory_slot *slot;
- unsigned long idx;
+ struct kvm_lpage_info *linfo;
slot = gfn_to_memslot(kvm, gfn);
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
- (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+ linfo = lpage_info_slot(gfn, slot, level);
- return &slot->lpage_info[level - 2][idx].rmap_pde;
+ return &linfo->rmap_pde;
}
/*
@@ -887,19 +881,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ gfn_t gfn = memslot->base_gfn + gfn_offset;
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- unsigned long idx;
- int sh;
-
- sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
- idx = ((memslot->base_gfn+gfn_offset) >> sh) -
- (memslot->base_gfn >> sh);
- ret |= handler(kvm,
- &memslot->lpage_info[j][idx].rmap_pde,
- data);
+ struct kvm_lpage_info *linfo;
+
+ linfo = lpage_info_slot(gfn, memslot,
+ PT_DIRECTORY_LEVEL + j);
+ ret |= handler(kvm, &linfo->rmap_pde, data);
}
trace_kvm_age_page(hva, memslot, ret);
retval |= ret;
@@ -1161,7 +1152,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync)
+ struct kvm_mmu_page *sp)
{
return 1;
}
@@ -1291,7 +1282,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (clear_unsync)
kvm_unlink_unsync_page(vcpu->kvm, sp);
- if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@@ -1332,12 +1323,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
continue;
WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unlink_unsync_page(vcpu->kvm, s);
if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+ (vcpu->arch.mmu.sync_page(vcpu, s))) {
kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
continue;
}
- kvm_unlink_unsync_page(vcpu->kvm, s);
flush = true;
}
@@ -1963,9 +1954,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync, bool reset_host_protection)
+ bool can_unsync, bool host_writable)
{
- u64 spte;
+ u64 spte, entry = *sptep;
int ret = 0;
/*
@@ -1973,7 +1964,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* whether the guest actually used the pte (in order to detect
* demand paging).
*/
- spte = shadow_base_present_pte;
+ spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
if (!dirty)
@@ -1990,8 +1981,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
- if (reset_host_protection)
+ if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -2036,6 +2029,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
set_pte:
update_spte(sptep, spte);
+ /*
+ * If we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB.
+ */
+ if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
}
@@ -2045,7 +2046,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int user_fault, int write_fault, int dirty,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
- bool reset_host_protection)
+ bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
@@ -2080,7 +2081,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
dirty, level, gfn, pfn, speculative, true,
- reset_host_protection)) {
+ host_writable)) {
if (write_fault)
*ptwrite = 1;
kvm_mmu_flush_tlb(vcpu);
@@ -2211,7 +2212,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int level, gfn_t gfn, pfn_t pfn)
+ int map_writable, int level, gfn_t gfn, pfn_t pfn,
+ bool prefault)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -2220,9 +2222,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
+ unsigned pte_access = ACC_ALL;
+
+ mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
0, write, 1, &pt_write,
- level, gfn, pfn, false, true);
+ level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@@ -2277,12 +2281,17 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1;
}
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
+ bool prefault)
{
int r;
int level;
pfn_t pfn;
unsigned long mmu_seq;
+ bool map_writable;
level = mapping_level(vcpu, gfn);
@@ -2297,7 +2306,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+ if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+ return 0;
/* mmio */
if (is_error_pfn(pfn))
@@ -2307,7 +2318,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, level, gfn, pfn);
+ r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
+ prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2530,6 +2542,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
@@ -2552,23 +2565,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
+ u32 access, struct x86_exception *exception)
{
- if (error)
- *error = 0;
+ if (exception)
+ exception->error_code = 0;
return vaddr;
}
static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
+ u32 access,
+ struct x86_exception *exception)
{
- if (error)
- *error = 0;
+ if (exception)
+ exception->error_code = 0;
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code)
+ u32 error_code, bool prefault)
{
gfn_t gfn;
int r;
@@ -2584,17 +2598,67 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn);
+ error_code & PFERR_WRITE_MASK, gfn, prefault);
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+{
+ struct kvm_arch_async_pf arch;
+
+ arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+ arch.gfn = gfn;
+ arch.direct_map = vcpu->arch.mmu.direct_map;
+ arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+
+ return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
- u32 error_code)
+static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+ kvm_event_needs_reinjection(vcpu)))
+ return false;
+
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable)
+{
+ bool async;
+
+ *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
+
+ if (!async)
+ return false; /* *pfn has correct page already */
+
+ put_page(pfn_to_page(*pfn));
+
+ if (!prefault && can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(gva, gfn);
+ if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+ trace_kvm_async_pf_doublefault(gva, gfn);
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return true;
+ } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+ return true;
+ }
+
+ *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
+
+ return false;
+}
+
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+ bool prefault)
{
pfn_t pfn;
int r;
int level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
+ int write = error_code & PFERR_WRITE_MASK;
+ bool map_writable;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2609,15 +2673,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return 0;
+
+ /* mmio */
if (is_error_pfn(pfn))
return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- level, gfn, pfn);
+ r = __direct_map(vcpu, gpa, write, map_writable,
+ level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -2659,18 +2727,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
- pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
+ pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
mmu_free_roots(vcpu);
}
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.cr3;
+ return kvm_read_cr3(vcpu);
}
-static void inject_page_fault(struct kvm_vcpu *vcpu)
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
- vcpu->arch.mmu.inject_page_fault(vcpu);
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
static void paging_free(struct kvm_vcpu *vcpu)
@@ -2816,6 +2885,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = vcpu->arch.walk_mmu;
+ context->base_role.word = 0;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
@@ -3008,9 +3078,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
- if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
- return;
-
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
@@ -3264,12 +3331,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
}
}
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+ void *insn, int insn_len)
{
int r;
enum emulation_result er;
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
if (r < 0)
goto out;
@@ -3282,7 +3350,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
if (r)
goto out;
- er = emulate_instruction(vcpu, cr2, error_code, 0);
+ er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
switch (er) {
case EMULATE_DONE:
@@ -3377,11 +3445,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (!test_bit(slot, sp->slot_bitmap))
continue;
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
if (is_writable_pte(pt[i]))
- pt[i] &= ~PT_WRITABLE_MASK;
+ update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
}
kvm_flush_remote_tlbs(kvm);
}
@@ -3463,13 +3534,6 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
-void kvm_mmu_module_exit(void)
-{
- mmu_destroy_caches();
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
- unregister_shrinker(&mmu_shrinker);
-}
-
int kvm_mmu_module_init(void)
{
pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3566,7 +3630,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
- (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
+ (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
return 1;
}
@@ -3662,12 +3726,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
@@ -3675,5 +3733,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
destroy_kvm_mmu(vcpu);
free_mmu_pages(vcpu);
mmu_free_memory_caches(vcpu);
+}
+
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
+#endif
+
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
+ unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdde6221..5f6223b8bcf7 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@
#include <linux/ratelimit.h>
-static int audit_point;
-
-#define audit_printk(fmt, args...) \
+#define audit_printk(kvm, fmt, args...) \
printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[audit_point], ##args)
+ fmt, audit_point_name[kvm->arch.audit_point], ##args)
typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
if (sp->unsync) {
if (level != PT_PAGE_TABLE_LEVEL) {
- audit_printk("unsync sp: %p level = %d\n", sp, level);
+ audit_printk(vcpu->kvm, "unsync sp: %p "
+ "level = %d\n", sp, level);
return;
}
if (*sptep == shadow_notrap_nonpresent_pte) {
- audit_printk("notrap spte in unsync sp: %p\n", sp);
+ audit_printk(vcpu->kvm, "notrap spte in unsync "
+ "sp: %p\n", sp);
return;
}
}
if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- audit_printk("notrap spte in direct sp: %p\n", sp);
+ audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
+ sp);
return;
}
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
hpa = pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
- vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
+ audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
+ "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+ hpa, *sptep);
}
static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!gfn_to_memslot(kvm, gfn)) {
if (!printk_ratelimit())
return;
- audit_printk("no memslot for gfn %llx\n", gfn);
- audit_printk("index %ld of sp (gfn=%llx)\n",
+ audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
+ audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
(long int)(sptep - rev_sp->spt), rev_sp->gfn);
dump_stack();
return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!*rmapp) {
if (!printk_ratelimit())
return;
- audit_printk("no rmap for writable spte %llx\n", *sptep);
+ audit_printk(kvm, "no rmap for writable spte %llx\n",
+ *sptep);
dump_stack();
}
}
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
struct kvm_mmu_page *sp = page_header(__pa(sptep));
- if (audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+ if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
+ audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
+ "root.\n", sp);
}
static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
if (is_writable_pte(*spte))
- audit_printk("shadow page has writable mappings: gfn "
- "%llx role %x\n", sp->gfn, sp->role.word);
+ audit_printk(kvm, "shadow page has writable "
+ "mappings: gfn %llx role %x\n",
+ sp->gfn, sp->role.word);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
if (!__ratelimit(&ratelimit_state))
return;
- audit_point = point;
+ vcpu->kvm->arch.audit_point = point;
audit_all_active_sps(vcpu->kvm);
audit_vcpu_spte(vcpu);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..53210f1e94c2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@ struct guest_walker {
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
- u32 error_code;
+ struct x86_exception fault;
};
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
return 1;
error:
- walker->error_code = 0;
+ walker->fault.vector = PF_VECTOR;
+ walker->fault.error_code_valid = true;
+ walker->fault.error_code = 0;
if (present)
- walker->error_code |= PFERR_PRESENT_MASK;
+ walker->fault.error_code |= PFERR_PRESENT_MASK;
- walker->error_code |= write_fault | user_fault;
+ walker->fault.error_code |= write_fault | user_fault;
if (fetch_fault && mmu->nx)
- walker->error_code |= PFERR_FETCH_MASK;
+ walker->fault.error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
- walker->error_code |= PFERR_RSVD_MASK;
+ walker->fault.error_code |= PFERR_RSVD_MASK;
- vcpu->arch.fault.address = addr;
- vcpu->arch.fault.error_code = walker->error_code;
+ walker->fault.address = addr;
+ walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
- trace_kvm_mmu_walker_error(walker->error_code);
+ trace_kvm_mmu_walker_error(walker->fault.error_code);
return 0;
}
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ pt_element_t gpte)
+{
+ u64 nonpresent = shadow_trap_nonpresent_pte;
+
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte)) {
+ if (!sp->unsync)
+ nonpresent = shadow_notrap_nonpresent_pte;
+ goto no_present;
+ }
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte, nonpresent);
+ return true;
+}
+
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
- u64 new_spte;
gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte)) {
- if (sp->unsync)
- new_spte = shadow_trap_nonpresent_pte;
- else
- new_spte = shadow_notrap_nonpresent_pte;
- __set_spte(spte, new_spte);
- }
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return;
- }
+
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
kvm_get_pfn(pfn);
/*
- * we call mmu_set_spte() with reset_host_protection = true beacuse that
+ * we call mmu_set_spte() with host_writable = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *sptep)
{
struct kvm_mmu_page *sp;
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
pt_element_t *gptep = gw->prefetch_ptes;
u64 *spte;
int i;
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gpte = gptep[i];
- if (!is_present_gpte(gpte) ||
- is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
- if (!sp->unsync)
- __set_spte(spte, shadow_notrap_nonpresent_pte);
- continue;
- }
-
- if (!(gpte & PT_ACCESSED_MASK))
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn)
+ int *ptwrite, pfn_t pfn, bool map_writable,
+ bool prefault)
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
user_fault, write_fault, dirty, ptwrite, it.level,
- gw->gfn, pfn, false, true);
+ gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return it.sptep;
@@ -527,8 +539,8 @@ out_gpte_changed:
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
- u32 error_code)
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+ bool prefault)
{
int write_fault = error_code & PFERR_WRITE_MASK;
int user_fault = error_code & PFERR_USER_MASK;
@@ -539,6 +551,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
unsigned long mmu_seq;
+ bool map_writable;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -556,8 +569,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- inject_page_fault(vcpu);
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+ if (!prefault) {
+ inject_page_fault(vcpu, &walker.fault);
+ /* reset fork detector */
+ vcpu->arch.last_pt_write_count = 0;
+ }
return 0;
}
@@ -568,7 +584,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
+ &map_writable))
+ return 0;
/* mmio */
if (is_error_pfn(pfn))
@@ -581,7 +600,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn);
+ level, &write_pt, pfn, map_writable, prefault);
(void)sptep;
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
sptep, *sptep, write_pt);
@@ -661,7 +680,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
- u32 *error)
+ struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
@@ -672,14 +691,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- } else if (error)
- *error = walker.error_code;
+ } else if (exception)
+ *exception = walker.fault;
return gpa;
}
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
+ u32 access,
+ struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
@@ -690,8 +710,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- } else if (error)
- *error = walker.error_code;
+ } else if (exception)
+ *exception = walker.fault;
return gpa;
}
@@ -730,12 +750,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ * We should flush all tlbs if spte is dropped even though guest is
+ * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ * used by guest then tlbs are not flushed, so guest is allowed to access the
+ * freed pages.
+ * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- bool clear_unsync)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
- bool reset_host_protection;
+ bool host_writable;
gpa_t first_pte_gpa;
offset = nr_present = 0;
@@ -764,31 +791,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return -EINVAL;
gfn = gpte_to_gfn(gpte);
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
- || gfn != sp->gfns[i] || !is_present_gpte(gpte)
- || !(gpte & PT_ACCESSED_MASK)) {
- u64 nonpresent;
- if (is_present_gpte(gpte) || !clear_unsync)
- nonpresent = shadow_trap_nonpresent_pte;
- else
- nonpresent = shadow_notrap_nonpresent_pte;
- drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ vcpu->kvm->tlbs_dirty++;
+ continue;
+ }
+
+ if (gfn != sp->gfns[i]) {
+ drop_spte(vcpu->kvm, &sp->spt[i],
+ shadow_trap_nonpresent_pte);
+ vcpu->kvm->tlbs_dirty++;
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
- pte_access &= ~ACC_WRITE_MASK;
- reset_host_protection = 0;
- } else {
- reset_host_protection = 1;
- }
+ host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
- reset_host_protection);
+ host_writable);
}
return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b81a9b7c2ca4..25bd1bc5aad2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
#include <asm/tlbflush.h>
#include <asm/desc.h>
+#include <asm/kvm_para.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_FEATURE_SVML (1 << 2)
#define SVM_FEATURE_NRIP (1 << 3)
+#define SVM_FEATURE_TSC_RATE (1 << 4)
+#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
+#define SVM_FEATURE_FLUSH_ASID (1 << 6)
+#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -97,10 +102,8 @@ struct nested_state {
unsigned long vmexit_rax;
/* cache for intercepts of the guest */
- u16 intercept_cr_read;
- u16 intercept_cr_write;
- u16 intercept_dr_read;
- u16 intercept_dr_write;
+ u32 intercept_cr;
+ u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
@@ -123,7 +126,12 @@ struct vcpu_svm {
u64 next_rip;
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- u64 host_gs_base;
+ struct {
+ u16 fs;
+ u16 gs;
+ u16 ldt;
+ u64 gs_base;
+ } host;
u32 *msrpm;
@@ -133,6 +141,7 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
+ u32 apf_reason;
};
#define MSR_INVALID 0xffffffffU
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_DIRTY_MAX,
+};
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_svm, vcpu);
}
-static inline bool is_nested(struct vcpu_svm *svm)
+static void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct nested_state *g;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->nested.hsave->control;
+ g = &svm->nested;
+
+ c->intercept_cr = h->intercept_cr | g->intercept_cr;
+ c->intercept_dr = h->intercept_dr | g->intercept_dr;
+ c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
+ c->intercept = h->intercept | g->intercept;
+}
+
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+ if (is_guest_mode(&svm->vcpu))
+ return svm->nested.hsave;
+ else
+ return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ return vmcb->control.intercept_cr & (1U << bit);
+}
+
+static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
{
- return svm->nested.vmcb;
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept |= (1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept &= ~(1ULL << bit);
+
+ recalc_intercepts(svm);
}
static inline void enable_gif(struct vcpu_svm *svm)
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr)
#define MAX_INST_SIZE 15
-static inline u32 svm_has(u32 feat)
-{
- return svm_features & feat;
-}
-
static inline void clgi(void)
{
asm volatile (__ex(SVM_CLGI));
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
}
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
- to_svm(vcpu)->asid_generation--;
-}
-
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
- force_new_asid(vcpu);
-}
-
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
}
static int is_external_interrupt(u32 info)
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
svm->next_rip = svm->vmcb->control.next_rip;
if (!svm->next_rip) {
- if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
+ if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
EMULATE_DONE)
printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
nested_svm_check_exception(svm, nr, has_error_code, error_code))
return;
- if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+ if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
/*
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void)
svm_features = cpuid_edx(SVM_CPUID_FUNC);
- if (!svm_has(SVM_FEATURE_NPT))
+ if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
if (npt_enabled && !npt) {
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
- if (is_nested(svm)) {
+ if (is_guest_mode(vcpu)) {
g_tsc_offset = svm->vmcb->control.tsc_offset -
svm->nested.hsave->control.tsc_offset;
svm->nested.hsave->control.tsc_offset = offset;
}
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.tsc_offset += adjustment;
- if (is_nested(svm))
+ if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
static void init_vmcb(struct vcpu_svm *svm)
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm)
struct vmcb_save_area *save = &svm->vmcb->save;
svm->vcpu.fpu_active = 1;
+ svm->vcpu.arch.hflags = 0;
- control->intercept_cr_read = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK;
-
- control->intercept_cr_write = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK |
- INTERCEPT_CR8_MASK;
-
- control->intercept_dr_read = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR4_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR6_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_dr_write = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR4_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR6_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_exceptions = (1 << PF_VECTOR) |
- (1 << UD_VECTOR) |
- (1 << MC_VECTOR);
-
-
- control->intercept = (1ULL << INTERCEPT_INTR) |
- (1ULL << INTERCEPT_NMI) |
- (1ULL << INTERCEPT_SMI) |
- (1ULL << INTERCEPT_SELECTIVE_CR0) |
- (1ULL << INTERCEPT_CPUID) |
- (1ULL << INTERCEPT_INVD) |
- (1ULL << INTERCEPT_HLT) |
- (1ULL << INTERCEPT_INVLPG) |
- (1ULL << INTERCEPT_INVLPGA) |
- (1ULL << INTERCEPT_IOIO_PROT) |
- (1ULL << INTERCEPT_MSR_PROT) |
- (1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_SHUTDOWN) |
- (1ULL << INTERCEPT_VMRUN) |
- (1ULL << INTERCEPT_VMMCALL) |
- (1ULL << INTERCEPT_VMLOAD) |
- (1ULL << INTERCEPT_VMSAVE) |
- (1ULL << INTERCEPT_STGI) |
- (1ULL << INTERCEPT_CLGI) |
- (1ULL << INTERCEPT_SKINIT) |
- (1ULL << INTERCEPT_WBINVD) |
- (1ULL << INTERCEPT_MONITOR) |
- (1ULL << INTERCEPT_MWAIT);
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR3_READ);
+ set_cr_intercept(svm, INTERCEPT_CR4_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercept(svm, INTERCEPT_DR0_READ);
+ set_dr_intercept(svm, INTERCEPT_DR1_READ);
+ set_dr_intercept(svm, INTERCEPT_DR2_READ);
+ set_dr_intercept(svm, INTERCEPT_DR3_READ);
+ set_dr_intercept(svm, INTERCEPT_DR4_READ);
+ set_dr_intercept(svm, INTERCEPT_DR5_READ);
+ set_dr_intercept(svm, INTERCEPT_DR6_READ);
+ set_dr_intercept(svm, INTERCEPT_DR7_READ);
+
+ set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+
+ set_intercept(svm, INTERCEPT_INTR);
+ set_intercept(svm, INTERCEPT_NMI);
+ set_intercept(svm, INTERCEPT_SMI);
+ set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ set_intercept(svm, INTERCEPT_CPUID);
+ set_intercept(svm, INTERCEPT_INVD);
+ set_intercept(svm, INTERCEPT_HLT);
+ set_intercept(svm, INTERCEPT_INVLPG);
+ set_intercept(svm, INTERCEPT_INVLPGA);
+ set_intercept(svm, INTERCEPT_IOIO_PROT);
+ set_intercept(svm, INTERCEPT_MSR_PROT);
+ set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ set_intercept(svm, INTERCEPT_SHUTDOWN);
+ set_intercept(svm, INTERCEPT_VMRUN);
+ set_intercept(svm, INTERCEPT_VMMCALL);
+ set_intercept(svm, INTERCEPT_VMLOAD);
+ set_intercept(svm, INTERCEPT_VMSAVE);
+ set_intercept(svm, INTERCEPT_STGI);
+ set_intercept(svm, INTERCEPT_CLGI);
+ set_intercept(svm, INTERCEPT_SKINIT);
+ set_intercept(svm, INTERCEPT_WBINVD);
+ set_intercept(svm, INTERCEPT_MONITOR);
+ set_intercept(svm, INTERCEPT_MWAIT);
+ set_intercept(svm, INTERCEPT_XSETBV);
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __pa(svm->msrpm);
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl = 1;
- control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_INVLPG));
- control->intercept_exceptions &= ~(1 << PF_VECTOR);
- control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
- control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
+ clr_intercept(svm, INTERCEPT_TASK_SWITCH);
+ clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ clr_cr_intercept(svm, INTERCEPT_CR3_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = 0x0007040600070406ULL;
save->cr3 = 0;
save->cr4 = 0;
}
- force_new_asid(&svm->vcpu);
+ svm->asid_generation = 0;
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
- if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
control->pause_filter_count = 3000;
- control->intercept |= (1ULL << INTERCEPT_PAUSE);
+ set_intercept(svm, INTERCEPT_PAUSE);
}
+ mark_all_dirty(svm->vmcb);
+
enable_gif(svm);
}
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(cpu != vcpu->cpu)) {
svm->asid_generation = 0;
+ mark_all_dirty(svm->vmcb);
}
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+#endif
+ savesegment(fs, svm->host.fs);
+ savesegment(gs, svm->host.gs);
+ svm->host.ldt = kvm_read_ldt();
+
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
int i;
++vcpu->stat.host_state_reload;
+ kvm_load_ldt(svm->host.ldt);
+#ifdef CONFIG_X86_64
+ loadsegment(fs, svm->host.fs);
+ load_gs_index(svm->host.gs);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+ loadsegment(gs, svm->host.gs);
+#endif
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
switch (reg) {
case VCPU_EXREG_PDPTR:
BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
break;
default:
BUG();
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
static void svm_set_vintr(struct vcpu_svm *svm)
{
- svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
+ set_intercept(svm, INTERCEPT_VINTR);
}
static void svm_clear_vintr(struct vcpu_svm *svm)
{
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+ clr_intercept(svm, INTERCEPT_VINTR);
}
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
svm->vmcb->save.idtr.limit = dt->size;
svm->vmcb->save.idtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
svm->vmcb->save.gdtr.limit = dt->size;
svm->vmcb->save.gdtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
}
+static void svm_decache_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
static void update_cr0_intercept(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = svm->vmcb;
ulong gcr0 = svm->vcpu.arch.cr0;
u64 *hcr0 = &svm->vmcb->save.cr0;
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
| (gcr0 & SVM_CR0_SELECTIVE_MASK);
+ mark_dirty(svm->vmcb, VMCB_CR);
if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
- vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
- if (is_nested(svm)) {
- struct vmcb *hsave = svm->nested.hsave;
-
- hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
- vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
- vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
- }
+ clr_cr_intercept(svm, INTERCEPT_CR0_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
- svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
- svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
- if (is_nested(svm)) {
- struct vmcb *hsave = svm->nested.hsave;
-
- hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
- hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
- }
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
}
}
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_nested(svm)) {
+ if (is_guest_mode(vcpu)) {
/*
* We are here because we run in nested mode, the host kvm
* intercepts cr0 writes but the l1 hypervisor does not.
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
*/
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
+ mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
}
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- force_new_asid(vcpu);
+ svm_flush_tlb(vcpu);
vcpu->arch.cr4 = cr4;
if (!npt_enabled)
cr4 |= X86_CR4_PAE;
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
= (svm->vmcb->save.cs.attrib
>> SVM_SELECTOR_DPL_SHIFT) & 3;
+ mark_dirty(svm->vmcb, VMCB_SEG);
}
static void update_db_intercept(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.intercept_exceptions &=
- ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+ clr_exception_intercept(svm, DB_VECTOR);
+ clr_exception_intercept(svm, BP_VECTOR);
if (svm->nmi_singlestep)
- svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- svm->vmcb->control.intercept_exceptions |=
- 1 << DB_VECTOR;
+ set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- svm->vmcb->control.intercept_exceptions |=
- 1 << BP_VECTOR;
+ set_exception_intercept(svm, BP_VECTOR);
} else
vcpu->guest_debug = 0;
}
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
else
svm->vmcb->save.dr7 = vcpu->arch.dr7;
- update_db_intercept(vcpu);
-}
-
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
+ mark_dirty(svm->vmcb, VMCB_DR);
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
+ update_db_intercept(vcpu);
}
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid_generation = sd->asid_generation;
svm->vmcb->control.asid = sd->next_asid++;
+
+ mark_dirty(svm->vmcb, VMCB_ASID);
}
static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.dr7 = value;
+ mark_dirty(svm->vmcb, VMCB_DR);
}
static int pf_interception(struct vcpu_svm *svm)
{
- u64 fault_address;
+ u64 fault_address = svm->vmcb->control.exit_info_2;
u32 error_code;
+ int r = 1;
- fault_address = svm->vmcb->control.exit_info_2;
- error_code = svm->vmcb->control.exit_info_1;
-
- trace_kvm_page_fault(fault_address, error_code);
- if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
- return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+ switch (svm->apf_reason) {
+ default:
+ error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+ r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ svm->vmcb->control.insn_bytes,
+ svm->vmcb->control.insn_len);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ svm->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait(fault_address);
+ local_irq_enable();
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ svm->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wake(fault_address);
+ local_irq_enable();
+ break;
+ }
+ return r;
}
static int db_interception(struct vcpu_svm *svm)
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm)
{
int er;
- er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm)
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 excp;
-
- if (is_nested(svm)) {
- u32 h_excp, n_excp;
-
- h_excp = svm->nested.hsave->control.intercept_exceptions;
- n_excp = svm->nested.intercept_exceptions;
- h_excp &= ~(1 << NM_VECTOR);
- excp = h_excp | n_excp;
- } else {
- excp = svm->vmcb->control.intercept_exceptions;
- excp &= ~(1 << NM_VECTOR);
- }
- svm->vmcb->control.intercept_exceptions = excp;
+ clr_exception_intercept(svm, NM_VECTOR);
svm->vcpu.fpu_active = 1;
update_cr0_intercept(svm);
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm)
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string || in)
- return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.nested_cr3 = root;
- force_new_asid(vcpu);
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ svm_flush_tlb(vcpu);
}
-static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.exit_code = SVM_EXIT_NPF;
svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
- svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
+ svm->vmcb->control.exit_info_1 = fault->error_code;
+ svm->vmcb->control.exit_info_2 = fault->address;
nested_svm_vmexit(svm);
}
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
{
int vmexit;
- if (!is_nested(svm))
+ if (!is_guest_mode(&svm->vcpu))
return 0;
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
/* This function returns true if it is save to enable the irq window */
static inline bool nested_svm_intr(struct vcpu_svm *svm)
{
- if (!is_nested(svm))
+ if (!is_guest_mode(&svm->vcpu))
return true;
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
/* This function returns true if it is save to enable the nmi window */
static inline bool nested_svm_nmi(struct vcpu_svm *svm)
{
- if (!is_nested(svm))
+ if (!is_guest_mode(&svm->vcpu))
return true;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs */
- if (!npt_enabled)
+ /* When we're shadowing, trap PFs, but not async PF */
+ if (!npt_enabled && svm->apf_reason == 0)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
case SVM_EXIT_IOIO:
vmexit = nested_svm_intercept_ioio(svm);
break;
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
- if (svm->nested.intercept_cr_read & cr_bits)
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+ if (svm->nested.intercept_cr & bit)
vmexit = NESTED_EXIT_DONE;
break;
}
- case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
- if (svm->nested.intercept_cr_write & cr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
- if (svm->nested.intercept_dr_read & dr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
- if (svm->nested.intercept_dr_write & dr_bits)
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+ if (svm->nested.intercept_dr & bit)
vmexit = NESTED_EXIT_DONE;
break;
}
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
if (svm->nested.intercept_exceptions & excp_bits)
vmexit = NESTED_EXIT_DONE;
+ /* async page fault always cause vmexit */
+ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+ svm->apf_reason != 0)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_ERR: {
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
struct vmcb_control_area *dst = &dst_vmcb->control;
struct vmcb_control_area *from = &from_vmcb->control;
- dst->intercept_cr_read = from->intercept_cr_read;
- dst->intercept_cr_write = from->intercept_cr_write;
- dst->intercept_dr_read = from->intercept_dr_read;
- dst->intercept_dr_write = from->intercept_dr_write;
+ dst->intercept_cr = from->intercept_cr;
+ dst->intercept_dr = from->intercept_dr;
dst->intercept_exceptions = from->intercept_exceptions;
dst->intercept = from->intercept;
dst->iopm_base_pa = from->iopm_base_pa;
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
if (!nested_vmcb)
return 1;
- /* Exit nested SVM mode */
+ /* Exit Guest-Mode */
+ leave_guest_mode(&svm->vcpu);
svm->nested.vmcb = 0;
/* Give the current vmcb to the guest */
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.idtr = vmcb->save.idtr;
nested_vmcb->save.efer = svm->vcpu.arch.efer;
nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
- nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
+ nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
nested_vmcb->save.cr2 = vmcb->save.cr2;
nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
+ mark_all_dirty(svm->vmcb);
+
nested_svm_unmap(page);
nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb->control.event_inj,
nested_vmcb->control.nested_ctl);
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
- nested_vmcb->control.intercept_cr_write,
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
nested_vmcb->control.intercept_exceptions,
nested_vmcb->control.intercept);
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (npt_enabled)
hsave->save.cr3 = vmcb->save.cr3;
else
- hsave->save.cr3 = svm->vcpu.arch.cr3;
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
copy_vmcb_control_area(hsave, vmcb);
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
/* cache intercepts */
- svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
- svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write;
- svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
- svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
+ svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
+ svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
svm->nested.intercept = nested_vmcb->control.intercept;
- force_new_asid(&svm->vcpu);
+ svm_flush_tlb(&svm->vcpu);
svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
/* We only want the cr8 intercept bits of the guest */
- svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ clr_cr_intercept(svm, INTERCEPT_CR8_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
/* We don't want to see VMMCALLs from a nested guest */
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
-
- /*
- * We don't want a nested guest to be more powerful than the guest, so
- * all intercepts are ORed
- */
- svm->vmcb->control.intercept_cr_read |=
- nested_vmcb->control.intercept_cr_read;
- svm->vmcb->control.intercept_cr_write |=
- nested_vmcb->control.intercept_cr_write;
- svm->vmcb->control.intercept_dr_read |=
- nested_vmcb->control.intercept_dr_read;
- svm->vmcb->control.intercept_dr_write |=
- nested_vmcb->control.intercept_dr_write;
- svm->vmcb->control.intercept_exceptions |=
- nested_vmcb->control.intercept_exceptions;
-
- svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+ clr_intercept(svm, INTERCEPT_VMMCALL);
svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_svm_unmap(page);
- /* nested_vmcb is our indicator if nested SVM is activated */
+ /* Enter Guest-Mode */
+ enter_guest_mode(&svm->vcpu);
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take affect here
+ */
+ recalc_intercepts(svm);
+
svm->nested.vmcb = vmcb_gpa;
enable_gif(svm);
+ mark_all_dirty(svm->vmcb);
+
return true;
}
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm)
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
+
return 1;
}
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm)
return 1;
}
+static int xsetbv_interception(struct vcpu_svm *svm)
+{
+ u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
+ u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+
+ if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ }
+
+ return 1;
+}
+
static int invalid_op_interception(struct vcpu_svm *svm)
{
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm)
static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+ clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
return 1;
}
static int invlpg_interception(struct vcpu_svm *svm)
{
- return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+
+ kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
+ skip_emulated_instruction(&svm->vcpu);
+ return 1;
}
static int emulate_on_interception(struct vcpu_svm *svm)
{
- return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct vcpu_svm *svm)
+{
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_read(&svm->vcpu, reg);
+ switch (cr) {
+ case 0:
+ err = kvm_set_cr0(&svm->vcpu, val);
+ break;
+ case 3:
+ err = kvm_set_cr3(&svm->vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(&svm->vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(&svm->vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(&svm->vcpu);
+ break;
+ case 2:
+ val = svm->vcpu.arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(&svm->vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(&svm->vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(&svm->vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_write(&svm->vcpu, reg, val);
+ }
+ kvm_complete_insn_gp(&svm->vcpu, err);
+
+ return 1;
}
static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
int r;
- r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+ r = cr_interception(svm);
if (svm->nested.vmexit_rip) {
kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm)
svm->nested.vmexit_rip = 0;
}
- return r == EMULATE_DONE;
+ return r;
+}
+
+static int dr_interception(struct vcpu_svm *svm)
+{
+ int reg, dr;
+ unsigned long val;
+ int err;
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+
+ if (dr >= 16) { /* mov to DRn */
+ val = kvm_register_read(&svm->vcpu, reg);
+ kvm_set_dr(&svm->vcpu, dr - 16, val);
+ } else {
+ err = kvm_get_dr(&svm->vcpu, dr, &val);
+ if (!err)
+ kvm_register_write(&svm->vcpu, reg, val);
+ }
+
+ return 1;
}
static int cr8_write_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
+ int r;
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
- emulate_instruction(&svm->vcpu, 0, 0, 0);
+ r = cr_interception(svm);
if (irqchip_in_kernel(svm->vcpu.kvm)) {
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
- return 1;
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ return r;
}
if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
- return 1;
+ return r;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
switch (ecx) {
case MSR_IA32_TSC: {
- u64 tsc_offset;
+ struct vmcb *vmcb = get_host_vmcb(svm);
- if (is_nested(svm))
- tsc_offset = svm->nested.hsave->control.tsc_offset;
- else
- tsc_offset = svm->vmcb->control.tsc_offset;
-
- *data = tsc_offset + native_read_tsc();
+ *data = vmcb->control.tsc_offset + native_read_tsc();
break;
}
case MSR_STAR:
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->vmcb->save.sysenter_esp = data;
break;
case MSR_IA32_DEBUGCTLMSR:
- if (!svm_has(SVM_FEATURE_LBRV)) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV)) {
pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
__func__, data);
break;
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
return 1;
svm->vmcb->save.dbgctl = data;
+ mark_dirty(svm->vmcb, VMCB_LBR);
if (data & (1ULL<<0))
svm_enable_lbrv(svm);
else
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
/*
* If the user space waits to inject interrupts, exit as soon as
* possible
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm)
}
static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
- [SVM_EXIT_READ_CR0] = emulate_on_interception,
- [SVM_EXIT_READ_CR3] = emulate_on_interception,
- [SVM_EXIT_READ_CR4] = emulate_on_interception,
- [SVM_EXIT_READ_CR8] = emulate_on_interception,
+ [SVM_EXIT_READ_CR0] = cr_interception,
+ [SVM_EXIT_READ_CR3] = cr_interception,
+ [SVM_EXIT_READ_CR4] = cr_interception,
+ [SVM_EXIT_READ_CR8] = cr_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
[SVM_EXIT_WRITE_CR0] = cr0_write_interception,
- [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR3] = cr_interception,
+ [SVM_EXIT_WRITE_CR4] = cr_interception,
[SVM_EXIT_WRITE_CR8] = cr8_write_interception,
- [SVM_EXIT_READ_DR0] = emulate_on_interception,
- [SVM_EXIT_READ_DR1] = emulate_on_interception,
- [SVM_EXIT_READ_DR2] = emulate_on_interception,
- [SVM_EXIT_READ_DR3] = emulate_on_interception,
- [SVM_EXIT_READ_DR4] = emulate_on_interception,
- [SVM_EXIT_READ_DR5] = emulate_on_interception,
- [SVM_EXIT_READ_DR6] = emulate_on_interception,
- [SVM_EXIT_READ_DR7] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
+ [SVM_EXIT_READ_DR0] = dr_interception,
+ [SVM_EXIT_READ_DR1] = dr_interception,
+ [SVM_EXIT_READ_DR2] = dr_interception,
+ [SVM_EXIT_READ_DR3] = dr_interception,
+ [SVM_EXIT_READ_DR4] = dr_interception,
+ [SVM_EXIT_READ_DR5] = dr_interception,
+ [SVM_EXIT_READ_DR6] = dr_interception,
+ [SVM_EXIT_READ_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_DR0] = dr_interception,
+ [SVM_EXIT_WRITE_DR1] = dr_interception,
+ [SVM_EXIT_WRITE_DR2] = dr_interception,
+ [SVM_EXIT_WRITE_DR3] = dr_interception,
+ [SVM_EXIT_WRITE_DR4] = dr_interception,
+ [SVM_EXIT_WRITE_DR5] = dr_interception,
+ [SVM_EXIT_WRITE_DR6] = dr_interception,
+ [SVM_EXIT_WRITE_DR7] = dr_interception,
[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_WBINVD] = emulate_on_interception,
[SVM_EXIT_MONITOR] = invalid_op_interception,
[SVM_EXIT_MWAIT] = invalid_op_interception,
+ [SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = pf_interception,
};
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_save_area *save = &svm->vmcb->save;
pr_err("VMCB Control Area:\n");
- pr_err("cr_read: %04x\n", control->intercept_cr_read);
- pr_err("cr_write: %04x\n", control->intercept_cr_write);
- pr_err("dr_read: %04x\n", control->intercept_dr_read);
- pr_err("dr_write: %04x\n", control->intercept_dr_write);
+ pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
+ pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
+ pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
+ pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
pr_err("exceptions: %08x\n", control->intercept_exceptions);
pr_err("intercepts: %016llx\n", control->intercept);
pr_err("pause filter count: %d\n", control->pause_filter_count);
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
}
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *info1 = control->exit_info_1;
+ *info2 = control->exit_info_2;
+}
+
static int handle_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, vcpu);
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+ if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
return 1;
}
- if (is_nested(svm)) {
+ if (is_guest_mode(vcpu)) {
int vmexit;
trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
/* FIXME: handle wraparound of asid_generation */
if (svm->asid_generation != sd->asid_generation)
new_asid(svm, sd);
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+ set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
control->int_ctl &= ~V_INTR_PRIO_MASK;
control->int_ctl |= V_IRQ_MASK |
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ mark_dirty(svm->vmcb, VMCB_INTR);
}
static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
if (irr == -1)
return;
if (tpr >= irr)
- svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
if (masked) {
svm->vcpu.arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+ set_intercept(svm, INTERCEPT_IRET);
} else {
svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+ clr_intercept(svm, INTERCEPT_IRET);
}
}
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
- if (is_nested(svm))
+ if (is_guest_mode(vcpu))
return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
return ret;
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
static void svm_flush_tlb(struct kvm_vcpu *vcpu)
{
- force_new_asid(vcpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ else
+ svm->asid_generation--;
}
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
+ if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
kvm_set_cr8(vcpu, cr8);
}
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
cr8 = kvm_get_cr8(vcpu);
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
static void svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u16 fs_selector;
- u16 gs_selector;
- u16 ldt_selector;
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
sync_lapic_to_cr8(vcpu);
- save_host_msrs(vcpu);
- savesegment(fs, fs_selector);
- savesegment(gs, gs_selector);
- ldt_selector = kvm_read_ldt();
svm->vmcb->save.cr2 = vcpu->arch.cr2;
clgi();
@@ -3389,19 +3635,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
- vcpu->arch.cr2 = svm->vmcb->save.cr2;
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
- load_host_msrs(vcpu);
- kvm_load_ldt(ldt_selector);
- loadsegment(fs, fs_selector);
#ifdef CONFIG_X86_64
- load_gs_index(gs_selector);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+ wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
- loadsegment(gs, gs_selector);
+ loadsegment(fs, svm->host.fs);
#endif
reload_tss(vcpu);
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
stgi();
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* if exit due to PF check for async PF */
+ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ svm->apf_reason = kvm_read_and_reset_pf_reason();
+
if (npt_enabled) {
vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->vmcb->control.exit_code ==
SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
+
+ mark_all_clean(svm->vmcb);
}
#undef R
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.cr3 = root;
- force_new_asid(vcpu);
+ mark_dirty(svm->vmcb, VMCB_CR);
+ svm_flush_tlb(vcpu);
}
static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.nested_cr3 = root;
+ mark_dirty(svm->vmcb, VMCB_NPT);
/* Also sync guest cr3 here in case we live migrate */
- svm->vmcb->save.cr3 = vcpu->arch.cr3;
+ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
+ mark_dirty(svm->vmcb, VMCB_CR);
- force_new_asid(vcpu);
+ svm_flush_tlb(vcpu);
}
static int is_disabled(void)
@@ -3494,10 +3747,6 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
switch (func) {
- case 0x00000001:
- /* Mask out xsave bit as long as it is not supported by SVM */
- entry->ecx &= ~(bit(X86_FEATURE_XSAVE));
- break;
case 0x80000001:
if (nested)
entry->ecx |= (1 << 2); /* Set SVM bit */
@@ -3511,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
additional features */
/* Support next_rip if host supports it */
- if (svm_has(SVM_FEATURE_NRIP))
+ if (boot_cpu_has(X86_FEATURE_NRIPS))
entry->edx |= SVM_FEATURE_NRIP;
/* Support NPT for the guest if enabled */
@@ -3571,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
{ SVM_EXIT_WBINVD, "wbinvd" },
{ SVM_EXIT_MONITOR, "monitor" },
{ SVM_EXIT_MWAIT, "mwait" },
+ { SVM_EXIT_XSETBV, "xsetbv" },
{ SVM_EXIT_NPF, "npf" },
{ -1, NULL }
};
@@ -3594,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
- if (is_nested(svm))
- svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+ set_exception_intercept(svm, NM_VECTOR);
update_cr0_intercept(svm);
}
@@ -3627,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
+ .decache_cr3 = svm_decache_cr3,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
.set_cr3 = svm_set_cr3,
@@ -3667,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
+ .get_exit_info = svm_get_exit_info,
.exit_reasons_str = svm_exit_reasons_str,
+
.get_lpage_level = svm_get_lpage_level,
.cpuid_update = svm_cpuid_update,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..1357d7cf4ec8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+#define KVM_ISA_VMX 1
+#define KVM_ISA_SVM 2
+
/*
* Tracepoint for kvm guest exit:
*/
TRACE_EVENT(kvm_exit,
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
- TP_ARGS(exit_reason, vcpu),
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
+ TP_ARGS(exit_reason, vcpu, isa),
TP_STRUCT__entry(
__field( unsigned int, exit_reason )
__field( unsigned long, guest_rip )
+ __field( u32, isa )
+ __field( u64, info1 )
+ __field( u64, info2 )
),
TP_fast_assign(
__entry->exit_reason = exit_reason;
__entry->guest_rip = kvm_rip_read(vcpu);
+ __entry->isa = isa;
+ kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
+ &__entry->info2);
),
- TP_printk("reason %s rip 0x%lx",
+ TP_printk("reason %s rip 0x%lx info %llx %llx",
ftrace_print_symbols_seq(p, __entry->exit_reason,
kvm_x86_ops->exit_reasons_str),
- __entry->guest_rip)
+ __entry->guest_rip, __entry->info1, __entry->info2)
);
/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81fcbe9515c5..bf89ec2cfb82 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static int __read_mostly vmm_exclusive = 1;
module_param(vmm_exclusive, bool, S_IRUGO);
+static int __read_mostly yield_on_hlt = 1;
+module_param(yield_on_hlt, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
+static bool cpu_has_load_ia32_efer;
+
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
u8 error;
asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
u8 error;
asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
static unsigned long vmcs_readl(unsigned long field)
{
- unsigned long value;
+ unsigned long value = 0;
asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
- : "=a"(value) : "d"(field) : "cc");
+ : "+a"(value) : "d"(field) : "cc");
return value;
}
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
+ if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+ vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+ vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
+ if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+ vmcs_write64(GUEST_IA32_EFER, guest_val);
+ vmcs_write64(HOST_IA32_EFER, host_val);
+ vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+ vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
@@ -1009,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /* Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set, then
+ * the instruction is already executing and RIP has already been
+ * advanced. */
+ if (!yield_on_hlt &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
@@ -1035,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+ vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
@@ -1305,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
&& tboot_enabled())
return 1;
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && !tboot_enabled())
+ && !tboot_enabled()) {
+ printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
+ " activate TXT before enabling KVM\n");
return 1;
+ }
}
return 0;
@@ -1400,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
+static __init bool allow_1_setting(u32 msr, u32 ctl)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+ return vmx_msr_high & ctl;
+}
+
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
{
u32 vmx_msr_low, vmx_msr_high;
@@ -1416,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_pin_based_exec_control) < 0)
return -EIO;
- min = CPU_BASED_HLT_EXITING |
+ min =
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
@@ -1429,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING;
+
+ if (yield_on_hlt)
+ min |= CPU_BASED_HLT_EXITING;
+
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1510,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
+ cpu_has_load_ia32_efer =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_EFER)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_EFER);
+
return 0;
}
@@ -1683,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
save->limit = vmcs_read32(sf->limit);
save->ar = vmcs_read32(sf->ar_bytes);
vmcs_write16(sf->selector, save->base >> 4);
- vmcs_write32(sf->base, save->base & 0xfffff);
+ vmcs_write32(sf->base, save->base & 0xffff0);
vmcs_write32(sf->limit, 0xffff);
vmcs_write32(sf->ar_bytes, 0xf3);
+ if (save->base & 0xf)
+ printk_once(KERN_WARNING "kvm: segment base is not paragraph"
+ " aligned when entering protected mode (seg=%d)",
+ seg);
}
static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1814,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
+static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+{
+ if (enable_ept && is_paging(vcpu))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+}
+
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1857,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
+ vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1937,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
- guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+ guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
vcpu->kvm->arch.ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
@@ -2725,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
- vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
@@ -2787,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
return;
}
+ if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ enable_irq_window(vcpu);
+ return;
+ }
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2814,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+ vmx_clear_hlt(vcpu);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2841,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+ vmx_clear_hlt(vcpu);
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2849,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
return 0;
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+ | GUEST_INTR_STATE_NMI));
}
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2910,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
+ if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
return 1;
/*
* Forward all other exceptions that are valid in real mode.
@@ -3007,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
if (is_invalid_opcode(intr_info)) {
- er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
@@ -3026,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
- return kvm_mmu_page_fault(vcpu, cr2, error_code);
+ return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
}
if (vmx->rmode.vm86_active &&
@@ -3098,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string || in)
- return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -3118,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
- if (err)
- kvm_inject_gp(vcpu, 0);
- else
- skip_emulated_instruction(vcpu);
-}
-
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
@@ -3143,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
switch (cr) {
case 0:
err = kvm_set_cr0(vcpu, val);
- complete_insn_gp(vcpu, err);
+ kvm_complete_insn_gp(vcpu, err);
return 1;
case 3:
err = kvm_set_cr3(vcpu, val);
- complete_insn_gp(vcpu, err);
+ kvm_complete_insn_gp(vcpu, err);
return 1;
case 4:
err = kvm_set_cr4(vcpu, val);
- complete_insn_gp(vcpu, err);
+ kvm_complete_insn_gp(vcpu, err);
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
u8 cr8 = kvm_register_read(vcpu, reg);
- kvm_set_cr8(vcpu, cr8);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr8(vcpu, cr8);
+ kvm_complete_insn_gp(vcpu, err);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
if (cr8_prev <= cr8)
@@ -3176,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
case 1: /*mov from cr*/
switch (cr) {
case 3:
- kvm_register_write(vcpu, reg, vcpu->arch.cr3);
- trace_kvm_cr_read(cr, vcpu->arch.cr3);
+ val = kvm_read_cr3(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
skip_emulated_instruction(vcpu);
return 1;
case 8:
@@ -3349,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_invd(struct kvm_vcpu *vcpu)
+{
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3377,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
- return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3476,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+ return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3592,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
&& (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
return handle_interrupt_window(&vmx->vcpu);
- err = emulate_instruction(vcpu, 0, 0, 0);
+ err = emulate_instruction(vcpu, 0);
if (err == EMULATE_DO_MMIO) {
ret = 0;
@@ -3649,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3676,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ *info1 = vmcs_readl(EXIT_QUALIFICATION);
+ *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -3686,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- trace_kvm_exit(exit_reason, vcpu);
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
/* If guest state is invalid, start emulating */
if (vmx->emulation_required && emulate_invalid_guest_state)
return handle_invalid_guest_state(vcpu);
- /* Access CR3 don't cause VMExit in paging mode, so we need
- * to sync with guest real CR3. */
- if (enable_ept && is_paging(vcpu))
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
-
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4013,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
);
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_PDPTR));
+ | (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4280,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+ .decache_cr3 = vmx_decache_cr3,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
@@ -4320,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
+ .get_exit_info = vmx_get_exit_info,
.exit_reasons_str = vmx_exit_reasons_str,
+
.get_lpage_level = vmx_get_lpage_level,
.cpuid_update = vmx_cpuid_update,
@@ -4396,8 +4472,6 @@ static int __init vmx_init(void)
if (enable_ept) {
bypass_guest_pf = 0;
- kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
- VMX_EPT_WRITABLE_MASK);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 46a368cb651e..bcc0efce85bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/uaccess.h>
+#include <linux/hash.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -155,6 +156,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0;
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
+{
+ int i;
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+ vcpu->arch.apf.gfns[i] = ~0;
+}
+
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
@@ -326,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
- unsigned error_code = vcpu->arch.fault.error_code;
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
++vcpu->stat.pf_guest;
- vcpu->arch.cr2 = vcpu->arch.fault.address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+ vcpu->arch.cr2 = fault->address;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
}
-void kvm_propagate_fault(struct kvm_vcpu *vcpu)
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
- if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
- vcpu->arch.nested_mmu.inject_page_fault(vcpu);
+ if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
+ vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
else
- vcpu->arch.mmu.inject_page_fault(vcpu);
-
- vcpu->arch.fault.nested = false;
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -460,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_avail))
return true;
- gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
- offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
+ gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
+ offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
@@ -506,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
} else
#endif
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.cr3))
+ kvm_read_cr3(vcpu)))
return 1;
}
kvm_x86_ops->set_cr0(vcpu, cr0);
+ if ((cr0 ^ old_cr0) & X86_CR0_PG)
+ kvm_clear_async_pf_completion_queue(vcpu);
+
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
return 0;
@@ -595,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
+ && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ kvm_read_cr3(vcpu)))
return 1;
if (cr4 & X86_CR4_VMXE)
@@ -615,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
- if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+ if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
return 0;
@@ -650,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
return 1;
vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
-int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
@@ -665,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
vcpu->arch.cr8 = cr8;
return 0;
}
-
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
- if (__kvm_set_cr8(vcpu, cr8))
- kvm_inject_gp(vcpu, 0);
-}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -775,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 7
+#define KVM_SAVE_MSRS_BEGIN 8
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_APIC_ASSIST_PAGE,
+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -830,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
- kvm_mmu_reset_context(vcpu);
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
@@ -1418,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ gpa_t gpa = data & ~0x3f;
+
+ /* Bits 2:5 are resrved, Should be zero */
+ if (data & 0x3c)
+ return 1;
+
+ vcpu->arch.apf.msr_val = data;
+
+ if (!(data & KVM_ASYNC_PF_ENABLED)) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ return 0;
+ }
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+ return 1;
+
+ vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ kvm_async_pf_wakeup_all(vcpu);
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@@ -1499,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
break;
}
+ case MSR_KVM_ASYNC_PF_EN:
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1775,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_SYSTEM_TIME_NEW:
data = vcpu->arch.time;
break;
+ case MSR_KVM_ASYNC_PF_EN:
+ data = vcpu->arch.apf.msr_val;
+ break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -1904,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_USER_NMI:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1922,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
+ case KVM_CAP_ASYNC_PF:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2185,6 +2229,11 @@ out:
return r;
}
+static void cpuid_mask(u32 *word, int wordnum)
+{
+ *word &= boot_cpu_data.x86_capability[wordnum];
+}
+
static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index)
{
@@ -2259,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
+ cpuid_mask(&entry->edx, 0);
entry->ecx &= kvm_supported_word4_x86_features;
+ cpuid_mask(&entry->ecx, 4);
/* we support x2apic emulation even if host does not support
* it since we emulate x2apic in software */
entry->ecx |= F(X2APIC);
@@ -2350,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 0x80000001:
entry->edx &= kvm_supported_word1_x86_features;
+ cpuid_mask(&entry->edx, 1);
entry->ecx &= kvm_supported_word6_x86_features;
+ cpuid_mask(&entry->ecx, 6);
break;
}
@@ -3169,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memslots *slots, *old_slots;
unsigned long *dirty_bitmap;
- r = -ENOMEM;
- dirty_bitmap = vmalloc(n);
- if (!dirty_bitmap)
- goto out;
+ dirty_bitmap = memslot->dirty_bitmap_head;
+ if (memslot->dirty_bitmap == dirty_bitmap)
+ dirty_bitmap += n / sizeof(long);
memset(dirty_bitmap, 0, n);
r = -ENOMEM;
slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
- if (!slots) {
- vfree(dirty_bitmap);
+ if (!slots)
goto out;
- }
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+ slots->generation++;
old_slots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
@@ -3195,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
spin_unlock(&kvm->mmu_lock);
r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
- vfree(dirty_bitmap);
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
goto out;
- }
- vfree(dirty_bitmap);
} else {
r = -EFAULT;
if (clear_user(log->dirty_bitmap, n))
@@ -3266,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (vpic) {
r = kvm_ioapic_init(kvm);
if (r) {
+ mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
&vpic->dev);
+ mutex_unlock(&kvm->slots_lock);
kfree(vpic);
goto create_irqchip_unlock;
}
@@ -3278,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
smp_wmb();
r = kvm_setup_default_irq_routing(kvm);
if (r) {
+ mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
kvm_destroy_pic(kvm);
mutex_unlock(&kvm->irq_lock);
+ mutex_unlock(&kvm->slots_lock);
}
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -3557,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
{
gpa_t t_gpa;
- u32 error;
+ struct x86_exception exception;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
- if (t_gpa == UNMAPPED_GVA)
- vcpu->arch.fault.nested = true;
+ t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
return t_gpa;
}
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
/* uses this to access any guest's mapped memory without checking CPL */
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
- u32 *error)
+ struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
- error);
+ exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA) {
- r = X86EMUL_PROPAGATE_FAULT;
- goto out;
- }
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
@@ -3630,31 +3682,35 @@ out:
/* used for instruction fetching */
static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
- access | PFERR_FETCH_MASK, error);
+ access | PFERR_FETCH_MASK,
+ exception);
}
static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- error);
+ exception);
}
static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
{
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
}
static int kvm_write_guest_virt_system(gva_t addr, void *val,
unsigned int bytes,
struct kvm_vcpu *vcpu,
- u32 *error)
+ struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
@@ -3662,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
PFERR_WRITE_MASK,
- error);
+ exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA) {
- r = X86EMUL_PROPAGATE_FAULT;
- goto out;
- }
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
@@ -3688,7 +3742,7 @@ out:
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@@ -3701,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@@ -3710,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
- == X86EMUL_CONTINUE)
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
+ == X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
mmio:
@@ -3735,7 +3789,7 @@ mmio:
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+ const void *val, int bytes)
{
int ret;
@@ -3749,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@@ -3787,7 +3841,7 @@ mmio:
int emulator_write_emulated(unsigned long addr,
const void *val,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
@@ -3795,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+ rc = emulator_write_emulated_onepage(addr, val, now, exception,
vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -3803,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+ return emulator_write_emulated_onepage(addr, val, bytes, exception,
vcpu);
}
@@ -3821,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@@ -3879,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
- return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
+ return emulator_write_emulated(addr, new, bytes, exception, vcpu);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3904,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
if (vcpu->arch.pio.count)
goto data_avail;
- trace_kvm_pio(0, port, size, 1);
+ trace_kvm_pio(0, port, size, count);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 1;
@@ -3932,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
const void *val, unsigned int count,
struct kvm_vcpu *vcpu)
{
- trace_kvm_pio(1, port, size, 1);
+ trace_kvm_pio(1, port, size, count);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 0;
@@ -3973,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
return X86EMUL_CONTINUE;
if (kvm_x86_ops->has_wbinvd_exit()) {
- preempt_disable();
+ int cpu = get_cpu();
+
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
wbinvd_ipi, NULL, 1);
- preempt_enable();
+ put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
- }
- wbinvd();
+ } else
+ wbinvd();
return X86EMUL_CONTINUE;
}
EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -4019,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
value = vcpu->arch.cr2;
break;
case 3:
- value = vcpu->arch.cr3;
+ value = kvm_read_cr3(vcpu);
break;
case 4:
value = kvm_read_cr4(vcpu);
@@ -4053,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
- res = __kvm_set_cr8(vcpu, val & 0xfUL);
+ res = kvm_set_cr8(vcpu, val);
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -4206,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- if (ctxt->exception == PF_VECTOR)
- kvm_propagate_fault(vcpu);
- else if (ctxt->error_code_valid)
- kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+ if (ctxt->exception.vector == PF_VECTOR)
+ kvm_propagate_fault(vcpu, &ctxt->exception);
+ else if (ctxt->exception.error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+ ctxt->exception.error_code);
else
- kvm_queue_exception(vcpu, ctxt->exception);
+ kvm_queue_exception(vcpu, ctxt->exception.vector);
}
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4267,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
static int handle_emulation_failure(struct kvm_vcpu *vcpu)
{
+ int r = EMULATE_DONE;
+
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ if (!is_guest_mode(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ r = EMULATE_FAIL;
+ }
kvm_queue_exception(vcpu, UD_VECTOR);
- return EMULATE_FAIL;
+
+ return r;
}
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4302,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
return false;
}
-int emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- u16 error_code,
- int emulation_type)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2,
+ int emulation_type,
+ void *insn,
+ int insn_len)
{
int r;
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4323,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
vcpu->arch.emulate_ctxt.interruptibility = 0;
- vcpu->arch.emulate_ctxt.exception = -1;
+ vcpu->arch.emulate_ctxt.have_exception = false;
vcpu->arch.emulate_ctxt.perm_ok = false;
- r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
if (r == X86EMUL_PROPAGATE_FAULT)
goto done;
@@ -4389,7 +4453,7 @@ restart:
}
done:
- if (vcpu->arch.emulate_ctxt.exception >= 0) {
+ if (vcpu->arch.emulate_ctxt.have_exception) {
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
} else if (vcpu->arch.pio.count) {
@@ -4413,7 +4477,7 @@ done:
return r;
}
-EXPORT_SYMBOL_GPL(emulate_instruction);
+EXPORT_SYMBOL_GPL(x86_emulate_instruction);
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
{
@@ -4653,7 +4717,6 @@ int kvm_arch_init(void *opaque)
kvm_x86_ops = ops;
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
- kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -5116,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
+ if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+ /* Page is swapped out. Do synthetic halt */
+ vcpu->arch.apf.halted = true;
+ r = 1;
+ goto out;
+ }
}
r = kvm_mmu_reload(vcpu);
@@ -5244,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
r = 1;
while (r > 0) {
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted)
r = vcpu_enter_guest(vcpu);
else {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5257,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
break;
case KVM_MP_STATE_SIPI_RECEIVED:
default:
@@ -5278,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
}
+
+ kvm_check_async_pf_completion(vcpu);
+
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5302,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
+ if (!tsk_used_math(current) && init_fpu(current))
+ return -ENOMEM;
+
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -5313,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm))
- kvm_set_cr8(vcpu, kvm_run->cr8);
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+ }
if (vcpu->arch.pio.count || vcpu->mmio_needed) {
if (vcpu->mmio_needed) {
@@ -5323,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->mmio_needed = 0;
}
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
+ r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r != EMULATE_DONE) {
r = 0;
@@ -5436,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
- sregs->cr3 = vcpu->arch.cr3;
+ sregs->cr3 = kvm_read_cr3(vcpu);
sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
@@ -5504,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_x86_ops->set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+ mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -5522,7 +5604,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
if (sregs->cr4 & X86_CR4_OSXSAVE)
update_cpuid(vcpu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1;
}
@@ -5773,6 +5855,8 @@ free_vcpu:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.apf.msr_val = 0;
+
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
@@ -5792,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.apf.msr_val = 0;
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ vcpu->arch.apf.halted = false;
return kvm_x86_ops->vcpu_reset(vcpu);
}
@@ -5881,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
goto fail_free_mce_banks;
+ kvm_async_pf_hash_reset(vcpu);
+
return 0;
fail_free_mce_banks:
kfree(vcpu->arch.mce_banks);
@@ -5906,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
}
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
{
- struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
- if (!kvm)
- return ERR_PTR(-ENOMEM);
-
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5921,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void)
spin_lock_init(&kvm->arch.tsc_write_lock);
- return kvm;
+ return 0;
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5939,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
/*
* Unpin any mmu pages first.
*/
- kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu);
+ }
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_free(vcpu);
@@ -5964,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
- kvm_free_physmem(kvm);
if (kvm->arch.apic_access_page)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
- cleanup_srcu_struct(&kvm->srcu);
- kfree(kvm);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6051,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted)
+ || !list_empty_careful(&vcpu->async_pf.done)
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
|| vcpu->arch.nmi_pending ||
(kvm_arch_interrupt_allowed(vcpu) &&
@@ -6110,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+ is_error_page(work->page))
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu.direct_map &&
+ work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+ return;
+
+ vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+}
+
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+ return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+ return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ while (vcpu->arch.apf.gfns[key] != ~0)
+ key = kvm_async_pf_next_probe(key);
+
+ vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ int i;
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+ (vcpu->arch.apf.gfns[key] != gfn &&
+ vcpu->arch.apf.gfns[key] != ~0); i++)
+ key = kvm_async_pf_next_probe(key);
+
+ return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 i, j, k;
+
+ i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+ while (true) {
+ vcpu->arch.apf.gfns[i] = ~0;
+ do {
+ j = kvm_async_pf_next_probe(j);
+ if (vcpu->arch.apf.gfns[j] == ~0)
+ return;
+ k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+ /*
+ * k lies cyclically in ]i,j]
+ * | i.k.j |
+ * |....j i.k.| or |.k..j i...|
+ */
+ } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+ vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+ i = j;
+ }
+}
+
+static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+ sizeof(val));
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_not_present(work->arch.token, work->gva);
+ kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+ (vcpu->arch.apf.send_user_only &&
+ kvm_x86_ops->get_cpl(vcpu) == 0))
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_ready(work->arch.token, work->gva);
+ if (is_error_page(work->page))
+ work->arch.token = ~0; /* broadcast wakeup */
+ else
+ kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
+ !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+ vcpu->arch.apf.halted = false;
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+ return true;
+ else
+ return !kvm_event_needs_reinjection(vcpu) &&
+ kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);