summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/paging_tmpl.h
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/paging_tmpl.h')
-rw-r--r--arch/x86/kvm/paging_tmpl.h258
1 files changed, 115 insertions, 143 deletions
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9d03ad4dd5ec..507e2b844cfa 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return (ret != orig_pte);
}
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
+ bool last)
{
unsigned access;
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+ if (last && !is_dirty_gpte(gpte))
+ access &= ~ACC_WRITE_MASK;
+
#if PTTYPE == 64
if (vcpu->arch.mmu.nx)
access &= ~(gpte >> PT64_NX_SHIFT);
@@ -113,6 +117,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
return access;
}
+static bool FNAME(is_last_gpte)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ pt_element_t gpte)
+{
+ if (walker->level == PT_PAGE_TABLE_LEVEL)
+ return true;
+
+ if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
+ (PTTYPE == 64 || is_pse(vcpu)))
+ return true;
+
+ if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
+ (mmu->root_level == PT64_ROOT_LEVEL))
+ return true;
+
+ return false;
+}
+
/*
* Fetch a guest pte for a guest virtual address
*/
@@ -125,18 +147,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
gfn_t table_gfn;
unsigned index, pt_access, uninitialized_var(pte_access);
gpa_t pte_gpa;
- bool eperm, present, rsvd_fault;
- int offset, write_fault, user_fault, fetch_fault;
-
- write_fault = access & PFERR_WRITE_MASK;
- user_fault = access & PFERR_USER_MASK;
- fetch_fault = access & PFERR_FETCH_MASK;
+ bool eperm;
+ int offset;
+ const int write_fault = access & PFERR_WRITE_MASK;
+ const int user_fault = access & PFERR_USER_MASK;
+ const int fetch_fault = access & PFERR_FETCH_MASK;
+ u16 errcode = 0;
trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
fetch_fault);
-walk:
- present = true;
- eperm = rsvd_fault = false;
+retry_walk:
+ eperm = false;
walker->level = mmu->root_level;
pte = mmu->get_cr3(vcpu);
@@ -144,10 +165,8 @@ walk:
if (walker->level == PT32E_ROOT_LEVEL) {
pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte)) {
- present = false;
+ if (!is_present_gpte(pte))
goto error;
- }
--walker->level;
}
#endif
@@ -170,42 +189,31 @@ walk:
real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
PFERR_USER_MASK|PFERR_WRITE_MASK);
- if (unlikely(real_gfn == UNMAPPED_GVA)) {
- present = false;
- break;
- }
+ if (unlikely(real_gfn == UNMAPPED_GVA))
+ goto error;
real_gfn = gpa_to_gfn(real_gfn);
host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
- if (unlikely(kvm_is_error_hva(host_addr))) {
- present = false;
- break;
- }
+ if (unlikely(kvm_is_error_hva(host_addr)))
+ goto error;
ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
- if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
- present = false;
- break;
- }
+ if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+ goto error;
trace_kvm_mmu_paging_element(pte, walker->level);
- if (unlikely(!is_present_gpte(pte))) {
- present = false;
- break;
- }
+ if (unlikely(!is_present_gpte(pte)))
+ goto error;
if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
walker->level))) {
- rsvd_fault = true;
- break;
+ errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ goto error;
}
- if (unlikely(write_fault && !is_writable_pte(pte)
- && (user_fault || is_write_protection(vcpu))))
- eperm = true;
-
- if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+ if (!check_write_user_access(vcpu, write_fault, user_fault,
+ pte))
eperm = true;
#if PTTYPE == 64
@@ -213,39 +221,35 @@ walk:
eperm = true;
#endif
- if (!eperm && !rsvd_fault
- && unlikely(!(pte & PT_ACCESSED_MASK))) {
+ if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
int ret;
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
sizeof(pte));
ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
pte, pte|PT_ACCESSED_MASK);
- if (unlikely(ret < 0)) {
- present = false;
- break;
- } else if (ret)
- goto walk;
+ if (unlikely(ret < 0))
+ goto error;
+ else if (ret)
+ goto retry_walk;
mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_ACCESSED_MASK;
}
- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
walker->ptes[walker->level - 1] = pte;
- if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
- ((walker->level == PT_DIRECTORY_LEVEL) &&
- is_large_pte(pte) &&
- (PTTYPE == 64 || is_pse(vcpu))) ||
- ((walker->level == PT_PDPE_LEVEL) &&
- is_large_pte(pte) &&
- mmu->root_level == PT64_ROOT_LEVEL)) {
+ if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
int lvl = walker->level;
gpa_t real_gpa;
gfn_t gfn;
u32 ac;
+ /* check if the kernel is fetching from user page */
+ if (unlikely(pte_access & PT_USER_MASK) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ if (fetch_fault && !user_fault)
+ eperm = true;
+
gfn = gpte_to_gfn_lvl(pte, lvl);
gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
@@ -266,12 +270,14 @@ walk:
break;
}
- pt_access = pte_access;
+ pt_access &= FNAME(gpte_access)(vcpu, pte, false);
--walker->level;
}
- if (unlikely(!present || eperm || rsvd_fault))
+ if (unlikely(eperm)) {
+ errcode |= PFERR_PRESENT_MASK;
goto error;
+ }
if (write_fault && unlikely(!is_dirty_gpte(pte))) {
int ret;
@@ -279,17 +285,17 @@ walk:
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
pte, pte|PT_DIRTY_MASK);
- if (unlikely(ret < 0)) {
- present = false;
+ if (unlikely(ret < 0))
goto error;
- } else if (ret)
- goto walk;
+ else if (ret)
+ goto retry_walk;
mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_DIRTY_MASK;
walker->ptes[walker->level - 1] = pte;
}
+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -297,19 +303,14 @@ walk:
return 1;
error:
+ errcode |= write_fault | user_fault;
+ if (fetch_fault && (mmu->nx ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+ errcode |= PFERR_FETCH_MASK;
+
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
- walker->fault.error_code = 0;
- if (present)
- walker->fault.error_code |= PFERR_PRESENT_MASK;
-
- walker->fault.error_code |= write_fault | user_fault;
-
- if (fetch_fault && mmu->nx)
- walker->fault.error_code |= PFERR_FETCH_MASK;
- if (rsvd_fault)
- walker->fault.error_code |= PFERR_RSVD_MASK;
-
+ walker->fault.error_code = errcode;
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
@@ -336,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
pt_element_t gpte)
{
- u64 nonpresent = shadow_trap_nonpresent_pte;
-
if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
- if (!is_present_gpte(gpte)) {
- if (!sp->unsync)
- nonpresent = shadow_notrap_nonpresent_pte;
+ if (!is_present_gpte(gpte))
goto no_present;
- }
if (!(gpte & PT_ACCESSED_MASK))
goto no_present;
@@ -353,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
return false;
no_present:
- drop_spte(vcpu->kvm, spte, nonpresent);
+ drop_spte(vcpu->kvm, spte);
return true;
}
@@ -369,9 +365,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (is_error_pfn(pfn)) {
+ if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return;
}
@@ -381,7 +377,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
+ NULL, PT_PAGE_TABLE_LEVEL,
gpte_to_gfn(gpte), pfn, true, true);
}
@@ -432,12 +428,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
unsigned pte_access;
gfn_t gfn;
pfn_t pfn;
- bool dirty;
if (spte == sptep)
continue;
- if (*spte != shadow_trap_nonpresent_pte)
+ if (is_shadow_present_pte(*spte))
continue;
gpte = gptep[i];
@@ -445,18 +440,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+ true);
gfn = gpte_to_gfn(gpte);
- dirty = is_dirty_gpte(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- (pte_access & ACC_WRITE_MASK) && dirty);
- if (is_error_pfn(pfn)) {
+ pte_access & ACC_WRITE_MASK);
+ if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
break;
}
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+ NULL, PT_PAGE_TABLE_LEVEL, gfn,
pfn, true, true);
}
}
@@ -467,12 +462,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn, bool map_writable,
+ int *emulate, pfn_t pfn, bool map_writable,
bool prefault)
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
- bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
int top_level;
unsigned direct_access;
struct kvm_shadow_walk_iterator it;
@@ -480,9 +474,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
- direct_access = gw->pt_access & gw->pte_access;
- if (!dirty)
- direct_access &= ~ACC_WRITE_MASK;
+ direct_access = gw->pte_access;
top_level = vcpu->arch.mmu.root_level;
if (top_level == PT32E_ROOT_LEVEL)
@@ -540,8 +532,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(it.sptep, sp);
}
- mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
- user_fault, write_fault, dirty, ptwrite, it.level,
+ mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
+ user_fault, write_fault, emulate, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
@@ -575,7 +567,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
u64 *sptep;
- int write_pt = 0;
+ int emulate = 0;
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
@@ -585,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, addr, error_code,
+ mmu_is_nested(vcpu));
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -623,9 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
&map_writable))
return 0;
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
+ walker.gfn, pfn, walker.pte_access, &r))
+ return r;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -636,19 +632,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn, map_writable, prefault);
+ level, &emulate, pfn, map_writable, prefault);
(void)sptep;
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
- sptep, *sptep, write_pt);
+ pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
+ sptep, *sptep, emulate);
- if (!write_pt)
+ if (!emulate)
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
++vcpu->stat.pf_fixed;
trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);
- return write_pt;
+ return emulate;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -665,6 +661,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
u64 *sptep;
int need_flush = 0;
+ vcpu_clear_mmio_info(vcpu, gva);
+
spin_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry(vcpu, gva, iterator) {
@@ -688,11 +686,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (is_shadow_present_pte(*sptep)) {
if (is_large_pte(*sptep))
--vcpu->kvm->stat.lpages;
- drop_spte(vcpu->kvm, sptep,
- shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
need_flush = 1;
- } else
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ } else if (is_mmio_spte(*sptep))
+ mmu_spte_clear_no_track(sptep);
+
break;
}
@@ -752,36 +750,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
return gpa;
}
-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i, j, offset, r;
- pt_element_t pt[256 / sizeof(pt_element_t)];
- gpa_t pte_gpa;
-
- if (sp->role.direct
- || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
- nonpaging_prefetch_page(vcpu, sp);
- return;
- }
-
- pte_gpa = gfn_to_gpa(sp->gfn);
- if (PTTYPE == 32) {
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
- pte_gpa += offset * sizeof(pt_element_t);
- }
-
- for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
- pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
- for (j = 0; j < ARRAY_SIZE(pt); ++j)
- if (r || is_present_gpte(pt[j]))
- sp->spt[i+j] = shadow_trap_nonpresent_pte;
- else
- sp->spt[i+j] = shadow_notrap_nonpresent_pte;
- }
-}
-
/*
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -817,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gpa_t pte_gpa;
gfn_t gfn;
- if (!is_shadow_present_pte(sp->spt[i]))
+ if (!sp->spt[i])
continue;
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -826,26 +794,30 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- gfn = gpte_to_gfn(gpte);
-
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+
+ if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+ continue;
+
if (gfn != sp->gfns[i]) {
- drop_spte(vcpu->kvm, &sp->spt[i],
- shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, &sp->spt[i]);
vcpu->kvm->tlbs_dirty++;
continue;
}
nr_present++;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
+ PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
host_writable);
}