9 files changed, 1443 insertions, 29 deletions
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 82e73e2b953d..09a9e6dfc09f 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
 
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 8e245e764c21..54200208bf24 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -8,6 +8,7 @@
 #include <linux/vmalloc.h>
 #include <linux/err.h>
 #include <asm/pgtable.h>
+#include <asm/gmap.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include <asm/switch_to.h>
@@ -946,3 +947,241 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
 		return 0;
 	return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
 }
+
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+				  unsigned long *pgt, int *dat_protection,
+				  int *fake)
+{
+	struct gmap *parent;
+	union asce asce;
+	union vaddress vaddr;
+	unsigned long ptr;
+	int rc;
+
+	*fake = 0;
+	*dat_protection = 0;
+	parent = sg->parent;
+	vaddr.addr = saddr;
+	asce.val = sg->orig_asce;
+	ptr = asce.origin * 4096;
+	if (asce.r) {
+		*fake = 1;
+		asce.dt = ASCE_TYPE_REGION1;
+	}
+	switch (asce.dt) {
+	case ASCE_TYPE_REGION1:
+		if (vaddr.rfx01 > asce.tl && !asce.r)
+			return PGM_REGION_FIRST_TRANS;
+		break;
+	case ASCE_TYPE_REGION2:
+		if (vaddr.rfx)
+			return PGM_ASCE_TYPE;
+		if (vaddr.rsx01 > asce.tl)
+			return PGM_REGION_SECOND_TRANS;
+		break;
+	case ASCE_TYPE_REGION3:
+		if (vaddr.rfx || vaddr.rsx)
+			return PGM_ASCE_TYPE;
+		if (vaddr.rtx01 > asce.tl)
+			return PGM_REGION_THIRD_TRANS;
+		break;
+	case ASCE_TYPE_SEGMENT:
+		if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+			return PGM_ASCE_TYPE;
+		if (vaddr.sx01 > asce.tl)
+			return PGM_SEGMENT_TRANSLATION;
+		break;
+	}
+
+	switch (asce.dt) {
+	case ASCE_TYPE_REGION1: {
+		union region1_table_entry rfte;
+
+		if (*fake) {
+			/* offset in 16EB guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+			rfte.val = ptr;
+			goto shadow_r2t;
+		}
+		rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+		if (rc)
+			return rc;
+		if (rfte.i)
+			return PGM_REGION_FIRST_TRANS;
+		if (rfte.tt != TABLE_TYPE_REGION1)
+			return PGM_TRANSLATION_SPEC;
+		if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+			return PGM_REGION_SECOND_TRANS;
+		if (sg->edat_level >= 1)
+			*dat_protection |= rfte.p;
+		ptr = rfte.rto << 12UL;
+shadow_r2t:
+		rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
+		if (rc)
+			return rc;
+		/* fallthrough */
+	}
+	case ASCE_TYPE_REGION2: {
+		union region2_table_entry rste;
+
+		if (*fake) {
+			/* offset in 8PB guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+			rste.val = ptr;
+			goto shadow_r3t;
+		}
+		rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+		if (rc)
+			return rc;
+		if (rste.i)
+			return PGM_REGION_SECOND_TRANS;
+		if (rste.tt != TABLE_TYPE_REGION2)
+			return PGM_TRANSLATION_SPEC;
+		if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+			return PGM_REGION_THIRD_TRANS;
+		if (sg->edat_level >= 1)
+			*dat_protection |= rste.p;
+		ptr = rste.rto << 12UL;
+shadow_r3t:
+		rste.p |= *dat_protection;
+		rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
+		if (rc)
+			return rc;
+		/* fallthrough */
+	}
+	case ASCE_TYPE_REGION3: {
+		union region3_table_entry rtte;
+
+		if (*fake) {
+			/* offset in 4TB guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+			rtte.val = ptr;
+			goto shadow_sgt;
+		}
+		rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+		if (rc)
+			return rc;
+		if (rtte.i)
+			return PGM_REGION_THIRD_TRANS;
+		if (rtte.tt != TABLE_TYPE_REGION3)
+			return PGM_TRANSLATION_SPEC;
+		if (rtte.cr && asce.p && sg->edat_level >= 2)
+			return PGM_TRANSLATION_SPEC;
+		if (rtte.fc && sg->edat_level >= 2) {
+			*dat_protection |= rtte.fc0.p;
+			*fake = 1;
+			ptr = rtte.fc1.rfaa << 31UL;
+			rtte.val = ptr;
+			goto shadow_sgt;
+		}
+		if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+			return PGM_SEGMENT_TRANSLATION;
+		if (sg->edat_level >= 1)
+			*dat_protection |= rtte.fc0.p;
+		ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+		rtte.fc0.p |= *dat_protection;
+		rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
+		if (rc)
+			return rc;
+		/* fallthrough */
+	}
+	case ASCE_TYPE_SEGMENT: {
+		union segment_table_entry ste;
+
+		if (*fake) {
+			/* offset in 2G guest memory block */
+			ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+			ste.val = ptr;
+			goto shadow_pgt;
+		}
+		rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+		if (rc)
+			return rc;
+		if (ste.i)
+			return PGM_SEGMENT_TRANSLATION;
+		if (ste.tt != TABLE_TYPE_SEGMENT)
+			return PGM_TRANSLATION_SPEC;
+		if (ste.cs && asce.p)
+			return PGM_TRANSLATION_SPEC;
+		*dat_protection |= ste.fc0.p;
+		if (ste.fc && sg->edat_level >= 1) {
+			*fake = 1;
+			ptr = ste.fc1.sfaa << 20UL;
+			ste.val = ptr;
+			goto shadow_pgt;
+		}
+		ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+		ste.fc0.p |= *dat_protection;
+		rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+		if (rc)
+			return rc;
+	}
+	}
+	/* Return the parent address of the page table */
+	*pgt = ptr;
+	return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ *	    - > 0 (pgm exception code) on exceptions while faulting
+ *	    - -EAGAIN if the caller can retry immediately
+ *	    - -EFAULT when accessing invalid guest addresses
+ *	    - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+			  unsigned long saddr)
+{
+	union vaddress vaddr;
+	union page_table_entry pte;
+	unsigned long pgt;
+	int dat_protection, fake;
+	int rc;
+
+	down_read(&sg->mm->mmap_sem);
+	/*
+	 * We don't want any guest-2 tables to change - so the parent
+	 * tables/pointers we read stay valid - unshadowing is however
+	 * always possible - only guest_table_lock protects us.
+	 */
+	ipte_lock(vcpu);
+
+	rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+	if (rc)
+		rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+					    &fake);
+
+	vaddr.addr = saddr;
+	if (fake) {
+		/* offset in 1MB guest memory block */
+		pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+		goto shadow_page;
+	}
+	if (!rc)
+		rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+	if (!rc && pte.i)
+		rc = PGM_PAGE_TRANSLATION;
+	if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+		rc = PGM_TRANSLATION_SPEC;
+shadow_page:
+	pte.p |= dat_protection;
+	if (!rc)
+		rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+	ipte_unlock(vcpu);
+	up_read(&sg->mm->mmap_sem);
+	return rc;
+}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index df0a79dd8159..8756569ad938 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
 int ipte_lock_held(struct kvm_vcpu *vcpu);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+			  unsigned long saddr);
+
 #endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d72c4a877622..ca19627779db 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -995,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 		swake_up(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
 	}
+	/*
+	 * The VCPU might not be sleeping but is executing the VSIE. Let's
+	 * kick it, so it leaves the SIE to process the request.
+	 */
+	kvm_s390_vsie_kick(vcpu);
 }
 
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0dcf9b8fc12c..03eeeb0ded24 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/mman.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/slab.h>
@@ -98,6 +99,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
 	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
 	{ "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+	{ "instruction_sie", VCPU_STAT(instruction_sie) },
 	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
 	{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
 	{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -123,6 +125,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
 /* upper facilities limit for kvm */
 unsigned long kvm_s390_fac_list_mask[16] = {
 	0xffe6000000000000UL,
@@ -141,6 +148,7 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS)
 static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
 
 static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
 debug_info_t *kvm_s390_dbf;
 
 /* Section: not file related */
@@ -150,7 +158,8 @@ int kvm_arch_hardware_enable(void)
 	return 0;
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+			      unsigned long end);
 
 /*
  * This callback is executed during stop_machine(). All CPUs are therefore
@@ -172,6 +181,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
 			vcpu->arch.sie_block->epoch -= *delta;
 			if (vcpu->arch.cputm_enabled)
 				vcpu->arch.cputm_start += *delta;
+			if (vcpu->arch.vsie_block)
+				vcpu->arch.vsie_block->epoch -= *delta;
 		}
 	}
 	return NOTIFY_OK;
@@ -184,7 +195,9 @@ static struct notifier_block kvm_clock_notifier = {
 int kvm_arch_hardware_setup(void)
 {
 	gmap_notifier.notifier_call = kvm_gmap_notifier;
-	gmap_register_ipte_notifier(&gmap_notifier);
+	gmap_register_pte_notifier(&gmap_notifier);
+	vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+	gmap_register_pte_notifier(&vsie_gmap_notifier);
 	atomic_notifier_chain_register(&s390_epoch_delta_notifier,
 				       &kvm_clock_notifier);
 	return 0;
@@ -192,7 +205,8 @@ int kvm_arch_hardware_setup(void)
 
 void kvm_arch_hardware_unsetup(void)
 {
-	gmap_unregister_ipte_notifier(&gmap_notifier);
+	gmap_unregister_pte_notifier(&gmap_notifier);
+	gmap_unregister_pte_notifier(&vsie_gmap_notifier);
 	atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
 					 &kvm_clock_notifier);
 }
@@ -250,6 +264,46 @@ static void kvm_s390_cpu_feat_init(void)
 
 	if (MACHINE_HAS_ESOP)
 		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+	/*
+	 * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+	 * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+	 */
+	if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+	    !test_facility(3) || !nested)
+		return;
+	allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+	if (sclp.has_64bscao)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+	if (sclp.has_siif)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+	if (sclp.has_gpere)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+	if (sclp.has_gsls)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+	if (sclp.has_ib)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+	if (sclp.has_cei)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+	if (sclp.has_ibs)
+		allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+	/*
+	 * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+	 * all skey handling functions read/set the skey from the PGSTE
+	 * instead of the real storage key.
+	 *
+	 * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+	 * pages being detected as preserved although they are resident.
+	 *
+	 * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+	 * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+	 *
+	 * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+	 * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+	 * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+	 *
+	 * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+	 * cannot easily shadow the SCA because of the ipte lock.
+	 */
 }
 
 int kvm_arch_init(void *opaque)
@@ -530,20 +584,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 		if (!new_limit)
 			return -EINVAL;
 
-		/* gmap_alloc takes last usable address */
+		/* gmap_create takes last usable address */
 		if (new_limit != KVM_S390_NO_MEM_LIMIT)
 			new_limit -= 1;
 
 		ret = -EBUSY;
 		mutex_lock(&kvm->lock);
 		if (!kvm->created_vcpus) {
-			/* gmap_alloc will round the limit up */
-			struct gmap *new = gmap_alloc(current->mm, new_limit);
+			/* gmap_create will round the limit up */
+			struct gmap *new = gmap_create(current->mm, new_limit);
 
 			if (!new) {
 				ret = -ENOMEM;
 			} else {
-				gmap_free(kvm->arch.gmap);
+				gmap_remove(kvm->arch.gmap);
 				new->private = kvm;
 				kvm->arch.gmap = new;
 				ret = 0;
@@ -1392,7 +1446,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 		else
 			kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
 						    sclp.hamax + 1);
-		kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+		kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
 		if (!kvm->arch.gmap)
 			goto out_err;
 		kvm->arch.gmap->private = kvm;
@@ -1404,6 +1458,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	kvm->arch.epoch = 0;
 
 	spin_lock_init(&kvm->arch.start_stop_lock);
+	kvm_s390_vsie_init(kvm);
 	KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
 
 	return 0;
@@ -1425,7 +1480,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 		sca_del_vcpu(vcpu);
 
 	if (kvm_is_ucontrol(vcpu->kvm))
-		gmap_free(vcpu->arch.gmap);
+		gmap_remove(vcpu->arch.gmap);
 
 	if (vcpu->kvm->arch.use_cmma)
 		kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1458,16 +1513,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	debug_unregister(kvm->arch.dbf);
 	free_page((unsigned long)kvm->arch.sie_page2);
 	if (!kvm_is_ucontrol(kvm))
-		gmap_free(kvm->arch.gmap);
+		gmap_remove(kvm->arch.gmap);
 	kvm_s390_destroy_adapters(kvm);
 	kvm_s390_clear_float_irqs(kvm);
+	kvm_s390_vsie_destroy(kvm);
 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
 }
 
 /* Section: vcpu related */
 static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+	vcpu->arch.gmap = gmap_create(current->mm, -1UL);
 	if (!vcpu->arch.gmap)
 		return -ENOMEM;
 	vcpu->arch.gmap->private = vcpu->kvm;
@@ -1717,7 +1773,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	save_access_regs(vcpu->arch.host_acrs);
 	restore_access_regs(vcpu->run->s.regs.acrs);
-	gmap_enable(vcpu->arch.gmap);
+	gmap_enable(vcpu->arch.enabled_gmap);
 	atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__start_cpu_timer_accounting(vcpu);
@@ -1730,7 +1786,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__stop_cpu_timer_accounting(vcpu);
 	atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
-	gmap_disable(vcpu->arch.gmap);
+	vcpu->arch.enabled_gmap = gmap_get_enabled();
+	gmap_disable(vcpu->arch.enabled_gmap);
 
 	/* Save guest register state */
 	save_fpu_regs();
@@ -1779,7 +1836,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 		vcpu->arch.gmap = vcpu->kvm->arch.gmap;
 		sca_add_vcpu(vcpu);
 	}
-
+	/* make vcpu_load load the right gmap on the first trigger */
+	vcpu->arch.enabled_gmap = vcpu->arch.gmap;
 }
 
 static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@ -1976,16 +2034,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
 	kvm_s390_vcpu_request(vcpu);
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+			      unsigned long end)
 {
-	int i;
 	struct kvm *kvm = gmap->private;
 	struct kvm_vcpu *vcpu;
+	unsigned long prefix;
+	int i;
 
+	if (gmap_is_shadow(gmap))
+		return;
+	if (start >= 1UL << 31)
+		/* We are only interested in prefix pages */
+		return;
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		/* match against both prefix pages */
-		if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
-			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+		prefix = kvm_s390_get_prefix(vcpu);
+		if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+				   start, end);
 			kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
 		}
 	}
@@ -2264,16 +2331,16 @@ retry:
 		return 0;
 	/*
 	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
-	 * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+	 * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
 	 * This ensures that the ipte instruction for this request has
 	 * already finished. We might race against a second unmapper that
 	 * wants to set the blocking bit. Lets just retry the request loop.
 	 */
 	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
 		int rc;
-		rc = gmap_ipte_notify(vcpu->arch.gmap,
-				      kvm_s390_get_prefix(vcpu),
-				      PAGE_SIZE * 2);
+		rc = gmap_mprotect_notify(vcpu->arch.gmap,
+					  kvm_s390_get_prefix(vcpu),
+					  PAGE_SIZE * 2, PROT_WRITE);
 		if (rc)
 			return rc;
 		goto retry;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 52aa47e112d8..031f451bb2cf 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 {
-	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+	return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
 }
 
 static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -252,6 +252,14 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
 
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+				 unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 3db3be139992..c77ad2dc334f 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -719,6 +719,7 @@ static const intercept_handler_t b2_handlers[256] = {
 	[0x10] = handle_set_prefix,
 	[0x11] = handle_store_prefix,
 	[0x12] = handle_store_cpu_address,
+	[0x14] = kvm_s390_handle_vsie,
 	[0x21] = handle_ipte_interlock,
 	[0x29] = handle_iske,
 	[0x2a] = handle_rrbe,
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 28ea0cab1f1b..1a252f537081 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
 	const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
 	u16 p_asn, s_asn;
 	psw_t *psw;
-	u32 flags;
+	bool idle;
 
-	flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+	idle = is_vcpu_idle(vcpu);
 	psw = &dst_vcpu->arch.sie_block->gpsw;
 	p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff;  /* Primary ASN */
 	s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff;  /* Secondary ASN */
 
 	/* Inject the emergency signal? */
-	if (!(flags & CPUSTAT_STOPPED)
+	if (!is_vcpu_stopped(vcpu)
 	    || (psw->mask & psw_int_mask) != psw_int_mask
-	    || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
-	    || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+	    || (idle && psw->addr != 0)
+	    || (!idle && (asn == p_asn || asn == s_asn))) {
 		return __inject_sigp_emergency(vcpu, dst_vcpu);
 	} else {
 		*reg &= 0xffffffff00000000UL;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
new file mode 100644
index 000000000000..6895e7b3be12
--- /dev/null
+++ b/arch/s390/kvm/vsie.c
@@ -0,0 +1,1091 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include <asm/dis.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+	struct kvm_s390_sie_block scb_s;	/* 0x0000 */
+	/* the pinned originial scb */
+	struct kvm_s390_sie_block *scb_o;	/* 0x0200 */
+	/* the shadow gmap in use by the vsie_page */
+	struct gmap *gmap;			/* 0x0208 */
+	/* address of the last reported fault to guest2 */
+	unsigned long fault_addr;		/* 0x0210 */
+	__u8 reserved[0x0700 - 0x0218];		/* 0x0218 */
+	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
+	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+			     __u16 reason_code)
+{
+	scb->ipa = 0x1000;
+	scb->ipb = ((__u32) reason_code) << 16;
+	scb->icptcode = ICPT_VALIDITY;
+	return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+	atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+	prefix_unmapped(vsie_page);
+	if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+		atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+	while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+		cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+	atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+	return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+	const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+	int cpuflags;
+
+	cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+	atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+	atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags  */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+	/* we don't allow ESA/390 guests */
+	if (!(cpuflags & CPUSTAT_ZARCH))
+		return set_validity_icpt(scb_s, 0x0001U);
+
+	if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+		return set_validity_icpt(scb_s, 0x0001U);
+	else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+		return set_validity_icpt(scb_s, 0x0007U);
+
+	/* intervention requests will be set later */
+	newflags = CPUSTAT_ZARCH;
+	if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+		newflags |= CPUSTAT_GED;
+	if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+		if (cpuflags & CPUSTAT_GED)
+			return set_validity_icpt(scb_s, 0x0001U);
+		newflags |= CPUSTAT_GED2;
+	}
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+		newflags |= cpuflags & CPUSTAT_P;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+		newflags |= cpuflags & CPUSTAT_SM;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+		newflags |= cpuflags & CPUSTAT_IBS;
+
+	atomic_set(&scb_s->cpuflags, newflags);
+	return 0;
+}
+
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+	unsigned long *b1, *b2;
+	u8 ecb3_flags;
+
+	scb_s->crycbd = 0;
+	if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+		return 0;
+	/* format-1 is supported with message-security-assist extension 3 */
+	if (!test_kvm_facility(vcpu->kvm, 76))
+		return 0;
+	/* we may only allow it if enabled for guest 2 */
+	ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+		     (ECB3_AES | ECB3_DEA);
+	if (!ecb3_flags)
+		return 0;
+
+	if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+		return set_validity_icpt(scb_s, 0x003CU);
+	else if (!crycb_addr)
+		return set_validity_icpt(scb_s, 0x0039U);
+
+	/* copy only the wrapping keys */
+	if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+		return set_validity_icpt(scb_s, 0x0035U);
+
+	scb_s->ecb3 |= ecb3_flags;
+	scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+			CRYCB_FORMAT2;
+
+	/* xor both blocks in one run */
+	b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+	b2 = (unsigned long *)
+			    vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+	/* as 56%8 == 0, bitmap_xor won't overwrite any data */
+	bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+	return 0;
+}
+
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	__u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+	scb_s->ibc = 0;
+	/* ibc installed in g2 and requested for g3 */
+	if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+		scb_s->ibc = scb_o->ibc & 0x0fffU;
+		/* takte care of the minimum ibc level of the machine */
+		if (scb_s->ibc < min_ibc)
+			scb_s->ibc = min_ibc;
+		/* take care of the maximum ibc level set for the guest */
+		if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+			scb_s->ibc = vcpu->kvm->arch.model.ibc;
+	}
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+	/* interception */
+	scb_o->icptcode = scb_s->icptcode;
+	scb_o->icptstatus = scb_s->icptstatus;
+	scb_o->ipa = scb_s->ipa;
+	scb_o->ipb = scb_s->ipb;
+	scb_o->gbea = scb_s->gbea;
+
+	/* timer */
+	scb_o->cputm = scb_s->cputm;
+	scb_o->ckc = scb_s->ckc;
+	scb_o->todpr = scb_s->todpr;
+
+	/* guest state */
+	scb_o->gpsw = scb_s->gpsw;
+	scb_o->gg14 = scb_s->gg14;
+	scb_o->gg15 = scb_s->gg15;
+	memcpy(scb_o->gcr, scb_s->gcr, 128);
+	scb_o->pp = scb_s->pp;
+
+	/* interrupt intercept */
+	switch (scb_s->icptcode) {
+	case ICPT_PROGI:
+	case ICPT_INSTPROGI:
+	case ICPT_EXTINT:
+		memcpy((void *)((u64)scb_o + 0xc0),
+		       (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+		break;
+	case ICPT_PARTEXEC:
+		/* MVPG only */
+		memcpy((void *)((u64)scb_o + 0xc0),
+		       (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+		break;
+	}
+
+	if (scb_s->ihcpu != 0xffffU)
+		scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ *          - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	bool had_tx = scb_s->ecb & 0x10U;
+	unsigned long new_mso = 0;
+	int rc;
+
+	/* make sure we don't have any leftovers when reusing the scb */
+	scb_s->icptcode = 0;
+	scb_s->eca = 0;
+	scb_s->ecb = 0;
+	scb_s->ecb2 = 0;
+	scb_s->ecb3 = 0;
+	scb_s->ecd = 0;
+	scb_s->fac = 0;
+
+	rc = prepare_cpuflags(vcpu, vsie_page);
+	if (rc)
+		goto out;
+
+	/* timer */
+	scb_s->cputm = scb_o->cputm;
+	scb_s->ckc = scb_o->ckc;
+	scb_s->todpr = scb_o->todpr;
+	scb_s->epoch = scb_o->epoch;
+
+	/* guest state */
+	scb_s->gpsw = scb_o->gpsw;
+	scb_s->gg14 = scb_o->gg14;
+	scb_s->gg15 = scb_o->gg15;
+	memcpy(scb_s->gcr, scb_o->gcr, 128);
+	scb_s->pp = scb_o->pp;
+
+	/* interception / execution handling */
+	scb_s->gbea = scb_o->gbea;
+	scb_s->lctl = scb_o->lctl;
+	scb_s->svcc = scb_o->svcc;
+	scb_s->ictl = scb_o->ictl;
+	/*
+	 * SKEY handling functions can't deal with false setting of PTE invalid
+	 * bits. Therefore we cannot provide interpretation and would later
+	 * have to provide own emulation handlers.
+	 */
+	scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+	scb_s->icpua = scb_o->icpua;
+
+	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+		new_mso = scb_o->mso & 0xfffffffffff00000UL;
+	/* if the hva of the prefix changes, we have to remap the prefix */
+	if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+		prefix_unmapped(vsie_page);
+	 /* SIE will do mso/msl validity and exception checks for us */
+	scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+	scb_s->mso = new_mso;
+	scb_s->prefix = scb_o->prefix;
+
+	/* We have to definetly flush the tlb if this scb never ran */
+	if (scb_s->ihcpu != 0xffffU)
+		scb_s->ihcpu = scb_o->ihcpu;
+
+	/* MVPG and Protection Exception Interpretation are always available */
+	scb_s->eca |= scb_o->eca & 0x01002000U;
+	/* Host-protection-interruption introduced with ESOP */
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+		scb_s->ecb |= scb_o->ecb & 0x02U;
+	/* transactional execution */
+	if (test_kvm_facility(vcpu->kvm, 73)) {
+		/* remap the prefix is tx is toggled on */
+		if ((scb_o->ecb & 0x10U) && !had_tx)
+			prefix_unmapped(vsie_page);
+		scb_s->ecb |= scb_o->ecb & 0x10U;
+	}
+	/* SIMD */
+	if (test_kvm_facility(vcpu->kvm, 129)) {
+		scb_s->eca |= scb_o->eca & 0x00020000U;
+		scb_s->ecd |= scb_o->ecd & 0x20000000U;
+	}
+	/* Run-time-Instrumentation */
+	if (test_kvm_facility(vcpu->kvm, 64))
+		scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+		scb_s->eca |= scb_o->eca & 0x00000001U;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+		scb_s->eca |= scb_o->eca & 0x40000000U;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+		scb_s->eca |= scb_o->eca & 0x80000000U;
+
+	prepare_ibc(vcpu, vsie_page);
+	rc = shadow_crycb(vcpu, vsie_page);
+out:
+	if (rc)
+		unshadow_scb(vcpu, vsie_page);
+	return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+				 unsigned long end)
+{
+	struct kvm *kvm = gmap->private;
+	struct vsie_page *cur;
+	unsigned long prefix;
+	struct page *page;
+	int i;
+
+	if (!gmap_is_shadow(gmap))
+		return;
+	if (start >= 1UL << 31)
+		/* We are only interested in prefix pages */
+		return;
+
+	/*
+	 * Only new shadow blocks are added to the list during runtime,
+	 * therefore we can safely reference them all the time.
+	 */
+	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+		page = READ_ONCE(kvm->arch.vsie.pages[i]);
+		if (!page)
+			continue;
+		cur = page_to_virt(page);
+		if (READ_ONCE(cur->gmap) != gmap)
+			continue;
+		prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+		/* with mso/msl, the prefix lies at an offset */
+		prefix += cur->scb_s.mso;
+		if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+			prefix_unmapped_sync(cur);
+	}
+}
+
+/*
+ * Map the first prefix page and if tx is enabled also the second prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ *          - > 0 if control has to be given to guest 2
+ *          - -EAGAIN if the caller can retry immediately
+ *          - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+	int rc;
+
+	if (prefix_is_mapped(vsie_page))
+		return 0;
+
+	/* mark it as mapped so we can catch any concurrent unmappers */
+	prefix_mapped(vsie_page);
+
+	/* with mso/msl, the prefix lies at offset *mso* */
+	prefix += scb_s->mso;
+
+	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+	if (!rc && (scb_s->ecb & 0x10U))
+		rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+					   prefix + PAGE_SIZE);
+	/*
+	 * We don't have to mprotect, we will be called for all unshadows.
+	 * SIE will detect if protection applies and trigger a validity.
+	 */
+	if (rc)
+		prefix_unmapped(vsie_page);
+	if (rc > 0 || rc == -EFAULT)
+		rc = set_validity_icpt(scb_s, 0x0037U);
+	return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ *          - -EINVAL if the gpa is not valid guest storage
+ *          - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+	struct page *page;
+	hva_t hva;
+	int rc;
+
+	hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+	if (kvm_is_error_hva(hva))
+		return -EINVAL;
+	rc = get_user_pages_fast(hva, 1, 1, &page);
+	if (rc < 0)
+		return rc;
+	else if (rc != 1)
+		return -ENOMEM;
+	*hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+	return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+	struct page *page;
+
+	page = virt_to_page(hpa);
+	set_page_dirty_lock(page);
+	put_page(page);
+	/* mark the page always as dirty for migration */
+	mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	hpa_t hpa;
+	gpa_t gpa;
+
+	hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+	if (hpa) {
+		gpa = scb_o->scaol & ~0xfUL;
+		if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+			gpa |= (u64) scb_o->scaoh << 32;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->scaol = 0;
+		scb_s->scaoh = 0;
+	}
+
+	hpa = scb_s->itdba;
+	if (hpa) {
+		gpa = scb_o->itdba & ~0xffUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->itdba = 0;
+	}
+
+	hpa = scb_s->gvrd;
+	if (hpa) {
+		gpa = scb_o->gvrd & ~0x1ffUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->gvrd = 0;
+	}
+
+	hpa = scb_s->riccbd;
+	if (hpa) {
+		gpa = scb_o->riccbd & ~0x3fUL;
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		scb_s->riccbd = 0;
+	}
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	hpa_t hpa;
+	gpa_t gpa;
+	int rc = 0;
+
+	gpa = scb_o->scaol & ~0xfUL;
+	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+		gpa |= (u64) scb_o->scaoh << 32;
+	if (gpa) {
+		if (!(gpa & ~0x1fffUL))
+			rc = set_validity_icpt(scb_s, 0x0038U);
+		else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+			rc = set_validity_icpt(scb_s, 0x0011U);
+		else if ((gpa & PAGE_MASK) !=
+			 ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+			rc = set_validity_icpt(scb_s, 0x003bU);
+		if (!rc) {
+			rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+			if (rc == -EINVAL)
+				rc = set_validity_icpt(scb_s, 0x0034U);
+		}
+		if (rc)
+			goto unpin;
+		scb_s->scaoh = (u32)((u64)hpa >> 32);
+		scb_s->scaol = (u32)(u64)hpa;
+	}
+
+	gpa = scb_o->itdba & ~0xffUL;
+	if (gpa && (scb_s->ecb & 0x10U)) {
+		if (!(gpa & ~0x1fffU)) {
+			rc = set_validity_icpt(scb_s, 0x0080U);
+			goto unpin;
+		}
+		/* 256 bytes cannot cross page boundaries */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x0080U);
+		if (rc)
+			goto unpin;
+		scb_s->itdba = hpa;
+	}
+
+	gpa = scb_o->gvrd & ~0x1ffUL;
+	if (gpa && (scb_s->eca & 0x00020000U) &&
+	    !(scb_s->ecd & 0x20000000U)) {
+		if (!(gpa & ~0x1fffUL)) {
+			rc = set_validity_icpt(scb_s, 0x1310U);
+			goto unpin;
+		}
+		/*
+		 * 512 bytes vector registers cannot cross page boundaries
+		 * if this block gets bigger, we have to shadow it.
+		 */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x1310U);
+		if (rc)
+			goto unpin;
+		scb_s->gvrd = hpa;
+	}
+
+	gpa = scb_o->riccbd & ~0x3fUL;
+	if (gpa && (scb_s->ecb3 & 0x01U)) {
+		if (!(gpa & ~0x1fffUL)) {
+			rc = set_validity_icpt(scb_s, 0x0043U);
+			goto unpin;
+		}
+		/* 64 bytes cannot cross page boundaries */
+		rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+		if (rc == -EINVAL)
+			rc = set_validity_icpt(scb_s, 0x0043U);
+		/* Validity 0x0044 will be checked by SIE */
+		if (rc)
+			goto unpin;
+		scb_s->gvrd = hpa;
+	}
+	return 0;
+unpin:
+	unpin_blocks(vcpu, vsie_page);
+	return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+		      gpa_t gpa)
+{
+	hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+	if (hpa)
+		unpin_guest_page(vcpu->kvm, gpa, hpa);
+	vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+		   gpa_t gpa)
+{
+	hpa_t hpa;
+	int rc;
+
+	rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+	if (rc == -EINVAL) {
+		rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+		if (!rc)
+			rc = 1;
+	}
+	if (!rc)
+		vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+	return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ *            < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+			bool write_flag)
+{
+	struct kvm_s390_pgm_info pgm = {
+		.code = code,
+		.trans_exc_code =
+			/* 0-51: virtual address */
+			(vaddr & 0xfffffffffffff000UL) |
+			/* 52-53: store / fetch */
+			(((unsigned int) !write_flag) + 1) << 10,
+			/* 62-63: asce id (alway primary == 0) */
+		.exc_access_id = 0, /* always primary */
+		.op_access_id = 0, /* not MVPG */
+	};
+	int rc;
+
+	if (code == PGM_PROTECTION)
+		pgm.trans_exc_code |= 0x4UL;
+
+	rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+	return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	int rc;
+
+	if (current->thread.gmap_int_code == PGM_PROTECTION)
+		/* we can directly forward all protection exceptions */
+		return inject_fault(vcpu, PGM_PROTECTION,
+				    current->thread.gmap_addr, 1);
+
+	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+				   current->thread.gmap_addr);
+	if (rc > 0) {
+		rc = inject_fault(vcpu, rc,
+				  current->thread.gmap_addr,
+				  current->thread.gmap_write_flag);
+		if (rc >= 0)
+			vsie_page->fault_addr = current->thread.gmap_addr;
+	}
+	return rc;
+}
+
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+			      struct vsie_page *vsie_page)
+{
+	if (vsie_page->fault_addr)
+		kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+				      vsie_page->fault_addr);
+	vsie_page->fault_addr = 0;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+	vsie_page->scb_s.icptcode = 0;
+}
+
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	int ilen = insn_length(scb_s->ipa >> 8);
+
+	/* take care of EXECUTE instructions */
+	if (scb_s->icptstatus & 1) {
+		ilen = (scb_s->icptstatus >> 4) & 0x6;
+		if (!ilen)
+			ilen = 4;
+	}
+	scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+	clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ *          - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	__u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+	if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+		retry_vsie_icpt(vsie_page);
+		if (read_guest_real(vcpu, fac, &vsie_page->fac,
+				    sizeof(vsie_page->fac)))
+			return set_validity_icpt(scb_s, 0x1090U);
+		scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+	}
+	return 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ *          - > 0 if control has to be given to guest 2
+ *          - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	int rc;
+
+	handle_last_fault(vcpu, vsie_page);
+
+	if (need_resched())
+		schedule();
+	if (test_cpu_flag(CIF_MCCK_PENDING))
+		s390_handle_mcck();
+
+	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+	local_irq_disable();
+	kvm_guest_enter();
+	local_irq_enable();
+
+	rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+	local_irq_disable();
+	kvm_guest_exit();
+	local_irq_enable();
+	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	if (rc > 0)
+		rc = 0; /* we could still have an icpt */
+	else if (rc == -EFAULT)
+		return handle_fault(vcpu, vsie_page);
+
+	switch (scb_s->icptcode) {
+	case ICPT_INST:
+		if (scb_s->ipa == 0xb2b0)
+			rc = handle_stfle(vcpu, vsie_page);
+		break;
+	case ICPT_STOP:
+		/* stop not requested by g2 - must have been a kick */
+		if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+			clear_vsie_icpt(vsie_page);
+		break;
+	case ICPT_VALIDITY:
+		if ((scb_s->ipa & 0xf000) != 0xf000)
+			scb_s->ipa += 0x1000;
+		break;
+	}
+	return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+	if (vsie_page->gmap)
+		gmap_put(vsie_page->gmap);
+	WRITE_ONCE(vsie_page->gmap, NULL);
+	prefix_unmapped(vsie_page);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+			       struct vsie_page *vsie_page)
+{
+	unsigned long asce;
+	union ctlreg0 cr0;
+	struct gmap *gmap;
+	int edat;
+
+	asce = vcpu->arch.sie_block->gcr[1];
+	cr0.val = vcpu->arch.sie_block->gcr[0];
+	edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+	edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+	/*
+	 * ASCE or EDAT could have changed since last icpt, or the gmap
+	 * we're holding has been unshadowed. If the gmap is still valid,
+	 * we can safely reuse it.
+	 */
+	if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+		return 0;
+
+	/* release the old shadow - if any, and mark the prefix as unmapped */
+	release_gmap_shadow(vsie_page);
+	gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+	if (IS_ERR(gmap))
+		return PTR_ERR(gmap);
+	gmap->private = vcpu->kvm;
+	WRITE_ONCE(vsie_page->gmap, gmap);
+	return 0;
+}
+
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+				struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+	WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+	/*
+	 * External calls have to lead to a kick of the vcpu and
+	 * therefore the vsie -> Simulate Wait state.
+	 */
+	atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+	/*
+	 * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+	 * automatically be adjusted on tod clock changes via kvm_sync_clock.
+	 */
+	preempt_disable();
+	scb_s->epoch += vcpu->kvm->arch.epoch;
+	preempt_enable();
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+	atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+	WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ *          - > 0 if control has to be given to guest 2
+ *          - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	int rc = 0;
+
+	while (1) {
+		rc = acquire_gmap_shadow(vcpu, vsie_page);
+		if (!rc)
+			rc = map_prefix(vcpu, vsie_page);
+		if (!rc) {
+			gmap_enable(vsie_page->gmap);
+			update_intervention_requests(vsie_page);
+			rc = do_vsie_run(vcpu, vsie_page);
+			gmap_enable(vcpu->arch.gmap);
+		}
+		atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
+
+		if (rc == -EAGAIN)
+			rc = 0;
+		if (rc || scb_s->icptcode || signal_pending(current) ||
+		    kvm_s390_vcpu_has_irq(vcpu, 0))
+			break;
+	};
+
+	if (rc == -EFAULT) {
+		/*
+		 * Addressing exceptions are always presentes as intercepts.
+		 * As addressing exceptions are suppressing and our guest 3 PSW
+		 * points at the responsible instruction, we have to
+		 * forward the PSW and set the ilc. If we can't read guest 3
+		 * instruction, we can use an arbitrary ilc. Let's always use
+		 * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+		 * memory. (we could also fake the shadow so the hardware
+		 * handles it).
+		 */
+		scb_s->icptcode = ICPT_PROGI;
+		scb_s->iprcc = PGM_ADDRESSING;
+		scb_s->pgmilc = 4;
+		scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+	}
+	return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ *          - NULL if the same scb address is already used by another VCPU
+ *          - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+	struct vsie_page *vsie_page;
+	struct page *page;
+	int nr_vcpus;
+
+	rcu_read_lock();
+	page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+	rcu_read_unlock();
+	if (page) {
+		if (page_ref_inc_return(page) == 2)
+			return page_to_virt(page);
+		page_ref_dec(page);
+	}
+
+	/*
+	 * We want at least #online_vcpus shadows, so every VCPU can execute
+	 * the VSIE in parallel.
+	 */
+	nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+	mutex_lock(&kvm->arch.vsie.mutex);
+	if (kvm->arch.vsie.page_count < nr_vcpus) {
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+		if (!page) {
+			mutex_unlock(&kvm->arch.vsie.mutex);
+			return ERR_PTR(-ENOMEM);
+		}
+		page_ref_inc(page);
+		kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+		kvm->arch.vsie.page_count++;
+	} else {
+		/* reuse an existing entry that belongs to nobody */
+		while (true) {
+			page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+			if (page_ref_inc_return(page) == 2)
+				break;
+			page_ref_dec(page);
+			kvm->arch.vsie.next++;
+			kvm->arch.vsie.next %= nr_vcpus;
+		}
+		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+	}
+	page->index = addr;
+	/* double use of the same address */
+	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+		page_ref_dec(page);
+		mutex_unlock(&kvm->arch.vsie.mutex);
+		return NULL;
+	}
+	mutex_unlock(&kvm->arch.vsie.mutex);
+
+	vsie_page = page_to_virt(page);
+	memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+	release_gmap_shadow(vsie_page);
+	vsie_page->fault_addr = 0;
+	vsie_page->scb_s.ihcpu = 0xffffU;
+	return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+	struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+	page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+	struct vsie_page *vsie_page;
+	unsigned long scb_addr;
+	int rc;
+
+	vcpu->stat.instruction_sie++;
+	if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+		return -EOPNOTSUPP;
+	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+	BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+	scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+	/* 512 byte alignment */
+	if (unlikely(scb_addr & 0x1ffUL))
+		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+	if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+		return 0;
+
+	vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+	if (IS_ERR(vsie_page))
+		return PTR_ERR(vsie_page);
+	else if (!vsie_page)
+		/* double use of sie control block - simply do nothing */
+		return 0;
+
+	rc = pin_scb(vcpu, vsie_page, scb_addr);
+	if (rc)
+		goto out_put;
+	rc = shadow_scb(vcpu, vsie_page);
+	if (rc)
+		goto out_unpin_scb;
+	rc = pin_blocks(vcpu, vsie_page);
+	if (rc)
+		goto out_unshadow;
+	register_shadow_scb(vcpu, vsie_page);
+	rc = vsie_run(vcpu, vsie_page);
+	unregister_shadow_scb(vcpu);
+	unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+	unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+	unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+	put_vsie_page(vcpu->kvm, vsie_page);
+
+	return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+	mutex_init(&kvm->arch.vsie.mutex);
+	INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+	struct vsie_page *vsie_page;
+	struct page *page;
+	int i;
+
+	mutex_lock(&kvm->arch.vsie.mutex);
+	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+		page = kvm->arch.vsie.pages[i];
+		kvm->arch.vsie.pages[i] = NULL;
+		vsie_page = page_to_virt(page);
+		release_gmap_shadow(vsie_page);
+		/* free the radix tree entry */
+		radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+		__free_page(page);
+	}
+	kvm->arch.vsie.page_count = 0;
+	mutex_unlock(&kvm->arch.vsie.mutex);
+}
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+	struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+	/*
+	 * Even if the VCPU lets go of the shadow sie block reference, it is
+	 * still valid in the cache. So we can safely kick it.
+	 */
+	if (scb) {
+		atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+		if (scb->prog0c & PROG_IN_SIE)
+			atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+	}
+}