From 9a99649f2a89fdfc9dde5d5401675561567bf99a Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Fri, 29 Jan 2016 15:13:30 +0100
Subject: s390/pci: remove pdev pointer from arch data

For each PCI function we need to maintain arch specific data in
struct zpci_dev which also contains a pointer to struct pci_dev.

When a function is registered or deregistered (which is triggered by PCI
common code) we need to adjust that pointer which could interfere with
the machine check handler (triggered by FW) using zpci_dev->pdev.

Since multiple instances of the same pdev could exist at a time this can't
be solved with locking.

Fix that by ditching the pdev pointer and use a bus walk to reach
struct pci_dev (only one instance of a pdev can be registered at the bus
at a time).

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/pci.h |  3 +--
 arch/s390/pci/pci.c         |  6 +-----
 arch/s390/pci/pci_debug.c   |  5 ++---
 arch/s390/pci/pci_dma.c     | 21 ++++++++++++---------
 arch/s390/pci/pci_event.c   | 13 +++++++++++--
 5 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index c873e682b67f..dc763eae7c49 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -66,7 +66,6 @@ struct s390_domain;
 
 /* Private data per function */
 struct zpci_dev {
-	struct pci_dev	*pdev;
 	struct pci_bus	*bus;
 	struct list_head entry;		/* list of all zpci_devices, needed for hotplug, etc. */
 
@@ -192,7 +191,7 @@ int zpci_fmb_disable_device(struct zpci_dev *);
 /* Debug */
 int zpci_debug_init(void);
 void zpci_debug_exit(void);
-void zpci_debug_init_device(struct zpci_dev *);
+void zpci_debug_init_device(struct zpci_dev *, const char *);
 void zpci_debug_exit_device(struct zpci_dev *);
 void zpci_debug_info(struct zpci_dev *, struct seq_file *);
 
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 8f19c8f9d660..f76d01e69fb8 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -637,11 +637,9 @@ static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
 
 int pcibios_add_device(struct pci_dev *pdev)
 {
-	struct zpci_dev *zdev = to_zpci(pdev);
 	struct resource *res;
 	int i;
 
-	zdev->pdev = pdev;
 	pdev->dev.groups = zpci_attr_groups;
 	zpci_map_resources(pdev);
 
@@ -664,8 +662,7 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
 {
 	struct zpci_dev *zdev = to_zpci(pdev);
 
-	zdev->pdev = pdev;
-	zpci_debug_init_device(zdev);
+	zpci_debug_init_device(zdev, dev_name(&pdev->dev));
 	zpci_fmb_enable_device(zdev);
 
 	return pci_enable_resources(pdev, mask);
@@ -677,7 +674,6 @@ void pcibios_disable_device(struct pci_dev *pdev)
 
 	zpci_fmb_disable_device(zdev);
 	zpci_debug_exit_device(zdev);
-	zdev->pdev = NULL;
 }
 
 #ifdef CONFIG_HIBERNATE_CALLBACKS
diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c
index 4129b0a5fd78..c555de3d12d6 100644
--- a/arch/s390/pci/pci_debug.c
+++ b/arch/s390/pci/pci_debug.c
@@ -128,10 +128,9 @@ static const struct file_operations debugfs_pci_perf_fops = {
 	.release = single_release,
 };
 
-void zpci_debug_init_device(struct zpci_dev *zdev)
+void zpci_debug_init_device(struct zpci_dev *zdev, const char *name)
 {
-	zdev->debugfs_dev = debugfs_create_dir(dev_name(&zdev->pdev->dev),
-					       debugfs_root);
+	zdev->debugfs_dev = debugfs_create_dir(name, debugfs_root);
 	if (IS_ERR(zdev->debugfs_dev))
 		zdev->debugfs_dev = NULL;
 
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 4638b93c7632..a06ce8037cec 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -217,27 +217,29 @@ void dma_cleanup_tables(unsigned long *table)
 	dma_free_cpu_table(table);
 }
 
-static unsigned long __dma_alloc_iommu(struct zpci_dev *zdev,
+static unsigned long __dma_alloc_iommu(struct device *dev,
 				       unsigned long start, int size)
 {
+	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
 	unsigned long boundary_size;
 
-	boundary_size = ALIGN(dma_get_seg_boundary(&zdev->pdev->dev) + 1,
+	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
 			      PAGE_SIZE) >> PAGE_SHIFT;
 	return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages,
 				start, size, 0, boundary_size, 0);
 }
 
-static unsigned long dma_alloc_iommu(struct zpci_dev *zdev, int size)
+static unsigned long dma_alloc_iommu(struct device *dev, int size)
 {
+	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
 	unsigned long offset, flags;
 	int wrap = 0;
 
 	spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
-	offset = __dma_alloc_iommu(zdev, zdev->next_bit, size);
+	offset = __dma_alloc_iommu(dev, zdev->next_bit, size);
 	if (offset == -1) {
 		/* wrap-around */
-		offset = __dma_alloc_iommu(zdev, 0, size);
+		offset = __dma_alloc_iommu(dev, 0, size);
 		wrap = 1;
 	}
 
@@ -251,8 +253,9 @@ static unsigned long dma_alloc_iommu(struct zpci_dev *zdev, int size)
 	return offset;
 }
 
-static void dma_free_iommu(struct zpci_dev *zdev, unsigned long offset, int size)
+static void dma_free_iommu(struct device *dev, unsigned long offset, int size)
 {
+	struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
 	unsigned long flags;
 
 	spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags);
@@ -293,7 +296,7 @@ static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
 
 	/* This rounds up number of pages based on size and offset */
 	nr_pages = iommu_num_pages(pa, size, PAGE_SIZE);
-	iommu_page_index = dma_alloc_iommu(zdev, nr_pages);
+	iommu_page_index = dma_alloc_iommu(dev, nr_pages);
 	if (iommu_page_index == -1) {
 		ret = -ENOSPC;
 		goto out_err;
@@ -319,7 +322,7 @@ static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
 	return dma_addr + (offset & ~PAGE_MASK);
 
 out_free:
-	dma_free_iommu(zdev, iommu_page_index, nr_pages);
+	dma_free_iommu(dev, iommu_page_index, nr_pages);
 out_err:
 	zpci_err("map error:\n");
 	zpci_err_dma(ret, pa);
@@ -346,7 +349,7 @@ static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr,
 
 	atomic64_add(npages, &zdev->unmapped_pages);
 	iommu_page_index = (dma_addr - zdev->start_dma) >> PAGE_SHIFT;
-	dma_free_iommu(zdev, iommu_page_index, npages);
+	dma_free_iommu(dev, iommu_page_index, npages);
 }
 
 static void *s390_dma_alloc(struct device *dev, size_t size,
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index b0e04751c5d5..fb2a9a560fdc 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -46,11 +46,14 @@ struct zpci_ccdf_avail {
 static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
 {
 	struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
-	struct pci_dev *pdev = zdev ? zdev->pdev : NULL;
+	struct pci_dev *pdev = NULL;
 
 	zpci_err("error CCDF:\n");
 	zpci_err_hex(ccdf, sizeof(*ccdf));
 
+	if (zdev)
+		pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN);
+
 	pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n",
 	       pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
 
@@ -58,6 +61,7 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
 		return;
 
 	pdev->error_state = pci_channel_io_perm_failure;
+	pci_dev_put(pdev);
 }
 
 void zpci_event_error(void *data)
@@ -69,9 +73,12 @@ void zpci_event_error(void *data)
 static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 {
 	struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
-	struct pci_dev *pdev = zdev ? zdev->pdev : NULL;
+	struct pci_dev *pdev = NULL;
 	int ret;
 
+	if (zdev)
+		pdev = pci_get_slot(zdev->bus, ZPCI_DEVFN);
+
 	pr_info("%s: Event 0x%x reconfigured PCI function 0x%x\n",
 		pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
 	zpci_err("avail CCDF:\n");
@@ -138,6 +145,8 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 	default:
 		break;
 	}
+	if (pdev)
+		pci_dev_put(pdev);
 }
 
 void zpci_event_availability(void *data)
-- 
cgit v1.2.3


From 2cfc5f9ce7f5e17553e84d36ea9563e677e369d1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 2 Feb 2016 14:40:40 +0100
Subject: s390/xor: optimized xor routing using the XC instruction

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/xor.h |  21 ++++++-
 arch/s390/lib/Makefile      |   2 +-
 arch/s390/lib/xor.c         | 134 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 arch/s390/lib/xor.c

(limited to 'arch')

diff --git a/arch/s390/include/asm/xor.h b/arch/s390/include/asm/xor.h
index c82eb12a5b18..c988df744a70 100644
--- a/arch/s390/include/asm/xor.h
+++ b/arch/s390/include/asm/xor.h
@@ -1 +1,20 @@
-#include <asm-generic/xor.h>
+/*
+ * Optimited xor routines
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+#ifndef _ASM_S390_XOR_H
+#define _ASM_S390_XOR_H
+
+extern struct xor_block_template xor_block_xc;
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES				\
+do {							\
+	xor_speed(&xor_block_xc);			\
+} while (0)
+
+#define XOR_SELECT_TEMPLATE(FASTEST)	(&xor_block_xc)
+
+#endif /* _ASM_S390_XOR_H */
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 0e8fefe5b0ce..1d1af31e8354 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -3,7 +3,7 @@
 #
 
 lib-y += delay.o string.o uaccess.o find.o
-obj-y += mem.o
+obj-y += mem.o xor.o
 lib-$(CONFIG_SMP) += spinlock.o
 lib-$(CONFIG_KPROBES) += probes.o
 lib-$(CONFIG_UPROBES) += probes.o
diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c
new file mode 100644
index 000000000000..7d94e3ec34a9
--- /dev/null
+++ b/arch/s390/lib/xor.c
@@ -0,0 +1,134 @@
+/*
+ * Optimized xor_block operation for RAID4/5
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/raid/xor.h>
+
+static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	asm volatile(
+		"	larl	1,2f\n"
+		"	aghi	%0,-1\n"
+		"	jm	3f\n"
+		"	srlg	0,%0,8\n"
+		"	ltgr	0,0\n"
+		"	jz	1f\n"
+		"0:	xc	0(256,%1),0(%2)\n"
+		"	la	%1,256(%1)\n"
+		"	la	%2,256(%2)\n"
+		"	brctg	0,0b\n"
+		"1:	ex	%0,0(1)\n"
+		"	j	3f\n"
+		"2:	xc	0(1,%1),0(%2)\n"
+		"3:\n"
+		: : "d" (bytes), "a" (p1), "a" (p2)
+		: "0", "1", "cc", "memory");
+}
+
+static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+		     unsigned long *p3)
+{
+	asm volatile(
+		"	larl	1,2f\n"
+		"	aghi	%0,-1\n"
+		"	jm	3f\n"
+		"	srlg	0,%0,8\n"
+		"	ltgr	0,0\n"
+		"	jz	1f\n"
+		"0:	xc	0(256,%1),0(%2)\n"
+		"	xc	0(256,%1),0(%3)\n"
+		"	la	%1,256(%1)\n"
+		"	la	%2,256(%2)\n"
+		"	la	%3,256(%3)\n"
+		"	brctg	0,0b\n"
+		"1:	ex	%0,0(1)\n"
+		"	ex	%0,6(1)\n"
+		"	j	3f\n"
+		"2:	xc	0(1,%1),0(%2)\n"
+		"	xc	0(1,%1),0(%3)\n"
+		"3:\n"
+		: "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3)
+		: : "0", "1", "cc", "memory");
+}
+
+static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+		     unsigned long *p3, unsigned long *p4)
+{
+	asm volatile(
+		"	larl	1,2f\n"
+		"	aghi	%0,-1\n"
+		"	jm	3f\n"
+		"	srlg	0,%0,8\n"
+		"	ltgr	0,0\n"
+		"	jz	1f\n"
+		"0:	xc	0(256,%1),0(%2)\n"
+		"	xc	0(256,%1),0(%3)\n"
+		"	xc	0(256,%1),0(%4)\n"
+		"	la	%1,256(%1)\n"
+		"	la	%2,256(%2)\n"
+		"	la	%3,256(%3)\n"
+		"	la	%4,256(%4)\n"
+		"	brctg	0,0b\n"
+		"1:	ex	%0,0(1)\n"
+		"	ex	%0,6(1)\n"
+		"	ex	%0,12(1)\n"
+		"	j	3f\n"
+		"2:	xc	0(1,%1),0(%2)\n"
+		"	xc	0(1,%1),0(%3)\n"
+		"	xc	0(1,%1),0(%4)\n"
+		"3:\n"
+		: "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4)
+		: : "0", "1", "cc", "memory");
+}
+
+static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+		     unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	/* Get around a gcc oddity */
+	register unsigned long *reg7 asm ("7") = p5;
+
+	asm volatile(
+		"	larl	1,2f\n"
+		"	aghi	%0,-1\n"
+		"	jm	3f\n"
+		"	srlg	0,%0,8\n"
+		"	ltgr	0,0\n"
+		"	jz	1f\n"
+		"0:	xc	0(256,%1),0(%2)\n"
+		"	xc	0(256,%1),0(%3)\n"
+		"	xc	0(256,%1),0(%4)\n"
+		"	xc	0(256,%1),0(%5)\n"
+		"	la	%1,256(%1)\n"
+		"	la	%2,256(%2)\n"
+		"	la	%3,256(%3)\n"
+		"	la	%4,256(%4)\n"
+		"	la	%5,256(%5)\n"
+		"	brctg	0,0b\n"
+		"1:	ex	%0,0(1)\n"
+		"	ex	%0,6(1)\n"
+		"	ex	%0,12(1)\n"
+		"	ex	%0,18(1)\n"
+		"	j	3f\n"
+		"2:	xc	0(1,%1),0(%2)\n"
+		"	xc	0(1,%1),0(%3)\n"
+		"	xc	0(1,%1),0(%4)\n"
+		"	xc	0(1,%1),0(%5)\n"
+		"3:\n"
+		: "+d" (bytes), "+a" (p1), "+a" (p2), "+a" (p3), "+a" (p4),
+		  "+a" (reg7)
+		: : "0", "1", "cc", "memory");
+}
+
+struct xor_block_template xor_block_xc = {
+	.name = "xc",
+	.do_2 = xor_xc_2,
+	.do_3 = xor_xc_3,
+	.do_4 = xor_xc_4,
+	.do_5 = xor_xc_5,
+};
+EXPORT_SYMBOL(xor_block_xc);
-- 
cgit v1.2.3


From 007ccec53da35528bd06fa0063da55b1311054c1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Thu, 4 Feb 2016 12:24:46 +0100
Subject: s390/pageattr: do a single TLB flush for change_page_attr

The change of the access rights for an address range in the kernel
address space is currently done with a loop of IPTE + a store of the
modified PTE. Between the IPTE and the store the PTE will be invalid,
this intermediate state can cause problems with concurrent accesses.

Consider a change of a kernel area from read-write to read-only, a
concurrent reader of that area should be fine but with the invalid
PTE it might get an unexpected exception.

Remove the IPTEs for each PTE and do a global flush after all PTEs
have been modified.

Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/pageattr.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 749c98407b41..f2a5c29a97e9 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -65,19 +65,17 @@ static pte_t *walk_page_table(unsigned long addr)
 static void change_page_attr(unsigned long addr, int numpages,
 			     pte_t (*set) (pte_t))
 {
-	pte_t *ptep, pte;
+	pte_t *ptep;
 	int i;
 
 	for (i = 0; i < numpages; i++) {
 		ptep = walk_page_table(addr);
 		if (WARN_ON_ONCE(!ptep))
 			break;
-		pte = *ptep;
-		pte = set(pte);
-		__ptep_ipte(addr, ptep);
-		*ptep = pte;
+		*ptep = set(*ptep);
 		addr += PAGE_SIZE;
 	}
+	__tlb_flush_kernel();
 }
 
 int set_memory_ro(unsigned long addr, int numpages)
-- 
cgit v1.2.3


From 77ec8a545136272ca3056ca3759836df0cfee5e5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 1 Feb 2016 14:12:08 +0100
Subject: s390/stacktrace: use nosched instead of savesched parameter

Use the inversed "nosched" logic like all other architectures.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/stacktrace.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index 8f64ebd63767..3872d14fe1a1 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -14,7 +14,7 @@ static unsigned long save_context_stack(struct stack_trace *trace,
 					unsigned long sp,
 					unsigned long low,
 					unsigned long high,
-					int savesched)
+					int nosched)
 {
 	struct stack_frame *sf;
 	struct pt_regs *regs;
@@ -46,7 +46,7 @@ static unsigned long save_context_stack(struct stack_trace *trace,
 			return sp;
 		regs = (struct pt_regs *)sp;
 		addr = regs->psw.addr;
-		if (savesched || !in_sched_functions(addr)) {
+		if (!nosched || !in_sched_functions(addr)) {
 			if (!trace->skip)
 				trace->entries[trace->nr_entries++] = addr;
 			else
@@ -66,13 +66,13 @@ static void __save_stack_trace(struct stack_trace *trace, unsigned long sp)
 	frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
 	new_sp = save_context_stack(trace, sp,
 			S390_lowcore.panic_stack + frame_size - PAGE_SIZE,
-			S390_lowcore.panic_stack + frame_size, 1);
+			S390_lowcore.panic_stack + frame_size, 0);
 	new_sp = save_context_stack(trace, new_sp,
 			S390_lowcore.async_stack + frame_size - ASYNC_SIZE,
-			S390_lowcore.async_stack + frame_size, 1);
+			S390_lowcore.async_stack + frame_size, 0);
 	save_context_stack(trace, new_sp,
 			   S390_lowcore.thread_info,
-			   S390_lowcore.thread_info + THREAD_SIZE, 1);
+			   S390_lowcore.thread_info + THREAD_SIZE, 0);
 }
 
 void save_stack_trace(struct stack_trace *trace)
@@ -98,7 +98,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 	}
 	low = (unsigned long) task_stack_page(tsk);
 	high = (unsigned long) task_pt_regs(tsk);
-	save_context_stack(trace, sp, low, high, 0);
+	save_context_stack(trace, sp, low, high, 1);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
-- 
cgit v1.2.3


From 76737ce17ab4c88f14e4452076610474108246d7 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 31 Jan 2016 17:06:16 +0100
Subject: s390: add current_stack_pointer() helper function

Implement current_stack_pointer() helper function and use it
everywhere, instead of having several different inline assembly
variants.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/processor.h |  8 ++++++++
 arch/s390/kernel/dumpstack.c      | 16 ++++++++--------
 arch/s390/kernel/irq.c            |  3 +--
 arch/s390/kernel/stacktrace.c     |  9 +++------
 4 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 1c4fe129486d..073e18bc1676 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -203,6 +203,14 @@ unsigned long get_wchan(struct task_struct *p);
 /* Has task runtime instrumentation enabled ? */
 #define is_ri_task(tsk) (!!(tsk)->thread.ri_cb)
 
+static inline unsigned long current_stack_pointer(void)
+{
+	unsigned long sp;
+
+	asm volatile("la %0,0(15)" : "=a" (sp));
+	return sp;
+}
+
 static inline unsigned short stap(void)
 {
 	unsigned short cpu_address;
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 02bd02ff648b..feda48f243ef 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -67,12 +67,11 @@ static void show_trace(struct task_struct *task, unsigned long *stack)
 {
 	const unsigned long frame_size =
 		STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
-	register unsigned long __r15 asm ("15");
 	unsigned long sp;
 
 	sp = (unsigned long) stack;
 	if (!sp)
-		sp = task ? task->thread.ksp : __r15;
+		sp = task ? task->thread.ksp : current_stack_pointer();
 	printk("Call Trace:\n");
 #ifdef CONFIG_CHECK_STACK
 	sp = __show_trace(sp,
@@ -95,15 +94,16 @@ static void show_trace(struct task_struct *task, unsigned long *stack)
 
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
-	register unsigned long *__r15 asm ("15");
 	unsigned long *stack;
 	int i;
 
-	if (!sp)
-		stack = task ? (unsigned long *) task->thread.ksp : __r15;
-	else
-		stack = sp;
-
+	stack = sp;
+	if (!stack) {
+		if (!task)
+			stack = (unsigned long *)current_stack_pointer();
+		else
+			stack = (unsigned long *)task->thread.ksp;
+	}
 	for (i = 0; i < 20; i++) {
 		if (((addr_t) stack & (THREAD_SIZE-1)) == 0)
 			break;
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index f41d5208aaf7..c373a1d41d10 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -164,8 +164,7 @@ void do_softirq_own_stack(void)
 {
 	unsigned long old, new;
 
-	/* Get current stack pointer. */
-	asm volatile("la %0,0(15)" : "=a" (old));
+	old = current_stack_pointer();
 	/* Check against async. stack address range. */
 	new = S390_lowcore.async_stack;
 	if (((new - old) >> (PAGE_SHIFT + THREAD_ORDER)) != 0) {
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index 3872d14fe1a1..7dd8360b1c39 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -77,10 +77,9 @@ static void __save_stack_trace(struct stack_trace *trace, unsigned long sp)
 
 void save_stack_trace(struct stack_trace *trace)
 {
-	register unsigned long r15 asm ("15");
 	unsigned long sp;
 
-	sp = r15;
+	sp = current_stack_pointer();
 	__save_stack_trace(trace, sp);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
@@ -92,10 +91,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 	unsigned long sp, low, high;
 
 	sp = tsk->thread.ksp;
-	if (tsk == current) {
-		/* Get current stack pointer. */
-		asm volatile("la %0,0(15)" : "=a" (sp));
-	}
+	if (tsk == current)
+		sp = current_stack_pointer();
 	low = (unsigned long) task_stack_page(tsk);
 	high = (unsigned long) task_pt_regs(tsk);
 	save_context_stack(trace, sp, low, high, 1);
-- 
cgit v1.2.3


From ca0fd75a54cd4b1b1957bfa5611644f5b8b8465e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 31 Jan 2016 17:15:56 +0100
Subject: s390/dumpstack: add missing ri bit to show_registers() output

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/dumpstack.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index feda48f243ef..9a4852b7a1da 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -142,7 +142,8 @@ void show_registers(struct pt_regs *regs)
 	       mask_bits(regs, PSW_MASK_MCHECK), mask_bits(regs, PSW_MASK_WAIT),
 	       mask_bits(regs, PSW_MASK_PSTATE), mask_bits(regs, PSW_MASK_ASC),
 	       mask_bits(regs, PSW_MASK_CC), mask_bits(regs, PSW_MASK_PM));
-	printk(" EA:%x", mask_bits(regs, PSW_MASK_EA | PSW_MASK_BA));
+	printk(" RI:%x EA:%x", mask_bits(regs, PSW_MASK_RI),
+	       mask_bits(regs, PSW_MASK_EA | PSW_MASK_BA));
 	printk("\n%s GPRS: %016lx %016lx %016lx %016lx\n", mode,
 	       regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]);
 	printk("           %016lx %016lx %016lx %016lx\n",
-- 
cgit v1.2.3


From 1f021ea0da3bc499387d87e31aeb1e36d2db052b Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 31 Jan 2016 17:36:59 +0100
Subject: s390/dumpstack: use bit fields to decode psw mask in show_registers()

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/dumpstack.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 9a4852b7a1da..e1c786d23989 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -121,13 +121,9 @@ static void show_last_breaking_event(struct pt_regs *regs)
 	printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]);
 }
 
-static inline int mask_bits(struct pt_regs *regs, unsigned long bits)
-{
-	return (regs->psw.mask & bits) / ((~bits + 1) & bits);
-}
-
 void show_registers(struct pt_regs *regs)
 {
+	struct psw_bits *psw = &psw_bits(regs->psw);
 	char *mode;
 
 	mode = user_mode(regs) ? "User" : "Krnl";
@@ -136,14 +132,9 @@ void show_registers(struct pt_regs *regs)
 		printk(" (%pSR)", (void *)regs->psw.addr);
 	printk("\n");
 	printk("           R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x "
-	       "P:%x AS:%x CC:%x PM:%x", mask_bits(regs, PSW_MASK_PER),
-	       mask_bits(regs, PSW_MASK_DAT), mask_bits(regs, PSW_MASK_IO),
-	       mask_bits(regs, PSW_MASK_EXT), mask_bits(regs, PSW_MASK_KEY),
-	       mask_bits(regs, PSW_MASK_MCHECK), mask_bits(regs, PSW_MASK_WAIT),
-	       mask_bits(regs, PSW_MASK_PSTATE), mask_bits(regs, PSW_MASK_ASC),
-	       mask_bits(regs, PSW_MASK_CC), mask_bits(regs, PSW_MASK_PM));
-	printk(" RI:%x EA:%x", mask_bits(regs, PSW_MASK_RI),
-	       mask_bits(regs, PSW_MASK_EA | PSW_MASK_BA));
+	       "P:%x AS:%x CC:%x PM:%x", psw->r, psw->t, psw->i, psw->e,
+	       psw->key, psw->m, psw->w, psw->p, psw->as, psw->cc, psw->pm);
+	printk(" RI:%x EA:%x", psw->ri, psw->eaba);
 	printk("\n%s GPRS: %016lx %016lx %016lx %016lx\n", mode,
 	       regs->gprs[0], regs->gprs[1], regs->gprs[2], regs->gprs[3]);
 	printk("           %016lx %016lx %016lx %016lx\n",
-- 
cgit v1.2.3


From 3c2c126a675bd8a95d5ed186fe9cd788f438bff1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 5 Feb 2016 15:37:22 +0100
Subject: s390/mm: remove unnecessary indirection with pgste_update_all

The first parameter of pgste_update_all is a pointer to a pte.
Simplify the code by passing the pte value.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/pgtable.h | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 64ead8091248..21e95cdcf8f6 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -680,15 +680,15 @@ static inline void pgste_set(pte_t *ptep, pgste_t pgste)
 #endif
 }
 
-static inline pgste_t pgste_update_all(pte_t *ptep, pgste_t pgste,
+static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
 				       struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
 	unsigned long address, bits, skey;
 
-	if (!mm_use_skey(mm) || pte_val(*ptep) & _PAGE_INVALID)
+	if (!mm_use_skey(mm) || pte_val(pte) & _PAGE_INVALID)
 		return pgste;
-	address = pte_val(*ptep) & PAGE_MASK;
+	address = pte_val(pte) & PAGE_MASK;
 	skey = (unsigned long) page_get_storage_key(address);
 	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
 	/* Transfer page changed & referenced bit to guest bits in pgste */
@@ -1067,7 +1067,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long addr, pte_t *ptep)
 {
 	pgste_t pgste;
-	pte_t pte, oldpte;
+	pte_t pte;
 	int young;
 
 	if (mm_has_pgste(vma->vm_mm)) {
@@ -1075,17 +1075,16 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 		pgste = pgste_ipte_notify(vma->vm_mm, addr, ptep, pgste);
 	}
 
-	oldpte = pte = *ptep;
+	pte = *ptep;
 	ptep_flush_direct(vma->vm_mm, addr, ptep);
 	young = pte_young(pte);
-	pte = pte_mkold(pte);
 
 	if (mm_has_pgste(vma->vm_mm)) {
-		pgste = pgste_update_all(&oldpte, pgste, vma->vm_mm);
-		pgste = pgste_set_pte(ptep, pgste, pte);
+		pgste = pgste_update_all(pte, pgste, vma->vm_mm);
+		pgste = pgste_set_pte(ptep, pgste, pte_mkold(pte));
 		pgste_set_unlock(ptep, pgste);
 	} else
-		*ptep = pte;
+		*ptep = pte_mkold(pte);
 
 	return young;
 }
@@ -1127,7 +1126,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	pte_val(*ptep) = _PAGE_INVALID;
 
 	if (mm_has_pgste(mm)) {
-		pgste = pgste_update_all(&pte, pgste, mm);
+		pgste = pgste_update_all(pte, pgste, mm);
 		pgste_set_unlock(ptep, pgste);
 	}
 	return pte;
@@ -1150,7 +1149,7 @@ static inline pte_t ptep_modify_prot_start(struct mm_struct *mm,
 	ptep_flush_lazy(mm, address, ptep);
 
 	if (mm_has_pgste(mm)) {
-		pgste = pgste_update_all(&pte, pgste, mm);
+		pgste = pgste_update_all(pte, pgste, mm);
 		pgste_set(ptep, pgste);
 	}
 	return pte;
@@ -1191,7 +1190,7 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
 		if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
 		    _PGSTE_GPS_USAGE_UNUSED)
 			pte_val(pte) |= _PAGE_UNUSED;
-		pgste = pgste_update_all(&pte, pgste, vma->vm_mm);
+		pgste = pgste_update_all(pte, pgste, vma->vm_mm);
 		pgste_set_unlock(ptep, pgste);
 	}
 	return pte;
@@ -1223,7 +1222,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
 	pte_val(*ptep) = _PAGE_INVALID;
 
 	if (!full && mm_has_pgste(mm)) {
-		pgste = pgste_update_all(&pte, pgste, mm);
+		pgste = pgste_update_all(pte, pgste, mm);
 		pgste_set_unlock(ptep, pgste);
 	}
 	return pte;
-- 
cgit v1.2.3


From 758d39ebd3d5666edb3b1c339f7f138c349ff8bf Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 9 Feb 2016 12:58:54 +0100
Subject: s390/dumpstack: merge all four stack tracers

We have four different stack tracers of which three had bugs. So it's
time to merge them to a single stack tracer which allows to specify a
call back function which will be called for each step.

This patch changes behavior a bit:

- the "nosched" and "in_sched_functions" check within
  save_stack_trace_tsk did work only for the last stack frame within a
  context. Now it considers the check for each stack frame like it
  should.

- both the oprofile variant and the perf_events variant did save a
  return address twice if a zero back chain was detected, which
  indicates an interrupt frame. The new dump_trace function will call
  the oprofile and perf_events backends with the psw address that is
  contained within the corresponding pt_regs structure instead.

- the original show_trace and save_context_stack functions did already
  use the psw address of the pt_regs structure if a zero back chain
  was detected. However now we ignore the psw address if it is a user
  space address. After all we trace the kernel stack and not the user
  space stack. This way we also get rid of the garbage user space
  address in case of warnings and / or panic call traces.

So this should make life easier since now there is only one stack
tracer left which we can break.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/processor.h |  4 ++
 arch/s390/kernel/dumpstack.c      | 63 +++++++++++++++++------------
 arch/s390/kernel/perf_event.c     | 56 +++-----------------------
 arch/s390/kernel/stacktrace.c     | 84 ++++++++++-----------------------------
 arch/s390/oprofile/Makefile       |  2 +-
 arch/s390/oprofile/backtrace.c    | 78 ------------------------------------
 arch/s390/oprofile/init.c         | 20 +++++++++-
 7 files changed, 89 insertions(+), 218 deletions(-)
 delete mode 100644 arch/s390/oprofile/backtrace.c

(limited to 'arch')

diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 073e18bc1676..d6fd22ea270d 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -184,6 +184,10 @@ struct task_struct;
 struct mm_struct;
 struct seq_file;
 
+typedef int (*dump_trace_func_t)(void *data, unsigned long address);
+void dump_trace(dump_trace_func_t func, void *data,
+		struct task_struct *task, unsigned long sp);
+
 void show_cacheinfo(struct seq_file *m);
 
 /* Free all resources held by a thread. */
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index e1c786d23989..2150b0139a0b 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -19,28 +19,28 @@
 #include <asm/ipl.h>
 
 /*
- * For show_trace we have tree different stack to consider:
+ * For dump_trace we have tree different stack to consider:
  *   - the panic stack which is used if the kernel stack has overflown
  *   - the asynchronous interrupt stack (cpu related)
  *   - the synchronous kernel stack (process related)
- * The stack trace can start at any of the three stack and can potentially
+ * The stack trace can start at any of the three stacks and can potentially
  * touch all of them. The order is: panic stack, async stack, sync stack.
  */
 static unsigned long
-__show_trace(unsigned long sp, unsigned long low, unsigned long high)
+__dump_trace(dump_trace_func_t func, void *data, unsigned long sp,
+	     unsigned long low, unsigned long high)
 {
 	struct stack_frame *sf;
 	struct pt_regs *regs;
-	unsigned long addr;
 
 	while (1) {
 		if (sp < low || sp > high - sizeof(*sf))
 			return sp;
 		sf = (struct stack_frame *) sp;
-		addr = sf->gprs[8];
-		printk("([<%016lx>] %pSR)\n", addr, (void *)addr);
 		/* Follow the backchain. */
 		while (1) {
+			if (func(data, sf->gprs[8]))
+				return sp;
 			low = sp;
 			sp = sf->back_chain;
 			if (!sp)
@@ -48,45 +48,58 @@ __show_trace(unsigned long sp, unsigned long low, unsigned long high)
 			if (sp <= low || sp > high - sizeof(*sf))
 				return sp;
 			sf = (struct stack_frame *) sp;
-			addr = sf->gprs[8];
-			printk(" [<%016lx>] %pSR\n", addr, (void *)addr);
 		}
 		/* Zero backchain detected, check for interrupt frame. */
 		sp = (unsigned long) (sf + 1);
 		if (sp <= low || sp > high - sizeof(*regs))
 			return sp;
 		regs = (struct pt_regs *) sp;
-		addr = regs->psw.addr;
-		printk(" [<%016lx>] %pSR\n", addr, (void *)addr);
+		if (!user_mode(regs)) {
+			if (func(data, regs->psw.addr))
+				return sp;
+		}
 		low = sp;
 		sp = regs->gprs[15];
 	}
 }
 
-static void show_trace(struct task_struct *task, unsigned long *stack)
+void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task,
+		unsigned long sp)
 {
-	const unsigned long frame_size =
-		STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
-	unsigned long sp;
+	unsigned long frame_size;
 
-	sp = (unsigned long) stack;
-	if (!sp)
-		sp = task ? task->thread.ksp : current_stack_pointer();
-	printk("Call Trace:\n");
+	frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
 #ifdef CONFIG_CHECK_STACK
-	sp = __show_trace(sp,
+	sp = __dump_trace(func, data, sp,
 			  S390_lowcore.panic_stack + frame_size - 4096,
 			  S390_lowcore.panic_stack + frame_size);
 #endif
-	sp = __show_trace(sp,
+	sp = __dump_trace(func, data, sp,
 			  S390_lowcore.async_stack + frame_size - ASYNC_SIZE,
 			  S390_lowcore.async_stack + frame_size);
 	if (task)
-		__show_trace(sp, (unsigned long) task_stack_page(task),
-			     (unsigned long) task_stack_page(task) + THREAD_SIZE);
+		__dump_trace(func, data, sp,
+			     (unsigned long)task_stack_page(task),
+			     (unsigned long)task_stack_page(task) + THREAD_SIZE);
 	else
-		__show_trace(sp, S390_lowcore.thread_info,
+		__dump_trace(func, data, sp,
+			     S390_lowcore.thread_info,
 			     S390_lowcore.thread_info + THREAD_SIZE);
+}
+EXPORT_SYMBOL_GPL(dump_trace);
+
+static int show_address(void *data, unsigned long address)
+{
+	printk("([<%016lx>] %pSR)\n", address, (void *)address);
+	return 0;
+}
+
+static void show_trace(struct task_struct *task, unsigned long sp)
+{
+	if (!sp)
+		sp = task ? task->thread.ksp : current_stack_pointer();
+	printk("Call Trace:\n");
+	dump_trace(show_address, NULL, task, sp);
 	if (!task)
 		task = current;
 	debug_show_held_locks(task);
@@ -112,7 +125,7 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 		printk("%016lx ", *stack++);
 	}
 	printk("\n");
-	show_trace(task, sp);
+	show_trace(task, (unsigned long)sp);
 }
 
 static void show_last_breaking_event(struct pt_regs *regs)
@@ -152,7 +165,7 @@ void show_regs(struct pt_regs *regs)
 	show_registers(regs);
 	/* Show stack backtrace if pt_regs is from kernel mode */
 	if (!user_mode(regs))
-		show_trace(NULL, (unsigned long *) regs->gprs[15]);
+		show_trace(NULL, regs->gprs[15]);
 	show_last_breaking_event(regs);
 }
 
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 0943b11a2f6e..a1b708643a2c 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -222,64 +222,20 @@ static int __init service_level_perf_register(void)
 }
 arch_initcall(service_level_perf_register);
 
-/* See also arch/s390/kernel/traps.c */
-static unsigned long __store_trace(struct perf_callchain_entry *entry,
-				   unsigned long sp,
-				   unsigned long low, unsigned long high)
+static int __perf_callchain_kernel(void *data, unsigned long address)
 {
-	struct stack_frame *sf;
-	struct pt_regs *regs;
-
-	while (1) {
-		if (sp < low || sp > high - sizeof(*sf))
-			return sp;
-		sf = (struct stack_frame *) sp;
-		perf_callchain_store(entry, sf->gprs[8]);
-		/* Follow the backchain. */
-		while (1) {
-			low = sp;
-			sp = sf->back_chain;
-			if (!sp)
-				break;
-			if (sp <= low || sp > high - sizeof(*sf))
-				return sp;
-			sf = (struct stack_frame *) sp;
-			perf_callchain_store(entry, sf->gprs[8]);
-		}
-		/* Zero backchain detected, check for interrupt frame. */
-		sp = (unsigned long) (sf + 1);
-		if (sp <= low || sp > high - sizeof(*regs))
-			return sp;
-		regs = (struct pt_regs *) sp;
-		perf_callchain_store(entry, sf->gprs[8]);
-		low = sp;
-		sp = regs->gprs[15];
-	}
+	struct perf_callchain_entry *entry = data;
+
+	perf_callchain_store(entry, address);
+	return 0;
 }
 
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	unsigned long head, frame_size;
-	struct stack_frame *head_sf;
-
 	if (user_mode(regs))
 		return;
-
-	frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
-	head = regs->gprs[15];
-	head_sf = (struct stack_frame *) head;
-
-	if (!head_sf || !head_sf->back_chain)
-		return;
-
-	head = head_sf->back_chain;
-	head = __store_trace(entry, head,
-			     S390_lowcore.async_stack + frame_size - ASYNC_SIZE,
-			     S390_lowcore.async_stack + frame_size);
-
-	__store_trace(entry, head, S390_lowcore.thread_info,
-		      S390_lowcore.thread_info + THREAD_SIZE);
+	dump_trace(__perf_callchain_kernel, entry, NULL, regs->gprs[15]);
 }
 
 /* Perf defintions for PMU event attributes in sysfs */
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index 7dd8360b1c39..44f84b23d4e5 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -10,69 +10,31 @@
 #include <linux/kallsyms.h>
 #include <linux/module.h>
 
-static unsigned long save_context_stack(struct stack_trace *trace,
-					unsigned long sp,
-					unsigned long low,
-					unsigned long high,
-					int nosched)
+static int __save_address(void *data, unsigned long address, int nosched)
 {
-	struct stack_frame *sf;
-	struct pt_regs *regs;
-	unsigned long addr;
+	struct stack_trace *trace = data;
 
-	while(1) {
-		if (sp < low || sp > high)
-			return sp;
-		sf = (struct stack_frame *)sp;
-		while(1) {
-			addr = sf->gprs[8];
-			if (!trace->skip)
-				trace->entries[trace->nr_entries++] = addr;
-			else
-				trace->skip--;
-			if (trace->nr_entries >= trace->max_entries)
-				return sp;
-			low = sp;
-			sp = sf->back_chain;
-			if (!sp)
-				break;
-			if (sp <= low || sp > high - sizeof(*sf))
-				return sp;
-			sf = (struct stack_frame *)sp;
-		}
-		/* Zero backchain detected, check for interrupt frame. */
-		sp = (unsigned long)(sf + 1);
-		if (sp <= low || sp > high - sizeof(*regs))
-			return sp;
-		regs = (struct pt_regs *)sp;
-		addr = regs->psw.addr;
-		if (!nosched || !in_sched_functions(addr)) {
-			if (!trace->skip)
-				trace->entries[trace->nr_entries++] = addr;
-			else
-				trace->skip--;
-		}
-		if (trace->nr_entries >= trace->max_entries)
-			return sp;
-		low = sp;
-		sp = regs->gprs[15];
+	if (nosched && in_sched_functions(address))
+		return 0;
+	if (trace->skip > 0) {
+		trace->skip--;
+		return 0;
 	}
+	if (trace->nr_entries < trace->max_entries) {
+		trace->entries[trace->nr_entries++] = address;
+		return 0;
+	}
+	return 1;
 }
 
-static void __save_stack_trace(struct stack_trace *trace, unsigned long sp)
+static int save_address(void *data, unsigned long address)
 {
-	unsigned long new_sp, frame_size;
+	return __save_address(data, address, 0);
+}
 
-	frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
-	new_sp = save_context_stack(trace, sp,
-			S390_lowcore.panic_stack + frame_size - PAGE_SIZE,
-			S390_lowcore.panic_stack + frame_size, 0);
-	new_sp = save_context_stack(trace, new_sp,
-			S390_lowcore.async_stack + frame_size - ASYNC_SIZE,
-			S390_lowcore.async_stack + frame_size, 0);
-	save_context_stack(trace, new_sp,
-			   S390_lowcore.thread_info,
-			   S390_lowcore.thread_info + THREAD_SIZE, 0);
+static int save_address_nosched(void *data, unsigned long address)
+{
+	return __save_address(data, address, 1);
 }
 
 void save_stack_trace(struct stack_trace *trace)
@@ -80,7 +42,7 @@ void save_stack_trace(struct stack_trace *trace)
 	unsigned long sp;
 
 	sp = current_stack_pointer();
-	__save_stack_trace(trace, sp);
+	dump_trace(save_address, trace, NULL, sp);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
@@ -88,14 +50,12 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
-	unsigned long sp, low, high;
+	unsigned long sp;
 
 	sp = tsk->thread.ksp;
 	if (tsk == current)
 		sp = current_stack_pointer();
-	low = (unsigned long) task_stack_page(tsk);
-	high = (unsigned long) task_pt_regs(tsk);
-	save_context_stack(trace, sp, low, high, 1);
+	dump_trace(save_address_nosched, trace, tsk, sp);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
@@ -106,7 +66,7 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 	unsigned long sp;
 
 	sp = kernel_stack_pointer(regs);
-	__save_stack_trace(trace, sp);
+	dump_trace(save_address, trace, NULL, sp);
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
diff --git a/arch/s390/oprofile/Makefile b/arch/s390/oprofile/Makefile
index 1bd23017191e..496e4a7ee00e 100644
--- a/arch/s390/oprofile/Makefile
+++ b/arch/s390/oprofile/Makefile
@@ -6,5 +6,5 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 		oprofilefs.o oprofile_stats.o  \
 		timer_int.o )
 
-oprofile-y :=	$(DRIVER_OBJS) init.o backtrace.o
+oprofile-y :=	$(DRIVER_OBJS) init.o
 oprofile-y +=	hwsampler.o
diff --git a/arch/s390/oprofile/backtrace.c b/arch/s390/oprofile/backtrace.c
deleted file mode 100644
index 1884e1759529..000000000000
--- a/arch/s390/oprofile/backtrace.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * S390 Version
- *   Copyright IBM Corp. 2005
- *   Author(s): Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
- */
-
-#include <linux/oprofile.h>
-
-#include <asm/processor.h> /* for struct stack_frame */
-
-static unsigned long
-__show_trace(unsigned int *depth, unsigned long sp,
-	     unsigned long low, unsigned long high)
-{
-	struct stack_frame *sf;
-	struct pt_regs *regs;
-
-	while (*depth) {
-		if (sp < low || sp > high - sizeof(*sf))
-			return sp;
-		sf = (struct stack_frame *) sp;
-		(*depth)--;
-		oprofile_add_trace(sf->gprs[8]);
-
-		/* Follow the backchain.  */
-		while (*depth) {
-			low = sp;
-			sp = sf->back_chain;
-			if (!sp)
-				break;
-			if (sp <= low || sp > high - sizeof(*sf))
-				return sp;
-			sf = (struct stack_frame *) sp;
-			(*depth)--;
-			oprofile_add_trace(sf->gprs[8]);
-
-		}
-
-		if (*depth == 0)
-			break;
-
-		/* Zero backchain detected, check for interrupt frame.  */
-		sp = (unsigned long) (sf + 1);
-		if (sp <= low || sp > high - sizeof(*regs))
-			return sp;
-		regs = (struct pt_regs *) sp;
-		(*depth)--;
-		oprofile_add_trace(sf->gprs[8]);
-		low = sp;
-		sp = regs->gprs[15];
-	}
-	return sp;
-}
-
-void s390_backtrace(struct pt_regs * const regs, unsigned int depth)
-{
-	unsigned long head, frame_size;
-	struct stack_frame* head_sf;
-
-	if (user_mode(regs))
-		return;
-
-	frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
-	head = regs->gprs[15];
-	head_sf = (struct stack_frame*)head;
-
-	if (!head_sf->back_chain)
-		return;
-
-	head = head_sf->back_chain;
-
-	head = __show_trace(&depth, head,
-			    S390_lowcore.async_stack + frame_size - ASYNC_SIZE,
-			    S390_lowcore.async_stack + frame_size);
-
-	__show_trace(&depth, head, S390_lowcore.thread_info,
-		     S390_lowcore.thread_info + THREAD_SIZE);
-}
diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
index 9cfa2ffaa9d6..d85d1e6b1988 100644
--- a/arch/s390/oprofile/init.c
+++ b/arch/s390/oprofile/init.c
@@ -20,8 +20,6 @@
 
 #include "../../../drivers/oprofile/oprof.h"
 
-extern void s390_backtrace(struct pt_regs * const regs, unsigned int depth);
-
 #include "hwsampler.h"
 #include "op_counter.h"
 
@@ -494,6 +492,24 @@ static void oprofile_hwsampler_exit(void)
 	hwsampler_shutdown();
 }
 
+static int __s390_backtrace(void *data, unsigned long address)
+{
+	unsigned int *depth = data;
+
+	if (*depth == 0)
+		return 1;
+	(*depth)--;
+	oprofile_add_trace(address);
+	return 0;
+}
+
+static void s390_backtrace(struct pt_regs *regs, unsigned int depth)
+{
+	if (user_mode(regs))
+		return;
+	dump_trace(__s390_backtrace, &depth, NULL, regs->gprs[15]);
+}
+
 int __init oprofile_arch_init(struct oprofile_operations *ops)
 {
 	ops->backtrace = s390_backtrace;
-- 
cgit v1.2.3


From 13c6a790d77c3fe81e1a25524d1f9115b0da9404 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 10 Feb 2016 16:47:14 +0100
Subject: s390/mm: correct comment about segment table entries

The comment describing the bit encoding for segment table entries
is incorrect in regard to the read and write bits. The segment
read bit is 0x0002 and write is 0x0001, not the other way around.

Reported-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/pgtable.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 21e95cdcf8f6..7be9ae808a8b 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -298,15 +298,15 @@ static inline int is_module_addr(void *addr)
 
 /*
  * Segment table entry encoding (R = read-only, I = invalid, y = young bit):
- *				dy..R...I...wr
+ *				dy..R...I...rw
  * prot-none, clean, old	00..1...1...00
  * prot-none, clean, young	01..1...1...00
  * prot-none, dirty, old	10..1...1...00
  * prot-none, dirty, young	11..1...1...00
- * read-only, clean, old	00..1...1...01
- * read-only, clean, young	01..1...0...01
- * read-only, dirty, old	10..1...1...01
- * read-only, dirty, young	11..1...0...01
+ * read-only, clean, old	00..1...1...10
+ * read-only, clean, young	01..1...0...10
+ * read-only, dirty, old	10..1...1...10
+ * read-only, dirty, young	11..1...0...10
  * read-write, clean, old	00..1...1...11
  * read-write, clean, young	01..1...0...11
  * read-write, dirty, old	10..0...1...11
-- 
cgit v1.2.3


From bb3aa614014e11aea2c1aab528752e12b54df62d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 19 Feb 2016 10:46:08 +0100
Subject: s390: add z13s model number to z13 elf platform

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig        | 8 ++++----
 arch/s390/kernel/setup.c | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 3be9c832dec1..683af90efe86 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -254,12 +254,12 @@ config MARCH_ZEC12
 	  older machines.
 
 config MARCH_Z13
-	bool "IBM z13"
+	bool "IBM z13s and z13"
 	select HAVE_MARCH_Z13_FEATURES
 	help
-	  Select this to enable optimizations for IBM z13 (2964 series).
-	  The kernel will be slightly faster but will not work on older
-	  machines.
+	  Select this to enable optimizations for IBM z13s and z13 (2965 and
+	  2964 series). The kernel will be slightly faster but will not work on
+	  older machines.
 
 endchoice
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 9220db5c996a..b1fbcc07c871 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -779,6 +779,7 @@ static int __init setup_hwcaps(void)
 		strcpy(elf_platform, "zEC12");
 		break;
 	case 0x2964:
+	case 0x2965:
 		strcpy(elf_platform, "z13");
 		break;
 	}
-- 
cgit v1.2.3


From 993e0681084c8e84dd870bffedec9410778dfa87 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 19 Feb 2016 10:47:48 +0100
Subject: s390/oprofile: add z13/z13s model numbers

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/oprofile/init.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
index d85d1e6b1988..791935a65800 100644
--- a/arch/s390/oprofile/init.c
+++ b/arch/s390/oprofile/init.c
@@ -454,6 +454,7 @@ static int oprofile_hwsampler_init(struct oprofile_operations *ops)
 		case 0x2097: case 0x2098: ops->cpu_type = "s390/z10"; break;
 		case 0x2817: case 0x2818: ops->cpu_type = "s390/z196"; break;
 		case 0x2827: case 0x2828: ops->cpu_type = "s390/zEC12"; break;
+		case 0x2964: case 0x2965: ops->cpu_type = "s390/z13"; break;
 		default: return -ENODEV;
 		}
 	}
-- 
cgit v1.2.3


From 443a813304ec36d4e81264b6a452a412a6b3ad9b Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 24 Feb 2016 10:18:50 +0100
Subject: s390/kvm: simplify set_guest_storage_key

Git commit ab3f285f227fec62868037e9b1b1fd18294a83b8
"KVM: s390/mm: try a cow on read only pages for key ops"
added a fixup_user_fault to set_guest_storage_key force a copy on
write if the page is mapped read-only. This is supposed to fix the
problem of differing storage keys for shared mappings, e.g. the
empty_zero_page.
But if the storage key is set before the pte is mapped the storage
key update is done on the pgste. A later fault will happily map the
shared page with the key from the pgste.

Eventually git commit 2faee8ff9dc6f4bfe46f6d2d110add858140fb20
"s390/mm: prevent and break zero page mappings in case of storage keys"
fixed this problem for the empty_zero_page. The commit makes sure that
guests enabled for storage keys will not use the empty_zero_page at all.

As the call to fixup_user_fault in set_guest_storage_key depends on the
order of the storage key operation vs. the fault that maps the pte
it does not really fix anything. Just remove it.

Reviewed-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/mm/pgtable.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 5109827883ac..6acd7174fe75 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -809,30 +809,13 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 	spinlock_t *ptl;
 	pgste_t old, new;
 	pte_t *ptep;
-	bool unlocked;
 
 	down_read(&mm->mmap_sem);
-retry:
-	unlocked = false;
 	ptep = get_locked_pte(mm, addr, &ptl);
 	if (unlikely(!ptep)) {
 		up_read(&mm->mmap_sem);
 		return -EFAULT;
 	}
-	if (!(pte_val(*ptep) & _PAGE_INVALID) &&
-	     (pte_val(*ptep) & _PAGE_PROTECT)) {
-		pte_unmap_unlock(ptep, ptl);
-		/*
-		 * We do not really care about unlocked. We will retry either
-		 * way. But this allows fixup_user_fault to enable userfaultfd.
-		 */
-		if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
-				     &unlocked)) {
-			up_read(&mm->mmap_sem);
-			return -EFAULT;
-		}
-		goto retry;
-	}
 
 	new = old = pgste_get_lock(ptep);
 	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
-- 
cgit v1.2.3


From ee8479bb97dc790904e9afad503ae833e28ed8d3 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 25 Feb 2016 10:28:49 +0100
Subject: s390/dis: use correct escape sequence for '%' character

The double escape character sequence introduced with commit
272fa59ccb4f ("s390/dis: Fix handling of format specifiers") is not
necessary anymore since commit 561e10300269 ("s390/dis: Fix printing
of the register numbers").

Instead this now generates an extra '%' character:

lg      %%r1,160(%%r11)

So fix this and basically revert 272fa59ccb4f.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/dis.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index 62973efd214a..8cb9bfdd3ea8 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -1920,23 +1920,16 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
 			}
 			if (separator)
 				ptr += sprintf(ptr, "%c", separator);
-			/*
-			 * Use four '%' characters below because of the
-			 * following two conversions:
-			 *
-			 *  1) sprintf: %%%%r -> %%r
-			 *  2) printk : %%r   -> %r
-			 */
 			if (operand->flags & OPERAND_GPR)
-				ptr += sprintf(ptr, "%%%%r%i", value);
+				ptr += sprintf(ptr, "%%r%i", value);
 			else if (operand->flags & OPERAND_FPR)
-				ptr += sprintf(ptr, "%%%%f%i", value);
+				ptr += sprintf(ptr, "%%f%i", value);
 			else if (operand->flags & OPERAND_AR)
-				ptr += sprintf(ptr, "%%%%a%i", value);
+				ptr += sprintf(ptr, "%%a%i", value);
 			else if (operand->flags & OPERAND_CR)
-				ptr += sprintf(ptr, "%%%%c%i", value);
+				ptr += sprintf(ptr, "%%c%i", value);
 			else if (operand->flags & OPERAND_VR)
-				ptr += sprintf(ptr, "%%%%v%i", value);
+				ptr += sprintf(ptr, "%%v%i", value);
 			else if (operand->flags & OPERAND_PCREL)
 				ptr += sprintf(ptr, "%lx", (signed int) value
 								      + addr);
-- 
cgit v1.2.3


From 5d7eccecf8621e3cf5adcec9cf80aa444b4610d4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 24 Feb 2016 14:27:46 +0100
Subject: s390/fault: merge report_user_fault implementations

We have two close to identical report_user_fault functions.
Add a parameter to one and get rid of the other one in order
to reduce code duplication.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/setup.h |  2 ++
 arch/s390/kernel/traps.c      | 21 ++-------------------
 arch/s390/mm/fault.c          | 15 ++++++++-------
 3 files changed, 12 insertions(+), 26 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 69837225119e..c0f0efbb6ab5 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -101,6 +101,8 @@ extern void pfault_fini(void);
 #define pfault_fini()		do { } while (0)
 #endif /* CONFIG_PFAULT */
 
+void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault);
+
 extern void cmma_init(void);
 
 extern void (*_machine_restart)(char *command);
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 017eb03daee2..dd97a3e8a34a 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -22,8 +22,6 @@
 #include <asm/fpu/api.h>
 #include "entry.h"
 
-int show_unhandled_signals = 1;
-
 static inline void __user *get_trap_ip(struct pt_regs *regs)
 {
 	unsigned long address;
@@ -35,21 +33,6 @@ static inline void __user *get_trap_ip(struct pt_regs *regs)
 	return (void __user *) (address - (regs->int_code >> 16));
 }
 
-static inline void report_user_fault(struct pt_regs *regs, int signr)
-{
-	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
-		return;
-	if (!unhandled_signal(current, signr))
-		return;
-	if (!printk_ratelimit())
-		return;
-	printk("User process fault: interruption code %04x ilc:%d ",
-	       regs->int_code & 0xffff, regs->int_code >> 17);
-	print_vma_addr("in ", regs->psw.addr);
-	printk("\n");
-	show_regs(regs);
-}
-
 int is_valid_bugaddr(unsigned long addr)
 {
 	return 1;
@@ -65,7 +48,7 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
 		info.si_code = si_code;
 		info.si_addr = get_trap_ip(regs);
 		force_sig_info(si_signo, &info, current);
-		report_user_fault(regs, si_signo);
+		report_user_fault(regs, si_signo, 0);
         } else {
                 const struct exception_table_entry *fixup;
 		fixup = search_exception_tables(regs->psw.addr);
@@ -111,7 +94,7 @@ NOKPROBE_SYMBOL(do_per_trap);
 void default_trap_handler(struct pt_regs *regs)
 {
 	if (user_mode(regs)) {
-		report_user_fault(regs, SIGSEGV);
+		report_user_fault(regs, SIGSEGV, 0);
 		do_exit(SIGSEGV);
 	} else
 		die(regs, "Unknown program exception");
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 791a4146052c..64b3ad1d6575 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -183,6 +183,8 @@ static void dump_fault_info(struct pt_regs *regs)
 {
 	unsigned long asce;
 
+	pr_alert("Failing address: %016lx TEID: %016lx\n",
+		 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
 	pr_alert("Fault in ");
 	switch (regs->int_parm_long & 3) {
 	case 3:
@@ -218,7 +220,9 @@ static void dump_fault_info(struct pt_regs *regs)
 	dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
 }
 
-static inline void report_user_fault(struct pt_regs *regs, long signr)
+int show_unhandled_signals = 1;
+
+void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
 {
 	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
 		return;
@@ -230,9 +234,8 @@ static inline void report_user_fault(struct pt_regs *regs, long signr)
 	       regs->int_code & 0xffff, regs->int_code >> 17);
 	print_vma_addr(KERN_CONT "in ", regs->psw.addr);
 	printk(KERN_CONT "\n");
-	printk(KERN_ALERT "failing address: %016lx TEID: %016lx\n",
-	       regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
-	dump_fault_info(regs);
+	if (is_mm_fault)
+		dump_fault_info(regs);
 	show_regs(regs);
 }
 
@@ -244,7 +247,7 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
 {
 	struct siginfo si;
 
-	report_user_fault(regs, SIGSEGV);
+	report_user_fault(regs, SIGSEGV, 1);
 	si.si_signo = SIGSEGV;
 	si.si_code = si_code;
 	si.si_addr = (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK);
@@ -272,8 +275,6 @@ static noinline void do_no_context(struct pt_regs *regs)
 	else
 		printk(KERN_ALERT "Unable to handle kernel paging request"
 		       " in virtual user address space\n");
-	printk(KERN_ALERT "failing address: %016lx TEID: %016lx\n",
-	       regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
 	dump_fault_info(regs);
 	die(regs, "Oops");
 	do_exit(SIGKILL);
-- 
cgit v1.2.3


From b1685ab9bd3ae14830acac8ffdc7aafc0fb416e3 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 29 Feb 2016 16:05:35 +0100
Subject: s390/cpumf: Improve guest detection heuristics

commit e22cf8ca6f75 ("s390/cpumf: rework program parameter setting
to detect guest samples") requires guest changes to get proper
guest/host. We can do better: We can use the primary asn value,
which is set on all Linux variants to compare this with the host
pp value.
We now have the following cases:
1. Guest using PP
host sample:  gpp == 0, asn == hpp --> host
guest sample: gpp != 0 --> guest
2. Guest not using PP
host sample:  gpp == 0, asn == hpp --> host
guest sample: gpp == 0, asn != hpp --> guest

As soon as the host no longer sets CR4, we must back out
this heuristics - let's add a comment in switch_to.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/entry.S        | 1 +
 arch/s390/kernel/perf_cpum_sf.c | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index cd5a191381b9..4ba688c9d5a6 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -186,6 +186,7 @@ ENTRY(__switch_to)
 	stg	%r5,__LC_THREAD_INFO		# store thread info of next
 	stg	%r15,__LC_KERNEL_STACK		# store end of kernel stack
 	lg	%r15,__THREAD_ksp(%r1)		# load kernel stack of next
+	/* c4 is used in guest detection: arch/s390/kernel/perf_cpum_sf.c */
 	lctl	%c4,%c4,__TASK_pid(%r3)		# load pid to control reg. 4
 	mvc	__LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 3d8da1e742c2..1a43474df541 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1022,10 +1022,13 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 	/*
 	 * A non-zero guest program parameter indicates a guest
 	 * sample.
-	 * Note that some early samples might be misaccounted to
-	 * the host.
+	 * Note that some early samples or samples from guests without
+	 * lpp usage would be misaccounted to the host. We use the asn
+	 * value as a heuristic to detect most of these guest samples.
+	 * If the value differs from the host hpp value, we assume
+	 * it to be a KVM guest.
 	 */
-	if (sfr->basic.gpp)
+	if (sfr->basic.gpp || sfr->basic.prim_asn != (u16) sfr->basic.hpp)
 		sde_regs->in_guest = 1;
 
 	overflow = 0;
-- 
cgit v1.2.3


From f369b98ee30df4b4eaf57e2aab081529c2fad6d9 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 2 Mar 2016 10:14:08 +0100
Subject: s390/percpu: remove this_cpu_cmpxchg_double_4

git commit 26f15caaf993 ("s390/cmpxchg: simplify cmpxchg_double")
removed support for cmpxchg_double for two consecutive four byte
values, for which it would generate a cds instruction.

However I forgot to remove the corresponding define in our percpu
header file, which means that this_cpu_cmpxchg_double would now
incorrectly generate a cdsg instruction if being used on a double four
byte location. Therefore remove the percpu define as well.

There is currently no user and therefore no bug fixed with
this. Obviously any such user could and should simply use cmpxchg.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/percpu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 6d6556ca24aa..90240dfef76a 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -178,7 +178,6 @@
 	ret__;								\
 })
 
-#define this_cpu_cmpxchg_double_4 arch_this_cpu_cmpxchg_double
 #define this_cpu_cmpxchg_double_8 arch_this_cpu_cmpxchg_double
 
 #include <asm-generic/percpu.h>
-- 
cgit v1.2.3


From baebc70a4db86515d55ff1f226088a8e7f5821a0 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 3 Mar 2016 20:49:57 -0800
Subject: s390: Use pr_warn instead of pr_warning

Convert the uses of pr_warning to pr_warn so there are fewer
uses of the old pr_warning.

Miscellanea:

o Align arguments
o Coalesce formats

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/cpcmd.c |  3 +--
 arch/s390/kernel/debug.c |  6 ++----
 arch/s390/kernel/time.c  |  6 ++----
 arch/s390/mm/extmem.c    | 16 +++++++---------
 4 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index 7f768914fb4f..7f48e568ac64 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -96,8 +96,7 @@ int cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 			(((unsigned long)response + rlen) >> 31)) {
 		lowbuf = kmalloc(rlen, GFP_KERNEL | GFP_DMA);
 		if (!lowbuf) {
-			pr_warning("The cpcmd kernel function failed to "
-				   "allocate a response buffer\n");
+			pr_warn("The cpcmd kernel function failed to allocate a response buffer\n");
 			return -ENOMEM;
 		}
 		spin_lock_irqsave(&cpcmd_lock, flags);
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index c890a5589e59..aa12de72fd47 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -699,8 +699,7 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area,
 	/* Since debugfs currently does not support uid/gid other than root, */
 	/* we do not allow gid/uid != 0 until we get support for that. */
 	if ((uid != 0) || (gid != 0))
-		pr_warning("Root becomes the owner of all s390dbf files "
-			   "in sysfs\n");
+		pr_warn("Root becomes the owner of all s390dbf files in sysfs\n");
 	BUG_ON(!initialized);
 	mutex_lock(&debug_mutex);
 
@@ -1307,8 +1306,7 @@ debug_input_level_fn(debug_info_t * id, struct debug_view *view,
 		new_level = debug_get_uint(str);
 	}
 	if(new_level < 0) {
-		pr_warning("%s is not a valid level for a debug "
-			   "feature\n", str);
+		pr_warn("%s is not a valid level for a debug feature\n", str);
 		rc = -EINVAL;
 	} else {
 		debug_set_level(id, new_level);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 99f84ac31307..c4e5f183f225 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -499,8 +499,7 @@ static void etr_reset(void)
 		if (etr_port0_online && etr_port1_online)
 			set_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
 	} else if (etr_port0_online || etr_port1_online) {
-		pr_warning("The real or virtual hardware system does "
-			   "not provide an ETR interface\n");
+		pr_warn("The real or virtual hardware system does not provide an ETR interface\n");
 		etr_port0_online = etr_port1_online = 0;
 	}
 }
@@ -1464,8 +1463,7 @@ static void __init stp_reset(void)
 	if (rc == 0)
 		set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags);
 	else if (stp_online) {
-		pr_warning("The real or virtual hardware system does "
-			   "not provide an STP interface\n");
+		pr_warn("The real or virtual hardware system does not provide an STP interface\n");
 		free_page((unsigned long) stp_page);
 		stp_page = NULL;
 		stp_online = 0;
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index a1bf4ad8925d..02042b6b66bf 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -265,7 +265,7 @@ query_segment_type (struct dcss_segment *seg)
 		goto out_free;
 	}
 	if (diag_cc > 1) {
-		pr_warning("Querying a DCSS type failed with rc=%ld\n", vmrc);
+		pr_warn("Querying a DCSS type failed with rc=%ld\n", vmrc);
 		rc = dcss_diag_translate_rc (vmrc);
 		goto out_free;
 	}
@@ -457,8 +457,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 		goto out_resource;
 	}
 	if (diag_cc > 1) {
-		pr_warning("Loading DCSS %s failed with rc=%ld\n", name,
-			   end_addr);
+		pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr);
 		rc = dcss_diag_translate_rc(end_addr);
 		dcss_diag(&purgeseg_scode, seg->dcss_name,
 				&dummy, &dummy);
@@ -574,8 +573,7 @@ segment_modify_shared (char *name, int do_nonshared)
 		goto out_unlock;
 	}
 	if (atomic_read (&seg->ref_count) != 1) {
-		pr_warning("DCSS %s is in use and cannot be reloaded\n",
-			   name);
+		pr_warn("DCSS %s is in use and cannot be reloaded\n", name);
 		rc = -EAGAIN;
 		goto out_unlock;
 	}
@@ -588,8 +586,8 @@ segment_modify_shared (char *name, int do_nonshared)
 			seg->res->flags |= IORESOURCE_READONLY;
 
 	if (request_resource(&iomem_resource, seg->res)) {
-		pr_warning("DCSS %s overlaps with used memory resources "
-			   "and cannot be reloaded\n", name);
+		pr_warn("DCSS %s overlaps with used memory resources and cannot be reloaded\n",
+			name);
 		rc = -EBUSY;
 		kfree(seg->res);
 		goto out_del_mem;
@@ -607,8 +605,8 @@ segment_modify_shared (char *name, int do_nonshared)
 		goto out_del_res;
 	}
 	if (diag_cc > 1) {
-		pr_warning("Reloading DCSS %s failed with rc=%ld\n", name,
-			   end_addr);
+		pr_warn("Reloading DCSS %s failed with rc=%ld\n",
+			name, end_addr);
 		rc = dcss_diag_translate_rc(end_addr);
 		goto out_del_res;
 	}
-- 
cgit v1.2.3


From 988b86e69ded17f0f1209fd3ef1c4c7f1567dcc1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 13 Jan 2016 12:54:28 +0100
Subject: s390/pci: add ioctl interface for CLP

Provide a user space interface to issue call logical-processor instructions.
Only selected CLP commands are allowed, enough to get the full overview of
the installed PCI functions.

Reviewed-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/clp.h      |  27 +++++
 arch/s390/include/asm/pci_clp.h  |  30 +----
 arch/s390/include/uapi/asm/clp.h |  28 +++++
 arch/s390/pci/pci_clp.c          | 247 +++++++++++++++++++++++++++++++++++++--
 4 files changed, 293 insertions(+), 39 deletions(-)
 create mode 100644 arch/s390/include/uapi/asm/clp.h

(limited to 'arch')

diff --git a/arch/s390/include/asm/clp.h b/arch/s390/include/asm/clp.h
index a0e71a501f7c..5687d62fb0cb 100644
--- a/arch/s390/include/asm/clp.h
+++ b/arch/s390/include/asm/clp.h
@@ -4,14 +4,23 @@
 /* CLP common request & response block size */
 #define CLP_BLK_SIZE			PAGE_SIZE
 
+#define CLP_LPS_BASE	0
+#define CLP_LPS_PCI	2
+
 struct clp_req_hdr {
 	u16 len;
 	u16 cmd;
+	u32 fmt		: 4;
+	u32 reserved1	: 28;
+	u64 reserved2;
 } __packed;
 
 struct clp_rsp_hdr {
 	u16 len;
 	u16 rsp;
+	u32 fmt		: 4;
+	u32 reserved1	: 28;
+	u64 reserved2;
 } __packed;
 
 /* CLP Response Codes */
@@ -25,4 +34,22 @@ struct clp_rsp_hdr {
 #define CLP_RC_NODATA			0x0080	/* No data available */
 #define CLP_RC_FC_UNKNOWN		0x0100	/* Function code not recognized */
 
+/* Store logical-processor characteristics request */
+struct clp_req_slpc {
+	struct clp_req_hdr hdr;
+} __packed;
+
+struct clp_rsp_slpc {
+	struct clp_rsp_hdr hdr;
+	u32 reserved2[4];
+	u32 lpif[8];
+	u32 reserved3[8];
+	u32 lpic[8];
+} __packed;
+
+struct clp_req_rsp_slpc {
+	struct clp_req_slpc request;
+	struct clp_rsp_slpc response;
+} __packed;
+
 #endif
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index dd78f92f1cce..e75c64cbcf08 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -49,9 +49,6 @@ struct clp_fh_list_entry {
 /* List PCI functions request */
 struct clp_req_list_pci {
 	struct clp_req_hdr hdr;
-	u32 fmt			:  4;	/* cmd request block format */
-	u32			: 28;
-	u64 reserved1;
 	u64 resume_token;
 	u64 reserved2;
 } __packed;
@@ -59,9 +56,6 @@ struct clp_req_list_pci {
 /* List PCI functions response */
 struct clp_rsp_list_pci {
 	struct clp_rsp_hdr hdr;
-	u32 fmt			:  4;	/* cmd request block format */
-	u32			: 28;
-	u64 reserved1;
 	u64 resume_token;
 	u32 reserved2;
 	u16 max_fn;
@@ -73,9 +67,6 @@ struct clp_rsp_list_pci {
 /* Query PCI function request */
 struct clp_req_query_pci {
 	struct clp_req_hdr hdr;
-	u32 fmt			:  4;	/* cmd request block format */
-	u32			: 28;
-	u64 reserved1;
 	u32 fh;				/* function handle */
 	u32 reserved2;
 	u64 reserved3;
@@ -84,9 +75,6 @@ struct clp_req_query_pci {
 /* Query PCI function response */
 struct clp_rsp_query_pci {
 	struct clp_rsp_hdr hdr;
-	u32 fmt			:  4;	/* cmd request block format */
-	u32			: 28;
-	u64			: 64;
 	u16 vfn;			/* virtual fn number */
 	u16			:  7;
 	u16 util_str_avail	:  1;	/* utility string available? */
@@ -108,21 +96,15 @@ struct clp_rsp_query_pci {
 /* Query PCI function group request */
 struct clp_req_query_pci_grp {
 	struct clp_req_hdr hdr;
-	u32 fmt			:  4;	/* cmd request block format */
-	u32			: 28;
-	u64 reserved1;
-	u32			: 24;
+	u32 reserved2		: 24;
 	u32 pfgid		:  8;	/* function group id */
-	u32 reserved2;
-	u64 reserved3;
+	u32 reserved3;
+	u64 reserved4;
 } __packed;
 
 /* Query PCI function group response */
 struct clp_rsp_query_pci_grp {
 	struct clp_rsp_hdr hdr;
-	u32 fmt			:  4;	/* cmd request block format */
-	u32			: 28;
-	u64 reserved1;
 	u16			:  4;
 	u16 noi			: 12;	/* number of interrupts */
 	u8 version;
@@ -141,9 +123,6 @@ struct clp_rsp_query_pci_grp {
 /* Set PCI function request */
 struct clp_req_set_pci {
 	struct clp_req_hdr hdr;
-	u32 fmt			:  4;	/* cmd request block format */
-	u32			: 28;
-	u64 reserved1;
 	u32 fh;				/* function handle */
 	u16 reserved2;
 	u8 oc;				/* operation controls */
@@ -154,9 +133,6 @@ struct clp_req_set_pci {
 /* Set PCI function response */
 struct clp_rsp_set_pci {
 	struct clp_rsp_hdr hdr;
-	u32 fmt			:  4;	/* cmd request block format */
-	u32			: 28;
-	u64 reserved1;
 	u32 fh;				/* function handle */
 	u32 reserved3;
 	u64 reserved4;
diff --git a/arch/s390/include/uapi/asm/clp.h b/arch/s390/include/uapi/asm/clp.h
new file mode 100644
index 000000000000..ab72d9d24373
--- /dev/null
+++ b/arch/s390/include/uapi/asm/clp.h
@@ -0,0 +1,28 @@
+/*
+ * ioctl interface for /dev/clp
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#ifndef _ASM_CLP_H
+#define _ASM_CLP_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+struct clp_req {
+	unsigned int c : 1;
+	unsigned int r : 1;
+	unsigned int lps : 6;
+	unsigned int cmd : 8;
+	unsigned int : 16;
+	unsigned int reserved;
+	__u64 data_p;
+};
+
+#define CLP_IOCTL_MAGIC 'c'
+
+#define CLP_SYNC _IOWR(CLP_IOCTL_MAGIC, 0xC1, struct clp_req)
+
+#endif
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index d6e411ed8b1f..21591ddb4c1f 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -8,13 +8,19 @@
 #define KMSG_COMPONENT "zpci"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 
+#include <linux/compat.h>
 #include <linux/kernel.h>
+#include <linux/miscdevice.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/delay.h>
 #include <linux/pci.h>
+#include <linux/uaccess.h>
 #include <asm/pci_debug.h>
 #include <asm/pci_clp.h>
+#include <asm/compat.h>
+#include <asm/clp.h>
+#include <uapi/asm/clp.h>
 
 static inline void zpci_err_clp(unsigned int rsp, int rc)
 {
@@ -27,21 +33,43 @@ static inline void zpci_err_clp(unsigned int rsp, int rc)
 }
 
 /*
- * Call Logical Processor
- * Retry logic is handled by the caller.
+ * Call Logical Processor with c=1, lps=0 and command 1
+ * to get the bit mask of installed logical processors
  */
-static inline u8 clp_instr(void *data)
+static inline int clp_get_ilp(unsigned long *ilp)
+{
+	unsigned long mask;
+	int cc = 3;
+
+	asm volatile (
+		"	.insn	rrf,0xb9a00000,%[mask],%[cmd],8,0\n"
+		"0:	ipm	%[cc]\n"
+		"	srl	%[cc],28\n"
+		"1:\n"
+		EX_TABLE(0b, 1b)
+		: [cc] "+d" (cc), [mask] "=d" (mask) : [cmd] "a" (1)
+		: "cc");
+	*ilp = mask;
+	return cc;
+}
+
+/*
+ * Call Logical Processor with c=0, the give constant lps and an lpcb request.
+ */
+static inline int clp_req(void *data, unsigned int lps)
 {
 	struct { u8 _[CLP_BLK_SIZE]; } *req = data;
 	u64 ignored;
-	u8 cc;
+	int cc = 3;
 
 	asm volatile (
-		"	.insn	rrf,0xb9a00000,%[ign],%[req],0x0,0x2\n"
-		"	ipm	%[cc]\n"
+		"	.insn	rrf,0xb9a00000,%[ign],%[req],0,%[lps]\n"
+		"0:	ipm	%[cc]\n"
 		"	srl	%[cc],28\n"
-		: [cc] "=d" (cc), [ign] "=d" (ignored), "+m" (*req)
-		: [req] "a" (req)
+		"1:\n"
+		EX_TABLE(0b, 1b)
+		: [cc] "+d" (cc), [ign] "=d" (ignored), "+m" (*req)
+		: [req] "a" (req), [lps] "i" (lps)
 		: "cc");
 	return cc;
 }
@@ -90,7 +118,7 @@ static int clp_query_pci_fngrp(struct zpci_dev *zdev, u8 pfgid)
 	rrb->response.hdr.len = sizeof(rrb->response);
 	rrb->request.pfgid = pfgid;
 
-	rc = clp_instr(rrb);
+	rc = clp_req(rrb, CLP_LPS_PCI);
 	if (!rc && rrb->response.hdr.rsp == CLP_RC_OK)
 		clp_store_query_pci_fngrp(zdev, &rrb->response);
 	else {
@@ -143,7 +171,7 @@ static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh)
 	rrb->response.hdr.len = sizeof(rrb->response);
 	rrb->request.fh = fh;
 
-	rc = clp_instr(rrb);
+	rc = clp_req(rrb, CLP_LPS_PCI);
 	if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
 		rc = clp_store_query_pci_fn(zdev, &rrb->response);
 		if (rc)
@@ -214,7 +242,7 @@ static int clp_set_pci_fn(u32 *fh, u8 nr_dma_as, u8 command)
 		rrb->request.oc = command;
 		rrb->request.ndas = nr_dma_as;
 
-		rc = clp_instr(rrb);
+		rc = clp_req(rrb, CLP_LPS_PCI);
 		if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) {
 			retries--;
 			if (retries < 0)
@@ -280,7 +308,7 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb,
 		rrb->request.resume_token = resume_token;
 
 		/* Get PCI function handle list */
-		rc = clp_instr(rrb);
+		rc = clp_req(rrb, CLP_LPS_PCI);
 		if (rc || rrb->response.hdr.rsp != CLP_RC_OK) {
 			zpci_err("List PCI FN:\n");
 			zpci_err_clp(rrb->response.hdr.rsp, rc);
@@ -391,3 +419,198 @@ int clp_rescan_pci_devices_simple(void)
 	clp_free_block(rrb);
 	return rc;
 }
+
+static int clp_base_slpc(struct clp_req *req, struct clp_req_rsp_slpc *lpcb)
+{
+	unsigned long limit = PAGE_SIZE - sizeof(lpcb->request);
+
+	if (lpcb->request.hdr.len != sizeof(lpcb->request) ||
+	    lpcb->response.hdr.len > limit)
+		return -EINVAL;
+	return clp_req(lpcb, CLP_LPS_BASE) ? -EOPNOTSUPP : 0;
+}
+
+static int clp_base_command(struct clp_req *req, struct clp_req_hdr *lpcb)
+{
+	switch (lpcb->cmd) {
+	case 0x0001: /* store logical-processor characteristics */
+		return clp_base_slpc(req, (void *) lpcb);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int clp_pci_slpc(struct clp_req *req, struct clp_req_rsp_slpc *lpcb)
+{
+	unsigned long limit = PAGE_SIZE - sizeof(lpcb->request);
+
+	if (lpcb->request.hdr.len != sizeof(lpcb->request) ||
+	    lpcb->response.hdr.len > limit)
+		return -EINVAL;
+	return clp_req(lpcb, CLP_LPS_PCI) ? -EOPNOTSUPP : 0;
+}
+
+static int clp_pci_list(struct clp_req *req, struct clp_req_rsp_list_pci *lpcb)
+{
+	unsigned long limit = PAGE_SIZE - sizeof(lpcb->request);
+
+	if (lpcb->request.hdr.len != sizeof(lpcb->request) ||
+	    lpcb->response.hdr.len > limit)
+		return -EINVAL;
+	if (lpcb->request.reserved2 != 0)
+		return -EINVAL;
+	return clp_req(lpcb, CLP_LPS_PCI) ? -EOPNOTSUPP : 0;
+}
+
+static int clp_pci_query(struct clp_req *req,
+			 struct clp_req_rsp_query_pci *lpcb)
+{
+	unsigned long limit = PAGE_SIZE - sizeof(lpcb->request);
+
+	if (lpcb->request.hdr.len != sizeof(lpcb->request) ||
+	    lpcb->response.hdr.len > limit)
+		return -EINVAL;
+	if (lpcb->request.reserved2 != 0 || lpcb->request.reserved3 != 0)
+		return -EINVAL;
+	return clp_req(lpcb, CLP_LPS_PCI) ? -EOPNOTSUPP : 0;
+}
+
+static int clp_pci_query_grp(struct clp_req *req,
+			     struct clp_req_rsp_query_pci_grp *lpcb)
+{
+	unsigned long limit = PAGE_SIZE - sizeof(lpcb->request);
+
+	if (lpcb->request.hdr.len != sizeof(lpcb->request) ||
+	    lpcb->response.hdr.len > limit)
+		return -EINVAL;
+	if (lpcb->request.reserved2 != 0 || lpcb->request.reserved3 != 0 ||
+	    lpcb->request.reserved4 != 0)
+		return -EINVAL;
+	return clp_req(lpcb, CLP_LPS_PCI) ? -EOPNOTSUPP : 0;
+}
+
+static int clp_pci_command(struct clp_req *req, struct clp_req_hdr *lpcb)
+{
+	switch (lpcb->cmd) {
+	case 0x0001: /* store logical-processor characteristics */
+		return clp_pci_slpc(req, (void *) lpcb);
+	case 0x0002: /* list PCI functions */
+		return clp_pci_list(req, (void *) lpcb);
+	case 0x0003: /* query PCI function */
+		return clp_pci_query(req, (void *) lpcb);
+	case 0x0004: /* query PCI function group */
+		return clp_pci_query_grp(req, (void *) lpcb);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int clp_normal_command(struct clp_req *req)
+{
+	struct clp_req_hdr *lpcb;
+	void __user *uptr;
+	int rc;
+
+	rc = -EINVAL;
+	if (req->lps != 0 && req->lps != 2)
+		goto out;
+
+	rc = -ENOMEM;
+	lpcb = clp_alloc_block(GFP_KERNEL);
+	if (!lpcb)
+		goto out;
+
+	rc = -EFAULT;
+	uptr = (void __force __user *)(unsigned long) req->data_p;
+	if (copy_from_user(lpcb, uptr, PAGE_SIZE) != 0)
+		goto out_free;
+
+	rc = -EINVAL;
+	if (lpcb->fmt != 0 || lpcb->reserved1 != 0 || lpcb->reserved2 != 0)
+		goto out_free;
+
+	switch (req->lps) {
+	case 0:
+		rc = clp_base_command(req, lpcb);
+		break;
+	case 2:
+		rc = clp_pci_command(req, lpcb);
+		break;
+	}
+	if (rc)
+		goto out_free;
+
+	rc = -EFAULT;
+	if (copy_to_user(uptr, lpcb, PAGE_SIZE) != 0)
+		goto out_free;
+
+	rc = 0;
+
+out_free:
+	clp_free_block(lpcb);
+out:
+	return rc;
+}
+
+static int clp_immediate_command(struct clp_req *req)
+{
+	void __user *uptr;
+	unsigned long ilp;
+	int exists;
+
+	if (req->cmd > 1 || clp_get_ilp(&ilp) != 0)
+		return -EINVAL;
+
+	uptr = (void __force __user *)(unsigned long) req->data_p;
+	if (req->cmd == 0) {
+		/* Command code 0: test for a specific processor */
+		exists = test_bit_inv(req->lps, &ilp);
+		return put_user(exists, (int __user *) uptr);
+	}
+	/* Command code 1: return bit mask of installed processors */
+	return put_user(ilp, (unsigned long __user *) uptr);
+}
+
+static long clp_misc_ioctl(struct file *filp, unsigned int cmd,
+			   unsigned long arg)
+{
+	struct clp_req req;
+	void __user *argp;
+
+	if (cmd != CLP_SYNC)
+		return -EINVAL;
+
+	argp = is_compat_task() ? compat_ptr(arg) : (void __user *) arg;
+	if (copy_from_user(&req, argp, sizeof(req)))
+		return -EFAULT;
+	if (req.r != 0)
+		return -EINVAL;
+	return req.c ? clp_immediate_command(&req) : clp_normal_command(&req);
+}
+
+static int clp_misc_release(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static const struct file_operations clp_misc_fops = {
+	.owner = THIS_MODULE,
+	.open = nonseekable_open,
+	.release = clp_misc_release,
+	.unlocked_ioctl = clp_misc_ioctl,
+	.compat_ioctl = clp_misc_ioctl,
+	.llseek = no_llseek,
+};
+
+static struct miscdevice clp_misc_device = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "clp",
+	.fops = &clp_misc_fops,
+};
+
+static int __init clp_misc_init(void)
+{
+	return misc_register(&clp_misc_device);
+}
+
+device_initcall(clp_misc_init);
-- 
cgit v1.2.3


From ebde765c0e85f48534f98779b22349bf00761b61 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 11:08:09 +0100
Subject: s390/mm: uninline ptep_xxx functions from pgtable.h

The code in the various ptep_xxx functions has grown quite large,
consolidate them to four out-of-line functions:
  ptep_xchg_direct to exchange a pte with another with immediate flushing
  ptep_xchg_lazy to exchange a pte with another in a batched update
  ptep_modify_prot_start to begin a protection flags update
  ptep_modify_prot_commit to commit a protection flags update

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/pgtable.h | 374 ++++++----------------------------------
 arch/s390/kvm/kvm-s390.c        |   2 +-
 arch/s390/mm/pgtable.c          | 295 +++++++++++++++++++++++++++----
 3 files changed, 318 insertions(+), 353 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7be9ae808a8b..d102c4e23f91 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -680,69 +680,8 @@ static inline void pgste_set(pte_t *ptep, pgste_t pgste)
 #endif
 }
 
-static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
-				       struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	unsigned long address, bits, skey;
-
-	if (!mm_use_skey(mm) || pte_val(pte) & _PAGE_INVALID)
-		return pgste;
-	address = pte_val(pte) & PAGE_MASK;
-	skey = (unsigned long) page_get_storage_key(address);
-	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
-	/* Transfer page changed & referenced bit to guest bits in pgste */
-	pgste_val(pgste) |= bits << 48;		/* GR bit & GC bit */
-	/* Copy page access key and fetch protection bit to pgste */
-	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
-#endif
-	return pgste;
-
-}
-
-static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry,
-				 struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	unsigned long address;
-	unsigned long nkey;
-
-	if (!mm_use_skey(mm) || pte_val(entry) & _PAGE_INVALID)
-		return;
-	VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID));
-	address = pte_val(entry) & PAGE_MASK;
-	/*
-	 * Set page access key and fetch protection bit from pgste.
-	 * The guest C/R information is still in the PGSTE, set real
-	 * key C/R to 0.
-	 */
-	nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
-	nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
-	page_set_storage_key(address, nkey, 0);
-#endif
-}
-
-static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
-{
-	if ((pte_val(entry) & _PAGE_PRESENT) &&
-	    (pte_val(entry) & _PAGE_WRITE) &&
-	    !(pte_val(entry) & _PAGE_INVALID)) {
-		if (!MACHINE_HAS_ESOP) {
-			/*
-			 * Without enhanced suppression-on-protection force
-			 * the dirty bit on for all writable ptes.
-			 */
-			pte_val(entry) |= _PAGE_DIRTY;
-			pte_val(entry) &= ~_PAGE_PROTECT;
-		}
-		if (!(pte_val(entry) & _PAGE_PROTECT))
-			/* This pte allows write access, set user-dirty */
-			pgste_val(pgste) |= PGSTE_UC_BIT;
-	}
-	*ptep = entry;
-	return pgste;
-}
+bool pgste_test_and_clear_dirty(struct mm_struct *, unsigned long address);
+void ptep_ipte_notify(struct mm_struct *, unsigned long addr, pte_t *);
 
 /**
  * struct gmap_struct - guest address space
@@ -791,47 +730,11 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
 int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags);
 void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
 void __gmap_zap(struct gmap *, unsigned long gaddr);
-bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *);
 
 
 void gmap_register_ipte_notifier(struct gmap_notifier *);
 void gmap_unregister_ipte_notifier(struct gmap_notifier *);
 int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
-void gmap_do_ipte_notify(struct mm_struct *, unsigned long addr, pte_t *);
-
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
-					unsigned long addr,
-					pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	if (pgste_val(pgste) & PGSTE_IN_BIT) {
-		pgste_val(pgste) &= ~PGSTE_IN_BIT;
-		gmap_do_ipte_notify(mm, addr, ptep);
-	}
-#endif
-	return pgste;
-}
-
-/*
- * Certain architectures need to do special things when PTEs
- * within a page table are directly modified.  Thus, the following
- * hook is made available.
- */
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep, pte_t entry)
-{
-	pgste_t pgste;
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_get_lock(ptep);
-		pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
-		pgste_set_key(ptep, pgste, entry, mm);
-		pgste = pgste_set_pte(ptep, pgste, entry);
-		pgste_set_unlock(ptep, pgste);
-	} else {
-		*ptep = entry;
-	}
-}
 
 /*
  * query functions pte_write/pte_dirty/pte_young only work if
@@ -998,95 +901,30 @@ static inline void __ptep_ipte_range(unsigned long address, int nr, pte_t *ptep)
 	} while (nr != 255);
 }
 
-static inline void ptep_flush_direct(struct mm_struct *mm,
-				     unsigned long address, pte_t *ptep)
-{
-	int active, count;
-
-	if (pte_val(*ptep) & _PAGE_INVALID)
-		return;
-	active = (mm == current->active_mm) ? 1 : 0;
-	count = atomic_add_return(0x10000, &mm->context.attach_count);
-	if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
-	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
-		__ptep_ipte_local(address, ptep);
-	else
-		__ptep_ipte(address, ptep);
-	atomic_sub(0x10000, &mm->context.attach_count);
-}
-
-static inline void ptep_flush_lazy(struct mm_struct *mm,
-				   unsigned long address, pte_t *ptep)
-{
-	int active, count;
-
-	if (pte_val(*ptep) & _PAGE_INVALID)
-		return;
-	active = (mm == current->active_mm) ? 1 : 0;
-	count = atomic_add_return(0x10000, &mm->context.attach_count);
-	if ((count & 0xffff) <= active) {
-		pte_val(*ptep) |= _PAGE_INVALID;
-		mm->context.flush_mm = 1;
-	} else
-		__ptep_ipte(address, ptep);
-	atomic_sub(0x10000, &mm->context.attach_count);
-}
-
 /*
- * Get (and clear) the user dirty bit for a pte.
+ * This is hard to understand. ptep_get_and_clear and ptep_clear_flush
+ * both clear the TLB for the unmapped pte. The reason is that
+ * ptep_get_and_clear is used in common code (e.g. change_pte_range)
+ * to modify an active pte. The sequence is
+ *   1) ptep_get_and_clear
+ *   2) set_pte_at
+ *   3) flush_tlb_range
+ * On s390 the tlb needs to get flushed with the modification of the pte
+ * if the pte is active. The only way how this can be implemented is to
+ * have ptep_get_and_clear do the tlb flush. In exchange flush_tlb_range
+ * is a nop.
  */
-static inline int ptep_test_and_clear_user_dirty(struct mm_struct *mm,
-						 unsigned long addr,
-						 pte_t *ptep)
-{
-	pgste_t pgste;
-	pte_t pte;
-	int dirty;
-
-	if (!mm_has_pgste(mm))
-		return 0;
-	pgste = pgste_get_lock(ptep);
-	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
-	pgste_val(pgste) &= ~PGSTE_UC_BIT;
-	pte = *ptep;
-	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-		pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
-		__ptep_ipte(addr, ptep);
-		if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
-			pte_val(pte) |= _PAGE_PROTECT;
-		else
-			pte_val(pte) |= _PAGE_INVALID;
-		*ptep = pte;
-	}
-	pgste_set_unlock(ptep, pgste);
-	return dirty;
-}
+pte_t ptep_xchg_direct(struct mm_struct *, unsigned long, pte_t *, pte_t);
+pte_t ptep_xchg_lazy(struct mm_struct *, unsigned long, pte_t *, pte_t);
 
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long addr, pte_t *ptep)
 {
-	pgste_t pgste;
-	pte_t pte;
-	int young;
-
-	if (mm_has_pgste(vma->vm_mm)) {
-		pgste = pgste_get_lock(ptep);
-		pgste = pgste_ipte_notify(vma->vm_mm, addr, ptep, pgste);
-	}
-
-	pte = *ptep;
-	ptep_flush_direct(vma->vm_mm, addr, ptep);
-	young = pte_young(pte);
-
-	if (mm_has_pgste(vma->vm_mm)) {
-		pgste = pgste_update_all(pte, pgste, vma->vm_mm);
-		pgste = pgste_set_pte(ptep, pgste, pte_mkold(pte));
-		pgste_set_unlock(ptep, pgste);
-	} else
-		*ptep = pte_mkold(pte);
+	pte_t pte = *ptep;
 
-	return young;
+	pte = ptep_xchg_direct(vma->vm_mm, addr, ptep, pte_mkold(pte));
+	return pte_young(pte);
 }
 
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
@@ -1096,104 +934,22 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 	return ptep_test_and_clear_young(vma, address, ptep);
 }
 
-/*
- * This is hard to understand. ptep_get_and_clear and ptep_clear_flush
- * both clear the TLB for the unmapped pte. The reason is that
- * ptep_get_and_clear is used in common code (e.g. change_pte_range)
- * to modify an active pte. The sequence is
- *   1) ptep_get_and_clear
- *   2) set_pte_at
- *   3) flush_tlb_range
- * On s390 the tlb needs to get flushed with the modification of the pte
- * if the pte is active. The only way how this can be implemented is to
- * have ptep_get_and_clear do the tlb flush. In exchange flush_tlb_range
- * is a nop.
- */
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
-				       unsigned long address, pte_t *ptep)
+				       unsigned long addr, pte_t *ptep)
 {
-	pgste_t pgste;
-	pte_t pte;
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_get_lock(ptep);
-		pgste = pgste_ipte_notify(mm, address, ptep, pgste);
-	}
-
-	pte = *ptep;
-	ptep_flush_lazy(mm, address, ptep);
-	pte_val(*ptep) = _PAGE_INVALID;
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_update_all(pte, pgste, mm);
-		pgste_set_unlock(ptep, pgste);
-	}
-	return pte;
+	return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
 }
 
 #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
-static inline pte_t ptep_modify_prot_start(struct mm_struct *mm,
-					   unsigned long address,
-					   pte_t *ptep)
-{
-	pgste_t pgste;
-	pte_t pte;
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_get_lock(ptep);
-		pgste_ipte_notify(mm, address, ptep, pgste);
-	}
-
-	pte = *ptep;
-	ptep_flush_lazy(mm, address, ptep);
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_update_all(pte, pgste, mm);
-		pgste_set(ptep, pgste);
-	}
-	return pte;
-}
-
-static inline void ptep_modify_prot_commit(struct mm_struct *mm,
-					   unsigned long address,
-					   pte_t *ptep, pte_t pte)
-{
-	pgste_t pgste;
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_get(ptep);
-		pgste_set_key(ptep, pgste, pte, mm);
-		pgste = pgste_set_pte(ptep, pgste, pte);
-		pgste_set_unlock(ptep, pgste);
-	} else
-		*ptep = pte;
-}
+pte_t ptep_modify_prot_start(struct mm_struct *, unsigned long, pte_t *);
+void ptep_modify_prot_commit(struct mm_struct *, unsigned long, pte_t *, pte_t);
 
 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
 static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
-				     unsigned long address, pte_t *ptep)
+				     unsigned long addr, pte_t *ptep)
 {
-	pgste_t pgste;
-	pte_t pte;
-
-	if (mm_has_pgste(vma->vm_mm)) {
-		pgste = pgste_get_lock(ptep);
-		pgste = pgste_ipte_notify(vma->vm_mm, address, ptep, pgste);
-	}
-
-	pte = *ptep;
-	ptep_flush_direct(vma->vm_mm, address, ptep);
-	pte_val(*ptep) = _PAGE_INVALID;
-
-	if (mm_has_pgste(vma->vm_mm)) {
-		if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
-		    _PGSTE_GPS_USAGE_UNUSED)
-			pte_val(pte) |= _PAGE_UNUSED;
-		pgste = pgste_update_all(pte, pgste, vma->vm_mm);
-		pgste_set_unlock(ptep, pgste);
-	}
-	return pte;
+	return ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
 }
 
 /*
@@ -1205,80 +961,52 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
  */
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
-					    unsigned long address,
+					    unsigned long addr,
 					    pte_t *ptep, int full)
 {
-	pgste_t pgste;
-	pte_t pte;
-
-	if (!full && mm_has_pgste(mm)) {
-		pgste = pgste_get_lock(ptep);
-		pgste = pgste_ipte_notify(mm, address, ptep, pgste);
-	}
-
-	pte = *ptep;
-	if (!full)
-		ptep_flush_lazy(mm, address, ptep);
-	pte_val(*ptep) = _PAGE_INVALID;
-
-	if (!full && mm_has_pgste(mm)) {
-		pgste = pgste_update_all(pte, pgste, mm);
-		pgste_set_unlock(ptep, pgste);
+	if (full) {
+		pte_t pte = *ptep;
+		*ptep = __pte(_PAGE_INVALID);
+		return pte;
 	}
-	return pte;
+	return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
 }
 
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline pte_t ptep_set_wrprotect(struct mm_struct *mm,
-				       unsigned long address, pte_t *ptep)
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+				      unsigned long addr, pte_t *ptep)
 {
-	pgste_t pgste;
 	pte_t pte = *ptep;
 
-	if (pte_write(pte)) {
-		if (mm_has_pgste(mm)) {
-			pgste = pgste_get_lock(ptep);
-			pgste = pgste_ipte_notify(mm, address, ptep, pgste);
-		}
-
-		ptep_flush_lazy(mm, address, ptep);
-		pte = pte_wrprotect(pte);
-
-		if (mm_has_pgste(mm)) {
-			pgste = pgste_set_pte(ptep, pgste, pte);
-			pgste_set_unlock(ptep, pgste);
-		} else
-			*ptep = pte;
-	}
-	return pte;
+	if (pte_write(pte))
+		ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte));
 }
 
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 static inline int ptep_set_access_flags(struct vm_area_struct *vma,
-					unsigned long address, pte_t *ptep,
+					unsigned long addr, pte_t *ptep,
 					pte_t entry, int dirty)
 {
-	pgste_t pgste;
-	pte_t oldpte;
-
-	oldpte = *ptep;
-	if (pte_same(oldpte, entry))
+	if (pte_same(*ptep, entry))
 		return 0;
-	if (mm_has_pgste(vma->vm_mm)) {
-		pgste = pgste_get_lock(ptep);
-		pgste = pgste_ipte_notify(vma->vm_mm, address, ptep, pgste);
-	}
+	ptep_xchg_direct(vma->vm_mm, addr, ptep, entry);
+	return 1;
+}
 
-	ptep_flush_direct(vma->vm_mm, address, ptep);
+void set_pte_pgste_at(struct mm_struct *, unsigned long, pte_t *, pte_t);
 
-	if (mm_has_pgste(vma->vm_mm)) {
-		if (pte_val(oldpte) & _PAGE_INVALID)
-			pgste_set_key(ptep, pgste, entry, vma->vm_mm);
-		pgste = pgste_set_pte(ptep, pgste, entry);
-		pgste_set_unlock(ptep, pgste);
-	} else
+/*
+ * Certain architectures need to do special things when PTEs
+ * within a page table are directly modified.  Thus, the following
+ * hook is made available.
+ */
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep, pte_t entry)
+{
+	if (mm_has_pgste(mm))
+		set_pte_pgste_at(mm, addr, ptep, entry);
+	else
 		*ptep = entry;
-	return 1;
 }
 
 /*
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4af21c771f9b..616e0a16ee88 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -280,7 +280,7 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm,
 	for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
 		address = gfn_to_hva_memslot(memslot, cur_gfn);
 
-		if (gmap_test_and_clear_dirty(address, gmap))
+		if (pgste_test_and_clear_dirty(gmap->mm, address))
 			mark_page_dirty(kvm, cur_gfn);
 	}
 	up_read(&gmap->mm->mmap_sem);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 6acd7174fe75..30033aad17da 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -772,7 +772,7 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
 EXPORT_SYMBOL_GPL(gmap_ipte_notify);
 
 /**
- * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
+ * ptep_ipte_notify - call all invalidation callbacks for a specific pte.
  * @mm: pointer to the process mm_struct
  * @addr: virtual address in the process address space
  * @pte: pointer to the page table entry
@@ -780,7 +780,7 @@ EXPORT_SYMBOL_GPL(gmap_ipte_notify);
  * This function is assumed to be called with the page table lock held
  * for the pte to notify.
  */
-void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+void ptep_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
 {
 	unsigned long offset, gaddr;
 	unsigned long *table;
@@ -801,7 +801,7 @@ void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
 	}
 	spin_unlock(&gmap_notifier_lock);
 }
-EXPORT_SYMBOL_GPL(gmap_do_ipte_notify);
+EXPORT_SYMBOL_GPL(ptep_ipte_notify);
 
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 			  unsigned long key, bool nq)
@@ -1158,6 +1158,266 @@ static inline void thp_split_mm(struct mm_struct *mm)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+static inline pte_t ptep_flush_direct(struct mm_struct *mm,
+				      unsigned long addr, pte_t *ptep)
+{
+	int active, count;
+	pte_t old;
+
+	old = *ptep;
+	if (unlikely(pte_val(old) & _PAGE_INVALID))
+		return old;
+	active = (mm == current->active_mm) ? 1 : 0;
+	count = atomic_add_return(0x10000, &mm->context.attach_count);
+	if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+		__ptep_ipte_local(addr, ptep);
+	else
+		__ptep_ipte(addr, ptep);
+	atomic_sub(0x10000, &mm->context.attach_count);
+	return old;
+}
+
+static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
+				    unsigned long addr, pte_t *ptep)
+{
+	int active, count;
+	pte_t old;
+
+	old = *ptep;
+	if (unlikely(pte_val(old) & _PAGE_INVALID))
+		return old;
+	active = (mm == current->active_mm) ? 1 : 0;
+	count = atomic_add_return(0x10000, &mm->context.attach_count);
+	if ((count & 0xffff) <= active) {
+		pte_val(*ptep) |= _PAGE_INVALID;
+		mm->context.flush_mm = 1;
+	} else
+		__ptep_ipte(addr, ptep);
+	atomic_sub(0x10000, &mm->context.attach_count);
+	return old;
+}
+
+static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
+				       struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+	unsigned long address, bits, skey;
+
+	if (!mm_use_skey(mm) || pte_val(pte) & _PAGE_INVALID)
+		return pgste;
+	address = pte_val(pte) & PAGE_MASK;
+	skey = (unsigned long) page_get_storage_key(address);
+	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
+	/* Transfer page changed & referenced bit to guest bits in pgste */
+	pgste_val(pgste) |= bits << 48;		/* GR bit & GC bit */
+	/* Copy page access key and fetch protection bit to pgste */
+	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+#endif
+	return pgste;
+
+}
+
+static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry,
+				 struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+	unsigned long address;
+	unsigned long nkey;
+
+	if (!mm_use_skey(mm) || pte_val(entry) & _PAGE_INVALID)
+		return;
+	VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID));
+	address = pte_val(entry) & PAGE_MASK;
+	/*
+	 * Set page access key and fetch protection bit from pgste.
+	 * The guest C/R information is still in the PGSTE, set real
+	 * key C/R to 0.
+	 */
+	nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+	nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
+	page_set_storage_key(address, nkey, 0);
+#endif
+}
+
+static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
+{
+#ifdef CONFIG_PGSTE
+	if ((pte_val(entry) & _PAGE_PRESENT) &&
+	    (pte_val(entry) & _PAGE_WRITE) &&
+	    !(pte_val(entry) & _PAGE_INVALID)) {
+		if (!MACHINE_HAS_ESOP) {
+			/*
+			 * Without enhanced suppression-on-protection force
+			 * the dirty bit on for all writable ptes.
+			 */
+			pte_val(entry) |= _PAGE_DIRTY;
+			pte_val(entry) &= ~_PAGE_PROTECT;
+		}
+		if (!(pte_val(entry) & _PAGE_PROTECT))
+			/* This pte allows write access, set user-dirty */
+			pgste_val(pgste) |= PGSTE_UC_BIT;
+	}
+#endif
+	*ptep = entry;
+	return pgste;
+}
+
+static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
+					unsigned long addr,
+					pte_t *ptep, pgste_t pgste)
+{
+#ifdef CONFIG_PGSTE
+	if (pgste_val(pgste) & PGSTE_IN_BIT) {
+		pgste_val(pgste) &= ~PGSTE_IN_BIT;
+		ptep_ipte_notify(mm, addr, ptep);
+	}
+#endif
+	return pgste;
+}
+
+#ifdef CONFIG_PGSTE
+/*
+ * Test and reset if a guest page is dirty
+ */
+bool pgste_test_and_clear_dirty(struct mm_struct *mm, unsigned long addr)
+{
+	spinlock_t *ptl;
+	pgste_t pgste;
+	pte_t *ptep;
+	pte_t pte;
+	bool dirty;
+
+	ptep = get_locked_pte(mm, addr, &ptl);
+	if (unlikely(!ptep))
+		return false;
+
+	pgste = pgste_get_lock(ptep);
+	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
+	pgste_val(pgste) &= ~PGSTE_UC_BIT;
+	pte = *ptep;
+	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
+		pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+		__ptep_ipte(addr, ptep);
+		if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
+			pte_val(pte) |= _PAGE_PROTECT;
+		else
+			pte_val(pte) |= _PAGE_INVALID;
+		*ptep = pte;
+	}
+	pgste_set_unlock(ptep, pgste);
+
+	spin_unlock(ptl);
+	return dirty;
+}
+EXPORT_SYMBOL_GPL(pgste_test_and_clear_dirty);
+
+void set_pte_pgste_at(struct mm_struct *mm, unsigned long addr,
+		      pte_t *ptep, pte_t entry)
+{
+	pgste_t pgste;
+
+	/* the mm_has_pgste() check is done in set_pte_at() */
+	pgste = pgste_get_lock(ptep);
+	pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
+	pgste_set_key(ptep, pgste, entry, mm);
+	pgste = pgste_set_pte(ptep, pgste, entry);
+	pgste_set_unlock(ptep, pgste);
+}
+EXPORT_SYMBOL(set_pte_pgste_at);
+#endif
+
+static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
+				      unsigned long addr, pte_t *ptep)
+{
+	pgste_t pgste = __pgste(0);
+
+	if (mm_has_pgste(mm)) {
+		pgste = pgste_get_lock(ptep);
+		pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+	}
+	return pgste;
+}
+
+static inline void ptep_xchg_commit(struct mm_struct *mm,
+				    unsigned long addr, pte_t *ptep,
+				    pgste_t pgste, pte_t old, pte_t new)
+{
+	if (mm_has_pgste(mm)) {
+		if (pte_val(old) & _PAGE_INVALID)
+			pgste_set_key(ptep, pgste, new, mm);
+		if (pte_val(new) & _PAGE_INVALID) {
+			pgste = pgste_update_all(old, pgste, mm);
+			if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
+			    _PGSTE_GPS_USAGE_UNUSED)
+				pte_val(old) |= _PAGE_UNUSED;
+		}
+		pgste = pgste_set_pte(ptep, pgste, new);
+		pgste_set_unlock(ptep, pgste);
+	} else {
+		*ptep = new;
+	}
+}
+
+pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
+		       pte_t *ptep, pte_t new)
+{
+	pgste_t pgste;
+	pte_t old;
+
+	pgste = ptep_xchg_start(mm, addr, ptep);
+	old = ptep_flush_direct(mm, addr, ptep);
+	ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+	return old;
+}
+EXPORT_SYMBOL(ptep_xchg_direct);
+
+pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t new)
+{
+	pgste_t pgste;
+	pte_t old;
+
+	pgste = ptep_xchg_start(mm, addr, ptep);
+	old = ptep_flush_lazy(mm, addr, ptep);
+	ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+	return old;
+}
+EXPORT_SYMBOL(ptep_xchg_lazy);
+
+pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+			     pte_t *ptep)
+{
+	pgste_t pgste;
+	pte_t old;
+
+	pgste = ptep_xchg_start(mm, addr, ptep);
+	old = ptep_flush_lazy(mm, addr, ptep);
+	if (mm_has_pgste(mm)) {
+		pgste = pgste_update_all(old, pgste, mm);
+		pgste_set(ptep, pgste);
+	}
+	return old;
+}
+EXPORT_SYMBOL(ptep_modify_prot_start);
+
+void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+			     pte_t *ptep, pte_t pte)
+{
+	pgste_t pgste;
+
+	if (mm_has_pgste(mm)) {
+		pgste = pgste_get(ptep);
+		pgste_set_key(ptep, pgste, pte, mm);
+		pgste = pgste_set_pte(ptep, pgste, pte);
+		pgste_set_unlock(ptep, pgste);
+	} else {
+		*ptep = pte;
+	}
+}
+EXPORT_SYMBOL(ptep_modify_prot_commit);
+
 /*
  * switch on pgstes for its userspace process (for kvm)
  */
@@ -1190,17 +1450,15 @@ static int __s390_enable_skey(pte_t *pte, unsigned long addr,
 	unsigned long ptev;
 	pgste_t pgste;
 
-	pgste = pgste_get_lock(pte);
 	/*
 	 * Remove all zero page mappings,
 	 * after establishing a policy to forbid zero page mappings
 	 * following faults for that page will get fresh anonymous pages
 	 */
-	if (is_zero_pfn(pte_pfn(*pte))) {
-		ptep_flush_direct(walk->mm, addr, pte);
-		pte_val(*pte) = _PAGE_INVALID;
-	}
+	if (is_zero_pfn(pte_pfn(*pte)))
+		ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID));
 	/* Clear storage key */
+	pgste = pgste_get_lock(pte);
 	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
 			      PGSTE_GR_BIT | PGSTE_GC_BIT);
 	ptev = pte_val(*pte);
@@ -1266,27 +1524,6 @@ void s390_reset_cmma(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(s390_reset_cmma);
 
-/*
- * Test and reset if a guest page is dirty
- */
-bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap)
-{
-	pte_t *pte;
-	spinlock_t *ptl;
-	bool dirty = false;
-
-	pte = get_locked_pte(gmap->mm, address, &ptl);
-	if (unlikely(!pte))
-		return false;
-
-	if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte))
-		dirty = true;
-
-	spin_unlock(ptl);
-	return dirty;
-}
-EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty);
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
 			   pmd_t *pmdp)
-- 
cgit v1.2.3


From 227be799c39a28bf5d68187a4ea1b43190d96515 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 11:09:25 +0100
Subject: s390/mm: uninline pmdp_xxx functions from pgtable.h

The pmdp_xxx function are smaller than their ptep_xxx counterparts
but to keep things symmetrical unline them as well.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/pgtable.h | 136 ++++++++++++++++------------------------
 arch/s390/mm/hugetlbpage.c      |   7 +--
 arch/s390/mm/pgtable.c          |  93 +++++++++++++++++++--------
 3 files changed, 124 insertions(+), 112 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index d102c4e23f91..572001c8913d 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -520,15 +520,6 @@ static inline int pmd_bad(pmd_t pmd)
 	return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
 }
 
-#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
-extern int pmdp_set_access_flags(struct vm_area_struct *vma,
-				 unsigned long address, pmd_t *pmdp,
-				 pmd_t entry, int dirty);
-
-#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
-extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
-				  unsigned long address, pmd_t *pmdp);
-
 #define __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
@@ -1203,54 +1194,51 @@ static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp)
 		: "cc" );
 }
 
-static inline void pmdp_flush_direct(struct mm_struct *mm,
-				     unsigned long address, pmd_t *pmdp)
-{
-	int active, count;
+pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
+pmd_t pmdp_xchg_lazy(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
 
-	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
-		return;
-	if (!MACHINE_HAS_IDTE) {
-		__pmdp_csp(pmdp);
-		return;
-	}
-	active = (mm == current->active_mm) ? 1 : 0;
-	count = atomic_add_return(0x10000, &mm->context.attach_count);
-	if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
-	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
-		__pmdp_idte_local(address, pmdp);
-	else
-		__pmdp_idte(address, pmdp);
-	atomic_sub(0x10000, &mm->context.attach_count);
-}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+				pgtable_t pgtable);
+
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 
-static inline void pmdp_flush_lazy(struct mm_struct *mm,
-				   unsigned long address, pmd_t *pmdp)
+#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
+					unsigned long addr, pmd_t *pmdp,
+					pmd_t entry, int dirty)
 {
-	int active, count;
+	VM_BUG_ON(addr & ~HPAGE_MASK);
 
-	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
-		return;
-	active = (mm == current->active_mm) ? 1 : 0;
-	count = atomic_add_return(0x10000, &mm->context.attach_count);
-	if ((count & 0xffff) <= active) {
-		pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
-		mm->context.flush_mm = 1;
-	} else if (MACHINE_HAS_IDTE)
-		__pmdp_idte(address, pmdp);
-	else
-		__pmdp_csp(pmdp);
-	atomic_sub(0x10000, &mm->context.attach_count);
+	entry = pmd_mkyoung(entry);
+	if (dirty)
+		entry = pmd_mkdirty(entry);
+	if (pmd_val(*pmdp) == pmd_val(entry))
+		return 0;
+	pmdp_xchg_direct(vma->vm_mm, addr, pmdp, entry);
+	return 1;
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long addr, pmd_t *pmdp)
+{
+	pmd_t pmd = *pmdp;
 
-#define __HAVE_ARCH_PGTABLE_DEPOSIT
-extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
-				       pgtable_t pgtable);
+	pmd = pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd_mkold(pmd));
+	return pmd_young(pmd);
+}
 
-#define __HAVE_ARCH_PGTABLE_WITHDRAW
-extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
+					 unsigned long addr, pmd_t *pmdp)
+{
+	VM_BUG_ON(addr & ~HPAGE_MASK);
+	return pmdp_test_and_clear_young(vma, addr, pmdp);
+}
 
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 			      pmd_t *pmdp, pmd_t entry)
@@ -1266,66 +1254,48 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
 	return pmd;
 }
 
-#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
-					    unsigned long address, pmd_t *pmdp)
-{
-	pmd_t pmd;
-
-	pmd = *pmdp;
-	pmdp_flush_direct(vma->vm_mm, address, pmdp);
-	*pmdp = pmd_mkold(pmd);
-	return pmd_young(pmd);
-}
-
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
-					    unsigned long address, pmd_t *pmdp)
+					    unsigned long addr, pmd_t *pmdp)
 {
-	pmd_t pmd = *pmdp;
-
-	pmdp_flush_direct(mm, address, pmdp);
-	pmd_clear(pmdp);
-	return pmd;
+	return pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
 static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
-						 unsigned long address,
+						 unsigned long addr,
 						 pmd_t *pmdp, int full)
 {
-	pmd_t pmd = *pmdp;
-
-	if (!full)
-		pmdp_flush_lazy(mm, address, pmdp);
-	pmd_clear(pmdp);
-	return pmd;
+	if (full) {
+		pmd_t pmd = *pmdp;
+		*pmdp = __pmd(_SEGMENT_ENTRY_INVALID);
+		return pmd;
+	}
+	return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
 static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
-					  unsigned long address, pmd_t *pmdp)
+					  unsigned long addr, pmd_t *pmdp)
 {
-	return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+	return pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
 }
 
 #define __HAVE_ARCH_PMDP_INVALIDATE
 static inline void pmdp_invalidate(struct vm_area_struct *vma,
-				   unsigned long address, pmd_t *pmdp)
+				   unsigned long addr, pmd_t *pmdp)
 {
-	pmdp_flush_direct(vma->vm_mm, address, pmdp);
+	pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
 }
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
-				      unsigned long address, pmd_t *pmdp)
+				      unsigned long addr, pmd_t *pmdp)
 {
 	pmd_t pmd = *pmdp;
 
-	if (pmd_write(pmd)) {
-		pmdp_flush_direct(mm, address, pmdp);
-		set_pmd_at(mm, address, pmdp, pmd_wrprotect(pmd));
-	}
+	if (pmd_write(pmd))
+		pmd = pmdp_xchg_lazy(mm, addr, pmdp, pmd_wrprotect(pmd));
 }
 
 static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index f81096b6940d..1b5e8983f4f3 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -105,11 +105,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 			      unsigned long addr, pte_t *ptep)
 {
 	pmd_t *pmdp = (pmd_t *) ptep;
-	pte_t pte = huge_ptep_get(ptep);
+	pmd_t old;
 
-	pmdp_flush_direct(mm, addr, pmdp);
-	pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
-	return pte;
+	old = pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+	return __pmd_to_pte(old);
 }
 
 pte_t *huge_pte_alloc(struct mm_struct *mm,
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 30033aad17da..e24126208614 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1418,6 +1418,74 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(ptep_modify_prot_commit);
 
+static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
+				      unsigned long addr, pmd_t *pmdp)
+{
+	int active, count;
+	pmd_t old;
+
+	old = *pmdp;
+	if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
+		return old;
+	if (!MACHINE_HAS_IDTE) {
+		__pmdp_csp(pmdp);
+		return old;
+	}
+	active = (mm == current->active_mm) ? 1 : 0;
+	count = atomic_add_return(0x10000, &mm->context.attach_count);
+	if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+		__pmdp_idte_local(addr, pmdp);
+	else
+		__pmdp_idte(addr, pmdp);
+	atomic_sub(0x10000, &mm->context.attach_count);
+	return old;
+}
+
+static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
+				    unsigned long addr, pmd_t *pmdp)
+{
+	int active, count;
+	pmd_t old;
+
+	old = *pmdp;
+	if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
+		return old;
+	active = (mm == current->active_mm) ? 1 : 0;
+	count = atomic_add_return(0x10000, &mm->context.attach_count);
+	if ((count & 0xffff) <= active) {
+		pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
+		mm->context.flush_mm = 1;
+	} else if (MACHINE_HAS_IDTE)
+		__pmdp_idte(addr, pmdp);
+	else
+		__pmdp_csp(pmdp);
+	atomic_sub(0x10000, &mm->context.attach_count);
+	return old;
+}
+
+pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
+		       pmd_t *pmdp, pmd_t new)
+{
+	pmd_t old;
+
+	old = pmdp_flush_direct(mm, addr, pmdp);
+	*pmdp = new;
+	return old;
+}
+EXPORT_SYMBOL(pmdp_xchg_direct);
+
+pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
+		     pmd_t *pmdp, pmd_t new)
+{
+	pmd_t old;
+
+	old = pmdp_flush_lazy(mm, addr, pmdp);
+	*pmdp = new;
+	return old;
+}
+EXPORT_SYMBOL(pmdp_xchg_lazy);
+
 /*
  * switch on pgstes for its userspace process (for kvm)
  */
@@ -1525,31 +1593,6 @@ void s390_reset_cmma(struct mm_struct *mm)
 EXPORT_SYMBOL_GPL(s390_reset_cmma);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
-			   pmd_t *pmdp)
-{
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-	/* No need to flush TLB
-	 * On s390 reference bits are in storage key and never in TLB */
-	return pmdp_test_and_clear_young(vma, address, pmdp);
-}
-
-int pmdp_set_access_flags(struct vm_area_struct *vma,
-			  unsigned long address, pmd_t *pmdp,
-			  pmd_t entry, int dirty)
-{
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-	entry = pmd_mkyoung(entry);
-	if (dirty)
-		entry = pmd_mkdirty(entry);
-	if (pmd_same(*pmdp, entry))
-		return 0;
-	pmdp_invalidate(vma, address, pmdp);
-	set_pmd_at(vma->vm_mm, address, pmdp, entry);
-	return 1;
-}
-
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 				pgtable_t pgtable)
 {
-- 
cgit v1.2.3


From 1e133ab296f3ff8d9e58a5e758291ed39ba72ad7 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 8 Mar 2016 11:49:57 +0100
Subject: s390/mm: split arch/s390/mm/pgtable.c

The pgtable.c file is quite big, before it grows any larger split it
into pgtable.c, pgalloc.c and gmap.c. In addition move the gmap related
header definitions into the new gmap.h header and all of the pgste
helpers from pgtable.h to pgtable.c.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/gmap.h    |   64 ++
 arch/s390/include/asm/pgalloc.h |    4 -
 arch/s390/include/asm/pgtable.h |  123 +---
 arch/s390/kernel/asm-offsets.c  |    1 +
 arch/s390/kvm/diag.c            |    1 +
 arch/s390/kvm/interrupt.c       |    1 +
 arch/s390/kvm/kvm-s390.c        |    3 +-
 arch/s390/kvm/priv.c            |    1 +
 arch/s390/mm/Makefile           |    4 +-
 arch/s390/mm/fault.c            |    1 +
 arch/s390/mm/gmap.c             |  774 ++++++++++++++++++++
 arch/s390/mm/pgalloc.c          |  360 +++++++++
 arch/s390/mm/pgtable.c          | 1532 ++++++---------------------------------
 13 files changed, 1464 insertions(+), 1405 deletions(-)
 create mode 100644 arch/s390/include/asm/gmap.h
 create mode 100644 arch/s390/mm/gmap.c
 create mode 100644 arch/s390/mm/pgalloc.c

(limited to 'arch')

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
new file mode 100644
index 000000000000..d054c1b07a3c
--- /dev/null
+++ b/arch/s390/include/asm/gmap.h
@@ -0,0 +1,64 @@
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2016
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#ifndef _ASM_S390_GMAP_H
+#define _ASM_S390_GMAP_H
+
+/**
+ * struct gmap_struct - guest address space
+ * @crst_list: list of all crst tables used in the guest address space
+ * @mm: pointer to the parent mm_struct
+ * @guest_to_host: radix tree with guest to host address translation
+ * @host_to_guest: radix tree with pointer to segment table entries
+ * @guest_table_lock: spinlock to protect all entries in the guest page table
+ * @table: pointer to the page directory
+ * @asce: address space control element for gmap page table
+ * @pfault_enabled: defines if pfaults are applicable for the guest
+ */
+struct gmap {
+	struct list_head list;
+	struct list_head crst_list;
+	struct mm_struct *mm;
+	struct radix_tree_root guest_to_host;
+	struct radix_tree_root host_to_guest;
+	spinlock_t guest_table_lock;
+	unsigned long *table;
+	unsigned long asce;
+	unsigned long asce_end;
+	void *private;
+	bool pfault_enabled;
+};
+
+/**
+ * struct gmap_notifier - notify function block for page invalidation
+ * @notifier_call: address of callback function
+ */
+struct gmap_notifier {
+	struct list_head list;
+	void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
+};
+
+struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
+void gmap_free(struct gmap *gmap);
+void gmap_enable(struct gmap *gmap);
+void gmap_disable(struct gmap *gmap);
+int gmap_map_segment(struct gmap *gmap, unsigned long from,
+		     unsigned long to, unsigned long len);
+int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
+unsigned long __gmap_translate(struct gmap *, unsigned long gaddr);
+unsigned long gmap_translate(struct gmap *, unsigned long gaddr);
+int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
+int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags);
+void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
+void __gmap_zap(struct gmap *, unsigned long gaddr);
+void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
+
+void gmap_register_ipte_notifier(struct gmap_notifier *);
+void gmap_unregister_ipte_notifier(struct gmap_notifier *);
+int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
+
+#endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 7b7858f158b4..92487193706c 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -23,10 +23,6 @@ void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
 extern int page_table_allocate_pgste;
 
-int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned long key, bool nq);
-unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
-
 static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
 {
 	typedef struct { char _[n]; } addrtype;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 572001c8913d..2f66645587a2 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -622,111 +622,6 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
 	return pmd;
 }
 
-static inline pgste_t pgste_get_lock(pte_t *ptep)
-{
-	unsigned long new = 0;
-#ifdef CONFIG_PGSTE
-	unsigned long old;
-
-	preempt_disable();
-	asm(
-		"	lg	%0,%2\n"
-		"0:	lgr	%1,%0\n"
-		"	nihh	%0,0xff7f\n"	/* clear PCL bit in old */
-		"	oihh	%1,0x0080\n"	/* set PCL bit in new */
-		"	csg	%0,%1,%2\n"
-		"	jl	0b\n"
-		: "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
-		: "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory");
-#endif
-	return __pgste(new);
-}
-
-static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	asm(
-		"	nihh	%1,0xff7f\n"	/* clear PCL bit */
-		"	stg	%1,%0\n"
-		: "=Q" (ptep[PTRS_PER_PTE])
-		: "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
-		: "cc", "memory");
-	preempt_enable();
-#endif
-}
-
-static inline pgste_t pgste_get(pte_t *ptep)
-{
-	unsigned long pgste = 0;
-#ifdef CONFIG_PGSTE
-	pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
-#endif
-	return __pgste(pgste);
-}
-
-static inline void pgste_set(pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	*(pgste_t *)(ptep + PTRS_PER_PTE) = pgste;
-#endif
-}
-
-bool pgste_test_and_clear_dirty(struct mm_struct *, unsigned long address);
-void ptep_ipte_notify(struct mm_struct *, unsigned long addr, pte_t *);
-
-/**
- * struct gmap_struct - guest address space
- * @crst_list: list of all crst tables used in the guest address space
- * @mm: pointer to the parent mm_struct
- * @guest_to_host: radix tree with guest to host address translation
- * @host_to_guest: radix tree with pointer to segment table entries
- * @guest_table_lock: spinlock to protect all entries in the guest page table
- * @table: pointer to the page directory
- * @asce: address space control element for gmap page table
- * @pfault_enabled: defines if pfaults are applicable for the guest
- */
-struct gmap {
-	struct list_head list;
-	struct list_head crst_list;
-	struct mm_struct *mm;
-	struct radix_tree_root guest_to_host;
-	struct radix_tree_root host_to_guest;
-	spinlock_t guest_table_lock;
-	unsigned long *table;
-	unsigned long asce;
-	unsigned long asce_end;
-	void *private;
-	bool pfault_enabled;
-};
-
-/**
- * struct gmap_notifier - notify function block for page invalidation
- * @notifier_call: address of callback function
- */
-struct gmap_notifier {
-	struct list_head list;
-	void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
-};
-
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
-void gmap_free(struct gmap *gmap);
-void gmap_enable(struct gmap *gmap);
-void gmap_disable(struct gmap *gmap);
-int gmap_map_segment(struct gmap *gmap, unsigned long from,
-		     unsigned long to, unsigned long len);
-int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
-unsigned long __gmap_translate(struct gmap *, unsigned long gaddr);
-unsigned long gmap_translate(struct gmap *, unsigned long gaddr);
-int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
-int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags);
-void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
-void __gmap_zap(struct gmap *, unsigned long gaddr);
-
-
-void gmap_register_ipte_notifier(struct gmap_notifier *);
-void gmap_unregister_ipte_notifier(struct gmap_notifier *);
-int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
-
 /*
  * query functions pte_write/pte_dirty/pte_young only work if
  * pte_present() is true. Undefined behaviour if not..
@@ -984,7 +879,21 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 	return 1;
 }
 
-void set_pte_pgste_at(struct mm_struct *, unsigned long, pte_t *, pte_t);
+/*
+ * Additional functions to handle KVM guest page tables
+ */
+void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry);
+void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep , int reset);
+void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+
+bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
+int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+			  unsigned char key, bool nq);
+unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
 
 /*
  * Certain architectures need to do special things when PTEs
@@ -995,7 +904,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t entry)
 {
 	if (mm_has_pgste(mm))
-		set_pte_pgste_at(mm, addr, ptep, entry);
+		ptep_set_pte_at(mm, addr, ptep, entry);
 	else
 		*ptep = entry;
 }
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 53bbc9e8b281..1f95cc1faeb7 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -12,6 +12,7 @@
 #include <asm/idle.h>
 #include <asm/vdso.h>
 #include <asm/pgtable.h>
+#include <asm/gmap.h>
 
 /*
  * Make sure that the compiler is new enough. We want a compiler that
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 05f7de9869a9..1ea4095b67d7 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -14,6 +14,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <asm/pgalloc.h>
+#include <asm/gmap.h>
 #include <asm/virtio-ccw.h>
 #include "kvm-s390.h"
 #include "trace.h"
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f88ca72c3a77..e5e8739dcde3 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -23,6 +23,7 @@
 #include <asm/uaccess.h>
 #include <asm/sclp.h>
 #include <asm/isc.h>
+#include <asm/gmap.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include "trace-s390.h"
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 616e0a16ee88..be1f0288443e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -30,6 +30,7 @@
 #include <asm/lowcore.h>
 #include <asm/etr.h>
 #include <asm/pgtable.h>
+#include <asm/gmap.h>
 #include <asm/nmi.h>
 #include <asm/switch_to.h>
 #include <asm/isc.h>
@@ -280,7 +281,7 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm,
 	for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
 		address = gfn_to_hva_memslot(memslot, cur_gfn);
 
-		if (pgste_test_and_clear_dirty(gmap->mm, address))
+		if (test_and_clear_guest_dirty(gmap->mm, address))
 			mark_page_dirty(kvm, cur_gfn);
 	}
 	up_read(&gmap->mm->mmap_sem);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index ed74e86d9b9e..b632d8dda9ad 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -23,6 +23,7 @@
 #include <asm/sysinfo.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/gmap.h>
 #include <asm/io.h>
 #include <asm/ptrace.h>
 #include <asm/compat.h>
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 839592ca265c..2ae54cad2b6a 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -2,9 +2,11 @@
 # Makefile for the linux s390-specific parts of the memory manager.
 #
 
-obj-y		:= init.o fault.o extmem.o mmap.o vmem.o pgtable.o maccess.o
+obj-y		:= init.o fault.o extmem.o mmap.o vmem.o maccess.o
 obj-y		+= page-states.o gup.o extable.o pageattr.o mem_detect.o
+obj-y		+= pgtable.o pgalloc.o
 
 obj-$(CONFIG_CMM)		+= cmm.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_S390_PTDUMP)	+= dump_pagetables.o
+obj-$(CONFIG_PGSTE)		+= gmap.o
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 64b3ad1d6575..cce577feab1e 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -32,6 +32,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/diag.h>
 #include <asm/pgtable.h>
+#include <asm/gmap.h>
 #include <asm/irq.h>
 #include <asm/mmu_context.h>
 #include <asm/facility.h>
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
new file mode 100644
index 000000000000..69247b4dcc43
--- /dev/null
+++ b/arch/s390/mm/gmap.c
@@ -0,0 +1,774 @@
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2016
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/ksm.h>
+#include <linux/mman.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/gmap.h>
+#include <asm/tlb.h>
+
+/**
+ * gmap_alloc - allocate a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+{
+	struct gmap *gmap;
+	struct page *page;
+	unsigned long *table;
+	unsigned long etype, atype;
+
+	if (limit < (1UL << 31)) {
+		limit = (1UL << 31) - 1;
+		atype = _ASCE_TYPE_SEGMENT;
+		etype = _SEGMENT_ENTRY_EMPTY;
+	} else if (limit < (1UL << 42)) {
+		limit = (1UL << 42) - 1;
+		atype = _ASCE_TYPE_REGION3;
+		etype = _REGION3_ENTRY_EMPTY;
+	} else if (limit < (1UL << 53)) {
+		limit = (1UL << 53) - 1;
+		atype = _ASCE_TYPE_REGION2;
+		etype = _REGION2_ENTRY_EMPTY;
+	} else {
+		limit = -1UL;
+		atype = _ASCE_TYPE_REGION1;
+		etype = _REGION1_ENTRY_EMPTY;
+	}
+	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
+	if (!gmap)
+		goto out;
+	INIT_LIST_HEAD(&gmap->crst_list);
+	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
+	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+	spin_lock_init(&gmap->guest_table_lock);
+	gmap->mm = mm;
+	page = alloc_pages(GFP_KERNEL, 2);
+	if (!page)
+		goto out_free;
+	page->index = 0;
+	list_add(&page->lru, &gmap->crst_list);
+	table = (unsigned long *) page_to_phys(page);
+	crst_table_init(table, etype);
+	gmap->table = table;
+	gmap->asce = atype | _ASCE_TABLE_LENGTH |
+		_ASCE_USER_BITS | __pa(table);
+	gmap->asce_end = limit;
+	down_write(&mm->mmap_sem);
+	list_add(&gmap->list, &mm->context.gmap_list);
+	up_write(&mm->mmap_sem);
+	return gmap;
+
+out_free:
+	kfree(gmap);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(gmap_alloc);
+
+static void gmap_flush_tlb(struct gmap *gmap)
+{
+	if (MACHINE_HAS_IDTE)
+		__tlb_flush_asce(gmap->mm, gmap->asce);
+	else
+		__tlb_flush_global();
+}
+
+static void gmap_radix_tree_free(struct radix_tree_root *root)
+{
+	struct radix_tree_iter iter;
+	unsigned long indices[16];
+	unsigned long index;
+	void **slot;
+	int i, nr;
+
+	/* A radix tree is freed by deleting all of its entries */
+	index = 0;
+	do {
+		nr = 0;
+		radix_tree_for_each_slot(slot, root, &iter, index) {
+			indices[nr] = iter.index;
+			if (++nr == 16)
+				break;
+		}
+		for (i = 0; i < nr; i++) {
+			index = indices[i];
+			radix_tree_delete(root, index);
+		}
+	} while (nr > 0);
+}
+
+/**
+ * gmap_free - free a guest address space
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_free(struct gmap *gmap)
+{
+	struct page *page, *next;
+
+	/* Flush tlb. */
+	if (MACHINE_HAS_IDTE)
+		__tlb_flush_asce(gmap->mm, gmap->asce);
+	else
+		__tlb_flush_global();
+
+	/* Free all segment & region tables. */
+	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
+		__free_pages(page, 2);
+	gmap_radix_tree_free(&gmap->guest_to_host);
+	gmap_radix_tree_free(&gmap->host_to_guest);
+	down_write(&gmap->mm->mmap_sem);
+	list_del(&gmap->list);
+	up_write(&gmap->mm->mmap_sem);
+	kfree(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_enable - switch primary space to the guest address space
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_enable(struct gmap *gmap)
+{
+	S390_lowcore.gmap = (unsigned long) gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_enable);
+
+/**
+ * gmap_disable - switch back to the standard primary address space
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_disable(struct gmap *gmap)
+{
+	S390_lowcore.gmap = 0UL;
+}
+EXPORT_SYMBOL_GPL(gmap_disable);
+
+/*
+ * gmap_alloc_table is assumed to be called with mmap_sem held
+ */
+static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
+			    unsigned long init, unsigned long gaddr)
+{
+	struct page *page;
+	unsigned long *new;
+
+	/* since we dont free the gmap table until gmap_free we can unlock */
+	page = alloc_pages(GFP_KERNEL, 2);
+	if (!page)
+		return -ENOMEM;
+	new = (unsigned long *) page_to_phys(page);
+	crst_table_init(new, init);
+	spin_lock(&gmap->mm->page_table_lock);
+	if (*table & _REGION_ENTRY_INVALID) {
+		list_add(&page->lru, &gmap->crst_list);
+		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
+			(*table & _REGION_ENTRY_TYPE_MASK);
+		page->index = gaddr;
+		page = NULL;
+	}
+	spin_unlock(&gmap->mm->page_table_lock);
+	if (page)
+		__free_pages(page, 2);
+	return 0;
+}
+
+/**
+ * __gmap_segment_gaddr - find virtual address from segment pointer
+ * @entry: pointer to a segment table entry in the guest address space
+ *
+ * Returns the virtual address in the guest address space for the segment
+ */
+static unsigned long __gmap_segment_gaddr(unsigned long *entry)
+{
+	struct page *page;
+	unsigned long offset, mask;
+
+	offset = (unsigned long) entry / sizeof(unsigned long);
+	offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
+	mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
+	page = virt_to_page((void *)((unsigned long) entry & mask));
+	return page->index + offset;
+}
+
+/**
+ * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
+ * @gmap: pointer to the guest address space structure
+ * @vmaddr: address in the host process address space
+ *
+ * Returns 1 if a TLB flush is required
+ */
+static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
+{
+	unsigned long *entry;
+	int flush = 0;
+
+	spin_lock(&gmap->guest_table_lock);
+	entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+	if (entry) {
+		flush = (*entry != _SEGMENT_ENTRY_INVALID);
+		*entry = _SEGMENT_ENTRY_INVALID;
+	}
+	spin_unlock(&gmap->guest_table_lock);
+	return flush;
+}
+
+/**
+ * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
+ * @gmap: pointer to the guest address space structure
+ * @gaddr: address in the guest address space
+ *
+ * Returns 1 if a TLB flush is required
+ */
+static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
+{
+	unsigned long vmaddr;
+
+	vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
+						   gaddr >> PMD_SHIFT);
+	return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
+}
+
+/**
+ * gmap_unmap_segment - unmap segment from the guest address space
+ * @gmap: pointer to the guest address space structure
+ * @to: address in the guest address space
+ * @len: length of the memory area to unmap
+ *
+ * Returns 0 if the unmap succeeded, -EINVAL if not.
+ */
+int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
+{
+	unsigned long off;
+	int flush;
+
+	if ((to | len) & (PMD_SIZE - 1))
+		return -EINVAL;
+	if (len == 0 || to + len < to)
+		return -EINVAL;
+
+	flush = 0;
+	down_write(&gmap->mm->mmap_sem);
+	for (off = 0; off < len; off += PMD_SIZE)
+		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
+	up_write(&gmap->mm->mmap_sem);
+	if (flush)
+		gmap_flush_tlb(gmap);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gmap_unmap_segment);
+
+/**
+ * gmap_map_segment - map a segment to the guest address space
+ * @gmap: pointer to the guest address space structure
+ * @from: source address in the parent address space
+ * @to: target address in the guest address space
+ * @len: length of the memory area to map
+ *
+ * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
+ */
+int gmap_map_segment(struct gmap *gmap, unsigned long from,
+		     unsigned long to, unsigned long len)
+{
+	unsigned long off;
+	int flush;
+
+	if ((from | to | len) & (PMD_SIZE - 1))
+		return -EINVAL;
+	if (len == 0 || from + len < from || to + len < to ||
+	    from + len > TASK_MAX_SIZE || to + len > gmap->asce_end)
+		return -EINVAL;
+
+	flush = 0;
+	down_write(&gmap->mm->mmap_sem);
+	for (off = 0; off < len; off += PMD_SIZE) {
+		/* Remove old translation */
+		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
+		/* Store new translation */
+		if (radix_tree_insert(&gmap->guest_to_host,
+				      (to + off) >> PMD_SHIFT,
+				      (void *) from + off))
+			break;
+	}
+	up_write(&gmap->mm->mmap_sem);
+	if (flush)
+		gmap_flush_tlb(gmap);
+	if (off >= len)
+		return 0;
+	gmap_unmap_segment(gmap, to, len);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(gmap_map_segment);
+
+/**
+ * __gmap_translate - translate a guest address to a user space address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ *
+ * Returns user space address which corresponds to the guest address or
+ * -EFAULT if no such mapping exists.
+ * This function does not establish potentially missing page table entries.
+ * The mmap_sem of the mm that belongs to the address space must be held
+ * when this function gets called.
+ */
+unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
+{
+	unsigned long vmaddr;
+
+	vmaddr = (unsigned long)
+		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
+}
+EXPORT_SYMBOL_GPL(__gmap_translate);
+
+/**
+ * gmap_translate - translate a guest address to a user space address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ *
+ * Returns user space address which corresponds to the guest address or
+ * -EFAULT if no such mapping exists.
+ * This function does not establish potentially missing page table entries.
+ */
+unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
+{
+	unsigned long rc;
+
+	down_read(&gmap->mm->mmap_sem);
+	rc = __gmap_translate(gmap, gaddr);
+	up_read(&gmap->mm->mmap_sem);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_translate);
+
+/**
+ * gmap_unlink - disconnect a page table from the gmap shadow tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @table: pointer to the host page table
+ * @vmaddr: vm address associated with the host page table
+ */
+void gmap_unlink(struct mm_struct *mm, unsigned long *table,
+		 unsigned long vmaddr)
+{
+	struct gmap *gmap;
+	int flush;
+
+	list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
+		if (flush)
+			gmap_flush_tlb(gmap);
+	}
+}
+
+/**
+ * gmap_link - set up shadow page tables to connect a host to a guest address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ * @vmaddr: vm address
+ *
+ * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
+ * if the vm address is already mapped to a different guest segment.
+ * The mmap_sem of the mm that belongs to the address space must be held
+ * when this function gets called.
+ */
+int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
+{
+	struct mm_struct *mm;
+	unsigned long *table;
+	spinlock_t *ptl;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	int rc;
+
+	/* Create higher level tables in the gmap page table */
+	table = gmap->table;
+	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
+		table += (gaddr >> 53) & 0x7ff;
+		if ((*table & _REGION_ENTRY_INVALID) &&
+		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
+				     gaddr & 0xffe0000000000000UL))
+			return -ENOMEM;
+		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+	}
+	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
+		table += (gaddr >> 42) & 0x7ff;
+		if ((*table & _REGION_ENTRY_INVALID) &&
+		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
+				     gaddr & 0xfffffc0000000000UL))
+			return -ENOMEM;
+		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+	}
+	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
+		table += (gaddr >> 31) & 0x7ff;
+		if ((*table & _REGION_ENTRY_INVALID) &&
+		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
+				     gaddr & 0xffffffff80000000UL))
+			return -ENOMEM;
+		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+	}
+	table += (gaddr >> 20) & 0x7ff;
+	/* Walk the parent mm page table */
+	mm = gmap->mm;
+	pgd = pgd_offset(mm, vmaddr);
+	VM_BUG_ON(pgd_none(*pgd));
+	pud = pud_offset(pgd, vmaddr);
+	VM_BUG_ON(pud_none(*pud));
+	pmd = pmd_offset(pud, vmaddr);
+	VM_BUG_ON(pmd_none(*pmd));
+	/* large pmds cannot yet be handled */
+	if (pmd_large(*pmd))
+		return -EFAULT;
+	/* Link gmap segment table entry location to page table. */
+	rc = radix_tree_preload(GFP_KERNEL);
+	if (rc)
+		return rc;
+	ptl = pmd_lock(mm, pmd);
+	spin_lock(&gmap->guest_table_lock);
+	if (*table == _SEGMENT_ENTRY_INVALID) {
+		rc = radix_tree_insert(&gmap->host_to_guest,
+				       vmaddr >> PMD_SHIFT, table);
+		if (!rc)
+			*table = pmd_val(*pmd);
+	} else
+		rc = 0;
+	spin_unlock(&gmap->guest_table_lock);
+	spin_unlock(ptl);
+	radix_tree_preload_end();
+	return rc;
+}
+
+/**
+ * gmap_fault - resolve a fault on a guest address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ * @fault_flags: flags to pass down to handle_mm_fault()
+ *
+ * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
+ * if the vm address is already mapped to a different guest segment.
+ */
+int gmap_fault(struct gmap *gmap, unsigned long gaddr,
+	       unsigned int fault_flags)
+{
+	unsigned long vmaddr;
+	int rc;
+	bool unlocked;
+
+	down_read(&gmap->mm->mmap_sem);
+
+retry:
+	unlocked = false;
+	vmaddr = __gmap_translate(gmap, gaddr);
+	if (IS_ERR_VALUE(vmaddr)) {
+		rc = vmaddr;
+		goto out_up;
+	}
+	if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+			     &unlocked)) {
+		rc = -EFAULT;
+		goto out_up;
+	}
+	/*
+	 * In the case that fixup_user_fault unlocked the mmap_sem during
+	 * faultin redo __gmap_translate to not race with a map/unmap_segment.
+	 */
+	if (unlocked)
+		goto retry;
+
+	rc = __gmap_link(gmap, gaddr, vmaddr);
+out_up:
+	up_read(&gmap->mm->mmap_sem);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_fault);
+
+/*
+ * this function is assumed to be called with mmap_sem held
+ */
+void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
+{
+	unsigned long vmaddr;
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+	/* Find the vm address for the guest address */
+	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
+						   gaddr >> PMD_SHIFT);
+	if (vmaddr) {
+		vmaddr |= gaddr & ~PMD_MASK;
+		/* Get pointer to the page table entry */
+		ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
+		if (likely(ptep))
+			ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
+		pte_unmap_unlock(ptep, ptl);
+	}
+}
+EXPORT_SYMBOL_GPL(__gmap_zap);
+
+void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
+{
+	unsigned long gaddr, vmaddr, size;
+	struct vm_area_struct *vma;
+
+	down_read(&gmap->mm->mmap_sem);
+	for (gaddr = from; gaddr < to;
+	     gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
+		/* Find the vm address for the guest address */
+		vmaddr = (unsigned long)
+			radix_tree_lookup(&gmap->guest_to_host,
+					  gaddr >> PMD_SHIFT);
+		if (!vmaddr)
+			continue;
+		vmaddr |= gaddr & ~PMD_MASK;
+		/* Find vma in the parent mm */
+		vma = find_vma(gmap->mm, vmaddr);
+		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
+		zap_page_range(vma, vmaddr, size, NULL);
+	}
+	up_read(&gmap->mm->mmap_sem);
+}
+EXPORT_SYMBOL_GPL(gmap_discard);
+
+static LIST_HEAD(gmap_notifier_list);
+static DEFINE_SPINLOCK(gmap_notifier_lock);
+
+/**
+ * gmap_register_ipte_notifier - register a pte invalidation callback
+ * @nb: pointer to the gmap notifier block
+ */
+void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+{
+	spin_lock(&gmap_notifier_lock);
+	list_add(&nb->list, &gmap_notifier_list);
+	spin_unlock(&gmap_notifier_lock);
+}
+EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+
+/**
+ * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * @nb: pointer to the gmap notifier block
+ */
+void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+{
+	spin_lock(&gmap_notifier_lock);
+	list_del_init(&nb->list);
+	spin_unlock(&gmap_notifier_lock);
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
+
+/**
+ * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists and
+ * the invalidation notification could be set. If the gmap mapping is missing
+ * for one or more pages -EFAULT is returned. If no memory could be allocated
+ * -ENOMEM is returned. This function establishes missing page table entries.
+ */
+int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+{
+	unsigned long addr;
+	spinlock_t *ptl;
+	pte_t *ptep;
+	bool unlocked;
+	int rc = 0;
+
+	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+		return -EINVAL;
+	down_read(&gmap->mm->mmap_sem);
+	while (len) {
+		unlocked = false;
+		/* Convert gmap address and connect the page tables */
+		addr = __gmap_translate(gmap, gaddr);
+		if (IS_ERR_VALUE(addr)) {
+			rc = addr;
+			break;
+		}
+		/* Get the page mapped */
+		if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
+				     &unlocked)) {
+			rc = -EFAULT;
+			break;
+		}
+		/* While trying to map mmap_sem got unlocked. Let us retry */
+		if (unlocked)
+			continue;
+		rc = __gmap_link(gmap, gaddr, addr);
+		if (rc)
+			break;
+		/* Walk the process page table, lock and get pte pointer */
+		ptep = get_locked_pte(gmap->mm, addr, &ptl);
+		VM_BUG_ON(!ptep);
+		/* Set notification bit in the pgste of the pte */
+		if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
+			ptep_set_notify(gmap->mm, addr, ptep);
+			gaddr += PAGE_SIZE;
+			len -= PAGE_SIZE;
+		}
+		pte_unmap_unlock(ptep, ptl);
+	}
+	up_read(&gmap->mm->mmap_sem);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+
+/**
+ * ptep_notify - call all invalidation callbacks for a specific pte.
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the process address space
+ * @pte: pointer to the page table entry
+ *
+ * This function is assumed to be called with the page table lock held
+ * for the pte to notify.
+ */
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+{
+	unsigned long offset, gaddr;
+	unsigned long *table;
+	struct gmap_notifier *nb;
+	struct gmap *gmap;
+
+	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
+	offset = offset * (4096 / sizeof(pte_t));
+	spin_lock(&gmap_notifier_lock);
+	list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+		table = radix_tree_lookup(&gmap->host_to_guest,
+					  vmaddr >> PMD_SHIFT);
+		if (!table)
+			continue;
+		gaddr = __gmap_segment_gaddr(table) + offset;
+		list_for_each_entry(nb, &gmap_notifier_list, list)
+			nb->notifier_call(gmap, gaddr);
+	}
+	spin_unlock(&gmap_notifier_lock);
+}
+EXPORT_SYMBOL_GPL(ptep_notify);
+
+static inline void thp_split_mm(struct mm_struct *mm)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	struct vm_area_struct *vma;
+	unsigned long addr;
+
+	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+		for (addr = vma->vm_start;
+		     addr < vma->vm_end;
+		     addr += PAGE_SIZE)
+			follow_page(vma, addr, FOLL_SPLIT);
+		vma->vm_flags &= ~VM_HUGEPAGE;
+		vma->vm_flags |= VM_NOHUGEPAGE;
+	}
+	mm->def_flags |= VM_NOHUGEPAGE;
+#endif
+}
+
+/*
+ * switch on pgstes for its userspace process (for kvm)
+ */
+int s390_enable_sie(void)
+{
+	struct mm_struct *mm = current->mm;
+
+	/* Do we have pgstes? if yes, we are done */
+	if (mm_has_pgste(mm))
+		return 0;
+	/* Fail if the page tables are 2K */
+	if (!mm_alloc_pgste(mm))
+		return -EINVAL;
+	down_write(&mm->mmap_sem);
+	mm->context.has_pgste = 1;
+	/* split thp mappings and disable thp for future mappings */
+	thp_split_mm(mm);
+	up_write(&mm->mmap_sem);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(s390_enable_sie);
+
+/*
+ * Enable storage key handling from now on and initialize the storage
+ * keys with the default key.
+ */
+static int __s390_enable_skey(pte_t *pte, unsigned long addr,
+			      unsigned long next, struct mm_walk *walk)
+{
+	/*
+	 * Remove all zero page mappings,
+	 * after establishing a policy to forbid zero page mappings
+	 * following faults for that page will get fresh anonymous pages
+	 */
+	if (is_zero_pfn(pte_pfn(*pte)))
+		ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID));
+	/* Clear storage key */
+	ptep_zap_key(walk->mm, addr, pte);
+	return 0;
+}
+
+int s390_enable_skey(void)
+{
+	struct mm_walk walk = { .pte_entry = __s390_enable_skey };
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	int rc = 0;
+
+	down_write(&mm->mmap_sem);
+	if (mm_use_skey(mm))
+		goto out_up;
+
+	mm->context.use_skey = 1;
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
+				MADV_UNMERGEABLE, &vma->vm_flags)) {
+			mm->context.use_skey = 0;
+			rc = -ENOMEM;
+			goto out_up;
+		}
+	}
+	mm->def_flags &= ~VM_MERGEABLE;
+
+	walk.mm = mm;
+	walk_page_range(0, TASK_SIZE, &walk);
+
+out_up:
+	up_write(&mm->mmap_sem);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(s390_enable_skey);
+
+/*
+ * Reset CMMA state, make all pages stable again.
+ */
+static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
+			     unsigned long next, struct mm_walk *walk)
+{
+	ptep_zap_unused(walk->mm, addr, pte, 1);
+	return 0;
+}
+
+void s390_reset_cmma(struct mm_struct *mm)
+{
+	struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
+
+	down_write(&mm->mmap_sem);
+	walk.mm = mm;
+	walk_page_range(0, TASK_SIZE, &walk);
+	up_write(&mm->mmap_sem);
+}
+EXPORT_SYMBOL_GPL(s390_reset_cmma);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
new file mode 100644
index 000000000000..f6c3de26cda8
--- /dev/null
+++ b/arch/s390/mm/pgalloc.c
@@ -0,0 +1,360 @@
+/*
+ *  Page table allocation functions
+ *
+ *    Copyright IBM Corp. 2016
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <asm/gmap.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_PGSTE
+
+static int page_table_allocate_pgste_min = 0;
+static int page_table_allocate_pgste_max = 1;
+int page_table_allocate_pgste = 0;
+EXPORT_SYMBOL(page_table_allocate_pgste);
+
+static struct ctl_table page_table_sysctl[] = {
+	{
+		.procname	= "allocate_pgste",
+		.data		= &page_table_allocate_pgste,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO | S_IWUSR,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &page_table_allocate_pgste_min,
+		.extra2		= &page_table_allocate_pgste_max,
+	},
+	{ }
+};
+
+static struct ctl_table page_table_sysctl_dir[] = {
+	{
+		.procname	= "vm",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= page_table_sysctl,
+	},
+	{ }
+};
+
+static int __init page_table_register_sysctl(void)
+{
+	return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
+}
+__initcall(page_table_register_sysctl);
+
+#endif /* CONFIG_PGSTE */
+
+unsigned long *crst_table_alloc(struct mm_struct *mm)
+{
+	struct page *page = alloc_pages(GFP_KERNEL, 2);
+
+	if (!page)
+		return NULL;
+	return (unsigned long *) page_to_phys(page);
+}
+
+void crst_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	free_pages((unsigned long) table, 2);
+}
+
+static void __crst_table_upgrade(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		clear_user_asce();
+		set_user_asce(mm);
+	}
+	__tlb_flush_local();
+}
+
+int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
+{
+	unsigned long *table, *pgd;
+	unsigned long entry;
+	int flush;
+
+	BUG_ON(limit > TASK_MAX_SIZE);
+	flush = 0;
+repeat:
+	table = crst_table_alloc(mm);
+	if (!table)
+		return -ENOMEM;
+	spin_lock_bh(&mm->page_table_lock);
+	if (mm->context.asce_limit < limit) {
+		pgd = (unsigned long *) mm->pgd;
+		if (mm->context.asce_limit <= (1UL << 31)) {
+			entry = _REGION3_ENTRY_EMPTY;
+			mm->context.asce_limit = 1UL << 42;
+			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+						_ASCE_USER_BITS |
+						_ASCE_TYPE_REGION3;
+		} else {
+			entry = _REGION2_ENTRY_EMPTY;
+			mm->context.asce_limit = 1UL << 53;
+			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+						_ASCE_USER_BITS |
+						_ASCE_TYPE_REGION2;
+		}
+		crst_table_init(table, entry);
+		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
+		mm->pgd = (pgd_t *) table;
+		mm->task_size = mm->context.asce_limit;
+		table = NULL;
+		flush = 1;
+	}
+	spin_unlock_bh(&mm->page_table_lock);
+	if (table)
+		crst_table_free(mm, table);
+	if (mm->context.asce_limit < limit)
+		goto repeat;
+	if (flush)
+		on_each_cpu(__crst_table_upgrade, mm, 0);
+	return 0;
+}
+
+void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
+{
+	pgd_t *pgd;
+
+	if (current->active_mm == mm) {
+		clear_user_asce();
+		__tlb_flush_mm(mm);
+	}
+	while (mm->context.asce_limit > limit) {
+		pgd = mm->pgd;
+		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
+		case _REGION_ENTRY_TYPE_R2:
+			mm->context.asce_limit = 1UL << 42;
+			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+						_ASCE_USER_BITS |
+						_ASCE_TYPE_REGION3;
+			break;
+		case _REGION_ENTRY_TYPE_R3:
+			mm->context.asce_limit = 1UL << 31;
+			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+						_ASCE_USER_BITS |
+						_ASCE_TYPE_SEGMENT;
+			break;
+		default:
+			BUG();
+		}
+		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
+		mm->task_size = mm->context.asce_limit;
+		crst_table_free(mm, (unsigned long *) pgd);
+	}
+	if (current->active_mm == mm)
+		set_user_asce(mm);
+}
+
+static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
+{
+	unsigned int old, new;
+
+	do {
+		old = atomic_read(v);
+		new = old ^ bits;
+	} while (atomic_cmpxchg(v, old, new) != old);
+	return new;
+}
+
+/*
+ * page table entry allocation/free routines.
+ */
+unsigned long *page_table_alloc(struct mm_struct *mm)
+{
+	unsigned long *table;
+	struct page *page;
+	unsigned int mask, bit;
+
+	/* Try to get a fragment of a 4K page as a 2K page table */
+	if (!mm_alloc_pgste(mm)) {
+		table = NULL;
+		spin_lock_bh(&mm->context.list_lock);
+		if (!list_empty(&mm->context.pgtable_list)) {
+			page = list_first_entry(&mm->context.pgtable_list,
+						struct page, lru);
+			mask = atomic_read(&page->_mapcount);
+			mask = (mask | (mask >> 4)) & 3;
+			if (mask != 3) {
+				table = (unsigned long *) page_to_phys(page);
+				bit = mask & 1;		/* =1 -> second 2K */
+				if (bit)
+					table += PTRS_PER_PTE;
+				atomic_xor_bits(&page->_mapcount, 1U << bit);
+				list_del(&page->lru);
+			}
+		}
+		spin_unlock_bh(&mm->context.list_lock);
+		if (table)
+			return table;
+	}
+	/* Allocate a fresh page */
+	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+	if (!page)
+		return NULL;
+	if (!pgtable_page_ctor(page)) {
+		__free_page(page);
+		return NULL;
+	}
+	/* Initialize page table */
+	table = (unsigned long *) page_to_phys(page);
+	if (mm_alloc_pgste(mm)) {
+		/* Return 4K page table with PGSTEs */
+		atomic_set(&page->_mapcount, 3);
+		clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+		clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+	} else {
+		/* Return the first 2K fragment of the page */
+		atomic_set(&page->_mapcount, 1);
+		clear_table(table, _PAGE_INVALID, PAGE_SIZE);
+		spin_lock_bh(&mm->context.list_lock);
+		list_add(&page->lru, &mm->context.pgtable_list);
+		spin_unlock_bh(&mm->context.list_lock);
+	}
+	return table;
+}
+
+void page_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	struct page *page;
+	unsigned int bit, mask;
+
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	if (!mm_alloc_pgste(mm)) {
+		/* Free 2K page table fragment of a 4K page */
+		bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
+		spin_lock_bh(&mm->context.list_lock);
+		mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
+		if (mask & 3)
+			list_add(&page->lru, &mm->context.pgtable_list);
+		else
+			list_del(&page->lru);
+		spin_unlock_bh(&mm->context.list_lock);
+		if (mask != 0)
+			return;
+	}
+
+	pgtable_page_dtor(page);
+	atomic_set(&page->_mapcount, -1);
+	__free_page(page);
+}
+
+void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
+			 unsigned long vmaddr)
+{
+	struct mm_struct *mm;
+	struct page *page;
+	unsigned int bit, mask;
+
+	mm = tlb->mm;
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	if (mm_alloc_pgste(mm)) {
+		gmap_unlink(mm, table, vmaddr);
+		table = (unsigned long *) (__pa(table) | 3);
+		tlb_remove_table(tlb, table);
+		return;
+	}
+	bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
+	spin_lock_bh(&mm->context.list_lock);
+	mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
+	if (mask & 3)
+		list_add_tail(&page->lru, &mm->context.pgtable_list);
+	else
+		list_del(&page->lru);
+	spin_unlock_bh(&mm->context.list_lock);
+	table = (unsigned long *) (__pa(table) | (1U << bit));
+	tlb_remove_table(tlb, table);
+}
+
+static void __tlb_remove_table(void *_table)
+{
+	unsigned int mask = (unsigned long) _table & 3;
+	void *table = (void *)((unsigned long) _table ^ mask);
+	struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+
+	switch (mask) {
+	case 0:		/* pmd or pud */
+		free_pages((unsigned long) table, 2);
+		break;
+	case 1:		/* lower 2K of a 4K page table */
+	case 2:		/* higher 2K of a 4K page table */
+		if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0)
+			break;
+		/* fallthrough */
+	case 3:		/* 4K page table with pgstes */
+		pgtable_page_dtor(page);
+		atomic_set(&page->_mapcount, -1);
+		__free_page(page);
+		break;
+	}
+}
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+	/* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
+{
+	/*
+	 * This isn't an RCU grace period and hence the page-tables cannot be
+	 * assumed to be actually RCU-freed.
+	 *
+	 * It is however sufficient for software page-table walkers that rely
+	 * on IRQ disabling. See the comment near struct mmu_table_batch.
+	 */
+	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+	__tlb_remove_table(table);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+	struct mmu_table_batch *batch;
+	int i;
+
+	batch = container_of(head, struct mmu_table_batch, rcu);
+
+	for (i = 0; i < batch->nr; i++)
+		__tlb_remove_table(batch->tables[i]);
+
+	free_page((unsigned long)batch);
+}
+
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+	struct mmu_table_batch **batch = &tlb->batch;
+
+	if (*batch) {
+		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+		*batch = NULL;
+	}
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+	struct mmu_table_batch **batch = &tlb->batch;
+
+	tlb->mm->context.flush_mm = 1;
+	if (*batch == NULL) {
+		*batch = (struct mmu_table_batch *)
+			__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+		if (*batch == NULL) {
+			__tlb_flush_mm_lazy(tlb->mm);
+			tlb_remove_table_one(table);
+			return;
+		}
+		(*batch)->nr = 0;
+	}
+	(*batch)->tables[(*batch)->nr++] = table;
+	if ((*batch)->nr == MAX_TABLE_BATCH)
+		tlb_flush_mmu(tlb);
+}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index e24126208614..4324b87f9398 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -24,1140 +24,6 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 
-unsigned long *crst_table_alloc(struct mm_struct *mm)
-{
-	struct page *page = alloc_pages(GFP_KERNEL, 2);
-
-	if (!page)
-		return NULL;
-	return (unsigned long *) page_to_phys(page);
-}
-
-void crst_table_free(struct mm_struct *mm, unsigned long *table)
-{
-	free_pages((unsigned long) table, 2);
-}
-
-static void __crst_table_upgrade(void *arg)
-{
-	struct mm_struct *mm = arg;
-
-	if (current->active_mm == mm) {
-		clear_user_asce();
-		set_user_asce(mm);
-	}
-	__tlb_flush_local();
-}
-
-int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
-{
-	unsigned long *table, *pgd;
-	unsigned long entry;
-	int flush;
-
-	BUG_ON(limit > TASK_MAX_SIZE);
-	flush = 0;
-repeat:
-	table = crst_table_alloc(mm);
-	if (!table)
-		return -ENOMEM;
-	spin_lock_bh(&mm->page_table_lock);
-	if (mm->context.asce_limit < limit) {
-		pgd = (unsigned long *) mm->pgd;
-		if (mm->context.asce_limit <= (1UL << 31)) {
-			entry = _REGION3_ENTRY_EMPTY;
-			mm->context.asce_limit = 1UL << 42;
-			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
-						_ASCE_USER_BITS |
-						_ASCE_TYPE_REGION3;
-		} else {
-			entry = _REGION2_ENTRY_EMPTY;
-			mm->context.asce_limit = 1UL << 53;
-			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
-						_ASCE_USER_BITS |
-						_ASCE_TYPE_REGION2;
-		}
-		crst_table_init(table, entry);
-		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
-		mm->pgd = (pgd_t *) table;
-		mm->task_size = mm->context.asce_limit;
-		table = NULL;
-		flush = 1;
-	}
-	spin_unlock_bh(&mm->page_table_lock);
-	if (table)
-		crst_table_free(mm, table);
-	if (mm->context.asce_limit < limit)
-		goto repeat;
-	if (flush)
-		on_each_cpu(__crst_table_upgrade, mm, 0);
-	return 0;
-}
-
-void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
-{
-	pgd_t *pgd;
-
-	if (current->active_mm == mm) {
-		clear_user_asce();
-		__tlb_flush_mm(mm);
-	}
-	while (mm->context.asce_limit > limit) {
-		pgd = mm->pgd;
-		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
-		case _REGION_ENTRY_TYPE_R2:
-			mm->context.asce_limit = 1UL << 42;
-			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
-						_ASCE_USER_BITS |
-						_ASCE_TYPE_REGION3;
-			break;
-		case _REGION_ENTRY_TYPE_R3:
-			mm->context.asce_limit = 1UL << 31;
-			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
-						_ASCE_USER_BITS |
-						_ASCE_TYPE_SEGMENT;
-			break;
-		default:
-			BUG();
-		}
-		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
-		mm->task_size = mm->context.asce_limit;
-		crst_table_free(mm, (unsigned long *) pgd);
-	}
-	if (current->active_mm == mm)
-		set_user_asce(mm);
-}
-
-#ifdef CONFIG_PGSTE
-
-/**
- * gmap_alloc - allocate a guest address space
- * @mm: pointer to the parent mm_struct
- * @limit: maximum address of the gmap address space
- *
- * Returns a guest address space structure.
- */
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
-{
-	struct gmap *gmap;
-	struct page *page;
-	unsigned long *table;
-	unsigned long etype, atype;
-
-	if (limit < (1UL << 31)) {
-		limit = (1UL << 31) - 1;
-		atype = _ASCE_TYPE_SEGMENT;
-		etype = _SEGMENT_ENTRY_EMPTY;
-	} else if (limit < (1UL << 42)) {
-		limit = (1UL << 42) - 1;
-		atype = _ASCE_TYPE_REGION3;
-		etype = _REGION3_ENTRY_EMPTY;
-	} else if (limit < (1UL << 53)) {
-		limit = (1UL << 53) - 1;
-		atype = _ASCE_TYPE_REGION2;
-		etype = _REGION2_ENTRY_EMPTY;
-	} else {
-		limit = -1UL;
-		atype = _ASCE_TYPE_REGION1;
-		etype = _REGION1_ENTRY_EMPTY;
-	}
-	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
-	if (!gmap)
-		goto out;
-	INIT_LIST_HEAD(&gmap->crst_list);
-	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
-	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
-	spin_lock_init(&gmap->guest_table_lock);
-	gmap->mm = mm;
-	page = alloc_pages(GFP_KERNEL, 2);
-	if (!page)
-		goto out_free;
-	page->index = 0;
-	list_add(&page->lru, &gmap->crst_list);
-	table = (unsigned long *) page_to_phys(page);
-	crst_table_init(table, etype);
-	gmap->table = table;
-	gmap->asce = atype | _ASCE_TABLE_LENGTH |
-		_ASCE_USER_BITS | __pa(table);
-	gmap->asce_end = limit;
-	down_write(&mm->mmap_sem);
-	list_add(&gmap->list, &mm->context.gmap_list);
-	up_write(&mm->mmap_sem);
-	return gmap;
-
-out_free:
-	kfree(gmap);
-out:
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(gmap_alloc);
-
-static void gmap_flush_tlb(struct gmap *gmap)
-{
-	if (MACHINE_HAS_IDTE)
-		__tlb_flush_asce(gmap->mm, gmap->asce);
-	else
-		__tlb_flush_global();
-}
-
-static void gmap_radix_tree_free(struct radix_tree_root *root)
-{
-	struct radix_tree_iter iter;
-	unsigned long indices[16];
-	unsigned long index;
-	void **slot;
-	int i, nr;
-
-	/* A radix tree is freed by deleting all of its entries */
-	index = 0;
-	do {
-		nr = 0;
-		radix_tree_for_each_slot(slot, root, &iter, index) {
-			indices[nr] = iter.index;
-			if (++nr == 16)
-				break;
-		}
-		for (i = 0; i < nr; i++) {
-			index = indices[i];
-			radix_tree_delete(root, index);
-		}
-	} while (nr > 0);
-}
-
-/**
- * gmap_free - free a guest address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_free(struct gmap *gmap)
-{
-	struct page *page, *next;
-
-	/* Flush tlb. */
-	if (MACHINE_HAS_IDTE)
-		__tlb_flush_asce(gmap->mm, gmap->asce);
-	else
-		__tlb_flush_global();
-
-	/* Free all segment & region tables. */
-	list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
-		__free_pages(page, 2);
-	gmap_radix_tree_free(&gmap->guest_to_host);
-	gmap_radix_tree_free(&gmap->host_to_guest);
-	down_write(&gmap->mm->mmap_sem);
-	list_del(&gmap->list);
-	up_write(&gmap->mm->mmap_sem);
-	kfree(gmap);
-}
-EXPORT_SYMBOL_GPL(gmap_free);
-
-/**
- * gmap_enable - switch primary space to the guest address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_enable(struct gmap *gmap)
-{
-	S390_lowcore.gmap = (unsigned long) gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_enable);
-
-/**
- * gmap_disable - switch back to the standard primary address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_disable(struct gmap *gmap)
-{
-	S390_lowcore.gmap = 0UL;
-}
-EXPORT_SYMBOL_GPL(gmap_disable);
-
-/*
- * gmap_alloc_table is assumed to be called with mmap_sem held
- */
-static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
-			    unsigned long init, unsigned long gaddr)
-{
-	struct page *page;
-	unsigned long *new;
-
-	/* since we dont free the gmap table until gmap_free we can unlock */
-	page = alloc_pages(GFP_KERNEL, 2);
-	if (!page)
-		return -ENOMEM;
-	new = (unsigned long *) page_to_phys(page);
-	crst_table_init(new, init);
-	spin_lock(&gmap->mm->page_table_lock);
-	if (*table & _REGION_ENTRY_INVALID) {
-		list_add(&page->lru, &gmap->crst_list);
-		*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
-			(*table & _REGION_ENTRY_TYPE_MASK);
-		page->index = gaddr;
-		page = NULL;
-	}
-	spin_unlock(&gmap->mm->page_table_lock);
-	if (page)
-		__free_pages(page, 2);
-	return 0;
-}
-
-/**
- * __gmap_segment_gaddr - find virtual address from segment pointer
- * @entry: pointer to a segment table entry in the guest address space
- *
- * Returns the virtual address in the guest address space for the segment
- */
-static unsigned long __gmap_segment_gaddr(unsigned long *entry)
-{
-	struct page *page;
-	unsigned long offset, mask;
-
-	offset = (unsigned long) entry / sizeof(unsigned long);
-	offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
-	mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
-	page = virt_to_page((void *)((unsigned long) entry & mask));
-	return page->index + offset;
-}
-
-/**
- * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
- * @gmap: pointer to the guest address space structure
- * @vmaddr: address in the host process address space
- *
- * Returns 1 if a TLB flush is required
- */
-static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
-{
-	unsigned long *entry;
-	int flush = 0;
-
-	spin_lock(&gmap->guest_table_lock);
-	entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
-	if (entry) {
-		flush = (*entry != _SEGMENT_ENTRY_INVALID);
-		*entry = _SEGMENT_ENTRY_INVALID;
-	}
-	spin_unlock(&gmap->guest_table_lock);
-	return flush;
-}
-
-/**
- * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
- * @gmap: pointer to the guest address space structure
- * @gaddr: address in the guest address space
- *
- * Returns 1 if a TLB flush is required
- */
-static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
-{
-	unsigned long vmaddr;
-
-	vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
-						   gaddr >> PMD_SHIFT);
-	return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
-}
-
-/**
- * gmap_unmap_segment - unmap segment from the guest address space
- * @gmap: pointer to the guest address space structure
- * @to: address in the guest address space
- * @len: length of the memory area to unmap
- *
- * Returns 0 if the unmap succeeded, -EINVAL if not.
- */
-int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
-{
-	unsigned long off;
-	int flush;
-
-	if ((to | len) & (PMD_SIZE - 1))
-		return -EINVAL;
-	if (len == 0 || to + len < to)
-		return -EINVAL;
-
-	flush = 0;
-	down_write(&gmap->mm->mmap_sem);
-	for (off = 0; off < len; off += PMD_SIZE)
-		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
-	up_write(&gmap->mm->mmap_sem);
-	if (flush)
-		gmap_flush_tlb(gmap);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(gmap_unmap_segment);
-
-/**
- * gmap_mmap_segment - map a segment to the guest address space
- * @gmap: pointer to the guest address space structure
- * @from: source address in the parent address space
- * @to: target address in the guest address space
- * @len: length of the memory area to map
- *
- * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
- */
-int gmap_map_segment(struct gmap *gmap, unsigned long from,
-		     unsigned long to, unsigned long len)
-{
-	unsigned long off;
-	int flush;
-
-	if ((from | to | len) & (PMD_SIZE - 1))
-		return -EINVAL;
-	if (len == 0 || from + len < from || to + len < to ||
-	    from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end)
-		return -EINVAL;
-
-	flush = 0;
-	down_write(&gmap->mm->mmap_sem);
-	for (off = 0; off < len; off += PMD_SIZE) {
-		/* Remove old translation */
-		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
-		/* Store new translation */
-		if (radix_tree_insert(&gmap->guest_to_host,
-				      (to + off) >> PMD_SHIFT,
-				      (void *) from + off))
-			break;
-	}
-	up_write(&gmap->mm->mmap_sem);
-	if (flush)
-		gmap_flush_tlb(gmap);
-	if (off >= len)
-		return 0;
-	gmap_unmap_segment(gmap, to, len);
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(gmap_map_segment);
-
-/**
- * __gmap_translate - translate a guest address to a user space address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- * The mmap_sem of the mm that belongs to the address space must be held
- * when this function gets called.
- */
-unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
-{
-	unsigned long vmaddr;
-
-	vmaddr = (unsigned long)
-		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
-	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
-}
-EXPORT_SYMBOL_GPL(__gmap_translate);
-
-/**
- * gmap_translate - translate a guest address to a user space address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- */
-unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
-{
-	unsigned long rc;
-
-	down_read(&gmap->mm->mmap_sem);
-	rc = __gmap_translate(gmap, gaddr);
-	up_read(&gmap->mm->mmap_sem);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_translate);
-
-/**
- * gmap_unlink - disconnect a page table from the gmap shadow tables
- * @gmap: pointer to guest mapping meta data structure
- * @table: pointer to the host page table
- * @vmaddr: vm address associated with the host page table
- */
-static void gmap_unlink(struct mm_struct *mm, unsigned long *table,
-			unsigned long vmaddr)
-{
-	struct gmap *gmap;
-	int flush;
-
-	list_for_each_entry(gmap, &mm->context.gmap_list, list) {
-		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
-		if (flush)
-			gmap_flush_tlb(gmap);
-	}
-}
-
-/**
- * gmap_link - set up shadow page tables to connect a host to a guest address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- * @vmaddr: vm address
- *
- * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
- * if the vm address is already mapped to a different guest segment.
- * The mmap_sem of the mm that belongs to the address space must be held
- * when this function gets called.
- */
-int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
-{
-	struct mm_struct *mm;
-	unsigned long *table;
-	spinlock_t *ptl;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	int rc;
-
-	/* Create higher level tables in the gmap page table */
-	table = gmap->table;
-	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
-		table += (gaddr >> 53) & 0x7ff;
-		if ((*table & _REGION_ENTRY_INVALID) &&
-		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
-				     gaddr & 0xffe0000000000000UL))
-			return -ENOMEM;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-	}
-	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
-		table += (gaddr >> 42) & 0x7ff;
-		if ((*table & _REGION_ENTRY_INVALID) &&
-		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
-				     gaddr & 0xfffffc0000000000UL))
-			return -ENOMEM;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-	}
-	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
-		table += (gaddr >> 31) & 0x7ff;
-		if ((*table & _REGION_ENTRY_INVALID) &&
-		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
-				     gaddr & 0xffffffff80000000UL))
-			return -ENOMEM;
-		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
-	}
-	table += (gaddr >> 20) & 0x7ff;
-	/* Walk the parent mm page table */
-	mm = gmap->mm;
-	pgd = pgd_offset(mm, vmaddr);
-	VM_BUG_ON(pgd_none(*pgd));
-	pud = pud_offset(pgd, vmaddr);
-	VM_BUG_ON(pud_none(*pud));
-	pmd = pmd_offset(pud, vmaddr);
-	VM_BUG_ON(pmd_none(*pmd));
-	/* large pmds cannot yet be handled */
-	if (pmd_large(*pmd))
-		return -EFAULT;
-	/* Link gmap segment table entry location to page table. */
-	rc = radix_tree_preload(GFP_KERNEL);
-	if (rc)
-		return rc;
-	ptl = pmd_lock(mm, pmd);
-	spin_lock(&gmap->guest_table_lock);
-	if (*table == _SEGMENT_ENTRY_INVALID) {
-		rc = radix_tree_insert(&gmap->host_to_guest,
-				       vmaddr >> PMD_SHIFT, table);
-		if (!rc)
-			*table = pmd_val(*pmd);
-	} else
-		rc = 0;
-	spin_unlock(&gmap->guest_table_lock);
-	spin_unlock(ptl);
-	radix_tree_preload_end();
-	return rc;
-}
-
-/**
- * gmap_fault - resolve a fault on a guest address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- * @fault_flags: flags to pass down to handle_mm_fault()
- *
- * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
- * if the vm address is already mapped to a different guest segment.
- */
-int gmap_fault(struct gmap *gmap, unsigned long gaddr,
-	       unsigned int fault_flags)
-{
-	unsigned long vmaddr;
-	int rc;
-	bool unlocked;
-
-	down_read(&gmap->mm->mmap_sem);
-
-retry:
-	unlocked = false;
-	vmaddr = __gmap_translate(gmap, gaddr);
-	if (IS_ERR_VALUE(vmaddr)) {
-		rc = vmaddr;
-		goto out_up;
-	}
-	if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
-			     &unlocked)) {
-		rc = -EFAULT;
-		goto out_up;
-	}
-	/*
-	 * In the case that fixup_user_fault unlocked the mmap_sem during
-	 * faultin redo __gmap_translate to not race with a map/unmap_segment.
-	 */
-	if (unlocked)
-		goto retry;
-
-	rc = __gmap_link(gmap, gaddr, vmaddr);
-out_up:
-	up_read(&gmap->mm->mmap_sem);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_fault);
-
-static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
-{
-	if (!non_swap_entry(entry))
-		dec_mm_counter(mm, MM_SWAPENTS);
-	else if (is_migration_entry(entry)) {
-		struct page *page = migration_entry_to_page(entry);
-
-		dec_mm_counter(mm, mm_counter(page));
-	}
-	free_swap_and_cache(entry);
-}
-
-/*
- * this function is assumed to be called with mmap_sem held
- */
-void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
-{
-	unsigned long vmaddr, ptev, pgstev;
-	pte_t *ptep, pte;
-	spinlock_t *ptl;
-	pgste_t pgste;
-
-	/* Find the vm address for the guest address */
-	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
-						   gaddr >> PMD_SHIFT);
-	if (!vmaddr)
-		return;
-	vmaddr |= gaddr & ~PMD_MASK;
-	/* Get pointer to the page table entry */
-	ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
-	if (unlikely(!ptep))
-		return;
-	pte = *ptep;
-	if (!pte_swap(pte))
-		goto out_pte;
-	/* Zap unused and logically-zero pages */
-	pgste = pgste_get_lock(ptep);
-	pgstev = pgste_val(pgste);
-	ptev = pte_val(pte);
-	if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
-	    ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
-		gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm);
-		pte_clear(gmap->mm, vmaddr, ptep);
-	}
-	pgste_set_unlock(ptep, pgste);
-out_pte:
-	pte_unmap_unlock(ptep, ptl);
-}
-EXPORT_SYMBOL_GPL(__gmap_zap);
-
-void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
-{
-	unsigned long gaddr, vmaddr, size;
-	struct vm_area_struct *vma;
-
-	down_read(&gmap->mm->mmap_sem);
-	for (gaddr = from; gaddr < to;
-	     gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
-		/* Find the vm address for the guest address */
-		vmaddr = (unsigned long)
-			radix_tree_lookup(&gmap->guest_to_host,
-					  gaddr >> PMD_SHIFT);
-		if (!vmaddr)
-			continue;
-		vmaddr |= gaddr & ~PMD_MASK;
-		/* Find vma in the parent mm */
-		vma = find_vma(gmap->mm, vmaddr);
-		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
-		zap_page_range(vma, vmaddr, size, NULL);
-	}
-	up_read(&gmap->mm->mmap_sem);
-}
-EXPORT_SYMBOL_GPL(gmap_discard);
-
-static LIST_HEAD(gmap_notifier_list);
-static DEFINE_SPINLOCK(gmap_notifier_lock);
-
-/**
- * gmap_register_ipte_notifier - register a pte invalidation callback
- * @nb: pointer to the gmap notifier block
- */
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
-{
-	spin_lock(&gmap_notifier_lock);
-	list_add(&nb->list, &gmap_notifier_list);
-	spin_unlock(&gmap_notifier_lock);
-}
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
-
-/**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
- * @nb: pointer to the gmap notifier block
- */
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
-{
-	spin_lock(&gmap_notifier_lock);
-	list_del_init(&nb->list);
-	spin_unlock(&gmap_notifier_lock);
-}
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
-
-/**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @len: size of area
- *
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
- */
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
-{
-	unsigned long addr;
-	spinlock_t *ptl;
-	pte_t *ptep, entry;
-	pgste_t pgste;
-	bool unlocked;
-	int rc = 0;
-
-	if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
-		return -EINVAL;
-	down_read(&gmap->mm->mmap_sem);
-	while (len) {
-		unlocked = false;
-		/* Convert gmap address and connect the page tables */
-		addr = __gmap_translate(gmap, gaddr);
-		if (IS_ERR_VALUE(addr)) {
-			rc = addr;
-			break;
-		}
-		/* Get the page mapped */
-		if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-				     &unlocked)) {
-			rc = -EFAULT;
-			break;
-		}
-		/* While trying to map mmap_sem got unlocked. Let us retry */
-		if (unlocked)
-			continue;
-		rc = __gmap_link(gmap, gaddr, addr);
-		if (rc)
-			break;
-		/* Walk the process page table, lock and get pte pointer */
-		ptep = get_locked_pte(gmap->mm, addr, &ptl);
-		VM_BUG_ON(!ptep);
-		/* Set notification bit in the pgste of the pte */
-		entry = *ptep;
-		if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
-			pgste = pgste_get_lock(ptep);
-			pgste_val(pgste) |= PGSTE_IN_BIT;
-			pgste_set_unlock(ptep, pgste);
-			gaddr += PAGE_SIZE;
-			len -= PAGE_SIZE;
-		}
-		pte_unmap_unlock(ptep, ptl);
-	}
-	up_read(&gmap->mm->mmap_sem);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
-
-/**
- * ptep_ipte_notify - call all invalidation callbacks for a specific pte.
- * @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
- * @pte: pointer to the page table entry
- *
- * This function is assumed to be called with the page table lock held
- * for the pte to notify.
- */
-void ptep_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
-{
-	unsigned long offset, gaddr;
-	unsigned long *table;
-	struct gmap_notifier *nb;
-	struct gmap *gmap;
-
-	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
-	offset = offset * (4096 / sizeof(pte_t));
-	spin_lock(&gmap_notifier_lock);
-	list_for_each_entry(gmap, &mm->context.gmap_list, list) {
-		table = radix_tree_lookup(&gmap->host_to_guest,
-					  vmaddr >> PMD_SHIFT);
-		if (!table)
-			continue;
-		gaddr = __gmap_segment_gaddr(table) + offset;
-		list_for_each_entry(nb, &gmap_notifier_list, list)
-			nb->notifier_call(gmap, gaddr);
-	}
-	spin_unlock(&gmap_notifier_lock);
-}
-EXPORT_SYMBOL_GPL(ptep_ipte_notify);
-
-int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned long key, bool nq)
-{
-	spinlock_t *ptl;
-	pgste_t old, new;
-	pte_t *ptep;
-
-	down_read(&mm->mmap_sem);
-	ptep = get_locked_pte(mm, addr, &ptl);
-	if (unlikely(!ptep)) {
-		up_read(&mm->mmap_sem);
-		return -EFAULT;
-	}
-
-	new = old = pgste_get_lock(ptep);
-	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
-			    PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
-	pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
-	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-		unsigned long address, bits, skey;
-
-		address = pte_val(*ptep) & PAGE_MASK;
-		skey = (unsigned long) page_get_storage_key(address);
-		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
-		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
-		/* Set storage key ACC and FP */
-		page_set_storage_key(address, skey, !nq);
-		/* Merge host changed & referenced into pgste  */
-		pgste_val(new) |= bits << 52;
-	}
-	/* changing the guest storage key is considered a change of the page */
-	if ((pgste_val(new) ^ pgste_val(old)) &
-	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
-		pgste_val(new) |= PGSTE_UC_BIT;
-
-	pgste_set_unlock(ptep, new);
-	pte_unmap_unlock(ptep, ptl);
-	up_read(&mm->mmap_sem);
-	return 0;
-}
-EXPORT_SYMBOL(set_guest_storage_key);
-
-unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
-{
-	spinlock_t *ptl;
-	pgste_t pgste;
-	pte_t *ptep;
-	uint64_t physaddr;
-	unsigned long key = 0;
-
-	down_read(&mm->mmap_sem);
-	ptep = get_locked_pte(mm, addr, &ptl);
-	if (unlikely(!ptep)) {
-		up_read(&mm->mmap_sem);
-		return -EFAULT;
-	}
-	pgste = pgste_get_lock(ptep);
-
-	if (pte_val(*ptep) & _PAGE_INVALID) {
-		key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
-		key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
-		key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
-		key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
-	} else {
-		physaddr = pte_val(*ptep) & PAGE_MASK;
-		key = page_get_storage_key(physaddr);
-
-		/* Reflect guest's logical view, not physical */
-		if (pgste_val(pgste) & PGSTE_GR_BIT)
-			key |= _PAGE_REFERENCED;
-		if (pgste_val(pgste) & PGSTE_GC_BIT)
-			key |= _PAGE_CHANGED;
-	}
-
-	pgste_set_unlock(ptep, pgste);
-	pte_unmap_unlock(ptep, ptl);
-	up_read(&mm->mmap_sem);
-	return key;
-}
-EXPORT_SYMBOL(get_guest_storage_key);
-
-static int page_table_allocate_pgste_min = 0;
-static int page_table_allocate_pgste_max = 1;
-int page_table_allocate_pgste = 0;
-EXPORT_SYMBOL(page_table_allocate_pgste);
-
-static struct ctl_table page_table_sysctl[] = {
-	{
-		.procname	= "allocate_pgste",
-		.data		= &page_table_allocate_pgste,
-		.maxlen		= sizeof(int),
-		.mode		= S_IRUGO | S_IWUSR,
-		.proc_handler	= proc_dointvec,
-		.extra1		= &page_table_allocate_pgste_min,
-		.extra2		= &page_table_allocate_pgste_max,
-	},
-	{ }
-};
-
-static struct ctl_table page_table_sysctl_dir[] = {
-	{
-		.procname	= "vm",
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= page_table_sysctl,
-	},
-	{ }
-};
-
-static int __init page_table_register_sysctl(void)
-{
-	return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
-}
-__initcall(page_table_register_sysctl);
-
-#else /* CONFIG_PGSTE */
-
-static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table,
-			unsigned long vmaddr)
-{
-}
-
-#endif /* CONFIG_PGSTE */
-
-static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
-{
-	unsigned int old, new;
-
-	do {
-		old = atomic_read(v);
-		new = old ^ bits;
-	} while (atomic_cmpxchg(v, old, new) != old);
-	return new;
-}
-
-/*
- * page table entry allocation/free routines.
- */
-unsigned long *page_table_alloc(struct mm_struct *mm)
-{
-	unsigned long *table;
-	struct page *page;
-	unsigned int mask, bit;
-
-	/* Try to get a fragment of a 4K page as a 2K page table */
-	if (!mm_alloc_pgste(mm)) {
-		table = NULL;
-		spin_lock_bh(&mm->context.list_lock);
-		if (!list_empty(&mm->context.pgtable_list)) {
-			page = list_first_entry(&mm->context.pgtable_list,
-						struct page, lru);
-			mask = atomic_read(&page->_mapcount);
-			mask = (mask | (mask >> 4)) & 3;
-			if (mask != 3) {
-				table = (unsigned long *) page_to_phys(page);
-				bit = mask & 1;		/* =1 -> second 2K */
-				if (bit)
-					table += PTRS_PER_PTE;
-				atomic_xor_bits(&page->_mapcount, 1U << bit);
-				list_del(&page->lru);
-			}
-		}
-		spin_unlock_bh(&mm->context.list_lock);
-		if (table)
-			return table;
-	}
-	/* Allocate a fresh page */
-	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
-	if (!page)
-		return NULL;
-	if (!pgtable_page_ctor(page)) {
-		__free_page(page);
-		return NULL;
-	}
-	/* Initialize page table */
-	table = (unsigned long *) page_to_phys(page);
-	if (mm_alloc_pgste(mm)) {
-		/* Return 4K page table with PGSTEs */
-		atomic_set(&page->_mapcount, 3);
-		clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
-		clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
-	} else {
-		/* Return the first 2K fragment of the page */
-		atomic_set(&page->_mapcount, 1);
-		clear_table(table, _PAGE_INVALID, PAGE_SIZE);
-		spin_lock_bh(&mm->context.list_lock);
-		list_add(&page->lru, &mm->context.pgtable_list);
-		spin_unlock_bh(&mm->context.list_lock);
-	}
-	return table;
-}
-
-void page_table_free(struct mm_struct *mm, unsigned long *table)
-{
-	struct page *page;
-	unsigned int bit, mask;
-
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	if (!mm_alloc_pgste(mm)) {
-		/* Free 2K page table fragment of a 4K page */
-		bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-		spin_lock_bh(&mm->context.list_lock);
-		mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
-		if (mask & 3)
-			list_add(&page->lru, &mm->context.pgtable_list);
-		else
-			list_del(&page->lru);
-		spin_unlock_bh(&mm->context.list_lock);
-		if (mask != 0)
-			return;
-	}
-
-	pgtable_page_dtor(page);
-	atomic_set(&page->_mapcount, -1);
-	__free_page(page);
-}
-
-void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
-			 unsigned long vmaddr)
-{
-	struct mm_struct *mm;
-	struct page *page;
-	unsigned int bit, mask;
-
-	mm = tlb->mm;
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	if (mm_alloc_pgste(mm)) {
-		gmap_unlink(mm, table, vmaddr);
-		table = (unsigned long *) (__pa(table) | 3);
-		tlb_remove_table(tlb, table);
-		return;
-	}
-	bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-	spin_lock_bh(&mm->context.list_lock);
-	mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
-	if (mask & 3)
-		list_add_tail(&page->lru, &mm->context.pgtable_list);
-	else
-		list_del(&page->lru);
-	spin_unlock_bh(&mm->context.list_lock);
-	table = (unsigned long *) (__pa(table) | (1U << bit));
-	tlb_remove_table(tlb, table);
-}
-
-static void __tlb_remove_table(void *_table)
-{
-	unsigned int mask = (unsigned long) _table & 3;
-	void *table = (void *)((unsigned long) _table ^ mask);
-	struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-
-	switch (mask) {
-	case 0:		/* pmd or pud */
-		free_pages((unsigned long) table, 2);
-		break;
-	case 1:		/* lower 2K of a 4K page table */
-	case 2:		/* higher 2K of a 4K page table */
-		if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0)
-			break;
-		/* fallthrough */
-	case 3:		/* 4K page table with pgstes */
-		pgtable_page_dtor(page);
-		atomic_set(&page->_mapcount, -1);
-		__free_page(page);
-		break;
-	}
-}
-
-static void tlb_remove_table_smp_sync(void *arg)
-{
-	/* Simply deliver the interrupt */
-}
-
-static void tlb_remove_table_one(void *table)
-{
-	/*
-	 * This isn't an RCU grace period and hence the page-tables cannot be
-	 * assumed to be actually RCU-freed.
-	 *
-	 * It is however sufficient for software page-table walkers that rely
-	 * on IRQ disabling. See the comment near struct mmu_table_batch.
-	 */
-	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
-	__tlb_remove_table(table);
-}
-
-static void tlb_remove_table_rcu(struct rcu_head *head)
-{
-	struct mmu_table_batch *batch;
-	int i;
-
-	batch = container_of(head, struct mmu_table_batch, rcu);
-
-	for (i = 0; i < batch->nr; i++)
-		__tlb_remove_table(batch->tables[i]);
-
-	free_page((unsigned long)batch);
-}
-
-void tlb_table_flush(struct mmu_gather *tlb)
-{
-	struct mmu_table_batch **batch = &tlb->batch;
-
-	if (*batch) {
-		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
-		*batch = NULL;
-	}
-}
-
-void tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
-	struct mmu_table_batch **batch = &tlb->batch;
-
-	tlb->mm->context.flush_mm = 1;
-	if (*batch == NULL) {
-		*batch = (struct mmu_table_batch *)
-			__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
-		if (*batch == NULL) {
-			__tlb_flush_mm_lazy(tlb->mm);
-			tlb_remove_table_one(table);
-			return;
-		}
-		(*batch)->nr = 0;
-	}
-	(*batch)->tables[(*batch)->nr++] = table;
-	if ((*batch)->nr == MAX_TABLE_BATCH)
-		tlb_flush_mmu(tlb);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline void thp_split_vma(struct vm_area_struct *vma)
-{
-	unsigned long addr;
-
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
-		follow_page(vma, addr, FOLL_SPLIT);
-}
-
-static inline void thp_split_mm(struct mm_struct *mm)
-{
-	struct vm_area_struct *vma;
-
-	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
-		thp_split_vma(vma);
-		vma->vm_flags &= ~VM_HUGEPAGE;
-		vma->vm_flags |= VM_NOHUGEPAGE;
-	}
-	mm->def_flags |= VM_NOHUGEPAGE;
-}
-#else
-static inline void thp_split_mm(struct mm_struct *mm)
-{
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
 static inline pte_t ptep_flush_direct(struct mm_struct *mm,
 				      unsigned long addr, pte_t *ptep)
 {
@@ -1198,6 +64,55 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
 	return old;
 }
 
+static inline pgste_t pgste_get_lock(pte_t *ptep)
+{
+	unsigned long new = 0;
+#ifdef CONFIG_PGSTE
+	unsigned long old;
+
+	preempt_disable();
+	asm(
+		"	lg	%0,%2\n"
+		"0:	lgr	%1,%0\n"
+		"	nihh	%0,0xff7f\n"	/* clear PCL bit in old */
+		"	oihh	%1,0x0080\n"	/* set PCL bit in new */
+		"	csg	%0,%1,%2\n"
+		"	jl	0b\n"
+		: "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE])
+		: "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory");
+#endif
+	return __pgste(new);
+}
+
+static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
+{
+#ifdef CONFIG_PGSTE
+	asm(
+		"	nihh	%1,0xff7f\n"	/* clear PCL bit */
+		"	stg	%1,%0\n"
+		: "=Q" (ptep[PTRS_PER_PTE])
+		: "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
+		: "cc", "memory");
+	preempt_enable();
+#endif
+}
+
+static inline pgste_t pgste_get(pte_t *ptep)
+{
+	unsigned long pgste = 0;
+#ifdef CONFIG_PGSTE
+	pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
+#endif
+	return __pgste(pgste);
+}
+
+static inline void pgste_set(pte_t *ptep, pgste_t pgste)
+{
+#ifdef CONFIG_PGSTE
+	*(pgste_t *)(ptep + PTRS_PER_PTE) = pgste;
+#endif
+}
+
 static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
 				       struct mm_struct *mm)
 {
@@ -1271,63 +186,12 @@ static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
 #ifdef CONFIG_PGSTE
 	if (pgste_val(pgste) & PGSTE_IN_BIT) {
 		pgste_val(pgste) &= ~PGSTE_IN_BIT;
-		ptep_ipte_notify(mm, addr, ptep);
+		ptep_notify(mm, addr, ptep);
 	}
 #endif
 	return pgste;
 }
 
-#ifdef CONFIG_PGSTE
-/*
- * Test and reset if a guest page is dirty
- */
-bool pgste_test_and_clear_dirty(struct mm_struct *mm, unsigned long addr)
-{
-	spinlock_t *ptl;
-	pgste_t pgste;
-	pte_t *ptep;
-	pte_t pte;
-	bool dirty;
-
-	ptep = get_locked_pte(mm, addr, &ptl);
-	if (unlikely(!ptep))
-		return false;
-
-	pgste = pgste_get_lock(ptep);
-	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
-	pgste_val(pgste) &= ~PGSTE_UC_BIT;
-	pte = *ptep;
-	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-		pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
-		__ptep_ipte(addr, ptep);
-		if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
-			pte_val(pte) |= _PAGE_PROTECT;
-		else
-			pte_val(pte) |= _PAGE_INVALID;
-		*ptep = pte;
-	}
-	pgste_set_unlock(ptep, pgste);
-
-	spin_unlock(ptl);
-	return dirty;
-}
-EXPORT_SYMBOL_GPL(pgste_test_and_clear_dirty);
-
-void set_pte_pgste_at(struct mm_struct *mm, unsigned long addr,
-		      pte_t *ptep, pte_t entry)
-{
-	pgste_t pgste;
-
-	/* the mm_has_pgste() check is done in set_pte_at() */
-	pgste = pgste_get_lock(ptep);
-	pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
-	pgste_set_key(ptep, pgste, entry, mm);
-	pgste = pgste_set_pte(ptep, pgste, entry);
-	pgste_set_unlock(ptep, pgste);
-}
-EXPORT_SYMBOL(set_pte_pgste_at);
-#endif
-
 static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
 				      unsigned long addr, pte_t *ptep)
 {
@@ -1486,112 +350,6 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(pmdp_xchg_lazy);
 
-/*
- * switch on pgstes for its userspace process (for kvm)
- */
-int s390_enable_sie(void)
-{
-	struct mm_struct *mm = current->mm;
-
-	/* Do we have pgstes? if yes, we are done */
-	if (mm_has_pgste(mm))
-		return 0;
-	/* Fail if the page tables are 2K */
-	if (!mm_alloc_pgste(mm))
-		return -EINVAL;
-	down_write(&mm->mmap_sem);
-	mm->context.has_pgste = 1;
-	/* split thp mappings and disable thp for future mappings */
-	thp_split_mm(mm);
-	up_write(&mm->mmap_sem);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(s390_enable_sie);
-
-/*
- * Enable storage key handling from now on and initialize the storage
- * keys with the default key.
- */
-static int __s390_enable_skey(pte_t *pte, unsigned long addr,
-			      unsigned long next, struct mm_walk *walk)
-{
-	unsigned long ptev;
-	pgste_t pgste;
-
-	/*
-	 * Remove all zero page mappings,
-	 * after establishing a policy to forbid zero page mappings
-	 * following faults for that page will get fresh anonymous pages
-	 */
-	if (is_zero_pfn(pte_pfn(*pte)))
-		ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID));
-	/* Clear storage key */
-	pgste = pgste_get_lock(pte);
-	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
-			      PGSTE_GR_BIT | PGSTE_GC_BIT);
-	ptev = pte_val(*pte);
-	if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
-		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
-	pgste_set_unlock(pte, pgste);
-	return 0;
-}
-
-int s390_enable_skey(void)
-{
-	struct mm_walk walk = { .pte_entry = __s390_enable_skey };
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	int rc = 0;
-
-	down_write(&mm->mmap_sem);
-	if (mm_use_skey(mm))
-		goto out_up;
-
-	mm->context.use_skey = 1;
-	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
-				MADV_UNMERGEABLE, &vma->vm_flags)) {
-			mm->context.use_skey = 0;
-			rc = -ENOMEM;
-			goto out_up;
-		}
-	}
-	mm->def_flags &= ~VM_MERGEABLE;
-
-	walk.mm = mm;
-	walk_page_range(0, TASK_SIZE, &walk);
-
-out_up:
-	up_write(&mm->mmap_sem);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(s390_enable_skey);
-
-/*
- * Reset CMMA state, make all pages stable again.
- */
-static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
-			     unsigned long next, struct mm_walk *walk)
-{
-	pgste_t pgste;
-
-	pgste = pgste_get_lock(pte);
-	pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
-	pgste_set_unlock(pte, pgste);
-	return 0;
-}
-
-void s390_reset_cmma(struct mm_struct *mm)
-{
-	struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
-
-	down_write(&mm->mmap_sem);
-	walk.mm = mm;
-	walk_page_range(0, TASK_SIZE, &walk);
-	up_write(&mm->mmap_sem);
-}
-EXPORT_SYMBOL_GPL(s390_reset_cmma);
-
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 				pgtable_t pgtable)
@@ -1632,3 +390,193 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 	return pgtable;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_PGSTE
+void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t entry)
+{
+	pgste_t pgste;
+
+	/* the mm_has_pgste() check is done in set_pte_at() */
+	pgste = pgste_get_lock(ptep);
+	pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
+	pgste_set_key(ptep, pgste, entry, mm);
+	pgste = pgste_set_pte(ptep, pgste, entry);
+	pgste_set_unlock(ptep, pgste);
+}
+
+void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	pgste_t pgste;
+
+	pgste = pgste_get_lock(ptep);
+	pgste_val(pgste) |= PGSTE_IN_BIT;
+	pgste_set_unlock(ptep, pgste);
+}
+
+static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
+{
+	if (!non_swap_entry(entry))
+		dec_mm_counter(mm, MM_SWAPENTS);
+	else if (is_migration_entry(entry)) {
+		struct page *page = migration_entry_to_page(entry);
+
+		dec_mm_counter(mm, mm_counter(page));
+	}
+	free_swap_and_cache(entry);
+}
+
+void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, int reset)
+{
+	unsigned long pgstev;
+	pgste_t pgste;
+	pte_t pte;
+
+	/* Zap unused and logically-zero pages */
+	pgste = pgste_get_lock(ptep);
+	pgstev = pgste_val(pgste);
+	pte = *ptep;
+	if (pte_swap(pte) &&
+	    ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
+	     (pgstev & _PGSTE_GPS_ZERO))) {
+		ptep_zap_swap_entry(mm, pte_to_swp_entry(pte));
+		pte_clear(mm, addr, ptep);
+	}
+	if (reset)
+		pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
+	pgste_set_unlock(ptep, pgste);
+}
+
+void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	unsigned long ptev;
+	pgste_t pgste;
+
+	/* Clear storage key */
+	pgste = pgste_get_lock(ptep);
+	pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
+			      PGSTE_GR_BIT | PGSTE_GC_BIT);
+	ptev = pte_val(*ptep);
+	if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
+		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
+	pgste_set_unlock(ptep, pgste);
+}
+
+/*
+ * Test and reset if a guest page is dirty
+ */
+bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
+{
+	spinlock_t *ptl;
+	pgste_t pgste;
+	pte_t *ptep;
+	pte_t pte;
+	bool dirty;
+
+	ptep = get_locked_pte(mm, addr, &ptl);
+	if (unlikely(!ptep))
+		return false;
+
+	pgste = pgste_get_lock(ptep);
+	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
+	pgste_val(pgste) &= ~PGSTE_UC_BIT;
+	pte = *ptep;
+	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
+		pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+		__ptep_ipte(addr, ptep);
+		if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
+			pte_val(pte) |= _PAGE_PROTECT;
+		else
+			pte_val(pte) |= _PAGE_INVALID;
+		*ptep = pte;
+	}
+	pgste_set_unlock(ptep, pgste);
+
+	spin_unlock(ptl);
+	return dirty;
+}
+EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty);
+
+int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+			  unsigned char key, bool nq)
+{
+	unsigned long keyul;
+	spinlock_t *ptl;
+	pgste_t old, new;
+	pte_t *ptep;
+
+	down_read(&mm->mmap_sem);
+	ptep = get_locked_pte(mm, addr, &ptl);
+	if (unlikely(!ptep)) {
+		up_read(&mm->mmap_sem);
+		return -EFAULT;
+	}
+
+	new = old = pgste_get_lock(ptep);
+	pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
+			    PGSTE_ACC_BITS | PGSTE_FP_BIT);
+	keyul = (unsigned long) key;
+	pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
+	pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+		unsigned long address, bits, skey;
+
+		address = pte_val(*ptep) & PAGE_MASK;
+		skey = (unsigned long) page_get_storage_key(address);
+		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
+		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
+		/* Set storage key ACC and FP */
+		page_set_storage_key(address, skey, !nq);
+		/* Merge host changed & referenced into pgste  */
+		pgste_val(new) |= bits << 52;
+	}
+	/* changing the guest storage key is considered a change of the page */
+	if ((pgste_val(new) ^ pgste_val(old)) &
+	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
+		pgste_val(new) |= PGSTE_UC_BIT;
+
+	pgste_set_unlock(ptep, new);
+	pte_unmap_unlock(ptep, ptl);
+	up_read(&mm->mmap_sem);
+	return 0;
+}
+EXPORT_SYMBOL(set_guest_storage_key);
+
+unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+{
+	unsigned char key;
+	spinlock_t *ptl;
+	pgste_t pgste;
+	pte_t *ptep;
+
+	down_read(&mm->mmap_sem);
+	ptep = get_locked_pte(mm, addr, &ptl);
+	if (unlikely(!ptep)) {
+		up_read(&mm->mmap_sem);
+		return -EFAULT;
+	}
+	pgste = pgste_get_lock(ptep);
+
+	if (pte_val(*ptep) & _PAGE_INVALID) {
+		key  = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
+		key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
+		key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
+		key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
+	} else {
+		key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+
+		/* Reflect guest's logical view, not physical */
+		if (pgste_val(pgste) & PGSTE_GR_BIT)
+			key |= _PAGE_REFERENCED;
+		if (pgste_val(pgste) & PGSTE_GC_BIT)
+			key |= _PAGE_CHANGED;
+	}
+
+	pgste_set_unlock(ptep, pgste);
+	pte_unmap_unlock(ptep, ptl);
+	up_read(&mm->mmap_sem);
+	return key;
+}
+EXPORT_SYMBOL(get_guest_storage_key);
+#endif
-- 
cgit v1.2.3


From 7eb792bf7c253cb63202aae72885f5f7abdd8668 Mon Sep 17 00:00:00 2001
From: Adam Buchbinder <adam.buchbinder@gmail.com>
Date: Fri, 4 Mar 2016 11:20:04 -0800
Subject: s390: Fix misspellings in comments

Signed-off-by: Adam Buchbinder <adam.buchbinder@gmail.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/perf_event.h | 2 +-
 arch/s390/include/asm/rwsem.h      | 2 +-
 arch/s390/kernel/perf_cpum_cf.c    | 2 +-
 arch/s390/kernel/perf_event.c      | 2 +-
 arch/s390/kvm/guestdbg.c           | 4 ++--
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index f897ec73dc8c..1f7ff85c5e4c 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -21,7 +21,7 @@
 #define PMU_F_ERR_LSDA			0x0200
 #define PMU_F_ERR_MASK			(PMU_F_ERR_IBE|PMU_F_ERR_LSDA)
 
-/* Perf defintions for PMU event attributes in sysfs */
+/* Perf definitions for PMU event attributes in sysfs */
 extern __init const struct attribute_group **cpumf_cf_event_group(void);
 extern ssize_t cpumf_events_sysfs_show(struct device *dev,
 				       struct device_attribute *attr,
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index 4b43ee7e6776..fead491dfc28 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -31,7 +31,7 @@
  * This should be totally fair - if anything is waiting, a process that wants a
  * lock will go to the back of the queue. When the currently active lock is
  * released, if there's a writer at the front of the queue, then that and only
- * that will be woken up; if there's a bunch of consequtive readers at the
+ * that will be woken up; if there's a bunch of consecutive readers at the
  * front, then they'll all be woken up, but no other readers will be.
  */
 
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 929c147e07b4..58bf4572d457 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -383,7 +383,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 
 	/* Validate the counter that is assigned to this event.
 	 * Because the counter facility can use numerous counters at the
-	 * same time without constraints, it is not necessary to explicity
+	 * same time without constraints, it is not necessary to explicitly
 	 * validate event groups (event->group_leader != event).
 	 */
 	err = validate_event(hwc);
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index a1b708643a2c..c3e4099b60a5 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -238,7 +238,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	dump_trace(__perf_callchain_kernel, entry, NULL, regs->gprs[15]);
 }
 
-/* Perf defintions for PMU event attributes in sysfs */
+/* Perf definitions for PMU event attributes in sysfs */
 ssize_t cpumf_events_sysfs_show(struct device *dev,
 				struct device_attribute *attr, char *page)
 {
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index d697312ce9ee..e8c6843b9600 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -17,7 +17,7 @@
 /*
  * Extends the address range given by *start and *stop to include the address
  * range starting with estart and the length len. Takes care of overflowing
- * intervals and tries to minimize the overall intervall size.
+ * intervals and tries to minimize the overall interval size.
  */
 static void extend_address_range(u64 *start, u64 *stop, u64 estart, int len)
 {
@@ -72,7 +72,7 @@ static void enable_all_hw_bp(struct kvm_vcpu *vcpu)
 		return;
 
 	/*
-	 * If the guest is not interrested in branching events, we can savely
+	 * If the guest is not interested in branching events, we can safely
 	 * limit them to the PER address range.
 	 */
 	if (!(*cr9 & PER_EVENT_BRANCH))
-- 
cgit v1.2.3


From 8f100bb1ff27873dd71f636da670e503b9ade3c6 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 10 Mar 2016 10:32:21 +0100
Subject: s390/cpumf: add missing lpp magic initialization

Add the missing lpp magic initialization for cpu 0. Without this all
samples on cpu 0 do not have the most significant bit set in the
program parameter field, which we use to distinguish between guest and
host samples if the pid is also 0.

We did initialize the lpp magic in the absolute zero lowcore but
forgot that when switching to the allocated lowcore on cpu 0 only.

Reported-by: Shu Juan Zhang <zhshuj@cn.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: stable@vger.kernel.org # v4.4+
Fixes: e22cf8ca6f75 ("s390/cpumf: rework program parameter setting to detect guest samples")
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/setup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index b1fbcc07c871..cc46767e902e 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -327,6 +327,7 @@ static void __init setup_lowcore(void)
 		+ PAGE_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
 	lc->current_task = (unsigned long) init_thread_union.thread_info.task;
 	lc->thread_info = (unsigned long) &init_thread_union;
+	lc->lpp = LPP_MAGIC;
 	lc->machine_flags = S390_lowcore.machine_flags;
 	lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
 	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
-- 
cgit v1.2.3


From e370e4769463a65dcf8806fa26d2874e0542ac41 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Thu, 10 Mar 2016 09:52:55 +0100
Subject: s390: fix floating pointer register corruption (again)

There is a tricky interaction between the machine check handler
and the critical sections of load_fpu_regs and save_fpu_regs
functions. If the machine check interrupts one of the two
functions the critical section cleanup will complete the function
before the machine check handler s390_do_machine_check is called.
Trouble is that the machine check handler needs to validate the
floating point registers *before* and not *after* the completion
of load_fpu_regs/save_fpu_regs.

The simplest solution is to rewind the PSW to the start of the
load_fpu_regs/save_fpu_regs and retry the function after the
return from the machine check handler.

Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: <stable@vger.kernel.org> # 4.3+
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/entry.S | 106 +----------------------------------------------
 1 file changed, 2 insertions(+), 104 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 4ba688c9d5a6..2d47f9cfcb36 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -1200,114 +1200,12 @@ cleanup_critical:
 	.quad	.Lpsw_idle_lpsw
 
 .Lcleanup_save_fpu_regs:
-	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
-	bor	%r14
-	clg	%r9,BASED(.Lcleanup_save_fpu_regs_done)
-	jhe	5f
-	clg	%r9,BASED(.Lcleanup_save_fpu_regs_fp)
-	jhe	4f
-	clg	%r9,BASED(.Lcleanup_save_fpu_regs_vx_high)
-	jhe	3f
-	clg	%r9,BASED(.Lcleanup_save_fpu_regs_vx_low)
-	jhe	2f
-	clg	%r9,BASED(.Lcleanup_save_fpu_fpc_end)
-	jhe	1f
-	lg	%r2,__LC_CURRENT
-	aghi	%r2,__TASK_thread
-0:	# Store floating-point controls
-	stfpc	__THREAD_FPU_fpc(%r2)
-1:	# Load register save area and check if VX is active
-	lg	%r3,__THREAD_FPU_regs(%r2)
-	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
-	jz	4f			  # no VX -> store FP regs
-2:	# Store vector registers (V0-V15)
-	VSTM	%v0,%v15,0,%r3		  # vstm 0,15,0(3)
-3:	# Store vector registers (V16-V31)
-	VSTM	%v16,%v31,256,%r3	  # vstm 16,31,256(3)
-	j	5f			  # -> done, set CIF_FPU flag
-4:	# Store floating-point registers
-	std	0,0(%r3)
-	std	1,8(%r3)
-	std	2,16(%r3)
-	std	3,24(%r3)
-	std	4,32(%r3)
-	std	5,40(%r3)
-	std	6,48(%r3)
-	std	7,56(%r3)
-	std	8,64(%r3)
-	std	9,72(%r3)
-	std	10,80(%r3)
-	std	11,88(%r3)
-	std	12,96(%r3)
-	std	13,104(%r3)
-	std	14,112(%r3)
-	std	15,120(%r3)
-5:	# Set CIF_FPU flag
-	oi	__LC_CPU_FLAGS+7,_CIF_FPU
-	lg	%r9,48(%r11)		# return from save_fpu_regs
+	larl	%r9,save_fpu_regs
 	br	%r14
-.Lcleanup_save_fpu_fpc_end:
-	.quad	.Lsave_fpu_regs_fpc_end
-.Lcleanup_save_fpu_regs_vx_low:
-	.quad	.Lsave_fpu_regs_vx_low
-.Lcleanup_save_fpu_regs_vx_high:
-	.quad	.Lsave_fpu_regs_vx_high
-.Lcleanup_save_fpu_regs_fp:
-	.quad	.Lsave_fpu_regs_fp
-.Lcleanup_save_fpu_regs_done:
-	.quad	.Lsave_fpu_regs_done
 
 .Lcleanup_load_fpu_regs:
-	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
-	bnor	%r14
-	clg	%r9,BASED(.Lcleanup_load_fpu_regs_done)
-	jhe	1f
-	clg	%r9,BASED(.Lcleanup_load_fpu_regs_fp)
-	jhe	2f
-	clg	%r9,BASED(.Lcleanup_load_fpu_regs_vx_high)
-	jhe	3f
-	clg	%r9,BASED(.Lcleanup_load_fpu_regs_vx)
-	jhe	4f
-	lg	%r4,__LC_CURRENT
-	aghi	%r4,__TASK_thread
-	lfpc	__THREAD_FPU_fpc(%r4)
-	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
-	lg	%r4,__THREAD_FPU_regs(%r4)	# %r4 <- reg save area
-	jz	2f				# -> no VX, load FP regs
-4:	# Load V0 ..V15 registers
-	VLM	%v0,%v15,0,%r4
-3:	# Load V16..V31 registers
-	VLM	%v16,%v31,256,%r4
-	j	1f
-2:	# Load floating-point registers
-	ld	0,0(%r4)
-	ld	1,8(%r4)
-	ld	2,16(%r4)
-	ld	3,24(%r4)
-	ld	4,32(%r4)
-	ld	5,40(%r4)
-	ld	6,48(%r4)
-	ld	7,56(%r4)
-	ld	8,64(%r4)
-	ld	9,72(%r4)
-	ld	10,80(%r4)
-	ld	11,88(%r4)
-	ld	12,96(%r4)
-	ld	13,104(%r4)
-	ld	14,112(%r4)
-	ld	15,120(%r4)
-1:	# Clear CIF_FPU bit
-	ni	__LC_CPU_FLAGS+7,255-_CIF_FPU
-	lg	%r9,48(%r11)		# return from load_fpu_regs
+	larl	%r9,load_fpu_regs
 	br	%r14
-.Lcleanup_load_fpu_regs_vx:
-	.quad	.Lload_fpu_regs_vx
-.Lcleanup_load_fpu_regs_vx_high:
-	.quad	.Lload_fpu_regs_vx_high
-.Lcleanup_load_fpu_regs_fp:
-	.quad	.Lload_fpu_regs_fp
-.Lcleanup_load_fpu_regs_done:
-	.quad	.Lload_fpu_regs_done
 
 /*
  * Integer constants
-- 
cgit v1.2.3


From 80c544ded25ac14d7cc3e555abb8ed2c2da99b84 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Mon, 14 Mar 2016 15:47:23 +0100
Subject: s390/pci: enforce fmb page boundary rule

The function measurement block must not cross a page boundary. Ensure
that by raising the alignment requirement to the smallest power of 2
larger than the size of the fmb.

Fixes: d0b088531 ("s390/pci: performance statistics and debug infrastructure")
Cc: stable@vger.kernel.org # v3.8+
Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/pci.h | 2 +-
 arch/s390/pci/pci.c         | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index dc763eae7c49..f833082d9167 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -45,7 +45,7 @@ struct zpci_fmb {
 	u64 rpcit_ops;
 	u64 dma_rbytes;
 	u64 dma_wbytes;
-} __packed __aligned(16);
+} __packed __aligned(64);
 
 enum zpci_state {
 	ZPCI_FN_STATE_RESERVED,
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index f76d01e69fb8..9fd59a7cfcd3 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -860,8 +860,11 @@ static inline int barsize(u8 size)
 
 static int zpci_mem_init(void)
 {
+	BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) ||
+		     __alignof__(struct zpci_fmb) < sizeof(struct zpci_fmb));
+
 	zdev_fmb_cache = kmem_cache_create("PCI_FMB_cache", sizeof(struct zpci_fmb),
-				16, 0, NULL);
+					   __alignof__(struct zpci_fmb), 0, NULL);
 	if (!zdev_fmb_cache)
 		goto error_fmb;
 
-- 
cgit v1.2.3