KAISER: Kernel Address Isolation

This patch introduces our implementation of KAISER (Kernel Address Isolation to have Side-channels Efficiently Removed), a kernel isolation technique to close hardware side channels on kernel address information. More information about the original patch can be found at: https://github.com/IAIK/KAISER http://marc.info/?l=linux-kernel&m=149390087310405&w=2 Daniel Gruss <daniel.gruss@iaik.tugraz.at> Richard Fellner <richard.fellner@student.tugraz.at> Michael Schwarz <michael.schwarz@iaik.tugraz.at> <clementine.maurice@iaik.tugraz.at> <moritz.lipp@iaik.tugraz.at> That original was then developed further by Dave Hansen <dave.hansen@intel.com> Hugh Dickins <hughd@google.com> then others after this snapshot. This combined patch for 3.2.96 was derived from hughd's patches below for 3.18.72, in 2017-12-04's kaiser-3.18.72.tar; except for the last, which was sent in 2017-12-09's nokaiser-3.18.72.tar. They have been combined in order to minimize the effort of rebasing: most of the patches in the 3.18.72 series were small fixes and cleanups and enhancements to three large patches. About the only new work in this backport is a simple reimplementation of kaiser_remove_mapping(): since mm/pageattr.c changed a lot between 3.2 and 3.18, and the mods there for Kaiser never seemed necessary. KAISER: Kernel Address Isolation kaiser: merged update kaiser: do not set _PAGE_NX on pgd_none kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE kaiser: fix build and FIXME in alloc_ldt_struct() kaiser: KAISER depends on SMP kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER kaiser: fix perf crashes kaiser: ENOMEM if kaiser_pagetable_walk() NULL kaiser: tidied up asm/kaiser.h somewhat kaiser: tidied up kaiser_add/remove_mapping slightly kaiser: kaiser_remove_mapping() move along the pgd kaiser: align addition to x86/mm/Makefile kaiser: cleanups while trying for gold link kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET kaiser: delete KAISER_REAL_SWITCH option kaiser: vmstat show NR_KAISERTABLE as nr_overhead kaiser: enhanced by kernel and user PCIDs kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user kaiser: PCID 0 for kernel and 128 for user kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user kaiser: paranoid_entry pass cr3 need to paranoid_exit kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls kaiser: fix unlikely error in alloc_ldt_struct() kaiser: drop is_atomic arg to kaiser_pagetable_walk() Signed-off-by: Hugh Dickins <hughd@google.com> [bwh: - Fixed the #undef in arch/x86/boot/compressed/misc.h - Add missing #include in arch/x86/mm/kaiser.c] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
author: Hugh Dickins <hughd@google.com> 2017-12-11 17:59:50 -0800
committer: Ben Hutchings <ben@decadent.org.uk> 2018-01-07 01:46:49 +0000
commit: a4f588df14fb393b1c8f37c997dbab95afc2eb54 (patch)
tree: e4b11b8099abef5e9eb46eedfad86d8580a6a20f
parent: add19752eb782379610a01ea0d8cfce83cf071b0 (diff)
download: linux-stable-a4f588df14fb393b1c8f37c997dbab95afc2eb54.tar.gz
linux-stable-a4f588df14fb393b1c8f37c997dbab95afc2eb54.tar.bz2
linux-stable-a4f588df14fb393b1c8f37c997dbab95afc2eb54.zip
33 files changed, 1049 insertions, 67 deletions
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3f19c81a6203..2fa2635ee539 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -7,6 +7,7 @@
  * we just keep it from happening
  */
 #undef CONFIG_PARAVIRT
+#undef CONFIG_KAISER
 #ifdef CONFIG_X86_32
 #define _ASM_X86_DESC_H 1
 #endif
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 2b5527726ae1..7eb0d4792800 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -12,6 +12,8 @@
 #include <asm/ia32_unistd.h>	
 #include <asm/thread_info.h>	
 #include <asm/segment.h>
+#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
 
@@ -120,6 +122,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rsp,rbp
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	movq	PER_CPU_VAR(kernel_stack), %rsp
 	addq	$(KERNEL_STACK_OFFSET),%rsp
 	/*
@@ -183,6 +186,7 @@ sysexit_from_sys_call:
 	popq_cfi %rcx				/* User %esp */
 	CFI_REGISTER rsp,rcx
 	TRACE_IRQS_ON
+	SWITCH_USER_CR3
 	ENABLE_INTERRUPTS_SYSEXIT32
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -281,6 +285,7 @@ ENTRY(ia32_cstar_target)
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	movl	%esp,%r8d
 	CFI_REGISTER	rsp,r8
 	movq	PER_CPU_VAR(kernel_stack),%rsp
@@ -337,6 +342,7 @@ sysretl_from_sys_call:
 	xorq	%r9,%r9
 	xorq	%r8,%r8
 	TRACE_IRQS_ON
+	SWITCH_USER_CR3
 	movl RSP-ARGOFFSET(%rsp),%esp
 	CFI_RESTORE rsp
 	USERGS_SYSRET32
@@ -409,6 +415,7 @@ ENTRY(ia32_syscall)
 	CFI_REL_OFFSET	rip,RIP-RIP
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	SWAPGS
+	SWITCH_KERNEL_CR3_NO_STACK
 	/*
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 6f254f2fcd40..736272670870 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -176,6 +176,7 @@
 #define X86_FEATURE_PLN		(7*32+ 5) /* Intel Power Limit Notification */
 #define X86_FEATURE_PTS		(7*32+ 6) /* Intel Package Thermal Status */
 #define X86_FEATURE_DTHERM	(7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 382ce8a9fd62..7f1ead938ec1 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -40,7 +40,7 @@ struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
 
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index eb92a6ed2be7..3354a390cc71 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -164,7 +164,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
 extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
 
 typedef int vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
 extern void setup_vector_irq(int cpu);
 
 #ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 000000000000..6f4c8ef46881
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,126 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+#include <asm/processor-flags.h> /* For PCID constants */
+
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory.  It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
+ */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+orq  x86_cr3_pcid_noflush, \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
+movq %cr3, \reg
+orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
+js   9f
+/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
+9:
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax %al
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg regb
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_KAISER
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored.  To change the address space, another register is
+ * needed.  A register therefore has to be stored/restored.
+*/
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+extern unsigned long x86_cr3_pcid_noflush;
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
+/**
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+/**
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  kaiser_init - Initialize the shadow mapping
+ *
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and never unmapped.
+ */
+extern void kaiser_init(void);
+
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6be990922d4b..b1c8b8d3b02a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -570,7 +570,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
 
 static inline int pgd_bad(pgd_t pgd)
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	pgdval_t ignore_flags = _PAGE_USER;
+	/*
+	 * We set NX on KAISER pgds that map userspace memory so
+	 * that userspace can not meaningfully use the kernel
+	 * page table by accident; it will fault on the first
+	 * instruction it tries to run.  See native_set_pgd().
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)
@@ -771,6 +781,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+	/* Clone the shadow pgd part as well */
+	memcpy(native_get_shadow_pgd(dst),
+	       native_get_shadow_pgd(src),
+	       count * sizeof(pgd_t));
+#endif
 }
 
 
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 975f709e09ae..a3bf3de9893b 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -105,9 +105,36 @@ static inline void native_pud_clear(pud_t *pud)
 	native_set_pud(pud, native_make_pud(0));
 }
 
+#ifdef CONFIG_KAISER
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+	return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+	return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
+}
+#else
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+	return NULL;
+}
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+{
+	return pgdp;
+}
+#endif /* CONFIG_KAISER */
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
-	*pgdp = pgd;
+	*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 013286a10c2c..6e1315068a62 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -39,7 +39,11 @@
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL	(_AT(pteval_t, 0))
+#else
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
 #define _PAGE_UNUSED1	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
 #define _PAGE_IOMAP	(_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -62,7 +66,7 @@
 #endif
 
 #define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
-#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)
@@ -74,6 +78,33 @@
 			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
+
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER	(_AC(0x80,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH		(X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH		(X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH		(0)
+#define X86_CR3_PCID_USER_FLUSH		(0)
+#define X86_CR3_PCID_KERN_NOFLUSH	(0)
+#define X86_CR3_PCID_USER_NOFLUSH	(0)
+#endif
+
 #define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT)
 #define _PAGE_CACHE_WB		(0)
 #define _PAGE_CACHE_WC		(_PAGE_PWT)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index a9e14a52385f..360e80d0d217 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -43,6 +43,8 @@
  */
 #define X86_CR3_PWT	0x00000008 /* Page Write Through */
 #define X86_CR3_PCD	0x00000010 /* Page Cache Disable */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT)
 
 /*
  * Intel CPU features in CR4
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f7c89e231c6c..048249e983ca 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -266,7 +266,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
 
 /*
  * Save the original ist values for checking stack pointers during debugging
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e04cbc550424..288195901c8a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -64,27 +64,59 @@ static inline void invpcid_flush_all_nonglobals(void)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
 static inline void __native_flush_tlb(void)
 {
+	if (this_cpu_has(X86_FEATURE_INVPCID)) {
+		/*
+		 * Note, this works with CR4.PCIDE=0 or 1.
+		 */
+		invpcid_flush_all_nonglobals();
+		return;
+	}
+
 	/*
 	 * If current->mm == NULL then we borrow a mm which may change during a
 	 * task switch and therefore we must not be preempted while we write CR3
 	 * back:
 	 */
 	preempt_disable();
+	if (this_cpu_has(X86_FEATURE_PCID))
+		kaiser_flush_tlb_on_return_to_user();
 	native_write_cr3(native_read_cr3());
 	preempt_enable();
 }
 
 static inline void __native_flush_tlb_global(void)
 {
+#ifdef CONFIG_KAISER
+	/* Globals are not used at all */
+	__native_flush_tlb();
+#else
 	unsigned long flags;
 	unsigned long cr4;
 
-	if (static_cpu_has(X86_FEATURE_INVPCID)) {
+	if (this_cpu_has(X86_FEATURE_INVPCID)) {
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 *
+	 	 * Note, this works with CR4.PCIDE=0 or 1.
 		 */
 		invpcid_flush_all();
 		return;
@@ -104,11 +136,39 @@ static inline void __native_flush_tlb_global(void)
 	native_write_cr4(cr4);
 
 	raw_local_irq_restore(flags);
+#endif
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
-	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+	/*
+	 * SIMICS #GP's if you run INVPCID with type 2/3
+	 * and X86_CR4_PCIDE clear.  Shame!
+	 *
+	 * The ASIDs used below are hard-coded.  But, we must not
+	 * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
+	 * invlpg in the case we are called early.
+	 */
+
+	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+		if (this_cpu_has(X86_FEATURE_PCID))
+			kaiser_flush_tlb_on_return_to_user();
+		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+		return;
+	}
+	/* Flush the address out of both PCIDs. */
+	/*
+	 * An optimization here might be to determine addresses
+	 * that are only kernel-mapped and only flush the kernel
+	 * ASID.  But, userspace flushes are probably much more
+	 * important performance-wise.
+	 *
+	 * Make sure to do only a single invpcid when KAISER is
+	 * disabled and we have only a single ASID.
+	 */
+	if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+		invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+	invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
 }
 
 static inline void __flush_tlb_all(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 895e4b88469c..b567c89fc628 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitconst default_cpu = {
 
 static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
 	/*
 	 * We need valid kernel segments for data and code in long mode too
@@ -319,6 +319,19 @@ static void setup_pcid(struct cpuinfo_x86 *c)
 			 * SDM says that it can't be enabled in 32-bit mode.
 			 */
 			set_in_cr4(X86_CR4_PCIDE);
+			/*
+			 * INVPCID has two "groups" of types:
+			 * 1/2: Invalidate an individual address
+			 * 3/4: Invalidate all contexts
+			 *
+			 * 1/2 take a PCID, but 3/4 do not.  So, 3/4
+			 * ignore the PCID argument in the descriptor.
+			 * But, we have to be careful not to call 1/2
+			 * with an actual non-zero PCID in them before
+			 * we do the above set_in_cr4().
+			 */
+			if (cpu_has(c, X86_FEATURE_INVPCID))
+				set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
 		} else {
 			/*
 			 * flush_tlb_all(), as currently implemented, won't
@@ -331,6 +344,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
 			clear_cpu_cap(c, X86_FEATURE_PCID);
 		}
 	}
+	kaiser_setup_pcid();
 }
 
 /*
@@ -1115,7 +1129,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
 };
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 
 /* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 2d4e76ba2b5c..fb933cdca184 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -2,10 +2,14 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/kaiser.h>
 #include <asm/perf_event.h>
 
 #include "perf_event.h"
 
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
 
@@ -60,6 +64,39 @@ void fini_debug_store_on_cpu(int cpu)
 	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
 }
 
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+	unsigned int order = get_order(size);
+	struct page *page;
+	unsigned long addr;
+
+	page = alloc_pages_node(node, flags | __GFP_ZERO, order);
+	if (!page)
+		return NULL;
+	addr = (unsigned long)page_address(page);
+	if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+		__free_pages(page, order);
+		addr = 0;
+	}
+	return (void *)addr;
+#else
+	return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+	if (!buffer)
+		return;
+	kaiser_remove_mapping((unsigned long)buffer, size);
+	free_pages((unsigned long)buffer, get_order(size));
+#else
+	kfree(buffer);
+#endif
+}
+
 static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -70,7 +107,7 @@ static int alloc_pebs_buffer(int cpu)
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+	buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -94,7 +131,7 @@ static void release_pebs_buffer(int cpu)
 	if (!ds || !x86_pmu.pebs)
 		return;
 
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE);
 	ds->pebs_buffer_base = 0;
 }
 
@@ -108,7 +145,7 @@ static int alloc_bts_buffer(int cpu)
 	if (!x86_pmu.bts)
 		return 0;
 
-	buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+	buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -132,19 +169,15 @@ static void release_bts_buffer(int cpu)
 	if (!ds || !x86_pmu.bts)
 		return;
 
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
 	ds->bts_buffer_base = 0;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
-
-	ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
+	struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
 
+	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
 
 	return 0;
@@ -158,7 +191,6 @@ static void release_ds_buffer(int cpu)
 		return;
 
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
 }
 
 void release_ds_buffers(void)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f6daf3cdb878..3a4356a2f156 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -56,6 +56,7 @@
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
 #include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -323,6 +324,7 @@ ENDPROC(native_usergs_sysret64)
 	testl $3, CS(%rdi)
 	je 1f
 	SWAPGS
+	SWITCH_KERNEL_CR3
 	/*
 	 * irq_count is used to check if a CPU is already on an interrupt stack
 	 * or not. While this is essentially redundant with preempt_count it is
@@ -362,6 +364,12 @@ END(save_rest)
 
 /* save complete stack frame */
 	.pushsection .kprobes.text, "ax"
+/*
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
+ */
 ENTRY(save_paranoid)
 	XCPT_FRAME 1 RDI+8
 	cld
@@ -387,7 +395,25 @@ ENTRY(save_paranoid)
 	js 1f	/* negative -> in kernel */
 	SWAPGS
 	xorl %ebx,%ebx
-1:	ret
+1:
+#ifdef CONFIG_KAISER
+	/*
+	 * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+	 * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+	 * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+	 * unconditionally, but we need to find out whether the reverse
+	 * should be done on return (conveyed to paranoid_exit in %ebx).
+	 */
+	movq	%cr3, %rax
+	testl	$KAISER_SHADOW_PGD_OFFSET, %eax
+	jz	2f
+	orl	$2, %ebx
+	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+	orq	x86_cr3_pcid_noflush, %rax
+	movq	%rax, %cr3
+2:
+#endif
+	ret
 	CFI_ENDPROC
 END(save_paranoid)
 	.popsection
@@ -464,6 +490,7 @@ ENTRY(system_call)
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	/*
 	 * A hypervisor implementation might want to use a label
 	 * after the swapgs, so that it can do the swapgs
@@ -515,6 +542,14 @@ sysret_check:
 	CFI_REGISTER	rip,rcx
 	RESTORE_ARGS 1,-ARG_SKIP,0
 	/*CFI_REGISTER	rflags,r11*/
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
+	SWITCH_USER_CR3
 	movq	PER_CPU_VAR(old_rsp), %rsp
 	USERGS_SYSRET64
 
@@ -851,6 +886,14 @@ retint_swapgs:		/* return to user-space */
 	 */
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_IRETQ
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp restore_args
 
@@ -891,6 +934,7 @@ native_irq_return_ldt:
 	pushq_cfi %rax
 	pushq_cfi %rdi
 	SWAPGS
+	SWITCH_KERNEL_CR3
 	movq PER_CPU_VAR(espfix_waddr),%rdi
 	movq %rax,(0*8)(%rdi)	/* RAX */
 	movq (2*8)(%rsp),%rax	/* RIP */
@@ -906,6 +950,7 @@ native_irq_return_ldt:
 	andl $0xffff0000,%eax
 	popq_cfi %rdi
 	orq PER_CPU_VAR(espfix_stack),%rax
+	SWITCH_USER_CR3
 	SWAPGS
 	movq %rax,%rsp
 	popq_cfi %rax
@@ -1366,30 +1411,40 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
 	 * is fundamentally NMI-unsafe. (we cannot change the soft and
 	 * hard flags at once, atomically)
 	 */
-
-	/* ebx:	no swapgs flag */
+/*
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ *           ebx=2: needs both swapgs and SWITCH_USER_CR3
+ *           ebx=3: needs SWITCH_USER_CR3 but not swapgs
+ */
 ENTRY(paranoid_exit)
 	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_restore
-	testl $3,CS(%rsp)
-	jnz   paranoid_userspace
-paranoid_swapgs:
+	movq	%rbx, %r12		/* paranoid_userspace uses %ebx */
+	testl	$3, CS(%rsp)
+	jnz	paranoid_userspace
+paranoid_kernel:
+	movq	%r12, %rbx		/* restore after paranoid_userspace */
 	TRACE_IRQS_IRETQ 0
+#ifdef CONFIG_KAISER
+	testl	$2, %ebx		/* SWITCH_USER_CR3 needed? */
+	jz	paranoid_exit_no_switch
+	SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+	testl	$1, %ebx		/* swapgs needed? */
+	jnz	paranoid_exit_no_swapgs
 	SWAPGS_UNSAFE_STACK
+paranoid_exit_no_swapgs:
 	RESTORE_ALL 8
-	jmp irq_return
-paranoid_restore:
-	TRACE_IRQS_IRETQ 0
-	RESTORE_ALL 8
-	jmp irq_return
+	jmp	irq_return
+
 paranoid_userspace:
 	GET_THREAD_INFO(%rcx)
 	movl TI_flags(%rcx),%ebx
 	andl $_TIF_WORK_MASK,%ebx
-	jz paranoid_swapgs
+	jz paranoid_kernel
 	movq %rsp,%rdi			/* &pt_regs */
 	call sync_regs
 	movq %rax,%rsp			/* switch stack for scheduling */
@@ -1438,6 +1493,13 @@ ENTRY(error_entry)
 	movq_cfi r13, R13+8
 	movq_cfi r14, R14+8
 	movq_cfi r15, R15+8
+	/*
+	 * error_entry() always returns with a kernel gsbase and
+	 * CR3.  We must also have a kernel CR3/gsbase before
+	 * calling TRACE_IRQS_*.  Just unconditionally switch to
+	 * the kernel CR3 here.
+	 */
+	SWITCH_KERNEL_CR3
 	xorl %ebx,%ebx
 	testl $3,CS+8(%rsp)
 	je error_kernelspace
@@ -1527,22 +1589,31 @@ ENTRY(nmi)
 	call do_nmi
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/* paranoidexit; without TRACE_IRQS_OFF */
-	/* ebx:	no swapgs flag */
+	/* ebx:	no-swapgs and kaiser-switch-cr3 flag */
 	DISABLE_INTERRUPTS(CLBR_NONE)
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz nmi_restore
-	testl $3,CS(%rsp)
-	jnz nmi_userspace
-nmi_swapgs:
+	movq	%rbx, %r12		/* nmi_userspace uses %ebx */
+	testl	$3, CS(%rsp)
+	jnz	nmi_userspace
+nmi_kernel:
+	movq	%r12, %rbx		/* restore after nmi_userspace */
+#ifdef CONFIG_KAISER
+	testl	$2, %ebx		/* SWITCH_USER_CR3 needed? */
+	jz	nmi_exit_no_switch
+	SWITCH_USER_CR3
+nmi_exit_no_switch:
+#endif
+	testl	$1, %ebx		/* swapgs needed? */
+	jnz	nmi_exit_no_swapgs
 	SWAPGS_UNSAFE_STACK
-nmi_restore:
+nmi_exit_no_swapgs:
 	RESTORE_ALL 8
-	jmp irq_return
+	jmp	irq_return
+
 nmi_userspace:
 	GET_THREAD_INFO(%rcx)
 	movl TI_flags(%rcx),%ebx
 	andl $_TIF_WORK_MASK,%ebx
-	jz nmi_swapgs
+	jz nmi_kernel
 	movq %rsp,%rdi			/* &pt_regs */
 	call sync_regs
 	movq %rax,%rsp			/* switch stack for scheduling */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 94d857fb1033..14cd73b0e634 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/espfix.h>
+#include <asm/kaiser.h>
 
 /*
  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -129,6 +130,14 @@ void __init init_espfix_bsp(void)
 	/* Install the espfix pud into the kernel page directory */
 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+	/*
+	 * Just copy the top-level PGD that is mapping the espfix
+	 * area to ensure it is mapped into the shadow user page
+	 * tables.
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		set_pgd(native_get_shadow_pgd(pgd_p),
+			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
 
 	/* Randomize the locations */
 	init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0f8ebf78253a..6e697ac3fb54 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -338,6 +338,27 @@ early_idt_ripmsg:
 	.balign	PAGE_SIZE; \
 ENTRY(name)
 
+#ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL	512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+	.balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL	0
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)			\
 	i = 0 ;						\
@@ -353,13 +374,14 @@ ENTRY(name)
 	 * 0xffffffff80000000 to physical address 0x000000. (always using
 	 * 2Mbyte large pages provided by PAE mode)
 	 */
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
 	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 	.org	init_level4_pgt + L4_PAGE_OFFSET*8, 0
 	.quad	level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 	.org	init_level4_pgt + L4_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -385,6 +407,7 @@ NEXT_PAGE(level2_ident_pgt)
 	 * Don't set NX because code runs from these pages.
 	 */
 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level2_kernel_pgt)
 	/*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 43e9ccf44947..f00e6e734fbd 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task);
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS;
 
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index e328f691eeef..990f743e21b8 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -85,7 +85,7 @@ static struct irqaction irq2 = {
 	.flags = IRQF_NO_THREAD,
 };
 
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
 	[0 ... NR_VECTORS - 1] = -1,
 };
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 1dd32307a494..836a4c2d5ceb 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/kaiser.h>
 
 #include <asm/system.h>
 #include <asm/ldt.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
 	set_ldt(pc->ldt->entries, pc->ldt->size);
 }
 
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+		vfree(ldt->entries);
+	else
+		free_page((unsigned long)ldt->entries);
+	kfree(ldt);
+}
+
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
 static struct ldt_struct *alloc_ldt_struct(int size)
 {
 	struct ldt_struct *new_ldt;
 	int alloc_size;
+	int ret;
 
 	if (size > LDT_ENTRIES)
 		return NULL;
@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 		return NULL;
 	}
 
+	ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+				 __PAGE_KERNEL);
 	new_ldt->size = size;
+	if (ret) {
+		__free_ldt_struct(new_ldt);
+		return NULL;
+	}
 	return new_ldt;
 }
 
@@ -97,12 +114,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
 	if (likely(!ldt))
 		return;
 
+	kaiser_remove_mapping((unsigned long)ldt->entries,
+			      ldt->size * LDT_ENTRY_SIZE);
 	paravirt_free_ldt(ldt->entries, ldt->size);
-	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-		vfree(ldt->entries);
-	else
-		kfree(ldt->entries);
-	kfree(ldt);
+	__free_ldt_struct(ldt);
 }
 
 /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 557eb3757edb..d2ce2a33d15b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -57,7 +57,7 @@
 
 asmlinkage extern void ret_from_fork(void);
 
-DEFINE_PER_CPU(unsigned long, old_rsp);
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp);
 static DEFINE_PER_CPU(unsigned char, is_idle);
 
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index cf2a84031dfd..c9a00a5e0b87 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o
 
 obj-$(CONFIG_MEMTEST)		+= memtest.o
+obj-$(CONFIG_KAISER)		+= kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 000000000000..79b0222ffa74
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,382 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+extern struct mm_struct init_mm;
+
+#include <asm/kaiser.h>
+#include <asm/tlbflush.h>	/* to verify its kaiser declarations */
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_KAISER
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3.  It would take
+ * another register.  So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+unsigned long x86_cr3_pcid_noflush __read_mostly;
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset_k(vaddr);
+	/*
+	 * We made all the kernel PGDs present in kaiser_init().
+	 * We expect them to stay that way.
+	 */
+	BUG_ON(pgd_none(*pgd));
+	/*
+	 * PGDs are either 512GB or 128TB on all x86_64
+	 * configurations.  We don't handle these.
+	 */
+	BUG_ON(pgd_large(*pgd));
+
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	if (pud_large(*pud))
+		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	if (pmd_large(*pmd))
+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_none(*pte)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address)
+{
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+	if (pgd_none(*pgd)) {
+		WARN_ONCE(1, "All shadow pgds should have been populated");
+		return NULL;
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+	pud = pud_offset(pgd, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pud_large(*pud)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pud_none(*pud)) {
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+			__inc_zone_page_state(virt_to_page((void *)
+						new_pmd_page), NR_KAISERTABLE);
+		} else
+			free_page(new_pmd_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
+
+	pmd = pmd_offset(pud, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pmd_large(*pmd)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pmd_none(*pmd)) {
+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+			__inc_zone_page_state(virt_to_page((void *)
+						new_pte_page), NR_KAISERTABLE);
+		} else
+			free_page(new_pte_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
+
+	return pte_offset_kernel(pmd, address);
+}
+
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+			unsigned long flags)
+{
+	int ret = 0;
+	pte_t *pte;
+	unsigned long start_addr = (unsigned long )__start_addr;
+	unsigned long address = start_addr & PAGE_MASK;
+	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+	unsigned long target_address;
+
+	for (; address < end_addr; address += PAGE_SIZE) {
+		target_address = get_pa_from_mapping(address);
+		if (target_address == -1) {
+			ret = -EIO;
+			break;
+		}
+		pte = kaiser_pagetable_walk(address);
+		if (!pte) {
+			ret = -ENOMEM;
+			break;
+		}
+		if (pte_none(*pte)) {
+			set_pte(pte, __pte(flags | target_address));
+		} else {
+			pte_t tmp;
+			set_pte(&tmp, __pte(flags | target_address));
+			WARN_ON_ONCE(!pte_same(*pte, tmp));
+		}
+	}
+	return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+	unsigned long size = end - start;
+
+	return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+	pgd_t *pgd;
+	int i = 0;
+
+	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+		pgd_t new_pgd;
+		pud_t *pud = pud_alloc_one(&init_mm,
+					   PAGE_OFFSET + i * PGDIR_SIZE);
+		if (!pud) {
+			WARN_ON(1);
+			break;
+		}
+		inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
+		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+		/*
+		 * Make sure not to stomp on some other pgd entry.
+		 */
+		if (!pgd_none(pgd[i])) {
+			WARN_ON(1);
+			continue;
+		}
+		set_pgd(pgd + i, new_pgd);
+	}
+}
+
+#define kaiser_add_user_map_early(start, size, flags) do {	\
+	int __ret = kaiser_add_user_map(start, size, flags);	\
+	WARN_ON(__ret);						\
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
+	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
+	WARN_ON(__ret);							\
+} while (0)
+
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+void __init kaiser_init(void)
+{
+	int cpu;
+
+	kaiser_init_all_pgds();
+
+	for_each_possible_cpu(cpu) {
+		void *percpu_vaddr = __per_cpu_user_mapped_start +
+				     per_cpu_offset(cpu);
+		unsigned long percpu_sz = __per_cpu_user_mapped_end -
+					  __per_cpu_user_mapped_start;
+		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+					  __PAGE_KERNEL);
+	}
+
+	/*
+	 * Map the entry/exit text section, which is needed at
+	 * switches from user to and from kernel.
+	 */
+	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+				       __PAGE_KERNEL_RX);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+				       __irqentry_text_end,
+				       __PAGE_KERNEL_RX);
+#endif
+	kaiser_add_user_map_early((void *)idt_descr.address,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL_RO);
+	kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
+				  sizeof(x86_cr3_pcid_noflush),
+				  __PAGE_KERNEL);
+}
+
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+	return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+	unsigned long end = start + size;
+	unsigned long addr;
+	pte_t *pte;
+
+	for (addr = start; addr < end; addr += PAGE_SIZE) {
+		pte = kaiser_pagetable_walk(addr);
+		if (pte)
+			set_pte(pte, __pte(0));
+	}
+}
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	/*
+	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
+	 * skip cases like kexec and EFI which make temporary low mappings.
+	 */
+	if (pgd.pgd & _PAGE_USER) {
+		if (is_userspace_pgd(pgdp)) {
+			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+			/*
+			 * Even if the entry is *mapping* userspace, ensure
+			 * that userspace can not use it.  This way, if we
+			 * get out to userspace running on the kernel CR3,
+			 * userspace will crash instead of running.
+			 */
+			pgd.pgd |= _PAGE_NX;
+		}
+	} else if (!pgd.pgd) {
+		/*
+		 * pgd_clear() cannot check _PAGE_USER, and is even used to
+		 * clear corrupted pgd entries: so just rely on cases like
+		 * kexec and EFI never to be using pgd_clear().
+		 */
+		if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+		    is_userspace_pgd(pgdp))
+			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+	}
+	return pgd;
+}
+
+void kaiser_setup_pcid(void)
+{
+	unsigned long kern_cr3 = 0;
+	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+	if (this_cpu_has(X86_FEATURE_PCID)) {
+		kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+	}
+	/*
+	 * These variables are used by the entry/exit
+	 * code to change PCID and pgd and TLB flushing.
+	 */
+	x86_cr3_pcid_noflush = kern_cr3;
+	this_cpu_write(x86_cr3_pcid_user, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
+ * if cpu does not, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+	this_cpu_write(x86_cr3_pcid_user,
+			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8573b83a63d0..73285602c93f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -5,7 +5,7 @@
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
 
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -253,12 +253,35 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 	}
 }
 
+#ifdef CONFIG_KAISER
+/*
+ * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
+static inline pgd_t *_pgd_alloc(void)
+{
+	/* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
+					 PGD_ALLOCATION_ORDER);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+}
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;
 	pmd_t *pmds[PREALLOCATED_PMDS];
 
-	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
+	pgd = _pgd_alloc();
 
 	if (pgd == NULL)
 		goto out;
@@ -288,7 +311,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 out_free_pmds:
 	free_pmds(pmds);
 out_free_pgd:
-	free_page((unsigned long)pgd);
+	_pgd_free(pgd);
 out:
 	return NULL;
 }
@@ -298,7 +321,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	pgd_mop_up_pmds(mm, pgd);
 	pgd_dtor(pgd);
 	paravirt_pgd_free(mm, pgd);
-	free_page((unsigned long)pgd);
+	_pgd_free(pgd);
 }
 
 int ptep_set_access_flags(struct vm_area_struct *vma,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4f5ca8f04c0d..4078e3092b16 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,10 +12,43 @@
 #include <asm/cache.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
+#include <asm/kaiser.h>
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
 			= { &init_mm, 0, };
 
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+	unsigned long new_mm_cr3 = __pa(pgdir);
+
+#ifdef CONFIG_KAISER
+	if (this_cpu_has(X86_FEATURE_PCID)) {
+		/*
+		 * We reuse the same PCID for different tasks, so we must
+		 * flush all the entries for the PCID out when we change tasks.
+		 * Flush KERN below, flush USER when returning to userspace in
+		 * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+		 *
+		 * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+		 * do it here, but can only be used if X86_FEATURE_INVPCID is
+		 * available - and many machines support pcid without invpcid.
+		 *
+		 * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
+		 * but keep that line in there in case something changes.
+		 */
+		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+		kaiser_flush_tlb_on_return_to_user();
+	}
+#endif /* CONFIG_KAISER */
+
+	/*
+	 * Caution: many callers of this function expect
+	 * that load_new_mm_cr3() is serializing and orders TLB
+	 * fills with respect to the mm_cpumask writes.
+	 */
+	write_cr3(new_mm_cr3);
+}
+
 /*
  *	TLB flushing, formerly SMP-only
  *		c/o Linus Torvalds.
@@ -65,7 +98,7 @@ void leave_mm(int cpu)
 		BUG();
 	cpumask_clear_cpu(cpu,
 			  mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
-	load_cr3(swapper_pg_dir);
+	load_new_mm_cr3(swapper_pg_dir);
 }
 EXPORT_SYMBOL_GPL(leave_mm);
 
@@ -113,11 +146,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * from next->pgd.  TLB fills are special and can happen
 		 * due to instruction fetches or for no reason at all,
 		 * and neither LOCK nor MFENCE orders them.
-		 * Fortunately, load_cr3() is serializing and gives the
-		 * ordering guarantee we need.
-		 *
+		 * Fortunately, load_new_mm_cr3() is serializing
+		 * and gives the  ordering guarantee we need.
 		 */
-		load_cr3(next->pgd);
+		load_new_mm_cr3(next->pgd);
 
 		/* stop flush ipis for the previous mm */
 		cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -136,10 +168,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			 * tlb flush IPI delivery. We must reload CR3
 			 * to make sure to use no freed page tables.
 			 *
-			 * As above, load_cr3() is serializing and orders TLB
-			 * fills with respect to the mm_cpumask write.
+			 * As above, load_new_mm_cr3() is serializing and orders
+			 * TLB fills with respect to the mm_cpumask write.
 			 */
-			load_cr3(next->pgd);
+			load_new_mm_cr3(next->pgd);
 			load_mm_ldt(next);
 		}
 	}
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b5e2e4c6b017..01c8155dd613 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -692,7 +692,14 @@
  */
 #define PERCPU_INPUT(cacheline)						\
 	VMLINUX_SYMBOL(__per_cpu_start) = .;				\
+	VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;		\
 	*(.data..percpu..first)						\
+	. = ALIGN(cacheline);						\
+	*(.data..percpu..user_mapped)					\
+	*(.data..percpu..user_mapped..shared_aligned)			\
+	. = ALIGN(PAGE_SIZE);						\
+	*(.data..percpu..user_mapped..page_aligned)			\
+	VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;			\
 	. = ALIGN(PAGE_SIZE);						\
 	*(.data..percpu..page_aligned)					\
 	. = ALIGN(cacheline);						\
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644
index 000000000000..4a4d6d911a14
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+	/*
+	 * Map that page of kernel stack on which we enter from user context.
+	 */
+	return kaiser_add_mapping((unsigned long)stack +
+			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+	/*
+	 * Note: may be called even when kaiser_map_thread_stack() failed.
+	 */
+	kaiser_remove_mapping((unsigned long)stack +
+			THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr,
+				     unsigned long size, unsigned long flags)
+{
+	return 0;
+}
+static inline void kaiser_remove_mapping(unsigned long start,
+					 unsigned long size)
+{
+}
+static inline int kaiser_map_thread_stack(void *stack)
+{
+	return 0;
+}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _LINUX_KAISER_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 25842b6e72e1..a0b4422a116a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -95,8 +95,9 @@ enum zone_stat_item {
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
-	NR_KERNEL_STACK,
 	/* Second 128 byte cacheline */
+	NR_KERNEL_STACK,
+	NR_KAISERTABLE,
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_BOUNCE,
 	NR_VMSCAN_WRITE,
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 27ef6b190ea6..56f5eeb78d1d 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -28,6 +28,12 @@
 	(void)__vpp_verify;						\
 } while (0)
 
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
 /*
  * s390 and alpha modules require percpu variables to be defined as
  * weak to force the compiler to generate GOT based external
@@ -90,6 +96,12 @@
 #define DEFINE_PER_CPU(type, name)					\
 	DEFINE_PER_CPU_SECTION(type, name, "")
 
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)				\
+	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
 /*
  * Declaration/definition used for per-CPU variables that must come first in
  * the set of variables.
@@ -119,6 +131,14 @@
 	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
 	____cacheline_aligned_in_smp
 
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\
+	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)		\
+	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
+
 #define DECLARE_PER_CPU_ALIGNED(type, name)				\
 	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\
 	____cacheline_aligned
@@ -137,11 +157,21 @@
 #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)				\
 	DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")		\
 	__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\
+	DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+	__aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)		\
+	DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+	__aligned(PAGE_SIZE)
 
 /*
  * Declaration/definition used for per-CPU variables that must be read mostly.
  */
-#define DECLARE_PER_CPU_READ_MOSTLY(type, name)			\
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name)				\
 	DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
 
 #define DEFINE_PER_CPU_READ_MOSTLY(type, name)				\
diff --git a/init/main.c b/init/main.c
index e937d9bda0f8..558a9fdd566d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -69,6 +69,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/random.h>
+#include <linux/kaiser.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -463,6 +464,7 @@ static void __init mm_init(void)
 	percpu_init_late();
 	pgtable_cache_init();
 	vmalloc_init();
+	kaiser_init();
 }
 
 asmlinkage void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 29b460431c12..511131a15a75 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
+#include <linux/kaiser.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
@@ -133,6 +134,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 
 static inline void free_thread_info(struct thread_info *ti)
 {
+	kaiser_unmap_thread_stack(ti);
 	free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 #endif
@@ -275,6 +277,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
 	tsk->stack = ti;
 
+	err = kaiser_map_thread_stack(tsk->stack);
+	if (err)
+		goto out;
+
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ff9060919c4b..eaf3db038652 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -699,6 +699,7 @@ const char * const vmstat_text[] = {
 	"nr_slab_unreclaimable",
 	"nr_page_table_pages",
 	"nr_kernel_stack",
+	"nr_overhead",
 	"nr_unstable",
 	"nr_bounce",
 	"nr_vmscan_write",
diff --git a/security/Kconfig b/security/Kconfig
index 51bd5a0b69ae..19f83193e7ab 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -96,6 +96,16 @@ config SECURITY
 
 	  If you are unsure how to answer this question, answer N.
 
+config KAISER
+	bool "Remove the kernel mapping in user mode"
+	default y
+	depends on X86_64 && SMP && !PARAVIRT
+	help
+	  This enforces a strict kernel and user space isolation, in order
+	  to close hardware side channels on kernel address information.
+
+	  If you are unsure how to answer this question, answer Y.
+
 config SECURITYFS
 	bool "Enable the securityfs filesystem"
 	help
author	Hugh Dickins <hughd@google.com>	2017-12-11 17:59:50 -0800
committer	Ben Hutchings <ben@decadent.org.uk>	2018-01-07 01:46:49 +0000
commit	a4f588df14fb393b1c8f37c997dbab95afc2eb54 (patch)
tree	e4b11b8099abef5e9eb46eedfad86d8580a6a20f
parent	add19752eb782379610a01ea0d8cfce83cf071b0 (diff)
download	linux-stable-a4f588df14fb393b1c8f37c997dbab95afc2eb54.tar.gz linux-stable-a4f588df14fb393b1c8f37c997dbab95afc2eb54.tar.bz2 linux-stable-a4f588df14fb393b1c8f37c997dbab95afc2eb54.zip