diff options
author | Hugh Dickins <hughd@google.com> | 2017-12-11 17:59:50 -0800 |
---|---|---|
committer | Ben Hutchings <ben@decadent.org.uk> | 2018-01-07 01:46:49 +0000 |
commit | a4f588df14fb393b1c8f37c997dbab95afc2eb54 (patch) | |
tree | e4b11b8099abef5e9eb46eedfad86d8580a6a20f | |
parent | add19752eb782379610a01ea0d8cfce83cf071b0 (diff) | |
download | linux-stable-a4f588df14fb393b1c8f37c997dbab95afc2eb54.tar.gz linux-stable-a4f588df14fb393b1c8f37c997dbab95afc2eb54.tar.bz2 linux-stable-a4f588df14fb393b1c8f37c997dbab95afc2eb54.zip |
KAISER: Kernel Address Isolation
This patch introduces our implementation of KAISER (Kernel Address
Isolation to have Side-channels Efficiently Removed), a kernel isolation
technique to close hardware side channels on kernel address information.
More information about the original patch can be found at:
https://github.com/IAIK/KAISER
http://marc.info/?l=linux-kernel&m=149390087310405&w=2
Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Richard Fellner <richard.fellner@student.tugraz.at>
Michael Schwarz <michael.schwarz@iaik.tugraz.at>
<clementine.maurice@iaik.tugraz.at>
<moritz.lipp@iaik.tugraz.at>
That original was then developed further by
Dave Hansen <dave.hansen@intel.com>
Hugh Dickins <hughd@google.com>
then others after this snapshot.
This combined patch for 3.2.96 was derived from hughd's patches below
for 3.18.72, in 2017-12-04's kaiser-3.18.72.tar; except for the last,
which was sent in 2017-12-09's nokaiser-3.18.72.tar. They have been
combined in order to minimize the effort of rebasing: most of the
patches in the 3.18.72 series were small fixes and cleanups and
enhancements to three large patches. About the only new work in this
backport is a simple reimplementation of kaiser_remove_mapping():
since mm/pageattr.c changed a lot between 3.2 and 3.18, and the
mods there for Kaiser never seemed necessary.
KAISER: Kernel Address Isolation
kaiser: merged update
kaiser: do not set _PAGE_NX on pgd_none
kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE
kaiser: fix build and FIXME in alloc_ldt_struct()
kaiser: KAISER depends on SMP
kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER
kaiser: fix perf crashes
kaiser: ENOMEM if kaiser_pagetable_walk() NULL
kaiser: tidied up asm/kaiser.h somewhat
kaiser: tidied up kaiser_add/remove_mapping slightly
kaiser: kaiser_remove_mapping() move along the pgd
kaiser: align addition to x86/mm/Makefile
kaiser: cleanups while trying for gold link
kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET
kaiser: delete KAISER_REAL_SWITCH option
kaiser: vmstat show NR_KAISERTABLE as nr_overhead
kaiser: enhanced by kernel and user PCIDs
kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user
kaiser: PCID 0 for kernel and 128 for user
kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user
kaiser: paranoid_entry pass cr3 need to paranoid_exit
kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls
kaiser: fix unlikely error in alloc_ldt_struct()
kaiser: drop is_atomic arg to kaiser_pagetable_walk()
Signed-off-by: Hugh Dickins <hughd@google.com>
[bwh:
- Fixed the #undef in arch/x86/boot/compressed/misc.h
- Add missing #include in arch/x86/mm/kaiser.c]
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
33 files changed, 1049 insertions, 67 deletions
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3f19c81a6203..2fa2635ee539 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -7,6 +7,7 @@ * we just keep it from happening */ #undef CONFIG_PARAVIRT +#undef CONFIG_KAISER #ifdef CONFIG_X86_32 #define _ASM_X86_DESC_H 1 #endif diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 2b5527726ae1..7eb0d4792800 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -12,6 +12,8 @@ #include <asm/ia32_unistd.h> #include <asm/thread_info.h> #include <asm/segment.h> +#include <asm/pgtable_types.h> +#include <asm/kaiser.h> #include <asm/irqflags.h> #include <linux/linkage.h> @@ -120,6 +122,7 @@ ENTRY(ia32_sysenter_target) CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movq PER_CPU_VAR(kernel_stack), %rsp addq $(KERNEL_STACK_OFFSET),%rsp /* @@ -183,6 +186,7 @@ sysexit_from_sys_call: popq_cfi %rcx /* User %esp */ CFI_REGISTER rsp,rcx TRACE_IRQS_ON + SWITCH_USER_CR3 ENABLE_INTERRUPTS_SYSEXIT32 #ifdef CONFIG_AUDITSYSCALL @@ -281,6 +285,7 @@ ENTRY(ia32_cstar_target) CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 movq PER_CPU_VAR(kernel_stack),%rsp @@ -337,6 +342,7 @@ sysretl_from_sys_call: xorq %r9,%r9 xorq %r8,%r8 TRACE_IRQS_ON + SWITCH_USER_CR3 movl RSP-ARGOFFSET(%rsp),%esp CFI_RESTORE rsp USERGS_SYSRET32 @@ -409,6 +415,7 @@ ENTRY(ia32_syscall) CFI_REL_OFFSET rip,RIP-RIP PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS + SWITCH_KERNEL_CR3_NO_STACK /* * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 6f254f2fcd40..736272670870 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,6 +176,7 @@ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ #define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 382ce8a9fd62..7f1ead938ec1 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -40,7 +40,7 @@ struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6ed2be7..3354a390cc71 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -164,7 +164,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; -DECLARE_PER_CPU(vector_irq_t, vector_irq); +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); extern void setup_vector_irq(int cpu); #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h new file mode 100644 index 000000000000..6f4c8ef46881 --- /dev/null +++ b/arch/x86/include/asm/kaiser.h @@ -0,0 +1,126 @@ +#ifndef _ASM_X86_KAISER_H +#define _ASM_X86_KAISER_H + +#include <asm/processor-flags.h> /* For PCID constants */ + +/* + * This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on + * the kernel virtual memory. It has a shadow pgd for every process: the + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole + * user memory. Within a kernel context switch, or when an interrupt is handled, + * the pgd is switched to the normal one. When the system switches to user mode, + * the shadow pgd is enabled. By this, the virtual memory caches are freed, + * and the user may not attack the whole kernel memory. + * + * A minimalistic kernel mapping holds the parts needed to be mapped in user + * mode, such as the entry/exit functions of the user space, or the stacks. + */ + +#define KAISER_SHADOW_PGD_OFFSET 0x1000 + +#ifdef __ASSEMBLY__ +#ifdef CONFIG_KAISER + +.macro _SWITCH_TO_KERNEL_CR3 reg +movq %cr3, \reg +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +orq x86_cr3_pcid_noflush, \reg +movq \reg, %cr3 +.endm + +.macro _SWITCH_TO_USER_CR3 reg regb +/* + * regb must be the low byte portion of reg: because we have arranged + * for the low byte of the user PCID to serve as the high byte of NOFLUSH + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are + * not enabled): so that the one register can update both memory and cr3. + */ +movq %cr3, \reg +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg +js 9f +/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) +9: +movq \reg, %cr3 +.endm + +.macro SWITCH_KERNEL_CR3 +pushq %rax +_SWITCH_TO_KERNEL_CR3 %rax +popq %rax +.endm + +.macro SWITCH_USER_CR3 +pushq %rax +_SWITCH_TO_USER_CR3 %rax %al +popq %rax +.endm + +.macro SWITCH_KERNEL_CR3_NO_STACK +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_KERNEL_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +.endm + +#else /* CONFIG_KAISER */ + +.macro SWITCH_KERNEL_CR3 reg +.endm +.macro SWITCH_USER_CR3 reg regb +.endm +.macro SWITCH_KERNEL_CR3_NO_STACK +.endm + +#endif /* CONFIG_KAISER */ + +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_KAISER +/* + * Upon kernel/user mode switch, it may happen that the address + * space has to be switched before the registers have been + * stored. To change the address space, another register is + * needed. A register therefore has to be stored/restored. +*/ +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +extern unsigned long x86_cr3_pcid_noflush; +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); + +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; + +/** + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping + * @addr: the start address of the range + * @size: the size of the range + * @flags: The mapping flags of the pages + * + * The mapping is done on a global scope, so no bigger + * synchronization has to be done. the pages have to be + * manually unmapped again when they are not needed any longer. + */ +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + +/** + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + */ +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + +/** + * kaiser_init - Initialize the shadow mapping + * + * Most parts of the shadow mapping can be mapped upon boot + * time. Only per-process things like the thread stacks + * or a new LDT have to be mapped at runtime. These boot- + * time mappings are permanent and never unmapped. + */ +extern void kaiser_init(void); + +#endif /* CONFIG_KAISER */ + +#endif /* __ASSEMBLY */ + +#endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6be990922d4b..b1c8b8d3b02a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -570,7 +570,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) static inline int pgd_bad(pgd_t pgd) { - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; + pgdval_t ignore_flags = _PAGE_USER; + /* + * We set NX on KAISER pgds that map userspace memory so + * that userspace can not meaningfully use the kernel + * page table by accident; it will fault on the first + * instruction it tries to run. See native_set_pgd(). + */ + if (IS_ENABLED(CONFIG_KAISER)) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) @@ -771,6 +781,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_KAISER + /* Clone the shadow pgd part as well */ + memcpy(native_get_shadow_pgd(dst), + native_get_shadow_pgd(src), + count * sizeof(pgd_t)); +#endif } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 975f709e09ae..a3bf3de9893b 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -105,9 +105,36 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } +#ifdef CONFIG_KAISER +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); + +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) +{ + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); +} + +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) +{ + return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); +} +#else +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) +{ + return NULL; +} +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) +{ + return pgdp; +} +#endif /* CONFIG_KAISER */ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { - *pgdp = pgd; + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 013286a10c2c..6e1315068a62 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -39,7 +39,11 @@ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +#ifdef CONFIG_KAISER +#define _PAGE_GLOBAL (_AT(pteval_t, 0)) +#else #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#endif #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) @@ -62,7 +66,7 @@ #endif #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) @@ -74,6 +78,33 @@ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) +/* The ASID is the lower 12 bits of CR3 */ +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) + +/* Mask for all the PCID-related bits in CR3: */ +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) + +#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) + +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) +#else +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) +/* + * PCIDs are unsupported on 32-bit and none of these bits can be + * set in CR3: + */ +#define X86_CR3_PCID_KERN_FLUSH (0) +#define X86_CR3_PCID_USER_FLUSH (0) +#define X86_CR3_PCID_KERN_NOFLUSH (0) +#define X86_CR3_PCID_USER_NOFLUSH (0) +#endif + #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) #define _PAGE_CACHE_WC (_PAGE_PWT) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index a9e14a52385f..360e80d0d217 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -43,6 +43,8 @@ */ #define X86_CR3_PWT 0x00000008 /* Page Write Through */ #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f7c89e231c6c..048249e983ca 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -266,7 +266,7 @@ struct tss_struct { } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss); /* * Save the original ist values for checking stack pointers during debugging diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index e04cbc550424..288195901c8a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -64,27 +64,59 @@ static inline void invpcid_flush_all_nonglobals(void) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * Declare a couple of kaiser interfaces here for convenience, + * to avoid the need for asm/kaiser.h in unexpected places. + */ +#ifdef CONFIG_KAISER +extern void kaiser_setup_pcid(void); +extern void kaiser_flush_tlb_on_return_to_user(void); +#else +static inline void kaiser_setup_pcid(void) +{ +} +static inline void kaiser_flush_tlb_on_return_to_user(void) +{ +} +#endif + static inline void __native_flush_tlb(void) { + if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all_nonglobals(); + return; + } + /* * If current->mm == NULL then we borrow a mm which may change during a * task switch and therefore we must not be preempted while we write CR3 * back: */ preempt_disable(); + if (this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); native_write_cr3(native_read_cr3()); preempt_enable(); } static inline void __native_flush_tlb_global(void) { +#ifdef CONFIG_KAISER + /* Globals are not used at all */ + __native_flush_tlb(); +#else unsigned long flags; unsigned long cr4; - if (static_cpu_has(X86_FEATURE_INVPCID)) { + if (this_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. */ invpcid_flush_all(); return; @@ -104,11 +136,39 @@ static inline void __native_flush_tlb_global(void) native_write_cr4(cr4); raw_local_irq_restore(flags); +#endif } static inline void __native_flush_tlb_single(unsigned long addr) { - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + /* + * SIMICS #GP's if you run INVPCID with type 2/3 + * and X86_CR4_PCIDE clear. Shame! + * + * The ASIDs used below are hard-coded. But, we must not + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call + * invlpg in the case we are called early. + */ + + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { + if (this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + return; + } + /* Flush the address out of both PCIDs. */ + /* + * An optimization here might be to determine addresses + * that are only kernel-mapped and only flush the kernel + * ASID. But, userspace flushes are probably much more + * important performance-wise. + * + * Make sure to do only a single invpcid when KAISER is + * disabled and we have only a single ASID. + */ + if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); } static inline void __flush_tlb_all(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 895e4b88469c..b567c89fc628 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -84,7 +84,7 @@ static const struct cpu_dev __cpuinitconst default_cpu = { static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 /* * We need valid kernel segments for data and code in long mode too @@ -319,6 +319,19 @@ static void setup_pcid(struct cpuinfo_x86 *c) * SDM says that it can't be enabled in 32-bit mode. */ set_in_cr4(X86_CR4_PCIDE); + /* + * INVPCID has two "groups" of types: + * 1/2: Invalidate an individual address + * 3/4: Invalidate all contexts + * + * 1/2 take a PCID, but 3/4 do not. So, 3/4 + * ignore the PCID argument in the descriptor. + * But, we have to be careful not to call 1/2 + * with an actual non-zero PCID in them before + * we do the above set_in_cr4(). + */ + if (cpu_has(c, X86_FEATURE_INVPCID)) + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); } else { /* * flush_tlb_all(), as currently implemented, won't @@ -331,6 +344,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_PCID); } } + kaiser_setup_pcid(); } /* @@ -1115,7 +1129,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 2d4e76ba2b5c..fb933cdca184 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -2,10 +2,14 @@ #include <linux/types.h> #include <linux/slab.h> +#include <asm/kaiser.h> #include <asm/perf_event.h> #include "perf_event.h" +static +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); + /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 @@ -60,6 +64,39 @@ void fini_debug_store_on_cpu(int cpu) wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); } +static void *dsalloc(size_t size, gfp_t flags, int node) +{ +#ifdef CONFIG_KAISER + unsigned int order = get_order(size); + struct page *page; + unsigned long addr; + + page = alloc_pages_node(node, flags | __GFP_ZERO, order); + if (!page) + return NULL; + addr = (unsigned long)page_address(page); + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { + __free_pages(page, order); + addr = 0; + } + return (void *)addr; +#else + return kmalloc_node(size, flags | __GFP_ZERO, node); +#endif +} + +static void dsfree(const void *buffer, size_t size) +{ +#ifdef CONFIG_KAISER + if (!buffer) + return; + kaiser_remove_mapping((unsigned long)buffer, size); + free_pages((unsigned long)buffer, get_order(size)); +#else + kfree(buffer); +#endif +} + static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -70,7 +107,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -94,7 +131,7 @@ static void release_pebs_buffer(int cpu) if (!ds || !x86_pmu.pebs) return; - kfree((void *)(unsigned long)ds->pebs_buffer_base); + dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE); ds->pebs_buffer_base = 0; } @@ -108,7 +145,7 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -132,19 +169,15 @@ static void release_bts_buffer(int cpu) if (!ds || !x86_pmu.bts) return; - kfree((void *)(unsigned long)ds->bts_buffer_base); + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; } static int alloc_ds_buffer(int cpu) { - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); - if (unlikely(!ds)) - return -ENOMEM; + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); + memset(ds, 0, sizeof(*ds)); per_cpu(cpu_hw_events, cpu).ds = ds; return 0; @@ -158,7 +191,6 @@ static void release_ds_buffer(int cpu) return; per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); } void release_ds_buffers(void) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f6daf3cdb878..3a4356a2f156 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,6 +56,7 @@ #include <asm/ftrace.h> #include <asm/percpu.h> #include <asm/pgtable_types.h> +#include <asm/kaiser.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -323,6 +324,7 @@ ENDPROC(native_usergs_sysret64) testl $3, CS(%rdi) je 1f SWAPGS + SWITCH_KERNEL_CR3 /* * irq_count is used to check if a CPU is already on an interrupt stack * or not. While this is essentially redundant with preempt_count it is @@ -362,6 +364,12 @@ END(save_rest) /* save complete stack frame */ .pushsection .kprobes.text, "ax" +/* + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit + * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit + * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit + */ ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld @@ -387,7 +395,25 @@ ENTRY(save_paranoid) js 1f /* negative -> in kernel */ SWAPGS xorl %ebx,%ebx -1: ret +1: +#ifdef CONFIG_KAISER + /* + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. + * Do a conditional SWITCH_KERNEL_CR3: this could safely be done + * unconditionally, but we need to find out whether the reverse + * should be done on return (conveyed to paranoid_exit in %ebx). + */ + movq %cr3, %rax + testl $KAISER_SHADOW_PGD_OFFSET, %eax + jz 2f + orl $2, %ebx + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + orq x86_cr3_pcid_noflush, %rax + movq %rax, %cr3 +2: +#endif + ret CFI_ENDPROC END(save_paranoid) .popsection @@ -464,6 +490,7 @@ ENTRY(system_call) CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK /* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs @@ -515,6 +542,14 @@ sysret_check: CFI_REGISTER rip,rcx RESTORE_ARGS 1,-ARG_SKIP,0 /*CFI_REGISTER rflags,r11*/ + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS + * register useless for telling whether or not we need to + * switch CR3 in NMIs. Normal interrupts are OK because + * they are off here. + */ + SWITCH_USER_CR3 movq PER_CPU_VAR(old_rsp), %rsp USERGS_SYSRET64 @@ -851,6 +886,14 @@ retint_swapgs: /* return to user-space */ */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS + * register useless for telling whether or not we need to + * switch CR3 in NMIs. Normal interrupts are OK because + * they are off here. + */ + SWITCH_USER_CR3 SWAPGS jmp restore_args @@ -891,6 +934,7 @@ native_irq_return_ldt: pushq_cfi %rax pushq_cfi %rdi SWAPGS + SWITCH_KERNEL_CR3 movq PER_CPU_VAR(espfix_waddr),%rdi movq %rax,(0*8)(%rdi) /* RAX */ movq (2*8)(%rsp),%rax /* RIP */ @@ -906,6 +950,7 @@ native_irq_return_ldt: andl $0xffff0000,%eax popq_cfi %rdi orq PER_CPU_VAR(espfix_stack),%rax + SWITCH_USER_CR3 SWAPGS movq %rax,%rsp popq_cfi %rax @@ -1366,30 +1411,40 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) * is fundamentally NMI-unsafe. (we cannot change the soft and * hard flags at once, atomically) */ - - /* ebx: no swapgs flag */ +/* + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 + * ebx=2: needs both swapgs and SWITCH_USER_CR3 + * ebx=3: needs SWITCH_USER_CR3 but not swapgs + */ ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: + movq %rbx, %r12 /* paranoid_userspace uses %ebx */ + testl $3, CS(%rsp) + jnz paranoid_userspace +paranoid_kernel: + movq %r12, %rbx /* restore after paranoid_userspace */ TRACE_IRQS_IRETQ 0 +#ifdef CONFIG_KAISER + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz paranoid_exit_no_switch + SWITCH_USER_CR3 +paranoid_exit_no_switch: +#endif + testl $1, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs SWAPGS_UNSAFE_STACK +paranoid_exit_no_swapgs: RESTORE_ALL 8 - jmp irq_return -paranoid_restore: - TRACE_IRQS_IRETQ 0 - RESTORE_ALL 8 - jmp irq_return + jmp irq_return + paranoid_userspace: GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%ebx andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs + jz paranoid_kernel movq %rsp,%rdi /* &pt_regs */ call sync_regs movq %rax,%rsp /* switch stack for scheduling */ @@ -1438,6 +1493,13 @@ ENTRY(error_entry) movq_cfi r13, R13+8 movq_cfi r14, R14+8 movq_cfi r15, R15+8 + /* + * error_entry() always returns with a kernel gsbase and + * CR3. We must also have a kernel CR3/gsbase before + * calling TRACE_IRQS_*. Just unconditionally switch to + * the kernel CR3 here. + */ + SWITCH_KERNEL_CR3 xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1527,22 +1589,31 @@ ENTRY(nmi) call do_nmi #ifdef CONFIG_TRACE_IRQFLAGS /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ + /* ebx: no-swapgs and kaiser-switch-cr3 flag */ DISABLE_INTERRUPTS(CLBR_NONE) - testl %ebx,%ebx /* swapgs needed? */ - jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace -nmi_swapgs: + movq %rbx, %r12 /* nmi_userspace uses %ebx */ + testl $3, CS(%rsp) + jnz nmi_userspace +nmi_kernel: + movq %r12, %rbx /* restore after nmi_userspace */ +#ifdef CONFIG_KAISER + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz nmi_exit_no_switch + SWITCH_USER_CR3 +nmi_exit_no_switch: +#endif + testl $1, %ebx /* swapgs needed? */ + jnz nmi_exit_no_swapgs SWAPGS_UNSAFE_STACK -nmi_restore: +nmi_exit_no_swapgs: RESTORE_ALL 8 - jmp irq_return + jmp irq_return + nmi_userspace: GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%ebx andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs + jz nmi_kernel movq %rsp,%rdi /* &pt_regs */ call sync_regs movq %rax,%rsp /* switch stack for scheduling */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 94d857fb1033..14cd73b0e634 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -41,6 +41,7 @@ #include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/espfix.h> +#include <asm/kaiser.h> /* * Note: we only need 6*8 = 48 bytes for the espfix stack, but round @@ -129,6 +130,14 @@ void __init init_espfix_bsp(void) /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + /* + * Just copy the top-level PGD that is mapping the espfix + * area to ensure it is mapped into the shadow user page + * tables. + */ + if (IS_ENABLED(CONFIG_KAISER)) + set_pgd(native_get_shadow_pgd(pgd_p), + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 0f8ebf78253a..6e697ac3fb54 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -338,6 +338,27 @@ early_idt_ripmsg: .balign PAGE_SIZE; \ ENTRY(name) +#ifdef CONFIG_KAISER +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define KAISER_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define KAISER_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -353,13 +374,14 @@ ENTRY(name) * 0xffffffff80000000 to physical address 0x000000. (always using * 2Mbyte large pages provided by PAE mode) */ -NEXT_PAGE(init_level4_pgt) +NEXT_PGD_PAGE(init_level4_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -385,6 +407,7 @@ NEXT_PAGE(level2_ident_pgt) * Don't set NX because code runs from these pages. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level2_kernel_pgt) /* diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 43e9ccf44947..f00e6e734fbd 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -38,5 +38,5 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index e328f691eeef..990f743e21b8 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -85,7 +85,7 @@ static struct irqaction irq2 = { .flags = IRQF_NO_THREAD, }; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = -1, }; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 1dd32307a494..836a4c2d5ceb 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <linux/kaiser.h> #include <asm/system.h> #include <asm/ldt.h> @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) set_ldt(pc->ldt->entries, pc->ldt->size); } +static void __free_ldt_struct(struct ldt_struct *ldt) +{ + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(ldt->entries); + else + free_page((unsigned long)ldt->entries); + kfree(ldt); +} + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ static struct ldt_struct *alloc_ldt_struct(int size) { struct ldt_struct *new_ldt; int alloc_size; + int ret; if (size > LDT_ENTRIES) return NULL; @@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size) return NULL; } + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, + __PAGE_KERNEL); new_ldt->size = size; + if (ret) { + __free_ldt_struct(new_ldt); + return NULL; + } return new_ldt; } @@ -97,12 +114,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (likely(!ldt)) return; + kaiser_remove_mapping((unsigned long)ldt->entries, + ldt->size * LDT_ENTRY_SIZE); paravirt_free_ldt(ldt->entries, ldt->size); - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); - else - kfree(ldt->entries); - kfree(ldt); + __free_ldt_struct(ldt); } /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 557eb3757edb..d2ce2a33d15b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -57,7 +57,7 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(unsigned long, old_rsp); +DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); static ATOMIC_NOTIFIER_HEAD(idle_notifier); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index cf2a84031dfd..c9a00a5e0b87 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_MEMTEST) += memtest.o +obj-$(CONFIG_KAISER) += kaiser.o diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c new file mode 100644 index 000000000000..79b0222ffa74 --- /dev/null +++ b/arch/x86/mm/kaiser.c @@ -0,0 +1,382 @@ +#include <linux/bug.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> + +extern struct mm_struct init_mm; + +#include <asm/kaiser.h> +#include <asm/tlbflush.h> /* to verify its kaiser declarations */ +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/desc.h> + +#ifdef CONFIG_KAISER +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +/* + * These can have bit 63 set, so we can not just use a plain "or" + * instruction to get their value or'd into CR3. It would take + * another register. So, we use a memory reference to these instead. + * + * This is also handy because systems that do not support PCIDs + * just end up or'ing a 0 into their CR3, which does no harm. + */ +unsigned long x86_cr3_pcid_noflush __read_mostly; +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); + +/* + * At runtime, the only things we map are some things for CPU + * hotplug, and stacks for new processes. No two CPUs will ever + * be populating the same addresses, so we only need to ensure + * that we protect between two CPUs trying to allocate and + * populate the same page table page. + * + * Only take this lock when doing a set_p[4um]d(), but it is not + * needed for doing a set_pte(). We assume that only the *owner* + * of a given allocation will be doing this for _their_ + * allocation. + * + * This ensures that once a system has been running for a while + * and there have been stacks all over and these page tables + * are fully populated, there will be no further acquisitions of + * this lock. + */ +static DEFINE_SPINLOCK(shadow_table_allocation_lock); + +/* + * Returns -1 on error. + */ +static inline unsigned long get_pa_from_mapping(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(vaddr); + /* + * We made all the kernel PGDs present in kaiser_init(). + * We expect them to stay that way. + */ + BUG_ON(pgd_none(*pgd)); + /* + * PGDs are either 512GB or 128TB on all x86_64 + * configurations. We don't handle these. + */ + BUG_ON(pgd_large(*pgd)); + + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + WARN_ON_ONCE(1); + return -1; + } + + if (pud_large(*pud)) + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); + + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + WARN_ON_ONCE(1); + return -1; + } + + if (pmd_large(*pmd)) + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); + + pte = pte_offset_kernel(pmd, vaddr); + if (pte_none(*pte)) { + WARN_ON_ONCE(1); + return -1; + } + + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); +} + +/* + * This is a relatively normal page table walk, except that it + * also tries to allocate page tables pages along the way. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static pte_t *kaiser_pagetable_walk(unsigned long address) +{ + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + + if (pgd_none(*pgd)) { + WARN_ONCE(1, "All shadow pgds should have been populated"); + return NULL; + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + pud = pud_offset(pgd, address); + /* The shadow page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (!new_pmd_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pud_none(*pud)) { + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pmd_page), NR_KAISERTABLE); + } else + free_page(new_pmd_page); + spin_unlock(&shadow_table_allocation_lock); + } + + pmd = pmd_offset(pud, address); + /* The shadow page tables do not use large mappings: */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pte_page), NR_KAISERTABLE); + } else + free_page(new_pte_page); + spin_unlock(&shadow_table_allocation_lock); + } + + return pte_offset_kernel(pmd, address); +} + +int kaiser_add_user_map(const void *__start_addr, unsigned long size, + unsigned long flags) +{ + int ret = 0; + pte_t *pte; + unsigned long start_addr = (unsigned long )__start_addr; + unsigned long address = start_addr & PAGE_MASK; + unsigned long end_addr = PAGE_ALIGN(start_addr + size); + unsigned long target_address; + + for (; address < end_addr; address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + if (target_address == -1) { + ret = -EIO; + break; + } + pte = kaiser_pagetable_walk(address); + if (!pte) { + ret = -ENOMEM; + break; + } + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { + pte_t tmp; + set_pte(&tmp, __pte(flags | target_address)); + WARN_ON_ONCE(!pte_same(*pte, tmp)); + } + } + return ret; +} + +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) +{ + unsigned long size = end - start; + + return kaiser_add_user_map(start, size, flags); +} + +/* + * Ensure that the top level of the (shadow) page tables are + * entirely populated. This ensures that all processes that get + * forked have the same entries. This way, we do not have to + * ever go set up new entries in older processes. + * + * Note: we never free these, so there are no updates to them + * after this. + */ +static void __init kaiser_init_all_pgds(void) +{ + pgd_t *pgd; + int i = 0; + + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { + pgd_t new_pgd; + pud_t *pud = pud_alloc_one(&init_mm, + PAGE_OFFSET + i * PGDIR_SIZE); + if (!pud) { + WARN_ON(1); + break; + } + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); + /* + * Make sure not to stomp on some other pgd entry. + */ + if (!pgd_none(pgd[i])) { + WARN_ON(1); + continue; + } + set_pgd(pgd + i, new_pgd); + } +} + +#define kaiser_add_user_map_early(start, size, flags) do { \ + int __ret = kaiser_add_user_map(start, size, flags); \ + WARN_ON(__ret); \ +} while (0) + +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ + WARN_ON(__ret); \ +} while (0) + +/* + * If anything in here fails, we will likely die on one of the + * first kernel->user transitions and init will die. But, we + * will have most of the kernel up by then and should be able to + * get a clean warning out of it. If we BUG_ON() here, we run + * the risk of being before we have good console output. + */ +void __init kaiser_init(void) +{ + int cpu; + + kaiser_init_all_pgds(); + + for_each_possible_cpu(cpu) { + void *percpu_vaddr = __per_cpu_user_mapped_start + + per_cpu_offset(cpu); + unsigned long percpu_sz = __per_cpu_user_mapped_end - + __per_cpu_user_mapped_start; + kaiser_add_user_map_early(percpu_vaddr, percpu_sz, + __PAGE_KERNEL); + } + + /* + * Map the entry/exit text section, which is needed at + * switches from user to and from kernel. + */ + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, + __PAGE_KERNEL_RX); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + kaiser_add_user_map_ptrs_early(__irqentry_text_start, + __irqentry_text_end, + __PAGE_KERNEL_RX); +#endif + kaiser_add_user_map_early((void *)idt_descr.address, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL_RO); + kaiser_add_user_map_early(&x86_cr3_pcid_noflush, + sizeof(x86_cr3_pcid_noflush), + __PAGE_KERNEL); +} + +/* Add a mapping to the shadow mapping, and synchronize the mappings */ +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +{ + return kaiser_add_user_map((const void *)addr, size, flags); +} + +void kaiser_remove_mapping(unsigned long start, unsigned long size) +{ + unsigned long end = start + size; + unsigned long addr; + pte_t *pte; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + pte = kaiser_pagetable_walk(addr); + if (pte) + set_pte(pte, __pte(0)); + } +} + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * This returns true for user pages that need to get copied into + * both the user and kernel copies of the page tables, and false + * for kernel pages that should only be in the kernel copy. + */ +static inline bool is_userspace_pgd(pgd_t *pgdp) +{ + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); +} + +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Do we need to also populate the shadow pgd? Check _PAGE_USER to + * skip cases like kexec and EFI which make temporary low mappings. + */ + if (pgd.pgd & _PAGE_USER) { + if (is_userspace_pgd(pgdp)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + /* + * Even if the entry is *mapping* userspace, ensure + * that userspace can not use it. This way, if we + * get out to userspace running on the kernel CR3, + * userspace will crash instead of running. + */ + pgd.pgd |= _PAGE_NX; + } + } else if (!pgd.pgd) { + /* + * pgd_clear() cannot check _PAGE_USER, and is even used to + * clear corrupted pgd entries: so just rely on cases like + * kexec and EFI never to be using pgd_clear(). + */ + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && + is_userspace_pgd(pgdp)) + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + return pgd; +} + +void kaiser_setup_pcid(void) +{ + unsigned long kern_cr3 = 0; + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; + + if (this_cpu_has(X86_FEATURE_PCID)) { + kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH; + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; + } + /* + * These variables are used by the entry/exit + * code to change PCID and pgd and TLB flushing. + */ + x86_cr3_pcid_noflush = kern_cr3; + this_cpu_write(x86_cr3_pcid_user, user_cr3); +} + +/* + * Make a note that this cpu will need to flush USER tlb on return to user. + * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling: + * if cpu does not, then the NOFLUSH bit will never have been set. + */ +void kaiser_flush_tlb_on_return_to_user(void) +{ + this_cpu_write(x86_cr3_pcid_user, + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); +} +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); +#endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8573b83a63d0..73285602c93f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -5,7 +5,7 @@ #include <asm/tlb.h> #include <asm/fixmap.h> -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM @@ -253,12 +253,35 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) } } +#ifdef CONFIG_KAISER +/* + * Instead of one pmd, we aquire two pmds. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER 1 +#else +#define PGD_ALLOCATION_ORDER 0 +#endif + +static inline pgd_t *_pgd_alloc(void) +{ + /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */ + return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT, + PGD_ALLOCATION_ORDER); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); +} + pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + pgd = _pgd_alloc(); if (pgd == NULL) goto out; @@ -288,7 +311,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) out_free_pmds: free_pmds(pmds); out_free_pgd: - free_page((unsigned long)pgd); + _pgd_free(pgd); out: return NULL; } @@ -298,7 +321,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); - free_page((unsigned long)pgd); + _pgd_free(pgd); } int ptep_set_access_flags(struct vm_area_struct *vma, diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4f5ca8f04c0d..4078e3092b16 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,10 +12,43 @@ #include <asm/cache.h> #include <asm/apic.h> #include <asm/uv/uv.h> +#include <asm/kaiser.h> DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; +static void load_new_mm_cr3(pgd_t *pgdir) +{ + unsigned long new_mm_cr3 = __pa(pgdir); + +#ifdef CONFIG_KAISER + if (this_cpu_has(X86_FEATURE_PCID)) { + /* + * We reuse the same PCID for different tasks, so we must + * flush all the entries for the PCID out when we change tasks. + * Flush KERN below, flush USER when returning to userspace in + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. + * + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could + * do it here, but can only be used if X86_FEATURE_INVPCID is + * available - and many machines support pcid without invpcid. + * + * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0; + * but keep that line in there in case something changes. + */ + new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; + kaiser_flush_tlb_on_return_to_user(); + } +#endif /* CONFIG_KAISER */ + + /* + * Caution: many callers of this function expect + * that load_new_mm_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -65,7 +98,7 @@ void leave_mm(int cpu) BUG(); cpumask_clear_cpu(cpu, mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); - load_cr3(swapper_pg_dir); + load_new_mm_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -113,11 +146,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * from next->pgd. TLB fills are special and can happen * due to instruction fetches or for no reason at all, * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * + * Fortunately, load_new_mm_cr3() is serializing + * and gives the ordering guarantee we need. */ - load_cr3(next->pgd); + load_new_mm_cr3(next->pgd); /* stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); @@ -136,10 +168,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. + * As above, load_new_mm_cr3() is serializing and orders + * TLB fills with respect to the mm_cpumask write. */ - load_cr3(next->pgd); + load_new_mm_cr3(next->pgd); load_mm_ldt(next); } } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b5e2e4c6b017..01c8155dd613 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -692,7 +692,14 @@ */ #define PERCPU_INPUT(cacheline) \ VMLINUX_SYMBOL(__per_cpu_start) = .; \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ *(.data..percpu..first) \ + . = ALIGN(cacheline); \ + *(.data..percpu..user_mapped) \ + *(.data..percpu..user_mapped..shared_aligned) \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..user_mapped..page_aligned) \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h new file mode 100644 index 000000000000..4a4d6d911a14 --- /dev/null +++ b/include/linux/kaiser.h @@ -0,0 +1,52 @@ +#ifndef _LINUX_KAISER_H +#define _LINUX_KAISER_H + +#ifdef CONFIG_KAISER +#include <asm/kaiser.h> + +static inline int kaiser_map_thread_stack(void *stack) +{ + /* + * Map that page of kernel stack on which we enter from user context. + */ + return kaiser_add_mapping((unsigned long)stack + + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); +} + +static inline void kaiser_unmap_thread_stack(void *stack) +{ + /* + * Note: may be called even when kaiser_map_thread_stack() failed. + */ + kaiser_remove_mapping((unsigned long)stack + + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); +} +#else + +/* + * These stubs are used whenever CONFIG_KAISER is off, which + * includes architectures that support KAISER, but have it disabled. + */ + +static inline void kaiser_init(void) +{ +} +static inline int kaiser_add_mapping(unsigned long addr, + unsigned long size, unsigned long flags) +{ + return 0; +} +static inline void kaiser_remove_mapping(unsigned long start, + unsigned long size) +{ +} +static inline int kaiser_map_thread_stack(void *stack) +{ + return 0; +} +static inline void kaiser_unmap_thread_stack(void *stack) +{ +} + +#endif /* !CONFIG_KAISER */ +#endif /* _LINUX_KAISER_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 25842b6e72e1..a0b4422a116a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -95,8 +95,9 @@ enum zone_stat_item { NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ - NR_KERNEL_STACK, /* Second 128 byte cacheline */ + NR_KERNEL_STACK, + NR_KAISERTABLE, NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_BOUNCE, NR_VMSCAN_WRITE, diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 27ef6b190ea6..56f5eeb78d1d 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -28,6 +28,12 @@ (void)__vpp_verify; \ } while (0) +#ifdef CONFIG_KAISER +#define USER_MAPPED_SECTION "..user_mapped" +#else +#define USER_MAPPED_SECTION "" +#endif + /* * s390 and alpha modules require percpu variables to be defined as * weak to force the compiler to generate GOT based external @@ -90,6 +96,12 @@ #define DEFINE_PER_CPU(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "") +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + /* * Declaration/definition used for per-CPU variables that must come first in * the set of variables. @@ -119,6 +131,14 @@ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ ____cacheline_aligned_in_smp +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + #define DECLARE_PER_CPU_ALIGNED(type, name) \ DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ ____cacheline_aligned @@ -137,11 +157,21 @@ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ __aligned(PAGE_SIZE) +/* + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. + */ +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) + +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) /* * Declaration/definition used for per-CPU variables that must be read mostly. */ -#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ DECLARE_PER_CPU_SECTION(type, name, "..readmostly") #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ diff --git a/init/main.c b/init/main.c index e937d9bda0f8..558a9fdd566d 100644 --- a/init/main.c +++ b/init/main.c @@ -69,6 +69,7 @@ #include <linux/slab.h> #include <linux/perf_event.h> #include <linux/random.h> +#include <linux/kaiser.h> #include <asm/io.h> #include <asm/bugs.h> @@ -463,6 +464,7 @@ static void __init mm_init(void) percpu_init_late(); pgtable_cache_init(); vmalloc_init(); + kaiser_init(); } asmlinkage void __init start_kernel(void) diff --git a/kernel/fork.c b/kernel/fork.c index 29b460431c12..511131a15a75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -55,6 +55,7 @@ #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> +#include <linux/kaiser.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> #include <linux/random.h> @@ -133,6 +134,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { + kaiser_unmap_thread_stack(ti); free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } #endif @@ -275,6 +277,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack = ti; + err = kaiser_map_thread_stack(tsk->stack); + if (err) + goto out; + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); diff --git a/mm/vmstat.c b/mm/vmstat.c index ff9060919c4b..eaf3db038652 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -699,6 +699,7 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", + "nr_overhead", "nr_unstable", "nr_bounce", "nr_vmscan_write", diff --git a/security/Kconfig b/security/Kconfig index 51bd5a0b69ae..19f83193e7ab 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -96,6 +96,16 @@ config SECURITY If you are unsure how to answer this question, answer N. +config KAISER + bool "Remove the kernel mapping in user mode" + default y + depends on X86_64 && SMP && !PARAVIRT + help + This enforces a strict kernel and user space isolation, in order + to close hardware side channels on kernel address information. + + If you are unsure how to answer this question, answer Y. + config SECURITYFS bool "Enable the securityfs filesystem" help |