From 5aac644a9944bea93b4f05ced1883a902a2535f6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 3 Jun 2015 10:39:46 +0300 Subject: x86/tsc: Let high latency PIT fail fast in quick_pit_calibrate() If it takes longer than 12us to read the PIT counter lsb/msb, then the error margin will never fall below 500ppm within 50ms, and Fast TSC calibration will always fail. This patch detects when that will happen and fails fast. Note the failure message is not printed in that case because: 1. it will always happen on that class of hardware 2. the absence of the message is more informative than its presence Signed-off-by: Adrian Hunter Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Len Brown Cc: Andi Kleen Link: http://lkml.kernel.org/r/556EB717.9070607@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 505449700e0c..7437b41f6a47 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -598,10 +598,19 @@ static unsigned long quick_pit_calibrate(void) if (!pit_expect_msb(0xff-i, &delta, &d2)) break; + delta -= tsc; + + /* + * Extrapolate the error and fail fast if the error will + * never be below 500 ppm. + */ + if (i == 1 && + d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11) + return 0; + /* * Iterate until the error is less than 500 ppm */ - delta -= tsc; if (d1+d2 >= delta >> 11) continue; -- cgit v1.2.3 From d0f77d4d04b222a817925d33ba3589b190bfa863 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:33 +0300 Subject: x86/init: Clear 'init_level4_pgt' earlier Currently x86_64_start_kernel() has two KASAN related function calls. The first call maps shadow to early_level4_pgt, the second maps shadow to init_level4_pgt. If we move clear_page(init_level4_pgt) earlier, we could hide KASAN low level detail from generic x86_64 initialization code. The next patch will do it. Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-2-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5a4668136e98..65c8985e3eb2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -166,6 +166,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); + clear_page(init_level4_pgt); + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); @@ -177,7 +179,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - clear_page(init_level4_pgt); /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; -- cgit v1.2.3 From 5d5aa3cfca5cf74cd928daf3674642e6004328d1 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Thu, 2 Jul 2015 12:09:34 +0300 Subject: x86/kasan: Fix KASAN shadow region page tables Currently KASAN shadow region page tables created without respect of physical offset (phys_base). This causes kernel halt when phys_base is not zero. So let's initialize KASAN shadow region page tables in kasan_early_init() using __pa_nodebug() which considers phys_base. This patch also separates x86_64_start_kernel() from KASAN low level details by moving kasan_map_early_shadow(init_level4_pgt) into kasan_early_init(). Remove the comment before clear_bss() which stopped bringing much profit to the code readability. Otherwise describing all the new order dependencies would be too verbose. Signed-off-by: Alexander Popov Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-3-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kasan.h | 8 ++------ arch/x86/kernel/head64.c | 7 ++----- arch/x86/kernel/head_64.S | 29 ----------------------------- arch/x86/mm/kasan_init_64.c | 36 ++++++++++++++++++++++++++++++++++-- 4 files changed, 38 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 8b22422fbad8..74a2a8dc9908 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -14,15 +14,11 @@ #ifndef __ASSEMBLY__ -extern pte_t kasan_zero_pte[]; -extern pte_t kasan_zero_pmd[]; -extern pte_t kasan_zero_pud[]; - #ifdef CONFIG_KASAN -void __init kasan_map_early_shadow(pgd_t *pgd); +void __init kasan_early_init(void); void __init kasan_init(void); #else -static inline void kasan_map_early_shadow(pgd_t *pgd) { } +static inline void kasan_early_init(void) { } static inline void kasan_init(void) { } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 65c8985e3eb2..f129a9af6357 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -161,13 +161,12 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* Kill off the identity-map trampoline */ reset_early_page_tables(); - kasan_map_early_shadow(early_level4_pgt); - - /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); clear_page(init_level4_pgt); + kasan_early_init(); + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); @@ -182,8 +181,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; - kasan_map_early_shadow(init_level4_pgt); - x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e5c27f729a38..1d40ca8a73f2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -516,38 +516,9 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 -#ifdef CONFIG_KASAN -#define FILL(VAL, COUNT) \ - .rept (COUNT) ; \ - .quad (VAL) ; \ - .endr - -NEXT_PAGE(kasan_zero_pte) - FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512) -NEXT_PAGE(kasan_zero_pmd) - FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512) -NEXT_PAGE(kasan_zero_pud) - FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512) - -#undef FILL -#endif - - #include "../../x86/xen/xen-head.S" __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE -#ifdef CONFIG_KASAN -/* - * This page used as early shadow. We don't use empty_zero_page - * at early stages, stack instrumentation could write some garbage - * to this page. - * Latter we reuse it as zero shadow for large ranges of memory - * that allowed to access, but not instrumented by kasan - * (vmalloc/vmemmap ...). - */ -NEXT_PAGE(kasan_zero_page) - .skip PAGE_SIZE -#endif diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 4860906c6b9f..0e4a05fa34d7 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -11,7 +11,19 @@ extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_X_MAX]; -extern unsigned char kasan_zero_page[PAGE_SIZE]; +static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; + +/* + * This page used as early shadow. We don't use empty_zero_page + * at early stages, stack instrumentation could write some garbage + * to this page. + * Latter we reuse it as zero shadow for large ranges of memory + * that allowed to access, but not instrumented by kasan + * (vmalloc/vmemmap ...). + */ +static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; static int __init map_range(struct range *range) { @@ -36,7 +48,7 @@ static void __init clear_pgds(unsigned long start, pgd_clear(pgd_offset_k(start)); } -void __init kasan_map_early_shadow(pgd_t *pgd) +static void __init kasan_map_early_shadow(pgd_t *pgd) { int i; unsigned long start = KASAN_SHADOW_START; @@ -166,6 +178,26 @@ static struct notifier_block kasan_die_notifier = { }; #endif +void __init kasan_early_init(void) +{ + int i; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; + pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; + + for (i = 0; i < PTRS_PER_PTE; i++) + kasan_zero_pte[i] = __pte(pte_val); + + for (i = 0; i < PTRS_PER_PMD; i++) + kasan_zero_pmd[i] = __pmd(pmd_val); + + for (i = 0; i < PTRS_PER_PUD; i++) + kasan_zero_pud[i] = __pud(pud_val); + + kasan_map_early_shadow(early_level4_pgt); + kasan_map_early_shadow(init_level4_pgt); +} + void __init kasan_init(void) { int i; -- cgit v1.2.3 From 241d2c54c62fa0939fc9a9512b48ac3434e90a89 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:35 +0300 Subject: x86/kasan: Flush TLBs after switching CR3 load_cr3() doesn't cause tlb_flush if PGE enabled. This may cause tons of false positive reports spamming the kernel to death. To fix this __flush_tlb_all() should be called explicitly after CR3 changed. Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-4-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0e4a05fa34d7..5d26642e4e8a 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -208,6 +208,7 @@ void __init kasan_init(void) memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); load_cr3(early_level4_pgt); + __flush_tlb_all(); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); @@ -234,5 +235,6 @@ void __init kasan_init(void) memset(kasan_zero_page, 0, PAGE_SIZE); load_cr3(init_level4_pgt); + __flush_tlb_all(); init_task.kasan_depth = 0; } -- cgit v1.2.3 From d4f86beacc21d538dc41e1fc75a22e084f547edf Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:36 +0300 Subject: x86/kasan: Fix boot crash on AMD processors While populating zero shadow wrong bits in upper level page tables used. __PAGE_KERNEL_RO that was used for pgd/pud/pmd has _PAGE_BIT_GLOBAL set. Global bit is present only in the lowest level of the page translation hierarchy (ptes), and it should be zero in upper levels. This bug seems doesn't cause any troubles on Intel cpus, while on AMDs it cause kernel crash on boot. Use _KERNPG_TABLE bits for pgds/puds/pmds to fix this. Reported-by: Borislav Petkov Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-5-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 5d26642e4e8a..9a54dbe98064 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -85,7 +85,7 @@ static int __init zero_pmd_populate(pud_t *pud, unsigned long addr, while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) { WARN_ON(!pmd_none(*pmd)); set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte) - | __PAGE_KERNEL_RO)); + | _KERNPG_TABLE)); addr += PMD_SIZE; pmd = pmd_offset(pud, addr); } @@ -111,7 +111,7 @@ static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr, while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) { WARN_ON(!pud_none(*pud)); set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd) - | __PAGE_KERNEL_RO)); + | _KERNPG_TABLE)); addr += PUD_SIZE; pud = pud_offset(pgd, addr); } @@ -136,7 +136,7 @@ static int __init zero_pgd_populate(unsigned long addr, unsigned long end) while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) { WARN_ON(!pgd_none(*pgd)); set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud) - | __PAGE_KERNEL_RO)); + | _KERNPG_TABLE)); addr += PGDIR_SIZE; pgd = pgd_offset_k(addr); } -- cgit v1.2.3 From 8515522949951d81fe2d06c0a3292f171f2b8ec4 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:37 +0300 Subject: x86/kasan: Add message about KASAN being initialized Print informational message to tell user that kernel runs with KASAN enabled. Add a "kasan: " prefix to all messages in kasan_init_64.c. Signed-off-by: Andrey Ryabinin Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-6-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9a54dbe98064..e1840f3db5b5 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +#define pr_fmt(fmt) "kasan: " fmt #include #include #include @@ -237,4 +238,6 @@ void __init kasan_init(void) load_cr3(init_level4_pgt); __flush_tlb_all(); init_task.kasan_depth = 0; + + pr_info("Kernel address sanitizer initialized\n"); } -- cgit v1.2.3 From d6f2d75a7ae06ffd793bb504c4f0d1665548cffc Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:38 +0300 Subject: x86/kasan: Move KASAN_SHADOW_OFFSET to the arch Kconfig KASAN_SHADOW_OFFSET is purely arch specific setting, so it should be in arch's Kconfig file. Signed-off-by: Andrey Ryabinin Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Paul Bolle Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-7-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 55bced17dc95..0b2929f49531 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -254,6 +254,11 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config KASAN_SHADOW_OFFSET + hex + depends on KASAN + default 0xdffffc0000000000 + config HAVE_INTEL_TXT def_bool y depends on INTEL_IOMMU && ACPI -- cgit v1.2.3 From 1db875631f8d5bbf497f67e47f254eece888d51d Mon Sep 17 00:00:00 2001 From: Zhu Guihua Date: Fri, 3 Jul 2015 17:37:18 +0800 Subject: x86/espfix: Add 'cpu' parameter to init_espfix_ap() Add a CPU index parameter to init_espfix_ap(), so that the parameter could be propagated to the function for espfix page allocation. Signed-off-by: Zhu Guihua Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cde3fcf1b3211f3f03feb1a995bce3fee850f0fc.1435824469.git.zhugh.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/espfix.h | 2 +- arch/x86/kernel/espfix_64.c | 7 +++---- arch/x86/kernel/smpboot.c | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h index 99efebb2f69d..ca3ce9ab9385 100644 --- a/arch/x86/include/asm/espfix.h +++ b/arch/x86/include/asm/espfix.h @@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); extern void init_espfix_bsp(void); -extern void init_espfix_ap(void); +extern void init_espfix_ap(int cpu); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index f5d0730e7b08..1fb0e0003c94 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -131,12 +131,12 @@ void __init init_espfix_bsp(void) init_espfix_random(); /* The rest is the same as for any other processor */ - init_espfix_ap(); + init_espfix_ap(0); } -void init_espfix_ap(void) +void init_espfix_ap(int cpu) { - unsigned int cpu, page; + unsigned int page; unsigned long addr; pud_t pud, *pud_p; pmd_t pmd, *pmd_p; @@ -149,7 +149,6 @@ void init_espfix_ap(void) if (likely(this_cpu_read(espfix_stack))) return; /* Already initialized */ - cpu = smp_processor_id(); addr = espfix_base_addr(cpu); page = cpu/ESPFIX_STACKS_PER_PAGE; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8add66b22f33..6f5abac6c28b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -242,7 +242,7 @@ static void notrace start_secondary(void *unused) * Enable the espfix hack for this CPU */ #ifdef CONFIG_X86_ESPFIX64 - init_espfix_ap(); + init_espfix_ap(smp_processor_id()); #endif /* -- cgit v1.2.3 From 20d5e4a9cd453991e2590a4c25230a99b42dee0c Mon Sep 17 00:00:00 2001 From: Zhu Guihua Date: Fri, 3 Jul 2015 17:37:19 +0800 Subject: x86/espfix: Init espfix on the boot CPU side As we alloc pages with GFP_KERNEL in init_espfix_ap() which is called before we enable local irqs, so the lockdep sub-system would (correctly) trigger a warning about the potentially blocking API. So we allocate them on the boot CPU side when the secondary CPU is brought up by the boot CPU, and hand them over to the secondary CPU. And we use alloc_pages_node() with the secondary CPU's node, to make sure the espfix stack is NUMA-local to the CPU that is going to use it. Signed-off-by: Zhu Guihua Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c97add2670e9abebb90095369f0cfc172373ac94.1435824469.git.zhugh.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/espfix_64.c | 21 +++++++++++++-------- arch/x86/kernel/smpboot.c | 14 +++++++------- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 1fb0e0003c94..ce95676abd60 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -141,12 +141,12 @@ void init_espfix_ap(int cpu) pud_t pud, *pud_p; pmd_t pmd, *pmd_p; pte_t pte, *pte_p; - int n; + int n, node; void *stack_page; pteval_t ptemask; /* We only have to do this once... */ - if (likely(this_cpu_read(espfix_stack))) + if (likely(per_cpu(espfix_stack, cpu))) return; /* Already initialized */ addr = espfix_base_addr(cpu); @@ -164,12 +164,15 @@ void init_espfix_ap(int cpu) if (stack_page) goto unlock_done; + node = cpu_to_node(cpu); ptemask = __supported_pte_mask; pud_p = &espfix_pud_page[pud_index(addr)]; pud = *pud_p; if (!pud_present(pud)) { - pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pmd_p = (pmd_t *)page_address(page); pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PUD_CLONES; n++) @@ -179,7 +182,9 @@ void init_espfix_ap(int cpu) pmd_p = pmd_offset(&pud, addr); pmd = *pmd_p; if (!pmd_present(pmd)) { - pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pte_p = (pte_t *)page_address(page); pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PMD_CLONES; n++) @@ -187,7 +192,7 @@ void init_espfix_ap(int cpu) } pte_p = pte_offset_kernel(&pmd, addr); - stack_page = (void *)__get_free_page(GFP_KERNEL); + stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); @@ -198,7 +203,7 @@ void init_espfix_ap(int cpu) unlock_done: mutex_unlock(&espfix_init_mutex); done: - this_cpu_write(espfix_stack, addr); - this_cpu_write(espfix_waddr, (unsigned long)stack_page - + (addr & ~PAGE_MASK)); + per_cpu(espfix_stack, cpu) = addr; + per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page + + (addr & ~PAGE_MASK); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6f5abac6c28b..0bd8c1d507b3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -238,13 +238,6 @@ static void notrace start_secondary(void *unused) */ check_tsc_sync_target(); - /* - * Enable the espfix hack for this CPU - */ -#ifdef CONFIG_X86_ESPFIX64 - init_espfix_ap(smp_processor_id()); -#endif - /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding @@ -854,6 +847,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; + /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_ESPFIX64 + init_espfix_ap(cpu); +#endif + /* So we see what's up */ announce_cpu(cpu, apicid); -- cgit v1.2.3 From 827a82ff399523a954253dfea401af748640f0f4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Jul 2015 10:14:34 -0400 Subject: x86/earlyprintk: Allow early_printk() to use console style parameters like '115200n8' When I enable early_printk on a kernel, I cut and paste the console= input and add to earlyprintk parameter. But I notice recently that ktest has not been detecting triple faults. The way it detects it, is by seeing the kernel banner "Linux version .." with a different kernel version pop up. Then I noticed that early printk was no longer working on my console, which was why ktest was not seeing it. I bisected it down and it was added to 4.0 with this commit: ea9e9d802902 ("Specify PCI based UART for earlyprintk") because it converted the simple_strtoul() that converts the baud number into a kstrtoul(). The problem with this is, I had as my baud rate, 115200n8 (acceptable for console=ttyS0), but because of the "n8", the kstrtoul() doesn't parse the baud rate and returns an error, which sets the baud rate to the default 9600. This explains the garbage on my screen. Now, earlyprintk= kernel parameter does not say it accepts that format. Thus, one answer would simply be me changing my kernel parameters to remove the "n8" since it isn't parsed anyway. But I wonder if other people run into this, and it seems strange that the two consoles for serial accepts different input. I could also extend this to have earlyprintk do something with that "n8" or whatever it has and have it match the console parsing (which, BTW, still uses simple_strtoul(), as I guess it has to). This patch just makes my old kernel parameter parsing work like it use to. Although, simple_strtoull() is considered obsolete, it is the only standard string parsing function that parses a number that is attached to text. Ironically, commit ea9e9d802902 also added several calls to simple_strtoul()! Signed-off-by: Steven Rostedt Cc: Alan Cox Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Borislav Petkov Cc: Brian Gerst Cc: David Cohen Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stuart R. Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150706101434.5f6a351b@gandalf.local.home [ Cleaned it up a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 89427d8d4fc5..eec40f595ab9 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -175,7 +175,9 @@ static __init void early_serial_init(char *s) } if (*s) { - if (kstrtoul(s, 0, &baud) < 0 || baud == 0) + baud = simple_strtoull(s, &e, 0); + + if (baud == 0 || s == e) baud = DEFAULT_BAUD; } -- cgit v1.2.3 From 5a3f75e3f02836518ce49536e9c460ca8e1fa290 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:32 +0000 Subject: x86/irq: Plug irq vector hotplug race Jin debugged a nasty cpu hotplug race which results in leaking a irq vector on the newly hotplugged cpu. cpu N cpu M native_cpu_up device_shutdown do_boot_cpu free_msi_irqs start_secondary arch_teardown_msi_irqs smp_callin default_teardown_msi_irqs setup_vector_irq arch_teardown_msi_irq __setup_vector_irq native_teardown_msi_irq lock(vector_lock) destroy_irq install vectors unlock(vector_lock) lock(vector_lock) ---> __clear_irq_vector unlock(vector_lock) lock(vector_lock) set_cpu_online unlock(vector_lock) This leaves the irq vector(s) which are torn down on CPU M stale in the vector array of CPU N, because CPU M does not see CPU N online yet. There is a similar issue with concurrent newly setup interrupts. The alloc/free protection of irq descriptors does not prevent the above race, because it merily prevents interrupt descriptors from going away or changing concurrently. Prevent this by moving the call to setup_vector_irq() into the vector_lock held region which protects set_cpu_online(): cpu N cpu M native_cpu_up device_shutdown do_boot_cpu free_msi_irqs start_secondary arch_teardown_msi_irqs smp_callin default_teardown_msi_irqs lock(vector_lock) arch_teardown_msi_irq setup_vector_irq() __setup_vector_irq native_teardown_msi_irq install vectors destroy_irq set_cpu_online unlock(vector_lock) lock(vector_lock) __clear_irq_vector unlock(vector_lock) So cpu M either sees the cpu N online before clearing the vector or cpu N installs the vectors after cpu M has cleared it. Reported-by: xiao jin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.141898931@linutronix.de --- arch/x86/kernel/apic/vector.c | 10 ++-------- arch/x86/kernel/smpboot.c | 13 +++++-------- 2 files changed, 7 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 28eba2d38b15..f813261d9740 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -409,12 +409,6 @@ static void __setup_vector_irq(int cpu) int irq, vector; struct apic_chip_data *data; - /* - * vector_lock will make sure that we don't run into irq vector - * assignments that might be happening on another cpu in parallel, - * while we setup our initial vector to irq mappings. - */ - raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { data = apic_chip_data(irq_get_irq_data(irq)); @@ -436,16 +430,16 @@ static void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, data->domain)) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } - raw_spin_unlock(&vector_lock); } /* - * Setup the vector to irq mappings. + * Setup the vector to irq mappings. Must be called with vector_lock held. */ void setup_vector_irq(int cpu) { int irq; + lockdep_assert_held(&vector_lock); /* * On most of the platforms, legacy PIC delivers the interrupts on the * boot cpu. But there are certain platforms where PIC interrupts are diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0bd8c1d507b3..d3010aa79daf 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -170,11 +170,6 @@ static void smp_callin(void) */ apic_ap_setup(); - /* - * Need to setup vector mappings before we enable interrupts. - */ - setup_vector_irq(smp_processor_id()); - /* * Save our processor parameters. Note: this information * is needed for clock calibration. @@ -239,11 +234,13 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* - * We need to hold vector_lock so there the set of online cpus - * does not change while we are assigning vectors to cpus. Holding - * this lock ensures we don't half assign or remove an irq from a cpu. + * Lock vector_lock and initialize the vectors on this cpu + * before setting the cpu online. We must set it online with + * vector_lock held to prevent a concurrent setup/teardown + * from seeing a half valid vector space. */ lock_vector_lock(); + setup_vector_irq(smp_processor_id()); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); cpu_set_state_online(smp_processor_id()); -- cgit v1.2.3 From cbb24dc761d95fe39a7a122bb1b298e9604cae15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:33 +0000 Subject: x86/irq: Use proper locking in check_irq_vectors_for_cpu_disable() It's unsafe to examine fields in the irq descriptor w/o holding the descriptor lock. Add proper locking. While at it add a comment why the vector check can run lock less Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.236544164@linutronix.de --- arch/x86/kernel/irq.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 88b366487b0e..85ca76e6241c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -347,14 +347,22 @@ int check_irq_vectors_for_cpu_disable(void) if (!desc) continue; + /* + * Protect against concurrent action removal, + * affinity changes etc. + */ + raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); cpumask_clear_cpu(this_cpu, &affinity_new); /* Do not count inactive or per-cpu irqs. */ - if (!irq_has_action(irq) || irqd_is_per_cpu(data)) + if (!irq_has_action(irq) || irqd_is_per_cpu(data)) { + raw_spin_unlock(&desc->lock); continue; + } + raw_spin_unlock(&desc->lock); /* * A single irq may be mapped to multiple * cpu's vector_irq[] (for example IOAPIC cluster @@ -385,6 +393,9 @@ int check_irq_vectors_for_cpu_disable(void) * vector. If the vector is marked in the used vectors * bitmap or an irq is assigned to it, we don't count * it as available. + * + * As this is an inaccurate snapshot anyway, we can do + * this w/o holding vector_lock. */ for (vector = FIRST_EXTERNAL_VECTOR; vector < first_system_vector; vector++) { -- cgit v1.2.3 From 09cf92b784fae6109450c5d64f9908066d605249 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:35 +0000 Subject: x86/irq: Retrieve irq data after locking irq_desc irq_data is protected by irq_desc->lock, so retrieving the irq chip from irq_data outside the lock is racy vs. an concurrent update. Move it into the lock held region. While at it add a comment why the vector walk does not require vector_lock. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.331320612@linutronix.de --- arch/x86/kernel/irq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 85ca76e6241c..c7dfe1be784e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -497,6 +497,11 @@ void fixup_irqs(void) */ mdelay(1); + /* + * We can walk the vector array of this cpu without holding + * vector_lock because the cpu is already marked !online, so + * nothing else will touch it. + */ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; @@ -508,9 +513,9 @@ void fixup_irqs(void) irq = __this_cpu_read(vector_irq[vector]); desc = irq_to_desc(irq); + raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); chip = irq_data_get_irq_chip(data); - raw_spin_lock(&desc->lock); if (chip->irq_retrigger) { chip->irq_retrigger(data); __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); -- cgit v1.2.3 From 69711ca19b06d1b33d8f21213b540b5d1c638dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Hinderer?= Date: Wed, 8 Jul 2015 00:02:01 +0200 Subject: x86/kconfig: Fix typo in the CONFIG_CMDLINE_BOOL help text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sébastien Hinderer Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Samuel Thibault Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0b2929f49531..3dbb7e7909ca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2020,7 +2020,7 @@ config CMDLINE_BOOL To compile command line arguments into the kernel, set this option to 'Y', then fill in the - the boot arguments in CONFIG_CMDLINE. + boot arguments in CONFIG_CMDLINE. Systems with fully functional boot loaders (i.e. non-embedded) should leave this option set to 'N'. -- cgit v1.2.3 From 02941007f59ce015233d4c0f7047776960bf0c17 Mon Sep 17 00:00:00 2001 From: "qipeng.zha" Date: Thu, 9 Jul 2015 00:14:15 +0800 Subject: intel_pmc_ipc: Update kerneldoc formatting Update kerneldoc formatting per Documentation/kernel-dec-nano-HOWTO.txt. Signed-off-by: qipeng.zha Signed-off-by: Darren Hart --- arch/x86/include/asm/intel_pmc_ipc.h | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h index 200ec2e7821d..cd0310e186f4 100644 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -25,36 +25,9 @@ #if IS_ENABLED(CONFIG_INTEL_PMC_IPC) -/* - * intel_pmc_ipc_simple_command - * @cmd: command - * @sub: sub type - */ int intel_pmc_ipc_simple_command(int cmd, int sub); - -/* - * intel_pmc_ipc_raw_cmd - * @cmd: command - * @sub: sub type - * @in: input data - * @inlen: input length in bytes - * @out: output data - * @outlen: output length in dwords - * @sptr: data writing to SPTR register - * @dptr: data writing to DPTR register - */ int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen, u32 *out, u32 outlen, u32 dptr, u32 sptr); - -/* - * intel_pmc_ipc_command - * @cmd: command - * @sub: sub type - * @in: input data - * @inlen: input length in bytes - * @out: output data - * @outlen: output length in dwords - */ int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, u32 *out, u32 outlen); -- cgit v1.2.3 From a833581e372a4adae2319d8dc379493edbc444e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Jul 2015 19:23:38 +0200 Subject: x86, perf: Fix static_key bug in load_mm_cr4() Mikulas reported his K6-3 not booting. This is because the static_key API confusion struck and bit Andy, this wants to be static_key_false(). Reported-by: Mikulas Patocka Tested-by: Mikulas Patocka Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Valdis Kletnieks Cc: Vince Weaver Cc: hillf.zj Fixes: a66734297f78 ("perf/x86: Add /sys/devices/cpu/rdpmc=2 to allow rdpmc for all tasks") Link: http://lkml.kernel.org/r/20150709172338.GC19282@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 5e8daee7c5c9..804a3a6030ca 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -23,7 +23,7 @@ extern struct static_key rdpmc_always_available; static inline void load_mm_cr4(struct mm_struct *mm) { - if (static_key_true(&rdpmc_always_available) || + if (static_key_false(&rdpmc_always_available) || atomic_read(&mm->context.perf_rdpmc_allowed)) cr4_set_bits(X86_CR4_PCE); else -- cgit v1.2.3 From 5d75a747596be046546eeb9d6ba39a3af851a1af Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jul 2015 12:17:36 +0200 Subject: x86: hyperv: add CPUID bit for crash handlers Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/hyperv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 8fba544e9cc4..f36d56bd7632 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -108,6 +108,8 @@ #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) /* Support for a virtual guest idle state is available */ #define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) +/* Guest crash data handler available */ +#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10) /* * Implementation recommendations. Indicates which behaviors the hypervisor -- cgit v1.2.3 From d1fe9219551e914f26219afaca1063b280f25963 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jul 2015 15:03:18 +0200 Subject: KVM: x86: reintroduce kvm_is_mmio_pfn The call to get_mt_mask was really using kvm_is_reserved_pfn to detect an MMIO-backed page. In this case, we want "false" to be returned for the zero page. Reintroduce a separate kvm_is_mmio_pfn predicate for this use only. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f807496b62c2..44171462bd2a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2479,6 +2479,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 0; } +static bool kvm_is_mmio_pfn(pfn_t pfn) +{ + if (pfn_valid(pfn)) + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); + + return true; +} + static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int level, gfn_t gfn, pfn_t pfn, bool speculative, @@ -2506,7 +2514,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, - kvm_is_reserved_pfn(pfn)); + kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= SPTE_HOST_WRITEABLE; -- cgit v1.2.3 From 370777daab3f024f1645177039955088e2e9ae73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Fri, 3 Jul 2015 15:49:28 +0200 Subject: KVM: VMX: fix vmwrite to invalid VMCS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fpu_activate is called outside of vcpu_load(), which means it should not touch VMCS, but fpu_activate needs to. Avoid the call by moving it to a point where we know that the guest needs eager FPU and VMCS is loaded. This will get rid of the following trace vmwrite error: reg 6800 value 0 (err 1) [] dump_stack+0x19/0x1b [] vmwrite_error+0x2c/0x2e [kvm_intel] [] vmcs_writel+0x1f/0x30 [kvm_intel] [] vmx_fpu_activate.part.61+0x45/0xb0 [kvm_intel] [] vmx_fpu_activate+0x15/0x20 [kvm_intel] [] kvm_arch_vcpu_create+0x51/0x70 [kvm] [] kvm_vm_ioctl+0x1c1/0x760 [kvm] [] ? handle_mm_fault+0x49a/0xec0 [] do_vfs_ioctl+0x2e5/0x4c0 [] ? file_has_perm+0xae/0xc0 [] SyS_ioctl+0xa1/0xc0 [] system_call_fastpath+0x16/0x1b (Note: we also unconditionally activate FPU in vmx_vcpu_reset(), so the removed code added nothing.) Fixes: c447e76b4cab ("kvm/fpu: Enable eager restore kvm FPU for MPX") Cc: Reported-by: Vlastimil Holer Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/x86.c | 5 ----- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 64dd46793099..2fbea2544f24 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -98,6 +98,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu); + if (vcpu->arch.eager_fpu) + kvm_x86_ops->fpu_activate(vcpu); /* * The existing code assumes virtual address is 48-bit in the canonical diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bbaf44e8f0d3..6bd19c7abc65 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7315,11 +7315,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu = kvm_x86_ops->vcpu_create(kvm, id); - /* - * Activate fpu unconditionally in case the guest needs eager FPU. It will be - * deactivated soon if it doesn't. - */ - kvm_x86_ops->fpu_activate(vcpu); return vcpu; } -- cgit v1.2.3 From 5544eb9b81940647b8fad1f251b37cbe2819ce44 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jul 2015 15:41:58 +0200 Subject: KVM: count number of assigned devices If there are no assigned devices, the guest PAT are not providing any useful information and can be overridden to writeback; VMX always does this because it has the "IPAT" bit in its extended page table entries, but SVM does not have anything similar. Hook into VFIO and legacy device assignment so that they provide this information to KVM. Reviewed-by: Alex Williamson Tested-by: Joerg Roedel Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/iommu.c | 2 ++ arch/x86/kvm/x86.c | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2a7f5d782c33..49ec9038ec14 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -604,6 +604,8 @@ struct kvm_arch { bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA atomic_t noncoherent_dma_count; +#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE + atomic_t assigned_device_count; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 7dbced309ddb..5c520ebf6343 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -200,6 +200,7 @@ int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) goto out_unmap; } + kvm_arch_start_assignment(kvm); pci_set_dev_assigned(pdev); dev_info(&pdev->dev, "kvm assign device\n"); @@ -224,6 +225,7 @@ int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) iommu_detach_device(domain, &pdev->dev); pci_clear_dev_assigned(pdev); + kvm_arch_end_assignment(kvm); dev_info(&pdev->dev, "kvm deassign device\n"); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6bd19c7abc65..0024968b342d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8213,6 +8213,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) kvm_x86_ops->interrupt_allowed(vcpu); } +void kvm_arch_start_assignment(struct kvm *kvm) +{ + atomic_inc(&kvm->arch.assigned_device_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); + +void kvm_arch_end_assignment(struct kvm *kvm) +{ + atomic_dec(&kvm->arch.assigned_device_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); + +bool kvm_arch_has_assigned_device(struct kvm *kvm) +{ + return atomic_read(&kvm->arch.assigned_device_count); +} +EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); + void kvm_arch_register_noncoherent_dma(struct kvm *kvm) { atomic_inc(&kvm->arch.noncoherent_dma_count); -- cgit v1.2.3 From 3c2e7f7de3240216042b61073803b61b9b3cfb22 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jul 2015 14:32:17 +0200 Subject: KVM: SVM: use NPT page attributes Right now, NPT page attributes are not used, and the final page attribute depends solely on gPAT (which however is not synced correctly), the guest MTRRs and the guest page attributes. However, we can do better by mimicking what is done for VMX. In the absence of PCI passthrough, the guest PAT can be ignored and the page attributes can be just WB. If passthrough is being used, instead, keep respecting the guest PAT, and emulate the guest MTRRs through the PAT field of the nested page tables. The only snag is that WP memory cannot be emulated correctly, because Linux's default PAT setting only includes the other types. Tested-by: Joerg Roedel Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 96 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 602b974a60a6..414ec25b673e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -865,6 +865,64 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +#define MTRR_TYPE_UC_MINUS 7 +#define MTRR2PROTVAL_INVALID 0xff + +static u8 mtrr2protval[8]; + +static u8 fallback_mtrr_type(int mtrr) +{ + /* + * WT and WP aren't always available in the host PAT. Treat + * them as UC and UC- respectively. Everything else should be + * there. + */ + switch (mtrr) + { + case MTRR_TYPE_WRTHROUGH: + return MTRR_TYPE_UNCACHABLE; + case MTRR_TYPE_WRPROT: + return MTRR_TYPE_UC_MINUS; + default: + BUG(); + } +} + +static void build_mtrr2protval(void) +{ + int i; + u64 pat; + + for (i = 0; i < 8; i++) + mtrr2protval[i] = MTRR2PROTVAL_INVALID; + + /* Ignore the invalid MTRR types. */ + mtrr2protval[2] = 0; + mtrr2protval[3] = 0; + + /* + * Use host PAT value to figure out the mapping from guest MTRR + * values to nested page table PAT/PCD/PWT values. We do not + * want to change the host PAT value every time we enter the + * guest. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + for (i = 0; i < 8; i++) { + u8 mtrr = pat >> (8 * i); + + if (mtrr2protval[mtrr] == MTRR2PROTVAL_INVALID) + mtrr2protval[mtrr] = __cm_idx2pte(i); + } + + for (i = 0; i < 8; i++) { + if (mtrr2protval[i] == MTRR2PROTVAL_INVALID) { + u8 fallback = fallback_mtrr_type(i); + mtrr2protval[i] = mtrr2protval[fallback]; + BUG_ON(mtrr2protval[i] == MTRR2PROTVAL_INVALID); + } + } +} + static __init int svm_hardware_setup(void) { int cpu; @@ -931,6 +989,7 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); + build_mtrr2protval(); return 0; err: @@ -1085,6 +1144,42 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } +static void svm_set_guest_pat(struct vcpu_svm *svm, u64 *g_pat) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + + /* Unlike Intel, AMD takes the guest's CR0.CD into account. + * + * AMD does not have IPAT. To emulate it for the case of guests + * with no assigned devices, just set everything to WB. If guests + * have assigned devices, however, we cannot force WB for RAM + * pages only, so use the guest PAT directly. + */ + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + *g_pat = 0x0606060606060606; + else + *g_pat = vcpu->arch.pat; +} + +static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +{ + u8 mtrr; + + /* + * 1. MMIO: always map as UC + * 2. No passthrough: always map as WB, and force guest PAT to WB as well + * 3. Passthrough: can't guarantee the result, try to trust guest. + */ + if (is_mmio) + return _PAGE_NOCACHE; + + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + return 0; + + mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + return mtrr2protval[mtrr]; +} + static void init_vmcb(struct vcpu_svm *svm, bool init_event) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -1180,6 +1275,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event) clr_cr_intercept(svm, INTERCEPT_CR3_READ); clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = svm->vcpu.arch.pat; + svm_set_guest_pat(svm, &save->g_pat); save->cr3 = 0; save->cr4 = 0; } @@ -4088,11 +4184,6 @@ static bool svm_has_high_real_mode_segbase(void) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_cpuid_update(struct kvm_vcpu *vcpu) { } -- cgit v1.2.3 From e098223b789b4a618dacd79e5e0dad4a9d5018d1 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 20 Apr 2015 19:25:31 +0200 Subject: KVM: SVM: Sync g_pat with guest-written PAT value When hardware supports the g_pat VMCB field, we can use it for emulating the PAT configuration that the guest configures by writing to the corresponding MSR. Signed-off-by: Jan Kiszka Tested-by: Joerg Roedel Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 414ec25b673e..d36cfaf5a97a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3350,6 +3350,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; + case MSR_IA32_CR_PAT: + if (npt_enabled) { + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; + vcpu->arch.pat = data; + svm_set_guest_pat(svm, &svm->vmcb->save.g_pat); + mark_dirty(svm->vmcb, VMCB_NPT); + break; + } + /* fall through */ default: return kvm_set_msr_common(vcpu, msr); } -- cgit v1.2.3 From fd717f11015f673487ffc826e59b2bad69d20fe5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jul 2015 14:38:13 +0200 Subject: KVM: x86: apply guest MTRR virtualization on host reserved pages Currently guest MTRR is avoided if kvm_is_reserved_pfn returns true. However, the guest could prefer a different page type than UC for such pages. A good example is that pass-throughed VGA frame buffer is not always UC as host expected. This patch enables full use of virtual guest MTRRs. Suggested-by: Xiao Guangrong Tested-by: Joerg Roedel (on AMD) Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 7 ++----- arch/x86/kvm/vmx.c | 11 +++-------- 2 files changed, 5 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d36cfaf5a97a..bbc678a66b18 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1166,14 +1166,11 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) u8 mtrr; /* - * 1. MMIO: always map as UC + * 1. MMIO: trust guest MTRR, so same as item 3. * 2. No passthrough: always map as WB, and force guest PAT to WB as well * 3. Passthrough: can't guarantee the result, try to trust guest. */ - if (is_mmio) - return _PAGE_NOCACHE; - - if (!kvm_arch_has_assigned_device(vcpu->kvm)) + if (!is_mmio && !kvm_arch_has_assigned_device(vcpu->kvm)) return 0; mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e856dd566f4c..5b4e9384717a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8632,22 +8632,17 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) u64 ipat = 0; /* For VT-d and EPT combination - * 1. MMIO: always map as UC + * 1. MMIO: guest may want to apply WC, trust it. * 2. EPT with VT-d: * a. VT-d without snooping control feature: can't guarantee the - * result, try to trust guest. + * result, try to trust guest. So the same as item 1. * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ - if (is_mmio) { - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + if (!is_mmio && !kvm_arch_has_noncoherent_dma(vcpu->kvm)) { ipat = VMX_EPT_IPAT_BIT; cache = MTRR_TYPE_WRBACK; goto exit; -- cgit v1.2.3 From ee4100da1616d6d151f97862b87e8a4e43d14b4c Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 9 Jul 2015 15:44:52 +0800 Subject: kvm: x86: fix load xsave feature warning [ 68.196974] WARNING: CPU: 1 PID: 2140 at arch/x86/kvm/x86.c:3161 kvm_arch_vcpu_ioctl+0xe88/0x1340 [kvm]() [ 68.196975] Modules linked in: snd_hda_codec_hdmi i915 rfcomm bnep bluetooth i2c_algo_bit rfkill nfsd drm_kms_helper nfs_acl nfs drm lockd grace sunrpc fscache snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_dummy snd_seq_oss x86_pkg_temp_thermal snd_seq_midi kvm_intel snd_seq_midi_event snd_rawmidi kvm snd_seq ghash_clmulni_intel fuse snd_timer aesni_intel parport_pc ablk_helper snd_seq_device cryptd ppdev snd lp parport lrw dcdbas gf128mul i2c_core glue_helper lpc_ich video shpchp mfd_core soundcore serio_raw acpi_cpufreq ext4 mbcache jbd2 sd_mod crc32c_intel ahci libahci libata e1000e ptp pps_core [ 68.197005] CPU: 1 PID: 2140 Comm: qemu-system-x86 Not tainted 4.2.0-rc1+ #2 [ 68.197006] Hardware name: Dell Inc. OptiPlex 7020/0F5C5X, BIOS A03 01/08/2015 [ 68.197007] ffffffffa03b0657 ffff8800d984bca8 ffffffff815915a2 0000000000000000 [ 68.197009] 0000000000000000 ffff8800d984bce8 ffffffff81057c0a 00007ff6d0001000 [ 68.197010] 0000000000000002 ffff880211c1a000 0000000000000004 ffff8800ce0288c0 [ 68.197012] Call Trace: [ 68.197017] [] dump_stack+0x45/0x57 [ 68.197020] [] warn_slowpath_common+0x8a/0xc0 [ 68.197022] [] warn_slowpath_null+0x1a/0x20 [ 68.197029] [] kvm_arch_vcpu_ioctl+0xe88/0x1340 [kvm] [ 68.197035] [] ? kvm_arch_vcpu_load+0x4e/0x1c0 [kvm] [ 68.197040] [] kvm_vcpu_ioctl+0xc6/0x5c0 [kvm] [ 68.197043] [] ? perf_pmu_enable+0x22/0x30 [ 68.197044] [] ? perf_event_context_sched_in+0x7e/0xb0 [ 68.197048] [] do_vfs_ioctl+0x2c2/0x4a0 [ 68.197050] [] ? finish_task_switch+0x173/0x220 [ 68.197053] [] ? selinux_file_ioctl+0x4f/0xd0 [ 68.197055] [] ? security_file_ioctl+0x43/0x60 [ 68.197057] [] SyS_ioctl+0x79/0x90 [ 68.197060] [] entry_SYSCALL_64_fastpath+0x12/0x6a [ 68.197061] ---[ end trace 558a5ebf9445fc80 ]--- After commit (0c4109bec0 'x86/fpu/xstate: Fix up bad get_xsave_addr() assumptions'), there is no assumption an xsave bit is present in the hardware (pcntxt_mask) that it is always present in a given xsave buffer. An enabled state to be present on 'pcntxt_mask', but *not* in 'xstate_bv' could happen when the last 'xsave' did not request that this feature be saved (unlikely) or because the "init optimization" caused it to not be saved. This patch kill the assumption. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0024968b342d..5ef2560075bf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3157,8 +3157,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) cpuid_count(XSTATE_CPUID, index, &size, &offset, &ecx, &edx); memcpy(dest, src + offset, size); - } else - WARN_ON_ONCE(1); + } valid -= feature; } -- cgit v1.2.3 From ce0d3c0a6fb1422101498ef378c0851dabbbf67f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Jul 2015 22:03:57 +0200 Subject: genirq: Revert sparse irq locking around __cpu_up() and move it to x86 for now Boris reported that the sparse_irq protection around __cpu_up() in the generic code causes a regression on Xen. Xen allocates interrupts and some more in the xen_cpu_up() function, so it deadlocks on the sparse_irq_lock. There is no simple fix for this and we really should have the protection for all architectures, but for now the only solution is to move it to x86 where actual wreckage due to the lack of protection has been observed. Reported-and-tested-by: Boris Ostrovsky Fixes: a89941816726 'hotplug: Prevent alloc/free of irq descriptors during cpu up/down' Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Cc: xen-devel --- arch/x86/kernel/smpboot.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d3010aa79daf..b1f3ed9c7a9e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -992,8 +992,17 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) common_cpu_up(cpu, tidle); + /* + * We have to walk the irq descriptors to setup the vector + * space for the cpu which comes online. Prevent irq + * alloc/free across the bringup. + */ + irq_lock_sparse(); + err = do_boot_cpu(apicid, cpu, tidle); + if (err) { + irq_unlock_sparse(); pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); return -EIO; } @@ -1011,6 +1020,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) touch_nmi_watchdog(); } + irq_unlock_sparse(); + return 0; } -- cgit v1.2.3 From 9d05041679904b12c12421cbcf9cb5f4860a8d7b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:33 -0700 Subject: x86/nmi: Enable nested do_nmi() handling for 64-bit kernels 32-bit kernels handle nested NMIs in C. Enable the exact same handling on 64-bit kernels as well. This isn't currently necessary, but it will become necessary once the asm code starts allowing limited nesting. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 123 +++++++++++++++++++++----------------------------- 1 file changed, 52 insertions(+), 71 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index c3e985d1751c..d8766b1c9974 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs) NOKPROBE_SYMBOL(default_do_nmi); /* - * NMIs can hit breakpoints which will cause it to lose its - * NMI context with the CPU when the breakpoint does an iret. - */ -#ifdef CONFIG_X86_32 -/* - * For i386, NMIs use the same stack as the kernel, and we can - * add a workaround to the iret problem in C (preventing nested - * NMIs if an NMI takes a trap). Simply have 3 states the NMI - * can be in: + * NMIs can hit breakpoints which will cause it to lose its NMI context + * with the CPU when the breakpoint or page fault does an IRET. + * + * As a result, NMIs can nest if NMIs get unmasked due an IRET during + * NMI processing. On x86_64, the asm glue protects us from nested NMIs + * if the outer NMI came from kernel mode, but we can still nest if the + * outer NMI came from user mode. + * + * To handle these nested NMIs, we have three states: * * 1) not running * 2) executing @@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi); * (Note, the latch is binary, thus multiple NMIs triggering, * when one is running, are ignored. Only one NMI is restarted.) * - * If an NMI hits a breakpoint that executes an iret, another - * NMI can preempt it. We do not want to allow this new NMI - * to run, but we want to execute it when the first one finishes. - * We set the state to "latched", and the exit of the first NMI will - * perform a dec_return, if the result is zero (NOT_RUNNING), then - * it will simply exit the NMI handler. If not, the dec_return - * would have set the state to NMI_EXECUTING (what we want it to - * be when we are running). In this case, we simply jump back - * to rerun the NMI handler again, and restart the 'latched' NMI. + * If an NMI executes an iret, another NMI can preempt it. We do not + * want to allow this new NMI to run, but we want to execute it when the + * first one finishes. We set the state to "latched", and the exit of + * the first NMI will perform a dec_return, if the result is zero + * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the + * dec_return would have set the state to NMI_EXECUTING (what we want it + * to be when we are running). In this case, we simply jump back to + * rerun the NMI handler again, and restart the 'latched' NMI. * * No trap (breakpoint or page fault) should be hit before nmi_restart, * thus there is no race between the first check of state for NOT_RUNNING @@ -461,49 +460,36 @@ enum nmi_states { static DEFINE_PER_CPU(enum nmi_states, nmi_state); static DEFINE_PER_CPU(unsigned long, nmi_cr2); -#define nmi_nesting_preprocess(regs) \ - do { \ - if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ - this_cpu_write(nmi_state, NMI_LATCHED); \ - return; \ - } \ - this_cpu_write(nmi_state, NMI_EXECUTING); \ - this_cpu_write(nmi_cr2, read_cr2()); \ - } while (0); \ - nmi_restart: - -#define nmi_nesting_postprocess() \ - do { \ - if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ - write_cr2(this_cpu_read(nmi_cr2)); \ - if (this_cpu_dec_return(nmi_state)) \ - goto nmi_restart; \ - } while (0) -#else /* x86_64 */ +#ifdef CONFIG_X86_64 /* - * In x86_64 things are a bit more difficult. This has the same problem - * where an NMI hitting a breakpoint that calls iret will remove the - * NMI context, allowing a nested NMI to enter. What makes this more - * difficult is that both NMIs and breakpoints have their own stack. - * When a new NMI or breakpoint is executed, the stack is set to a fixed - * point. If an NMI is nested, it will have its stack set at that same - * fixed address that the first NMI had, and will start corrupting the - * stack. This is handled in entry_64.S, but the same problem exists with - * the breakpoint stack. + * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without + * some care, the inner breakpoint will clobber the outer breakpoint's + * stack. * - * If a breakpoint is being processed, and the debug stack is being used, - * if an NMI comes in and also hits a breakpoint, the stack pointer - * will be set to the same fixed address as the breakpoint that was - * interrupted, causing that stack to be corrupted. To handle this case, - * check if the stack that was interrupted is the debug stack, and if - * so, change the IDT so that new breakpoints will use the current stack - * and not switch to the fixed address. On return of the NMI, switch back - * to the original IDT. + * If a breakpoint is being processed, and the debug stack is being + * used, if an NMI comes in and also hits a breakpoint, the stack + * pointer will be set to the same fixed address as the breakpoint that + * was interrupted, causing that stack to be corrupted. To handle this + * case, check if the stack that was interrupted is the debug stack, and + * if so, change the IDT so that new breakpoints will use the current + * stack and not switch to the fixed address. On return of the NMI, + * switch back to the original IDT. */ static DEFINE_PER_CPU(int, update_debug_stack); +#endif -static inline void nmi_nesting_preprocess(struct pt_regs *regs) +dotraplinkage notrace void +do_nmi(struct pt_regs *regs, long error_code) { + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { + this_cpu_write(nmi_state, NMI_LATCHED); + return; + } + this_cpu_write(nmi_state, NMI_EXECUTING); + this_cpu_write(nmi_cr2, read_cr2()); +nmi_restart: + +#ifdef CONFIG_X86_64 /* * If we interrupted a breakpoint, it is possible that * the nmi handler will have breakpoints too. We need to @@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs) debug_stack_set_zero(); this_cpu_write(update_debug_stack, 1); } -} - -static inline void nmi_nesting_postprocess(void) -{ - if (unlikely(this_cpu_read(update_debug_stack))) { - debug_stack_reset(); - this_cpu_write(update_debug_stack, 0); - } -} #endif -dotraplinkage notrace void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_nesting_preprocess(regs); - nmi_enter(); inc_irq_stat(__nmi_count); @@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code) nmi_exit(); - /* On i386, may loop back to preprocess */ - nmi_nesting_postprocess(); +#ifdef CONFIG_X86_64 + if (unlikely(this_cpu_read(update_debug_stack))) { + debug_stack_reset(); + this_cpu_write(update_debug_stack, 0); + } +#endif + + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) + write_cr2(this_cpu_read(nmi_cr2)); + if (this_cpu_dec_return(nmi_state)) + goto nmi_restart; } NOKPROBE_SYMBOL(do_nmi); -- cgit v1.2.3 From 0e181bb58143cb4a2e8f01c281b0816cd0e4798e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:34 -0700 Subject: x86/nmi/64: Remove asm code that saves CR2 Now that do_nmi saves CR2, we don't need to save it in asm. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Acked-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3bb2c4302df1..062feb4eb478 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1415,28 +1415,11 @@ end_repeat_nmi: */ call paranoid_entry - /* - * Save off the CR2 register. If we take a page fault in the NMI then - * it could corrupt the CR2 value. If the NMI preempts a page fault - * handler before it was able to read the CR2 register, and then the - * NMI itself takes a page fault, the page fault that was preempted - * will read the information from the NMI page fault and not the - * origin fault. Save it off and restore it if it changes. - * Use the r12 callee-saved register. - */ - movq %cr2, %r12 - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi movq $-1, %rsi call do_nmi - /* Did the NMI take a page fault? Restore cr2 if it did */ - movq %cr2, %rcx - cmpq %rcx, %r12 - je 1f - movq %r12, %cr2 -1: testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: -- cgit v1.2.3 From 9b6e6a8334d56354853f9c255d1395c2ba570e0a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:35 -0700 Subject: x86/nmi/64: Switch stacks on userspace NMI entry Returning to userspace is tricky: IRET can fail, and ESPFIX can rearrange the stack prior to IRET. The NMI nesting fixup relies on a precise stack layout and atomic IRET. Rather than trying to teach the NMI nesting fixup to handle ESPFIX and failed IRET, punt: run NMIs that came from user mode on the normal kernel stack. This will make some nested NMIs visible to C code, but the C code is okay with that. As a side effect, this should speed up perf: it eliminates an RDMSR when NMIs come from user mode. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Reviewed-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 62 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 062feb4eb478..8668bbdd2bca 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1250,18 +1250,72 @@ ENTRY(nmi) * a nested NMI that updated the copy interrupt stack frame, a * jump will be made to the repeat_nmi code that will handle the second * NMI. + * + * However, espfix prevents us from directly returning to userspace + * with a single IRET instruction. Similarly, IRET to user mode + * can fault. We therefore handle NMIs from user space like + * other IST entries. */ /* Use %rdx as our temp variable throughout */ pushq %rdx + testb $3, CS-RIP+8(%rsp) + jz .Lnmi_from_kernel + + /* + * NMI from user mode. We need to run on the thread stack, but we + * can't go through the normal entry paths: NMIs are masked, and + * we don't want to enable interrupts, because then we'll end + * up in an awkward situation in which IRQs are on but NMIs + * are off. + */ + + SWAPGS + cld + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + pushq 5*8(%rdx) /* pt_regs->ss */ + pushq 4*8(%rdx) /* pt_regs->rsp */ + pushq 3*8(%rdx) /* pt_regs->flags */ + pushq 2*8(%rdx) /* pt_regs->cs */ + pushq 1*8(%rdx) /* pt_regs->rip */ + pushq $-1 /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq (%rdx) /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq %rax /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ + /* - * If %cs was not the kernel segment, then the NMI triggered in user - * space, which means it is definitely not nested. + * At this point we no longer need to worry about stack damage + * due to nesting -- we're on the normal thread stack and we're + * done with the NMI stack. */ - cmpl $__KERNEL_CS, 16(%rsp) - jne first_nmi + movq %rsp, %rdi + movq $-1, %rsi + call do_nmi + + /* + * Return back to user mode. We must *not* do the normal exit + * work, because we don't want to enable interrupts. Fortunately, + * do_nmi doesn't modify pt_regs. + */ + SWAPGS + jmp restore_c_regs_and_iret + +.Lnmi_from_kernel: /* * Check the special variable on the stack to see if NMIs are * executing. -- cgit v1.2.3 From 0b22930ebad563ae97ff3f8d7b9f12060b4c6e6b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:36 -0700 Subject: x86/nmi/64: Improve nested NMI comments I found the nested NMI documentation to be difficult to follow. Improve the comments. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 158 +++++++++++++++++++++++++++------------------- arch/x86/kernel/nmi.c | 4 +- 2 files changed, 94 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 8668bbdd2bca..f54d63a60a3b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1237,11 +1237,12 @@ ENTRY(nmi) * If the variable is not set and the stack is not the NMI * stack then: * o Set the special variable on the stack - * o Copy the interrupt frame into a "saved" location on the stack - * o Copy the interrupt frame into a "copy" location on the stack + * o Copy the interrupt frame into an "outermost" location on the + * stack + * o Copy the interrupt frame into an "iret" location on the stack * o Continue processing the NMI * If the variable is set or the previous stack is the NMI stack: - * o Modify the "copy" location to jump to the repeate_nmi + * o Modify the "iret" location to jump to the repeat_nmi * o return back to the first NMI * * Now on exit of the first NMI, we first clear the stack variable @@ -1317,18 +1318,60 @@ ENTRY(nmi) .Lnmi_from_kernel: /* - * Check the special variable on the stack to see if NMIs are - * executing. + * Here's what our stack frame will look like: + * +---------------------------------------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +---------------------------------------------------------+ + * | temp storage for rdx | + * +---------------------------------------------------------+ + * | "NMI executing" variable | + * +---------------------------------------------------------+ + * | iret SS } Copied from "outermost" frame | + * | iret Return RSP } on each loop iteration; overwritten | + * | iret RFLAGS } by a nested NMI to force another | + * | iret CS } iteration if needed. | + * | iret RIP } | + * +---------------------------------------------------------+ + * | outermost SS } initialized in first_nmi; | + * | outermost Return RSP } will not be changed before | + * | outermost RFLAGS } NMI processing is done. | + * | outermost CS } Copied to "iret" frame on each | + * | outermost RIP } iteration. | + * +---------------------------------------------------------+ + * | pt_regs | + * +---------------------------------------------------------+ + * + * The "original" frame is used by hardware. Before re-enabling + * NMIs, we need to be done with it, and we need to leave enough + * space for the asm code here. + * + * We return by executing IRET while RSP points to the "iret" frame. + * That will either return for real or it will loop back into NMI + * processing. + * + * The "outermost" frame is copied to the "iret" frame on each + * iteration of the loop, so each iteration starts with the "iret" + * frame pointing to the final return target. + */ + + /* + * Determine whether we're a nested NMI. + * + * First check "NMI executing". If it's set, then we're nested. + * This will not detect if we interrupted an outer NMI just + * before IRET. */ cmpl $1, -8(%rsp) je nested_nmi /* - * Now test if the previous stack was an NMI stack. - * We need the double check. We check the NMI stack to satisfy the - * race when the first NMI clears the variable before returning. - * We check the variable because the first NMI could be in a - * breakpoint routine using a breakpoint stack. + * Now test if the previous stack was an NMI stack. This covers + * the case where we interrupt an outer NMI after it clears + * "NMI executing" but before IRET. */ lea 6*8(%rsp), %rdx /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ @@ -1344,9 +1387,11 @@ ENTRY(nmi) nested_nmi: /* - * Do nothing if we interrupted the fixup in repeat_nmi. - * It's about to repeat the NMI handler, so we are fine - * with ignoring this one. + * If we interrupted an NMI that is between repeat_nmi and + * end_repeat_nmi, then we must not modify the "iret" frame + * because it's being written by the outer NMI. That's okay; + * the outer NMI handler is about to call do_nmi anyway, + * so we can just resume the outer NMI. */ movq $repeat_nmi, %rdx cmpq 8(%rsp), %rdx @@ -1356,7 +1401,10 @@ nested_nmi: ja nested_nmi_out 1: - /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + /* + * Modify the "iret" frame to point to repeat_nmi, forcing another + * iteration of NMI handling. + */ leaq -1*8(%rsp), %rdx movq %rdx, %rsp leaq -10*8(%rsp), %rdx @@ -1372,61 +1420,27 @@ nested_nmi: nested_nmi_out: popq %rdx - /* No need to check faults here */ + /* We are returning to kernel mode, so this cannot result in a fault. */ INTERRUPT_RETURN first_nmi: - /* - * Because nested NMIs will use the pushed location that we - * stored in rdx, we must keep that space available. - * Here's what our stack frame will look like: - * +-------------------------+ - * | original SS | - * | original Return RSP | - * | original RFLAGS | - * | original CS | - * | original RIP | - * +-------------------------+ - * | temp storage for rdx | - * +-------------------------+ - * | NMI executing variable | - * +-------------------------+ - * | copied SS | - * | copied Return RSP | - * | copied RFLAGS | - * | copied CS | - * | copied RIP | - * +-------------------------+ - * | Saved SS | - * | Saved Return RSP | - * | Saved RFLAGS | - * | Saved CS | - * | Saved RIP | - * +-------------------------+ - * | pt_regs | - * +-------------------------+ - * - * The saved stack frame is used to fix up the copied stack frame - * that a nested NMI may change to make the interrupted NMI iret jump - * to the repeat_nmi. The original stack frame and the temp storage - * is also used by nested NMIs and can not be trusted on exit. - */ - /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ + /* Restore rdx. */ movq (%rsp), %rdx - /* Set the NMI executing variable on the stack. */ + /* Set "NMI executing" on the stack. */ pushq $1 - /* Leave room for the "copied" frame */ + /* Leave room for the "iret" frame */ subq $(5*8), %rsp - /* Copy the stack frame to the Saved frame */ + /* Copy the "original" frame to the "outermost" frame */ .rept 5 pushq 11*8(%rsp) .endr /* Everything up to here is safe from nested NMIs */ +repeat_nmi: /* * If there was a nested NMI, the first NMI's iret will return * here. But NMIs are still enabled and we can take another @@ -1435,16 +1449,21 @@ first_nmi: * it will just return, as we are about to repeat an NMI anyway. * This makes it safe to copy to the stack frame that a nested * NMI will update. - */ -repeat_nmi: - /* - * Update the stack variable to say we are still in NMI (the update - * is benign for the non-repeat case, where 1 was pushed just above - * to this very stack slot). + * + * RSP is pointing to "outermost RIP". gsbase is unknown, but, if + * we're repeating an NMI, gsbase has the same value that it had on + * the first iteration. paranoid_entry will load the kernel + * gsbase if needed before we call do_nmi. + * + * Set "NMI executing" in case we came back here via IRET. */ movq $1, 10*8(%rsp) - /* Make another copy, this one may be modified by nested NMIs */ + /* + * Copy the "outermost" frame to the "iret" frame. NMIs that nest + * here must not modify the "iret" frame while we're writing to + * it or it will end up containing garbage. + */ addq $(10*8), %rsp .rept 5 pushq -6*8(%rsp) @@ -1453,9 +1472,9 @@ repeat_nmi: end_repeat_nmi: /* - * Everything below this point can be preempted by a nested - * NMI if the first NMI took an exception and reset our iret stack - * so that we repeat another NMI. + * Everything below this point can be preempted by a nested NMI. + * If this happens, then the inner NMI will change the "iret" + * frame to point back to repeat_nmi. */ pushq $-1 /* ORIG_RAX: no syscall to restart */ ALLOC_PT_GPREGS_ON_STACK @@ -1481,11 +1500,18 @@ nmi_swapgs: nmi_restore: RESTORE_EXTRA_REGS RESTORE_C_REGS - /* Pop the extra iret frame at once */ + + /* Point RSP at the "iret" frame. */ REMOVE_PT_GPREGS_FROM_STACK 6*8 - /* Clear the NMI executing stack variable */ + /* Clear "NMI executing". */ movq $0, 5*8(%rsp) + + /* + * INTERRUPT_RETURN reads the "iret" frame and exits the NMI + * stack in a single instruction. We are returning to kernel + * mode, so this cannot result in a fault. + */ INTERRUPT_RETURN END(nmi) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index d8766b1c9974..d05bd2e2ee91 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -408,8 +408,8 @@ static void default_do_nmi(struct pt_regs *regs) NOKPROBE_SYMBOL(default_do_nmi); /* - * NMIs can hit breakpoints which will cause it to lose its NMI context - * with the CPU when the breakpoint or page fault does an IRET. + * NMIs can page fault or hit breakpoints which will cause it to lose + * its NMI context with the CPU when the breakpoint or page fault does an IRET. * * As a result, NMIs can nest if NMIs get unmasked due an IRET during * NMI processing. On x86_64, the asm glue protects us from nested NMIs -- cgit v1.2.3 From a27507ca2d796cfa8d907de31ad730359c8a6d06 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:37 -0700 Subject: x86/nmi/64: Reorder nested NMI checks Check the repeat_nmi .. end_repeat_nmi special case first. The next patch will rework the RSP check and, as a side effect, the RSP check will no longer detect repeat_nmi .. end_repeat_nmi, so we'll need this ordering of the checks. Note: this is more subtle than it appears. The check for repeat_nmi .. end_repeat_nmi jumps straight out of the NMI code instead of adjusting the "iret" frame to force a repeat. This is necessary, because the code between repeat_nmi and end_repeat_nmi sets "NMI executing" and then writes to the "iret" frame itself. If a nested NMI comes in and modifies the "iret" frame while repeat_nmi is also modifying it, we'll end up with garbage. The old code got this right, as does the new code, but the new code is a bit more explicit. If we were to move the check right after the "NMI executing" check, then we'd get it wrong and have random crashes. ( Because the "NMI executing" check would jump to the code that would modify the "iret" frame without checking if the interrupted NMI was currently modifying it. ) Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f54d63a60a3b..5c4ab384b84f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1361,7 +1361,24 @@ ENTRY(nmi) /* * Determine whether we're a nested NMI. * - * First check "NMI executing". If it's set, then we're nested. + * If we interrupted kernel code between repeat_nmi and + * end_repeat_nmi, then we are a nested NMI. We must not + * modify the "iret" frame because it's being written by + * the outer NMI. That's okay; the outer NMI handler is + * about to about to call do_nmi anyway, so we can just + * resume the outer NMI. + */ + + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out +1: + + /* + * Now check "NMI executing". If it's set, then we're nested. * This will not detect if we interrupted an outer NMI just * before IRET. */ @@ -1386,21 +1403,6 @@ ENTRY(nmi) /* Ah, it is within the NMI stack, treat it as nested */ nested_nmi: - /* - * If we interrupted an NMI that is between repeat_nmi and - * end_repeat_nmi, then we must not modify the "iret" frame - * because it's being written by the outer NMI. That's okay; - * the outer NMI handler is about to call do_nmi anyway, - * so we can just resume the outer NMI. - */ - movq $repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja 1f - movq $end_repeat_nmi, %rdx - cmpq 8(%rsp), %rdx - ja nested_nmi_out - -1: /* * Modify the "iret" frame to point to repeat_nmi, forcing another * iteration of NMI handling. -- cgit v1.2.3 From 810bc075f78ff2c221536eb3008eac6a492dba2d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:38 -0700 Subject: x86/nmi/64: Use DF to avoid userspace RSP confusing nested NMI detection We have a tricky bug in the nested NMI code: if we see RSP pointing to the NMI stack on NMI entry from kernel mode, we assume that we are executing a nested NMI. This isn't quite true. A malicious userspace program can point RSP at the NMI stack, issue SYSCALL, and arrange for an NMI to happen while RSP is still pointing at the NMI stack. Fix it with a sneaky trick. Set DF in the region of code that the RSP check is intended to detect. IRET will clear DF atomically. ( Note: other than paravirt, there's little need for all this complexity. We could check RIP instead of RSP. ) Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5c4ab384b84f..d8ab2b201fa1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1388,7 +1388,14 @@ ENTRY(nmi) /* * Now test if the previous stack was an NMI stack. This covers * the case where we interrupt an outer NMI after it clears - * "NMI executing" but before IRET. + * "NMI executing" but before IRET. We need to be careful, though: + * there is one case in which RSP could point to the NMI stack + * despite there being no NMI active: naughty userspace controls + * RSP at the very beginning of the SYSCALL targets. We can + * pull a fast one on naughty userspace, though: we program + * SYSCALL to mask DF, so userspace cannot cause DF to be set + * if it controls the kernel's RSP. We set DF before we clear + * "NMI executing". */ lea 6*8(%rsp), %rdx /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ @@ -1400,7 +1407,13 @@ ENTRY(nmi) cmpq %rdx, 4*8(%rsp) /* If it is below the NMI stack, it is a normal NMI */ jb first_nmi - /* Ah, it is within the NMI stack, treat it as nested */ + + /* Ah, it is within the NMI stack. */ + + testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) + jz first_nmi /* RSP was user controlled. */ + + /* This is a nested NMI. */ nested_nmi: /* @@ -1506,8 +1519,16 @@ nmi_restore: /* Point RSP at the "iret" frame. */ REMOVE_PT_GPREGS_FROM_STACK 6*8 - /* Clear "NMI executing". */ - movq $0, 5*8(%rsp) + /* + * Clear "NMI executing". Set DF first so that we can easily + * distinguish the remaining code between here and IRET from + * the SYSCALL entry and exit paths. On a native kernel, we + * could just inspect RIP, but, on paravirt kernels, + * INTERRUPT_RETURN can translate into a jump into a + * hypercall page. + */ + std + movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* * INTERRUPT_RETURN reads the "iret" frame and exits the NMI -- cgit v1.2.3 From 23a781e987f05029c4a99a5c145be3efa6eda9f3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:39 -0700 Subject: x86/nmi/64: Minor asm simplification Replace LEA; MOV with an equivalent SUB. This saves one instruction. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d8ab2b201fa1..0fb52526e452 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1420,8 +1420,7 @@ nested_nmi: * Modify the "iret" frame to point to repeat_nmi, forcing another * iteration of NMI handling. */ - leaq -1*8(%rsp), %rdx - movq %rdx, %rsp + subq $8, %rsp leaq -10*8(%rsp), %rdx pushq $__KERNEL_DS pushq %rdx -- cgit v1.2.3 From 36f1a77b3aa57c5c2eb1ae2d67d07c4350a78345 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:40 -0700 Subject: x86/nmi/64: Make the "NMI executing" variable more consistent Currently, "NMI executing" is one the first time an outermost NMI hits repeat_nmi and zero thereafter. Change it to be zero each time for consistency. This is intended to help NMI handling fail harder if it's buggy. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0fb52526e452..5422bd20bdf4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1441,8 +1441,8 @@ first_nmi: /* Restore rdx. */ movq (%rsp), %rdx - /* Set "NMI executing" on the stack. */ - pushq $1 + /* Make room for "NMI executing". */ + pushq $0 /* Leave room for the "iret" frame */ subq $(5*8), %rsp @@ -1467,11 +1467,10 @@ repeat_nmi: * RSP is pointing to "outermost RIP". gsbase is unknown, but, if * we're repeating an NMI, gsbase has the same value that it had on * the first iteration. paranoid_entry will load the kernel - * gsbase if needed before we call do_nmi. - * - * Set "NMI executing" in case we came back here via IRET. + * gsbase if needed before we call do_nmi. "NMI executing" + * is zero. */ - movq $1, 10*8(%rsp) + movq $1, 10*8(%rsp) /* Set "NMI executing". */ /* * Copy the "outermost" frame to the "iret" frame. NMIs that nest -- cgit v1.2.3 From a97439aa1aec10387797b4abae3cf117de1c90d7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 10:29:41 -0700 Subject: x86/entry/64, x86/nmi/64: Add CONFIG_DEBUG_ENTRY NMI testing code It turns out to be rather tedious to test the NMI nesting code. Make it easier: add a new CONFIG_DEBUG_ENTRY option that causes the NMI handler to pre-emptively unmask NMIs. With this option set, errors in the repeat_nmi logic or failures to detect that we're in a nested NMI will result in quick panics under perf (especially if multiple counters are running at high frequency) instead of requiring an unusual workload that generates page faults or breakpoints inside NMIs. I called it CONFIG_DEBUG_ENTRY instead of CONFIG_DEBUG_NMI_ENTRY because I want to add new non-NMI checks elsewhere in the entry code in the future, and I'd rather not add too many new config options or add this option and then immediately rename it. Signed-off-by: Andy Lutomirski Reviewed-by: Steven Rostedt Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 12 ++++++++++++ arch/x86/entry/entry_64.S | 15 +++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index a15893d17c55..d8c0d3266173 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -297,6 +297,18 @@ config OPTIMIZE_INLINING If unsure, say N. +config DEBUG_ENTRY + bool "Debug low-level entry code" + depends on DEBUG_KERNEL + ---help--- + This option enables sanity checks in x86's low-level entry code. + Some of these sanity checks may slow down kernel entries and + exits or otherwise impact performance. + + This is currently used to help test NMI code. + + If unsure, say N. + config DEBUG_NMI_SELFTEST bool "NMI Selftest" depends on DEBUG_KERNEL && X86_LOCAL_APIC diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5422bd20bdf4..8cb3e438f21e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1454,6 +1454,21 @@ first_nmi: /* Everything up to here is safe from nested NMIs */ +#ifdef CONFIG_DEBUG_ENTRY + /* + * For ease of testing, unmask NMIs right away. Disabled by + * default because IRET is very expensive. + */ + pushq $0 /* SS */ + pushq %rsp /* RSP (minus 8 because of the previous push) */ + addq $8, (%rsp) /* Fix up RSP */ + pushfq /* RFLAGS */ + pushq $__KERNEL_CS /* CS */ + pushq $1f /* RIP */ + INTERRUPT_RETURN /* continues at repeat_nmi below */ +1: +#endif + repeat_nmi: /* * If there was a nested NMI, the first NMI's iret will return -- cgit v1.2.3 From f2abeef9fd6f03ebf417539ed099828a56733098 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 17 Jul 2015 16:23:58 -0700 Subject: mm: clean up per architecture MM hook header files Commit 2ae416b142b6 ("mm: new mm hook framework") introduced an empty header file (mm-arch-hooks.h) for every architecture, even those which doesn't need to define mm hooks. As suggested by Geert Uytterhoeven, this could be cleaned through the use of a generic header file included via each per architecture asm/include/Kbuild file. The PowerPC architecture is not impacted here since this architecture has to defined the arch_remap MM hook. Signed-off-by: Laurent Dufour Suggested-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Vineet Gupta Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/Kbuild | 1 + arch/x86/include/asm/mm-arch-hooks.h | 15 --------------- 2 files changed, 1 insertion(+), 15 deletions(-) delete mode 100644 arch/x86/include/asm/mm-arch-hooks.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4dd1f2d770af..aeac434c9feb 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -9,3 +9,4 @@ generic-y += cputime.h generic-y += dma-contiguous.h generic-y += early_ioremap.h generic-y += mcs_spinlock.h +generic-y += mm-arch-hooks.h diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h deleted file mode 100644 index 4e881a342236..000000000000 --- a/arch/x86/include/asm/mm-arch-hooks.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Architecture specific mm hooks - * - * Copyright (C) 2015, IBM Corporation - * Author: Laurent Dufour - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef _ASM_X86_MM_ARCH_HOOKS_H -#define _ASM_X86_MM_ARCH_HOOKS_H - -#endif /* _ASM_X86_MM_ARCH_HOOKS_H */ -- cgit v1.2.3 From 0c8c0f03e3a292e031596484275c14cf39c0ab7a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 17 Jul 2015 12:28:11 +0200 Subject: x86/fpu, sched: Dynamically allocate 'struct fpu' The FPU rewrite removed the dynamic allocations of 'struct fpu'. But, this potentially wastes massive amounts of memory (2k per task on systems that do not have AVX-512 for instance). Instead of having a separate slab, this patch just appends the space that we need to the 'task_struct' which we dynamically allocate already. This saves from doing an extra slab allocation at fork(). The only real downside here is that we have to stick everything and the end of the task_struct. But, I think the BUILD_BUG_ON()s I stuck in there should keep that from being too fragile. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1437128892-9831-2-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fpu/types.h | 72 +++++++++++++++++++++------------------- arch/x86/include/asm/processor.h | 10 ++++-- arch/x86/kernel/fpu/init.c | 39 ++++++++++++++++++++++ arch/x86/kernel/process.c | 2 +- 4 files changed, 85 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 0637826292de..c49c5173158e 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -189,6 +189,7 @@ union fpregs_state { struct fxregs_state fxsave; struct swregs_state soft; struct xregs_state xsave; + u8 __padding[PAGE_SIZE]; }; /* @@ -197,40 +198,6 @@ union fpregs_state { * state fields: */ struct fpu { - /* - * @state: - * - * In-memory copy of all FPU registers that we save/restore - * over context switches. If the task is using the FPU then - * the registers in the FPU are more recent than this state - * copy. If the task context-switches away then they get - * saved here and represent the FPU state. - * - * After context switches there may be a (short) time period - * during which the in-FPU hardware registers are unchanged - * and still perfectly match this state, if the tasks - * scheduled afterwards are not using the FPU. - * - * This is the 'lazy restore' window of optimization, which - * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. - * - * We detect whether a subsequent task uses the FPU via setting - * CR0::TS to 1, which causes any FPU use to raise a #NM fault. - * - * During this window, if the task gets scheduled again, we - * might be able to skip having to do a restore from this - * memory buffer to the hardware registers - at the cost of - * incurring the overhead of #NM fault traps. - * - * Note that on modern CPUs that support the XSAVEOPT (or other - * optimized XSAVE instructions), we don't use #NM traps anymore, - * as the hardware can track whether FPU registers need saving - * or not. On such CPUs we activate the non-lazy ('eagerfpu') - * logic, which unconditionally saves/restores all FPU state - * across context switches. (if FPU state exists.) - */ - union fpregs_state state; - /* * @last_cpu: * @@ -288,6 +255,43 @@ struct fpu { * deal with bursty apps that only use the FPU for a short time: */ unsigned char counter; + /* + * @state: + * + * In-memory copy of all FPU registers that we save/restore + * over context switches. If the task is using the FPU then + * the registers in the FPU are more recent than this state + * copy. If the task context-switches away then they get + * saved here and represent the FPU state. + * + * After context switches there may be a (short) time period + * during which the in-FPU hardware registers are unchanged + * and still perfectly match this state, if the tasks + * scheduled afterwards are not using the FPU. + * + * This is the 'lazy restore' window of optimization, which + * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. + * + * We detect whether a subsequent task uses the FPU via setting + * CR0::TS to 1, which causes any FPU use to raise a #NM fault. + * + * During this window, if the task gets scheduled again, we + * might be able to skip having to do a restore from this + * memory buffer to the hardware registers - at the cost of + * incurring the overhead of #NM fault traps. + * + * Note that on modern CPUs that support the XSAVEOPT (or other + * optimized XSAVE instructions), we don't use #NM traps anymore, + * as the hardware can track whether FPU registers need saving + * or not. On such CPUs we activate the non-lazy ('eagerfpu') + * logic, which unconditionally saves/restores all FPU state + * across context switches. (if FPU state exists.) + */ + union fpregs_state state; + /* + * WARNING: 'state' is dynamically-sized. Do not put + * anything after it here. + */ }; #endif /* _ASM_X86_FPU_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 43e6519df0d5..944f1785ed0d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -390,9 +390,6 @@ struct thread_struct { #endif unsigned long gs; - /* Floating point and extended processor state */ - struct fpu fpu; - /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ @@ -418,6 +415,13 @@ struct thread_struct { unsigned long iopl; /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; + + /* Floating point and extended processor state */ + struct fpu fpu; + /* + * WARNING: 'fpu' is dynamically-sized. It *MUST* be at + * the end. + */ }; /* diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 32826791e675..deacbfa6b33e 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -136,6 +136,45 @@ static void __init fpu__init_system_generic(void) unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); +#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ + BUILD_BUG_ON((sizeof(TYPE) - \ + offsetof(TYPE, MEMBER) - \ + sizeof(((TYPE *)0)->MEMBER)) > \ + 0) \ + +/* + * We append the 'struct fpu' to the task_struct. + */ +int __weak arch_task_struct_size(void) +{ + int task_size = sizeof(struct task_struct); + + /* + * Subtract off the static size of the register state. + * It potentially has a bunch of padding. + */ + task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state); + + /* + * Add back the dynamically-calculated register state + * size. + */ + task_size += xstate_size; + + /* + * We dynamically size 'struct fpu', so we require that + * it be at the end of 'thread_struct' and that + * 'thread_struct' be at the end of 'task_struct'. If + * you hit a compile error here, check the structure to + * see if something got added to the end. + */ + CHECK_MEMBER_AT_END_OF(struct fpu, state); + CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); + CHECK_MEMBER_AT_END_OF(struct task_struct, thread); + + return task_size; +} + /* * Set up the xstate_size based on the legacy FPU context size. * diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9cad694ed7c4..975420eac105 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - *dst = *src; + memcpy(dst, src, arch_task_struct_size()); return fpu__copy(&dst->thread.fpu, &src->thread.fpu); } -- cgit v1.2.3 From 5aaeb5c01c5b6c0be7b7aadbf3ace9f3a4458c3d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Jul 2015 12:28:12 +0200 Subject: x86/fpu, sched: Introduce CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and use it on x86 Don't burden architectures without dynamic task_struct sizing with the overhead of dynamic sizing. Also optimize the x86 code a bit by caching task_struct_size. Acked-and-Tested-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1437128892-9831-3-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/kernel/fpu/init.c | 17 +++++++++-------- arch/x86/kernel/process.c | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3dbb7e7909ca..b3a1a5d77d92 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -41,6 +41,7 @@ config X86 select ARCH_USE_CMPXCHG_LOCKREF if X86_64 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_IPC_PARSE_VERSION if X86_32 select ARCH_WANT_OPTIONAL_GPIOLIB diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index deacbfa6b33e..0b39173dd971 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -4,6 +4,8 @@ #include #include +#include + /* * Initialize the TS bit in CR0 according to the style of context-switches * we are using: @@ -136,16 +138,14 @@ static void __init fpu__init_system_generic(void) unsigned int xstate_size; EXPORT_SYMBOL_GPL(xstate_size); -#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ - BUILD_BUG_ON((sizeof(TYPE) - \ - offsetof(TYPE, MEMBER) - \ - sizeof(((TYPE *)0)->MEMBER)) > \ - 0) \ +/* Enforce that 'MEMBER' is the last field of 'TYPE': */ +#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ + BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER)) /* - * We append the 'struct fpu' to the task_struct. + * We append the 'struct fpu' to the task_struct: */ -int __weak arch_task_struct_size(void) +static void __init fpu__init_task_struct_size(void) { int task_size = sizeof(struct task_struct); @@ -172,7 +172,7 @@ int __weak arch_task_struct_size(void) CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); CHECK_MEMBER_AT_END_OF(struct task_struct, thread); - return task_size; + arch_task_struct_size = task_size; } /* @@ -326,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); + fpu__init_task_struct_size(); fpu__init_system_ctx_switch(); } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 975420eac105..397688beed4b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - memcpy(dst, src, arch_task_struct_size()); + memcpy(dst, src, arch_task_struct_size); return fpu__copy(&dst->thread.fpu, &src->thread.fpu); } -- cgit v1.2.3 From bbc03778b9954a2ec93baed63718e4df0192f130 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 20 Jul 2015 16:01:53 -0700 Subject: x86/mm: Add parenthesis for TLB tracepoint size calculation flush_tlb_info->flush_start/end are both normal virtual addresses. When calculating 'nr_pages' (only used for the tracepoint), I neglected to put parenthesis in. Thanks to David Koufaty for pointing this out. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@sr71.net Cc: Link: http://lkml.kernel.org/r/20150720230153.9E834081@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3250f2371aea..90b924acd982 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -117,7 +117,7 @@ static void flush_tlb_func(void *info) } else { unsigned long addr; unsigned long nr_pages = - f->flush_end - f->flush_start / PAGE_SIZE; + (f->flush_end - f->flush_start) / PAGE_SIZE; addr = f->flush_start; while (addr < f->flush_end) { __flush_tlb_single(addr); -- cgit v1.2.3 From a89652769470d12cd484ee3d3f7bde0742be8d96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 20 Jul 2015 14:29:58 -0700 Subject: x86/mpx: Do not set ->vm_ops on MPX VMAs MPX setups private anonymous mapping, but uses vma->vm_ops too. This can confuse core VM, as it relies on vm->vm_ops to distinguish file VMAs from anonymous. As result we will get SIGBUS, because handle_pte_fault() thinks it's file VMA without vm_ops->fault and it doesn't know how to handle the situation properly. Let's fix that by not setting ->vm_ops. We don't really need ->vm_ops here: MPX VMA can be detected with VM_MPX flag. And vma_merge() will not merge MPX VMA with non-MPX VMA, because ->vm_flags won't match. The only thing left is name of VMA. I'm not sure if it's part of ABI, or we can just drop it. The patch keep it by providing arch_vma_name() on x86. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Cc: # Fixes: 6b7339f4 (mm: avoid setting up anonymous pages into file mapping) Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@sr71.net Link: http://lkml.kernel.org/r/20150720212958.305CC3E9@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mmap.c | 7 +++++++ arch/x86/mm/mpx.c | 24 +++--------------------- 2 files changed, 10 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 9d518d693b4b..844b06d67df4 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -126,3 +126,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_MPX) + return "[mpx]"; + return NULL; +} diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 7a657f58bbea..db1b0bc5017c 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -20,20 +20,6 @@ #define CREATE_TRACE_POINTS #include -static const char *mpx_mapping_name(struct vm_area_struct *vma) -{ - return "[mpx]"; -} - -static struct vm_operations_struct mpx_vma_ops = { - .name = mpx_mapping_name, -}; - -static int is_mpx_vma(struct vm_area_struct *vma) -{ - return (vma->vm_ops == &mpx_vma_ops); -} - static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm) { if (is_64bit_mm(mm)) @@ -53,9 +39,6 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) /* * This is really a simplified "vm_mmap". it only handles MPX * bounds tables (the bounds directory is user-allocated). - * - * Later on, we use the vma->vm_ops to uniquely identify these - * VMAs. */ static unsigned long mpx_mmap(unsigned long len) { @@ -101,7 +84,6 @@ static unsigned long mpx_mmap(unsigned long len) ret = -ENOMEM; goto out; } - vma->vm_ops = &mpx_vma_ops; if (vm_flags & VM_LOCKED) { up_write(&mm->mmap_sem); @@ -812,7 +794,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm, * so stop immediately and return an error. This * probably results in a SIGSEGV. */ - if (!is_mpx_vma(vma)) + if (!(vma->vm_flags & VM_MPX)) return -EINVAL; len = min(vma->vm_end, end) - addr; @@ -945,9 +927,9 @@ static int try_unmap_single_bt(struct mm_struct *mm, * lots of tables even though we have no actual table * entries in use. */ - while (next && is_mpx_vma(next)) + while (next && (next->vm_flags & VM_MPX)) next = next->vm_next; - while (prev && is_mpx_vma(prev)) + while (prev && (prev->vm_flags & VM_MPX)) prev = prev->vm_prev; /* * We know 'start' and 'end' lie within an area controlled -- cgit v1.2.3 From 5bc016f1abaa1c5ac0e3af23aa79faec4634a074 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 20 Jul 2015 08:49:01 +0100 Subject: x86/fpu: Disable dependent CPU features on "noxsave" Complete the set of dependent features that need disabling at once: XSAVEC, AVX-512 and all currently known to the kernel extensions to it, as well as MPX need to be disabled too. Signed-off-by: Jan Beulich Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55ACC40D0200007800092E6C@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/fpu/init.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 0b39173dd971..1e173f6285c7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -351,9 +351,15 @@ static int __init x86_noxsave_setup(char *s) setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_XSAVEC); setup_clear_cpu_cap(X86_FEATURE_XSAVES); setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); + setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512PF); + setup_clear_cpu_cap(X86_FEATURE_AVX512ER); + setup_clear_cpu_cap(X86_FEATURE_AVX512CD); + setup_clear_cpu_cap(X86_FEATURE_MPX); return 1; } -- cgit v1.2.3 From ca1fec58bc6a90be96a59b4769e951156846c6ca Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 20 Jul 2015 08:46:14 +0100 Subject: x86/mm/pat: Adjust default caching mode translation tables Make WT really mean WT (rather than UC). I can't see why commit 9cd25aac1f ("x86/mm/pat: Emulate PAT when it is disabled") didn't make this to match its changes to pat_init(). Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/55ACC3660200007800092E62@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8533b46e6bee..7a4532229f51 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -43,18 +43,18 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, - [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_WT ] = _PAGE_PWT | 0, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WT, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WT, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -- cgit v1.2.3 From 1c9cf9b211030a454a84cbc1cb15b82d9aa49011 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 16 Jul 2015 17:23:14 -0600 Subject: x86/mm: Move warning from __ioremap_check_ram() to the call site __ioremap_check_ram() has a WARN_ONCE() which is emitted when the given pfn range is not RAM. The warning is bogus in two aspects: - it never triggers since walk_system_ram_range() only calls __ioremap_check_ram() for RAM ranges. - the warning message is wrong as it says: "ioremap on RAM' after it established that the pfn range is not RAM. Move the WARN_ONCE() to __ioremap_caller(), and update the message to include the address range so we get an actual warning when something tries to ioremap system RAM. [ tglx: Massaged changelog ] Signed-off-by: Toshi Kani Reviewed-by: Dan Williams Cc: Roland Dreier Cc: Luis R. Rodriguez Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1437088996-28511-2-git-send-email-toshi.kani@hp.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index cc5ccc415cc0..fd3df0ddb4d4 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -63,8 +63,6 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, !PageReserved(pfn_to_page(start_pfn + i))) return 1; - WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn); - return 0; } @@ -131,8 +129,11 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pfn = phys_addr >> PAGE_SHIFT; last_pfn = last_addr >> PAGE_SHIFT; if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, - __ioremap_check_ram) == 1) + __ioremap_check_ram) == 1) { + WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", + phys_addr, last_addr); return NULL; + } } /* * Mappings have to be page-aligned -- cgit v1.2.3 From 9a58eebe1ace609bedf8c5a65e70a097459f5696 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 16 Jul 2015 17:23:15 -0600 Subject: x86/mm: Remove region_is_ram() call from ioremap __ioremap_caller() calls region_is_ram() to walk through the iomem_resource table to check if a target range is in RAM, which was added to improve the lookup performance over page_is_ram() (commit 906e36c5c717 "x86: use optimized ioresource lookup in ioremap function"). page_is_ram() was no longer used when this change was added, though. __ioremap_caller() then calls walk_system_ram_range(), which had replaced page_is_ram() to improve the lookup performance (commit c81c8a1eeede "x86, ioremap: Speed up check for RAM pages"). Since both checks walk through the same iomem_resource table for the same purpose, there is no need to call both functions. Aside of that walk_system_ram_range() is the only useful check at the moment because region_is_ram() always returns -1 due to an implementation bug. That bug in region_is_ram() cannot be fixed without breaking existing ioremap callers, which rely on the subtle difference of walk_system_ram_range() versus non page aligned ranges. Once these offending callers are fixed we can use region_is_ram() and remove walk_system_ram_range(). [ tglx: Massaged changelog ] Signed-off-by: Toshi Kani Reviewed-by: Dan Williams Cc: Roland Dreier Cc: Mike Travis Cc: Luis R. Rodriguez Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Borislav Petkov Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1437088996-28511-3-git-send-email-toshi.kani@hp.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index fd3df0ddb4d4..b9d4a33017fd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -92,7 +92,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pgprot_t prot; int retval; void __iomem *ret_addr; - int ram_region; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; @@ -115,26 +114,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ - /* First check if whole region can be identified as RAM or not */ - ram_region = region_is_ram(phys_addr, size); - if (ram_region > 0) { - WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n", - (unsigned long int)phys_addr, - (unsigned long int)last_addr); - return NULL; - } - - /* If could not be identified(-1), check page by page */ - if (ram_region < 0) { - pfn = phys_addr >> PAGE_SHIFT; - last_pfn = last_addr >> PAGE_SHIFT; - if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, + pfn = phys_addr >> PAGE_SHIFT; + last_pfn = last_addr >> PAGE_SHIFT; + if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, __ioremap_check_ram) == 1) { - WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", + WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", phys_addr, last_addr); - return NULL; - } + return NULL; } + /* * Mappings have to be page-aligned */ -- cgit v1.2.3 From 10dc331ff5e7e4668c0f0c95b1a873aba9b70826 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2015 03:25:54 +0800 Subject: KVM: MTRR: fix memory type handling if MTRR is completely disabled Currently code uses default memory type if MTRR is fully disabled, fix it by using UC instead. Signed-off-by: Xiao Guangrong Tested-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index de1d2d8062e2..e2750134a22b 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -120,6 +120,16 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state) return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK; } +static u8 mtrr_disabled_type(void) +{ + /* + * Intel SDM 11.11.2.2: all MTRRs are disabled when + * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC + * memory type is applied to all of physical memory. + */ + return MTRR_TYPE_UNCACHABLE; +} + /* * Three terms are used in the following code: * - segment, it indicates the address segments covered by fixed MTRRs. @@ -434,6 +444,8 @@ struct mtrr_iter { /* output fields. */ int mem_type; + /* mtrr is completely disabled? */ + bool mtrr_disabled; /* [start, end) is not fully covered in MTRRs? */ bool partial_map; @@ -549,7 +561,7 @@ static void mtrr_lookup_var_next(struct mtrr_iter *iter) static void mtrr_lookup_start(struct mtrr_iter *iter) { if (!mtrr_is_enabled(iter->mtrr_state)) { - iter->partial_map = true; + iter->mtrr_disabled = true; return; } @@ -563,6 +575,7 @@ static void mtrr_lookup_init(struct mtrr_iter *iter, iter->mtrr_state = mtrr_state; iter->start = start; iter->end = end; + iter->mtrr_disabled = false; iter->partial_map = false; iter->fixed = false; iter->range = NULL; @@ -656,6 +669,9 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) return MTRR_TYPE_WRBACK; } + if (iter.mtrr_disabled) + return mtrr_disabled_type(); + /* It is not covered by MTRRs. */ if (iter.partial_map) { /* @@ -689,6 +705,9 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, return false; } + if (iter.mtrr_disabled) + return true; + if (!iter.partial_map) return true; -- cgit v1.2.3 From 3e5d2fdceda172554e681b68c853bf5d08205bbf Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2015 03:25:55 +0800 Subject: KVM: MTRR: simplify kvm_mtrr_get_guest_memory_type kvm_mtrr_get_guest_memory_type never returns -1 which is implied in the current code since if @type = -1 (means no MTRR contains the range), iter.partial_map must be true Simplify the code to indicate this fact Signed-off-by: Xiao Guangrong Tested-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index e2750134a22b..dc0a84a6f309 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -672,15 +672,16 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) if (iter.mtrr_disabled) return mtrr_disabled_type(); - /* It is not covered by MTRRs. */ - if (iter.partial_map) { - /* - * We just check one page, partially covered by MTRRs is - * impossible. - */ - WARN_ON(type != -1); - type = mtrr_default_type(mtrr_state); - } + /* + * We just check one page, partially covered by MTRRs is + * impossible. + */ + WARN_ON(iter.partial_map); + + /* not contained in any MTRRs. */ + if (type == -1) + return mtrr_default_type(mtrr_state); + return type; } EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); -- cgit v1.2.3 From 41dbc6bcd9722bba8f09256655d060af30c63d34 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 23 Jul 2015 08:22:45 +0200 Subject: KVM: x86: introduce kvm_check_has_quirk The logic of the disabled_quirks field usually results in a double negation. Wrap it in a simple function that checks the bit and negates it. Based on a patch from Xiao Guangrong. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/x86.h | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 954e98a8c2e3..1c425443a41a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1595,7 +1595,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_LINT0_REENABLED)) apic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bbc678a66b18..8cbec765b08d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1672,7 +1672,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * does not do it - this results in some delay at * reboot */ - if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_CD_NW_CLEARED)) cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index edc8cdcd786b..0ca2f3e4803c 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -147,6 +147,11 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, return kvm_register_write(vcpu, reg, val); } +static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) +{ + return !(kvm->arch.disabled_quirks & quirk); +} + void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); void kvm_set_pending_timer(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From fb279950ba02e3210a16b11ecfa8871f3ee0ca49 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 16 Jul 2015 03:25:56 +0800 Subject: KVM: vmx: obey KVM_QUIRK_CD_NW_CLEARED OVMF depends on WB to boot fast, because it only clears caches after it has set up MTRRs---which is too late. Let's do writeback if CR0.CD is set to make it happy, similar to what SVM is already doing. Signed-off-by: Xiao Guangrong Tested-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5b4e9384717a..e9e1fc517ab5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8650,7 +8650,10 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (kvm_read_cr0(vcpu) & X86_CR0_CD) { ipat = VMX_EPT_IPAT_BIT; - cache = MTRR_TYPE_UNCACHABLE; + if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_CD_NW_CLEARED)) + cache = MTRR_TYPE_WRBACK; + else + cache = MTRR_TYPE_UNCACHABLE; goto exit; } -- cgit v1.2.3 From 0da029ed7ee5fdf49a2a0e14160c3e9999be9292 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 23 Jul 2015 08:24:42 +0200 Subject: KVM: x86: rename quirk constants to KVM_X86_QUIRK_* Make them clearly architecture-dependent; the capability is valid for all architectures, but the argument is not. Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 4 ++-- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a4ae82eb82aa..cd54147cb365 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -354,7 +354,7 @@ struct kvm_xcrs { struct kvm_sync_regs { }; -#define KVM_QUIRK_LINT0_REENABLED (1 << 0) -#define KVM_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) +#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1c425443a41a..2a5ca97c263b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1595,7 +1595,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_LINT0_REENABLED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) apic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0)); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8cbec765b08d..8e0c0844c6b9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1672,7 +1672,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * does not do it - this results in some delay at * reboot */ - if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_CD_NW_CLEARED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e9e1fc517ab5..83b7b5cd75d5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8650,7 +8650,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (kvm_read_cr0(vcpu) & X86_CR0_CD) { ipat = VMX_EPT_IPAT_BIT; - if (kvm_check_has_quirk(vcpu->kvm, KVM_QUIRK_CD_NW_CLEARED)) + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else cache = MTRR_TYPE_UNCACHABLE; -- cgit v1.2.3 From 8a0a5da6d9cbf1b142115ff6e6b253a82633c3d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2015 16:13:43 +0200 Subject: x86/mm: Fix newly introduced printk format warnings Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index b9d4a33017fd..b9c78f3bcd67 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -118,8 +118,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, last_pfn = last_addr >> PAGE_SHIFT; if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, __ioremap_check_ram) == 1) { - WARN_ONCE(1, "ioremap on RAM at 0x%llx - 0x%llx\n", - phys_addr, last_addr); + WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", + &phys_addr, &last_addr); return NULL; } -- cgit v1.2.3 From c0c3322e98029e752526d906d9e3a680ed213c03 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 24 Jul 2015 14:16:43 +0200 Subject: x86/asm/entry/32: Revert 'Do not use R9 in SYSCALL32' commit This change reverts most of commit 53e9accf0f 'Do not use R9 in SYSCALL32'. I don't yet understand how, but code in that commit sometimes fails to preserve EBP. See https://bugzilla.kernel.org/show_bug.cgi?id=101061 "Problems while executing 32-bit code on AMD64" Reported-and-tested-by: Krzysztof A. Sobiecki Signed-off-by: Denys Vlasenko Cc: Linus Torvalds Cc: Steven Rostedt Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: Alexei Starovoitov Cc: Will Drewry Cc: Kees Cook CC: x86@kernel.org Link: http://lkml.kernel.org/r/1437740203-11552-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/entry/entry_64_compat.S | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bb187a6a877c..5a1844765a7a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -205,7 +205,6 @@ sysexit_from_sys_call: movl RDX(%rsp), %edx /* arg3 */ movl RSI(%rsp), %ecx /* arg4 */ movl RDI(%rsp), %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ .endm .macro auditsys_exit exit @@ -236,6 +235,7 @@ sysexit_from_sys_call: sysenter_auditsys: auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ jmp sysenter_dispatch sysexit_audit: @@ -336,7 +336,7 @@ ENTRY(entry_SYSCALL_compat) * 32-bit zero extended: */ ASM_STAC -1: movl (%r8), %ebp +1: movl (%r8), %r9d _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) @@ -346,7 +346,7 @@ ENTRY(entry_SYSCALL_compat) cstar_do_call: /* 32-bit syscall -> 64-bit C ABI argument conversion */ movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ + /* r9 already loaded */ /* arg6 */ xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ movl %ebx, %edi /* arg1 */ movl %edx, %edx /* arg3 (zero extension) */ @@ -358,7 +358,6 @@ cstar_dispatch: call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: - movl RCX(%rsp), %ebp DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -392,7 +391,9 @@ sysretl_from_sys_call: #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: + movl %r9d, R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common + movl R9(%rsp), %r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: @@ -404,14 +405,16 @@ cstar_tracesys: testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz cstar_auditsys #endif + xchgl %r9d, %ebp SAVE_EXTRA_REGS xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) movq %rax, R10(%rsp) - movq %rax, R9(%rsp) + movq %r9, R9(%rsp) movq %rax, R8(%rsp) movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter + movl R9(%rsp), %r9d /* Reload arg registers from stack. (see sysenter_tracesys) */ movl RCX(%rsp), %ecx @@ -421,6 +424,7 @@ cstar_tracesys: movl %eax, %eax /* zero extension */ RESTORE_EXTRA_REGS + xchgl %ebp, %r9d jmp cstar_do_call END(entry_SYSCALL_compat) -- cgit v1.2.3 From 2c534c0da0a68418693e10ce1c4146e085f39518 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 21 Jul 2015 15:55:09 +0100 Subject: perf/x86/intel/cqm: Return cached counter value from IRQ context Peter reported the following potential crash which I was able to reproduce with his test program, [ 148.765788] ------------[ cut here ]------------ [ 148.765796] WARNING: CPU: 34 PID: 2840 at kernel/smp.c:417 smp_call_function_many+0xb6/0x260() [ 148.765797] Modules linked in: [ 148.765800] CPU: 34 PID: 2840 Comm: perf Not tainted 4.2.0-rc1+ #4 [ 148.765803] ffffffff81cdc398 ffff88085f105950 ffffffff818bdfd5 0000000000000007 [ 148.765805] 0000000000000000 ffff88085f105990 ffffffff810e413a 0000000000000000 [ 148.765807] ffffffff82301080 0000000000000022 ffffffff8107f640 ffffffff8107f640 [ 148.765809] Call Trace: [ 148.765810] [] dump_stack+0x45/0x57 [ 148.765818] [] warn_slowpath_common+0x8a/0xc0 [ 148.765822] [] ? intel_cqm_stable+0x60/0x60 [ 148.765824] [] ? intel_cqm_stable+0x60/0x60 [ 148.765825] [] warn_slowpath_null+0x1a/0x20 [ 148.765827] [] smp_call_function_many+0xb6/0x260 [ 148.765829] [] ? intel_cqm_stable+0x60/0x60 [ 148.765831] [] on_each_cpu_mask+0x28/0x60 [ 148.765832] [] intel_cqm_event_count+0x7f/0xe0 [ 148.765836] [] perf_output_read+0x2a5/0x400 [ 148.765839] [] perf_output_sample+0x31a/0x590 [ 148.765840] [] ? perf_prepare_sample+0x26d/0x380 [ 148.765841] [] perf_event_output+0x47/0x60 [ 148.765843] [] __perf_event_overflow+0x215/0x240 [ 148.765844] [] perf_event_overflow+0x14/0x20 [ 148.765847] [] intel_pmu_handle_irq+0x1d4/0x440 [ 148.765849] [] ? __perf_event_task_sched_in+0x36/0xa0 [ 148.765853] [] ? vunmap_page_range+0x19d/0x2f0 [ 148.765854] [] ? unmap_kernel_range_noflush+0x11/0x20 [ 148.765859] [] ? ghes_copy_tofrom_phys+0x11e/0x2a0 [ 148.765863] [] ? native_apic_msr_write+0x2b/0x30 [ 148.765865] [] ? x2apic_send_IPI_self+0x1d/0x20 [ 148.765869] [] ? arch_irq_work_raise+0x35/0x40 [ 148.765872] [] ? irq_work_queue+0x66/0x80 [ 148.765875] [] perf_event_nmi_handler+0x26/0x40 [ 148.765877] [] nmi_handle+0x79/0x100 [ 148.765879] [] default_do_nmi+0x42/0x100 [ 148.765880] [] do_nmi+0x83/0xb0 [ 148.765884] [] end_repeat_nmi+0x1e/0x2e [ 148.765886] [] ? __perf_event_task_sched_in+0x36/0xa0 [ 148.765888] [] ? __perf_event_task_sched_in+0x36/0xa0 [ 148.765890] [] ? __perf_event_task_sched_in+0x36/0xa0 [ 148.765891] <> [] finish_task_switch+0x156/0x210 [ 148.765898] [] __schedule+0x341/0x920 [ 148.765899] [] schedule+0x37/0x80 [ 148.765903] [] ? do_page_fault+0x2f/0x80 [ 148.765905] [] schedule_user+0x1a/0x50 [ 148.765907] [] retint_careful+0x14/0x32 [ 148.765908] ---[ end trace e33ff2be78e14901 ]--- The CQM task events are not safe to be called from within interrupt context because they require performing an IPI to read the counter value on all sockets. And performing IPIs from within IRQ context is a "no-no". Make do with the last read counter value currently event in event->count when we're invoked in this context. Reported-by: Peter Zijlstra Signed-off-by: Matt Fleming Cc: Thomas Gleixner Cc: Vikas Shivappa Cc: Kanaka Juvva Cc: Will Auld Cc: Link: http://lkml.kernel.org/r/1437490509-15373-1-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 188076161c1b..63eb68b73589 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -951,6 +951,14 @@ static u64 intel_cqm_event_count(struct perf_event *event) if (!cqm_group_leader(event)) return 0; + /* + * Getting up-to-date values requires an SMP IPI which is not + * possible if we're being called in interrupt context. Return + * the cached values instead. + */ + if (unlikely(in_interrupt())) + goto out; + /* * Notice that we don't perform the reading of an RMID * atomically, because we can't hold a spin lock across the -- cgit v1.2.3 From 1a4e8795711f474b31ff6eac37f3efd304ed8a93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Jul 2015 10:27:37 +0200 Subject: x86/mm/pat: Revert 'Adjust default caching mode translation tables' Toshi explains: "No, the default values need to be set to the fallback types, i.e. minimal supported mode. For WC and WT, UC is the fallback type. When PAT is disabled, pat_init() does update the tables below to enable WT per the default BIOS setup. However, when PAT is enabled, but CPU has PAT -errata, WT falls back to UC per the default values." Revert: ca1fec58bc6a 'x86/mm/pat: Adjust default caching mode translation tables' Requested-by: Toshi Kani Cc: Jan Beulich Link: http://lkml.kernel.org/r/1437577776.3214.252.camel@hp.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a4532229f51..8533b46e6bee 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -43,18 +43,18 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, - [_PAGE_CACHE_MODE_WT ] = _PAGE_PWT | 0, + [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WT, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, - [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WT, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -- cgit v1.2.3