diff options
author | Ingo Molnar <mingo@kernel.org> | 2018-01-30 15:08:27 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2018-01-30 15:08:27 +0100 |
commit | 7e86548e2cc8d308cb75439480f428137151b0de (patch) | |
tree | fa3bcdedb64f4642a21080bc2b4ddc69ce2b2285 /arch/x86 | |
parent | 64e16720ea0879f8ab4547e3b9758936d483909b (diff) | |
parent | d8a5b80568a9cb66810e75b182018e9edb68e8ff (diff) | |
download | linux-stable-7e86548e2cc8d308cb75439480f428137151b0de.tar.gz linux-stable-7e86548e2cc8d308cb75439480f428137151b0de.tar.bz2 linux-stable-7e86548e2cc8d308cb75439480f428137151b0de.zip |
Merge tag 'v4.15' into x86/pti, to be able to merge dependent changes
Time has come to switch PTI development over to a v4.15 base - we'll still
try to make sure that all PTI fixes backport cleanly to v4.14 and earlier.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
234 files changed, 7358 insertions, 4591 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1819161cc6c..20da391b5f32 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,8 +55,7 @@ config X86 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 - # Causing hangs/crashes, see the commit that added this change for details. - select ARCH_HAS_REFCOUNT if BROKEN + select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN @@ -94,8 +93,10 @@ config X86 select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP + select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC select GENERIC_IRQ_MIGRATION if SMP select GENERIC_IRQ_PROBE + select GENERIC_IRQ_RESERVATION_MODE select GENERIC_IRQ_SHOW select GENERIC_PENDING_IRQ if SMP select GENERIC_SMP_IDLE_THREAD @@ -111,7 +112,6 @@ config X86 select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KGDB - select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT @@ -1443,7 +1443,7 @@ config ARCH_DMA_ADDR_T_64BIT config X86_DIRECT_GBPAGES def_bool y - depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK + depends on X86_64 && !DEBUG_PAGEALLOC ---help--- Certain kernel features effectively disable kernel linear 1 GB mappings (even if the CPU otherwise @@ -1817,6 +1817,22 @@ config X86_SMAP If unsure, say Y. +config X86_INTEL_UMIP + def_bool y + depends on CPU_SUP_INTEL + prompt "Intel User Mode Instruction Prevention" if EXPERT + ---help--- + The User Mode Instruction Prevention (UMIP) is a security + feature in newer Intel processors. If enabled, a general + protection fault is issued if the SGDT, SLDT, SIDT, SMSW + or STR instructions are executed in user mode. These instructions + unnecessarily expose information about the hardware state. + + The vast majority of applications do not use these instructions. + For the very few that do, software emulation is provided in + specific cases in protected and virtual-8086 modes. Emulated + results are dummy. + config X86_INTEL_MPX prompt "Intel MPX (Memory Protection Extensions)" def_bool n diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 6293a8768a91..672441c008c7 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -400,6 +400,7 @@ config UNWINDER_FRAME_POINTER config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT + depends on !STACKDEPOT ---help--- This option enables the "guess" unwinder for unwinding kernel stack traces. It scans the stack and reports every kernel text address it diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 504b1a4535ac..fad55160dcb9 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -158,11 +158,6 @@ ifdef CONFIG_X86_X32 endif export CONFIG_X86_X32_ABI -# Don't unroll struct assignments with kmemcheck enabled -ifeq ($(CONFIG_KMEMCHECK),y) - KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) -endif - # # If the function graph tracer is used with mcount instead of fentry, # '-maccumulate-outgoing-args' is needed to prevent a GCC bug diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index e3cf9f682be5..09d25dd09307 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -7,3 +7,6 @@ zoffset.h setup setup.bin setup.elf +fdimage +mtools.conf +image.iso diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index d88a2fddba8c..9b5adae9cc40 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -123,63 +123,26 @@ image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) $(obj)/mtools.conf: $(src)/mtools.conf.in sed -e 's|@OBJ@|$(obj)|g' < $< > $@ +quiet_cmd_genimage = GENIMAGE $3 +cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ + $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD) + # This requires write access to /dev/fd0 bzdisk: $(obj)/bzImage $(obj)/mtools.conf - MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync - syslinux /dev/fd0 ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux ; sync + $(call cmd,genimage,bzdisk,/dev/fd0) # These require being root or having syslinux 2.02 or higher installed fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf - dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 - MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync - syslinux $(obj)/fdimage ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux ; sync + $(call cmd,genimage,fdimage144,$(obj)/fdimage) + @$(kecho) 'Kernel: $(obj)/fdimage is ready' fdimage288: $(obj)/bzImage $(obj)/mtools.conf - dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 - MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync - syslinux $(obj)/fdimage ; sync - echo '$(image_cmdline)' | \ - MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \ - fi - MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux ; sync + $(call cmd,genimage,fdimage288,$(obj)/fdimage) + @$(kecho) 'Kernel: $(obj)/fdimage is ready' isoimage: $(obj)/bzImage - -rm -rf $(obj)/isoimage - mkdir $(obj)/isoimage - for i in lib lib64 share end ; do \ - if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ - cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ - if [ -f /usr/$$i/syslinux/ldlinux.c32 ]; then \ - cp /usr/$$i/syslinux/ldlinux.c32 $(obj)/isoimage ; \ - fi ; \ - break ; \ - fi ; \ - if [ $$i = end ] ; then exit 1 ; fi ; \ - done - cp $(obj)/bzImage $(obj)/isoimage/linux - echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg - if [ -f '$(FDINITRD)' ] ; then \ - cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \ - fi - mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ - -no-emul-boot -boot-load-size 4 -boot-info-table \ - $(obj)/isoimage - isohybrid $(obj)/image.iso 2>/dev/null || true - rm -rf $(obj)/isoimage + $(call cmd,genimage,isoimage,$(obj)/image.iso) + @$(kecho) 'Kernel: $(obj)/image.iso is ready' bzlilo: $(obj)/bzImage if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 4b7575b00563..f25e1530e064 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -36,6 +36,7 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += $(call cc-option,-ffreestanding) KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) +KBUILD_CFLAGS += $(call cc-disable-warning, gnu) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n @@ -78,6 +79,8 @@ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o + vmlinux-objs-y += $(obj)/mem_encrypt.o + vmlinux-objs-y += $(obj)/pgtable_64.o endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index beb255b66447..fc313e29fe2c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -131,6 +131,19 @@ ENTRY(startup_32) /* * Build early 4G boot pagetable */ + /* + * If SEV is active then set the encryption mask in the page tables. + * This will insure that when the kernel is copied and decompressed + * it will be done so encrypted. + */ + call get_sev_encryption_bit + xorl %edx, %edx + testl %eax, %eax + jz 1f + subl $32, %eax /* Encryption bit is always above bit 31 */ + bts %eax, %edx /* Set encryption mask for page tables */ +1: + /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax @@ -141,12 +154,14 @@ ENTRY(startup_32) leal pgtable + 0(%ebx), %edi leal 0x1007 (%edi), %eax movl %eax, 0(%edi) + addl %edx, 4(%edi) /* Build Level 3 */ leal pgtable + 0x1000(%ebx), %edi leal 0x1007(%edi), %eax movl $4, %ecx 1: movl %eax, 0x00(%edi) + addl %edx, 0x04(%edi) addl $0x00001000, %eax addl $8, %edi decl %ecx @@ -157,6 +172,7 @@ ENTRY(startup_32) movl $0x00000183, %eax movl $2048, %ecx 1: movl %eax, 0(%edi) + addl %edx, 4(%edi) addl $0x00200000, %eax addl $8, %edi decl %ecx @@ -289,10 +305,18 @@ ENTRY(startup_64) leaq boot_stack_end(%rbx), %rsp #ifdef CONFIG_X86_5LEVEL - /* Check if 5-level paging has already enabled */ - movq %cr4, %rax - testl $X86_CR4_LA57, %eax - jnz lvl5 + /* + * Check if we need to enable 5-level paging. + * RSI holds real mode data and need to be preserved across + * a function call. + */ + pushq %rsi + call l5_paging_required + popq %rsi + + /* If l5_paging_required() returned zero, we're done here. */ + cmpq $0, %rax + je lvl5 /* * At this point we are in long mode with 4-level paging enabled, diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index a63fbc25ce84..8199a6187251 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -171,7 +171,6 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size) static void mem_avoid_memmap(char *str) { static int i; - int rc; if (i >= MAX_MEMMAP_REGIONS) return; @@ -219,7 +218,7 @@ static int handle_mem_memmap(void) return 0; tmp_cmdline = malloc(len + 1); - if (!tmp_cmdline ) + if (!tmp_cmdline) error("Failed to allocate space for tmp_cmdline"); memcpy(tmp_cmdline, args, len); @@ -363,7 +362,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, cmd_line |= boot_params->hdr.cmd_line_ptr; /* Calculate size of cmd_line. */ ptr = (char *)(unsigned long)cmd_line; - for (cmd_line_size = 0; ptr[cmd_line_size++]; ) + for (cmd_line_size = 0; ptr[cmd_line_size++];) ; mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S new file mode 100644 index 000000000000..54f5f6625a73 --- /dev/null +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -0,0 +1,120 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2017 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/asm-offsets.h> + + .text + .code32 +ENTRY(get_sev_encryption_bit) + xor %eax, %eax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + push %ebx + push %ecx + push %edx + push %edi + + /* + * RIP-relative addressing is needed to access the encryption bit + * variable. Since we are running in 32-bit mode we need this call/pop + * sequence to get the proper relative addressing. + */ + call 1f +1: popl %edi + subl $1b, %edi + + movl enc_bit(%edi), %eax + cmpl $0, %eax + jge .Lsev_exit + + /* Check if running under a hypervisor */ + movl $1, %eax + cpuid + bt $31, %ecx /* Check the hypervisor bit */ + jnc .Lno_sev + + movl $0x80000000, %eax /* CPUID to check the highest leaf */ + cpuid + cmpl $0x8000001f, %eax /* See if 0x8000001f is available */ + jb .Lno_sev + + /* + * Check for the SEV feature: + * CPUID Fn8000_001F[EAX] - Bit 1 + * CPUID Fn8000_001F[EBX] - Bits 5:0 + * Pagetable bit position used to indicate encryption + */ + movl $0x8000001f, %eax + cpuid + bt $1, %eax /* Check if SEV is available */ + jnc .Lno_sev + + movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */ + rdmsr + bt $MSR_AMD64_SEV_ENABLED_BIT, %eax /* Check if SEV is active */ + jnc .Lno_sev + + movl %ebx, %eax + andl $0x3f, %eax /* Return the encryption bit location */ + movl %eax, enc_bit(%edi) + jmp .Lsev_exit + +.Lno_sev: + xor %eax, %eax + movl %eax, enc_bit(%edi) + +.Lsev_exit: + pop %edi + pop %edx + pop %ecx + pop %ebx + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + + ret +ENDPROC(get_sev_encryption_bit) + + .code64 +ENTRY(get_sev_encryption_mask) + xor %rax, %rax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + push %rbp + push %rdx + + movq %rsp, %rbp /* Save current stack pointer */ + + call get_sev_encryption_bit /* Get the encryption bit position */ + testl %eax, %eax + jz .Lno_sev_mask + + xor %rdx, %rdx + bts %rax, %rdx /* Create the encryption mask */ + mov %rdx, %rax /* ... and return it */ + +.Lno_sev_mask: + movq %rbp, %rsp /* Restore original stack pointer */ + + pop %rdx + pop %rbp +#endif + + ret +ENDPROC(get_sev_encryption_mask) + + .data +enc_bit: + .int 0xffffffff diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b50c42455e25..98761a1576ce 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -169,6 +169,16 @@ void __puthex(unsigned long value) } } +static bool l5_supported(void) +{ + /* Check if leaf 7 is supported. */ + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); +} + #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -362,6 +372,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); + if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { + error("This linux kernel as configured requires 5-level paging\n" + "This CPU does not support the required 'cr4.la57' feature\n" + "Unable to boot - please use a kernel appropriate for your CPU\n"); + } + free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 32d4ec2e0243..9d323dc6b159 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -109,4 +109,6 @@ static inline void console_init(void) { } #endif +unsigned long get_sev_encryption_mask(void); + #endif diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index e691ff734cb5..b5e5e02f8cde 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -80,16 +80,18 @@ static unsigned long top_level_pgt; * Mapping information structure passed to kernel_ident_mapping_init(). * Due to relocation, pointers must be assigned at run time not build time. */ -static struct x86_mapping_info mapping_info = { - .page_flag = __PAGE_KERNEL_LARGE_EXEC, -}; +static struct x86_mapping_info mapping_info; /* Locates and clears a region for a new top level page table. */ void initialize_identity_maps(void) { + unsigned long sev_me_mask = get_sev_encryption_mask(); + /* Init mapping_info with run-time function/buffer pointers. */ mapping_info.alloc_pgt_page = alloc_pgt_page; mapping_info.context = &pgt_data; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; /* * It should be impossible for this not to already be true, diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c new file mode 100644 index 000000000000..b4469a37e9a1 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -0,0 +1,28 @@ +#include <asm/processor.h> + +/* + * __force_order is used by special_insns.h asm code to force instruction + * serialization. + * + * It is not referenced from the code, but GCC < 5 with -fPIE would fail + * due to an undefined symbol. Define it to make these ancient GCCs work. + */ +unsigned long __force_order; + +int l5_paging_required(void) +{ + /* Check if leaf 7 is supported. */ + + if (native_cpuid_eax(0) < 7) + return 0; + + /* Check if la57 is supported. */ + if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) + return 0; + + /* Check if 5-level paging has already been enabled. */ + if (native_read_cr4() & X86_CR4_LA57) + return 0; + + return 1; +} diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh new file mode 100644 index 000000000000..6a10d52a4145 --- /dev/null +++ b/arch/x86/boot/genimage.sh @@ -0,0 +1,130 @@ +#!/bin/sh +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 2017 by Changbin Du <changbin.du@intel.com> +# +# Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others +# +# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture +# +# Arguments: +# $1 - fdimage format +# $2 - target image file +# $3 - kernel bzImage file +# $4 - mtool configuration file +# $5 - kernel cmdline +# $6 - inird image file +# + +# Use "make V=1" to debug this script +case "${KBUILD_VERBOSE}" in +*1*) + set -x + ;; +esac + +verify () { + if [ ! -f "$1" ]; then + echo "" 1>&2 + echo " *** Missing file: $1" 1>&2 + echo "" 1>&2 + exit 1 + fi +} + + +export MTOOLSRC=$4 +FIMAGE=$2 +FBZIMAGE=$3 +KCMDLINE=$5 +FDINITRD=$6 + +# Make sure the files actually exist +verify "$FBZIMAGE" + +genbzdisk() { + verify "$MTOOLSRC" + mformat a: + syslinux $FIMAGE + echo "$KCMDLINE" | mcopy - a:syslinux.cfg + if [ -f "$FDINITRD" ] ; then + mcopy "$FDINITRD" a:initrd.img + fi + mcopy $FBZIMAGE a:linux +} + +genfdimage144() { + verify "$MTOOLSRC" + dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null + mformat v: + syslinux $FIMAGE + echo "$KCMDLINE" | mcopy - v:syslinux.cfg + if [ -f "$FDINITRD" ] ; then + mcopy "$FDINITRD" v:initrd.img + fi + mcopy $FBZIMAGE v:linux +} + +genfdimage288() { + verify "$MTOOLSRC" + dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null + mformat w: + syslinux $FIMAGE + echo "$KCMDLINE" | mcopy - W:syslinux.cfg + if [ -f "$FDINITRD" ] ; then + mcopy "$FDINITRD" w:initrd.img + fi + mcopy $FBZIMAGE w:linux +} + +geniso() { + tmp_dir=`dirname $FIMAGE`/isoimage + rm -rf $tmp_dir + mkdir $tmp_dir + for i in lib lib64 share ; do + for j in syslinux ISOLINUX ; do + if [ -f /usr/$i/$j/isolinux.bin ] ; then + isolinux=/usr/$i/$j/isolinux.bin + fi + done + for j in syslinux syslinux/modules/bios ; do + if [ -f /usr/$i/$j/ldlinux.c32 ]; then + ldlinux=/usr/$i/$j/ldlinux.c32 + fi + done + if [ -n "$isolinux" -a -n "$ldlinux" ] ; then + break + fi + done + if [ -z "$isolinux" ] ; then + echo 'Need an isolinux.bin file, please install syslinux/isolinux.' + exit 1 + fi + if [ -z "$ldlinux" ] ; then + echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.' + exit 1 + fi + cp $isolinux $tmp_dir + cp $ldlinux $tmp_dir + cp $FBZIMAGE $tmp_dir/linux + echo "$KCMDLINE" > $tmp_dir/isolinux.cfg + if [ -f "$FDINITRD" ] ; then + cp "$FDINITRD" $tmp_dir/initrd.img + fi + genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \ + -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \ + -boot-info-table $tmp_dir + isohybrid $FIMAGE 2>/dev/null || true + rm -rf $tmp_dir +} + +case $1 in + bzdisk) genbzdisk;; + fdimage144) genfdimage144;; + fdimage288) genfdimage288;; + isoimage) geniso;; + *) echo 'Unknown image format'; exit 1; +esac diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 9c7ea597eee6..850b8762e889 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -17,7 +17,6 @@ */ #include <asm/segment.h> -#include <generated/utsrelease.h> #include <asm/boot.h> #include <asm/page_types.h> #include <asm/setup.h> diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 45bc9402aa49..a14c5178d4ba 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -241,9 +241,9 @@ static int vga_probe(void) vga_modes, }; static int mode_count[] = { - sizeof(cga_modes)/sizeof(struct mode_info), - sizeof(ega_modes)/sizeof(struct mode_info), - sizeof(vga_modes)/sizeof(struct mode_info), + ARRAY_SIZE(cga_modes), + ARRAY_SIZE(ega_modes), + ARRAY_SIZE(vga_modes), }; struct biosregs ireg, oreg; diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 5c15d6b57329..3bf3dcf29825 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -28,6 +28,7 @@ #include <crypto/cryptd.h> #include <crypto/ctr.h> #include <crypto/b128ops.h> +#include <crypto/gcm.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> #include <asm/fpu/api.h> @@ -1067,9 +1068,10 @@ static struct skcipher_alg aesni_skciphers[] = { } }; +static struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; -struct { +static struct { const char *algname; const char *drvname; const char *basename; @@ -1131,7 +1133,7 @@ static struct aead_alg aesni_aead_algs[] = { { .setauthsize = common_rfc4106_set_authsize, .encrypt = helper_rfc4106_encrypt, .decrypt = helper_rfc4106_decrypt, - .ivsize = 8, + .ivsize = GCM_RFC4106_IV_SIZE, .maxauthsize = 16, .base = { .cra_name = "__gcm-aes-aesni", @@ -1149,7 +1151,7 @@ static struct aead_alg aesni_aead_algs[] = { { .setauthsize = rfc4106_set_authsize, .encrypt = rfc4106_encrypt, .decrypt = rfc4106_decrypt, - .ivsize = 8, + .ivsize = GCM_RFC4106_IV_SIZE, .maxauthsize = 16, .base = { .cra_name = "rfc4106(gcm(aes))", @@ -1165,7 +1167,7 @@ static struct aead_alg aesni_aead_algs[] = { { .setauthsize = generic_gcmaes_set_authsize, .encrypt = generic_gcmaes_encrypt, .decrypt = generic_gcmaes_decrypt, - .ivsize = 12, + .ivsize = GCM_AES_IV_SIZE, .maxauthsize = 16, .base = { .cra_name = "gcm(aes)", diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index f247304299a2..1c099dc08cc3 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -41,6 +41,7 @@ #include <asm/inst.h> +.section .rodata .align 16 /* * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 @@ -111,19 +112,13 @@ ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ pxor CONSTANT, %xmm1 sub $0x40, LEN add $0x40, BUF -#ifndef __x86_64__ - /* This is for position independent code(-fPIC) support for 32bit */ - call delta -delta: - pop %ecx -#endif cmp $0x40, LEN jb less_64 #ifdef __x86_64__ movdqa .Lconstant_R2R1(%rip), CONSTANT #else - movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT + movdqa .Lconstant_R2R1, CONSTANT #endif loop_64:/* 64 bytes Full cache line folding */ @@ -172,7 +167,7 @@ less_64:/* Folding cache line into 128bit */ #ifdef __x86_64__ movdqa .Lconstant_R4R3(%rip), CONSTANT #else - movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT + movdqa .Lconstant_R4R3, CONSTANT #endif prefetchnta (BUF) @@ -220,8 +215,8 @@ fold_64: movdqa .Lconstant_R5(%rip), CONSTANT movdqa .Lconstant_mask32(%rip), %xmm3 #else - movdqa .Lconstant_R5 - delta(%ecx), CONSTANT - movdqa .Lconstant_mask32 - delta(%ecx), %xmm3 + movdqa .Lconstant_R5, CONSTANT + movdqa .Lconstant_mask32, %xmm3 #endif psrldq $0x04, %xmm2 pand %xmm3, %xmm1 @@ -232,7 +227,7 @@ fold_64: #ifdef __x86_64__ movdqa .Lconstant_RUpoly(%rip), CONSTANT #else - movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT + movdqa .Lconstant_RUpoly, CONSTANT #endif movdqa %xmm1, %xmm2 pand %xmm3, %xmm1 diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index 399a29d067d6..cb91a64a99e7 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -59,13 +59,6 @@ static int encrypt(struct blkcipher_desc *desc, salsa20_ivsetup(ctx, walk.iv); - if (likely(walk.nbytes == nbytes)) - { - salsa20_encrypt_bytes(ctx, walk.src.virt.addr, - walk.dst.virt.addr, nbytes); - return blkcipher_walk_done(desc, &walk, 0); - } - while (walk.nbytes >= 64) { salsa20_encrypt_bytes(ctx, walk.src.virt.addr, walk.dst.virt.addr, diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 03505ffbe1b6..d7d3cc24baf4 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -75,7 +75,7 @@ static long syscall_trace_enter(struct pt_regs *regs) if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; + work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; if (unlikely(work & _TIF_SYSCALL_EMU)) emulated = true; @@ -186,9 +186,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) addr_limit_user_check(); - if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) - local_irq_disable(); - + lockdep_assert_irqs_disabled(); lockdep_sys_exit(); cached_flags = READ_ONCE(ti->flags); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b4f00984089e..a83570495162 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -53,15 +53,19 @@ ENTRY(native_usergs_sysret64) END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ -.macro TRACE_IRQS_IRETQ +.macro TRACE_IRQS_FLAGS flags:req #ifdef CONFIG_TRACE_IRQFLAGS - bt $9, EFLAGS(%rsp) /* interrupts off? */ + bt $9, \flags /* interrupts off? */ jnc 1f TRACE_IRQS_ON 1: #endif .endm +.macro TRACE_IRQS_IRETQ + TRACE_IRQS_FLAGS EFLAGS(%rsp) +.endm + /* * When dynamic function tracer is enabled it will add a breakpoint * to all locations that it is about to modify, sync CPUs, update @@ -215,8 +219,6 @@ ENTRY(entry_SYSCALL_64) movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - TRACE_IRQS_OFF - /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ @@ -237,6 +239,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ UNWIND_HINT_REGS extra=0 + TRACE_IRQS_OFF + /* * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. @@ -1110,11 +1114,13 @@ ENTRY(native_load_gs_index) FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + TRACE_IRQS_OFF SWAPGS .Lgs_change: movl %edi, %gs 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS + TRACE_IRQS_FLAGS (%rsp) popfq FRAME_END ret diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index c366c0adeb40..1943aebadede 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -130,10 +130,6 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 -# This makes sure the $(obj) subdirectory exists even though vdso32/ -# is not a kbuild sub-make subdirectory. -override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ - targets += vdso32/vdso32.lds targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o targets += vdso32/vclock_gettime.o diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index fa8dbfcf7ed3..f19856d95c60 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -318,11 +318,11 @@ int gettimeofday(struct timeval *, struct timezone *) notrace time_t __vdso_time(time_t *t) { /* This is atomic on x86 so we don't need any locks. */ - time_t result = ACCESS_ONCE(gtod->wall_time_sec); + time_t result = READ_ONCE(gtod->wall_time_sec); if (t) *t = result; return result; } -int time(time_t *t) +time_t time(time_t *t) __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 0780a443a53b..4674f58581a1 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -65,6 +65,7 @@ #include <linux/elf.h> #include <linux/types.h> +#include <linux/kernel.h> const char *outfilename; @@ -151,7 +152,7 @@ extern void bad_put_le(void); PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val)))) -#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) +#define NSYMS ARRAY_SIZE(required_syms) #define BITSFUNC3(name, bits, suffix) name##bits##suffix #define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1911310959f8..5b8b556dbb12 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -112,12 +112,13 @@ static int vvar_fault(const struct vm_special_mapping *sm, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = - pvclock_pvti_cpu0_va(); + pvclock_get_pvti_cpu0_va(); if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { - ret = vm_insert_pfn( + ret = vm_insert_pfn_prot( vma, vmf->address, - __pa(pvti) >> PAGE_SHIFT); + __pa(pvti) >> PAGE_SHIFT, + pgprot_decrypted(vma->vm_page_prot)); } } else if (sym_offset == image->sym_hvclock_page) { struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 3641e24fdac5..38b5d41b0c37 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -405,7 +405,7 @@ const struct attribute_group *amd_iommu_attr_groups[] = { NULL, }; -static struct pmu iommu_pmu = { +static const struct pmu iommu_pmu __initconst = { .event_init = perf_iommu_event_init, .add = perf_iommu_add, .del = perf_iommu_del, diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index a6eee5ac4f58..2aefacf5c5b2 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -277,7 +277,7 @@ static int __init amd_power_pmu_init(void) int ret; if (!x86_match_cpu(cpu_match)) - return 0; + return -ENODEV; if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) return -ENODEV; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 589af1eec7c1..140d33288e78 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2118,7 +2118,7 @@ static int x86_pmu_event_init(struct perf_event *event) event->destroy(event); } - if (ACCESS_ONCE(x86_pmu.attr_rdpmc)) + if (READ_ONCE(x86_pmu.attr_rdpmc)) event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; return err; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 43445da30cea..731153a4681e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3734,6 +3734,19 @@ EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1"); EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1"); static struct attribute *hsw_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + EVENT_PTR(td_slots_issued), + EVENT_PTR(td_slots_retired), + EVENT_PTR(td_fetch_bubbles), + EVENT_PTR(td_total_slots), + EVENT_PTR(td_total_slots_scale), + EVENT_PTR(td_recovery_bubbles), + EVENT_PTR(td_recovery_bubbles_scale), + NULL +}; + +static struct attribute *hsw_tsx_events_attrs[] = { EVENT_PTR(tx_start), EVENT_PTR(tx_commit), EVENT_PTR(tx_abort), @@ -3746,18 +3759,16 @@ static struct attribute *hsw_events_attrs[] = { EVENT_PTR(el_conflict), EVENT_PTR(cycles_t), EVENT_PTR(cycles_ct), - EVENT_PTR(mem_ld_hsw), - EVENT_PTR(mem_st_hsw), - EVENT_PTR(td_slots_issued), - EVENT_PTR(td_slots_retired), - EVENT_PTR(td_fetch_bubbles), - EVENT_PTR(td_total_slots), - EVENT_PTR(td_total_slots_scale), - EVENT_PTR(td_recovery_bubbles), - EVENT_PTR(td_recovery_bubbles_scale), NULL }; +static __init struct attribute **get_hsw_events_attrs(void) +{ + return boot_cpu_has(X86_FEATURE_RTM) ? + merge_attr(hsw_events_attrs, hsw_tsx_events_attrs) : + hsw_events_attrs; +} + static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -3836,6 +3847,8 @@ static struct attribute *intel_pmu_attrs[] = { __init int intel_pmu_init(void) { + struct attribute **extra_attr = NULL; + struct attribute **to_free = NULL; union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; @@ -3843,7 +3856,6 @@ __init int intel_pmu_init(void) unsigned int unused; struct extra_reg *er; int version, i; - struct attribute **extra_attr = NULL; char *name; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -4186,7 +4198,7 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.cpu_events = get_hsw_events_attrs(); x86_pmu.lbr_double_abort = true; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; @@ -4225,7 +4237,7 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.cpu_events = get_hsw_events_attrs(); x86_pmu.limit_period = bdw_limit_period; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; @@ -4283,7 +4295,8 @@ __init int intel_pmu_init(void) extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; extra_attr = merge_attr(extra_attr, skl_format_attr); - x86_pmu.cpu_events = hsw_events_attrs; + to_free = extra_attr; + x86_pmu.cpu_events = get_hsw_events_attrs(); intel_pmu_pebs_data_source_skl( boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); pr_cont("Skylake events, "); @@ -4390,6 +4403,7 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } + kfree(to_free); return 0; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8156e47da7ba..18c25ab28557 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -372,10 +372,9 @@ static int alloc_pebs_buffer(int cpu) static void release_pebs_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); - struct debug_store *ds = hwev->ds; void *cea; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs) return; kfree(per_cpu(insn_buffer, cpu)); @@ -384,7 +383,6 @@ static void release_pebs_buffer(int cpu) /* Clear the fixmap */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ds_clear_cea(cea, x86_pmu.pebs_buffer_size); - ds->pebs_buffer_base = 0; dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); hwev->ds_pebs_vaddr = NULL; } @@ -419,16 +417,14 @@ static int alloc_bts_buffer(int cpu) static void release_bts_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); - struct debug_store *ds = hwev->ds; void *cea; - if (!ds || !x86_pmu.bts) + if (!x86_pmu.bts) return; /* Clear the fixmap */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; ds_clear_cea(cea, BTS_BUFFER_SIZE); - ds->bts_buffer_base = 0; dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); hwev->ds_bts_vaddr = NULL; } @@ -454,16 +450,22 @@ void release_ds_buffers(void) if (!x86_pmu.bts && !x86_pmu.pebs) return; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) + release_ds_buffer(cpu); + + for_each_possible_cpu(cpu) { + /* + * Again, ignore errors from offline CPUs, they will no longer + * observe cpu_hw_events.ds and not program the DS_AREA when + * they come up. + */ fini_debug_store_on_cpu(cpu); + } for_each_possible_cpu(cpu) { release_pebs_buffer(cpu); release_bts_buffer(cpu); - release_ds_buffer(cpu); } - put_online_cpus(); } void reserve_ds_buffers(void) @@ -483,8 +485,6 @@ void reserve_ds_buffers(void) if (!x86_pmu.pebs) pebs_err = 1; - get_online_cpus(); - for_each_possible_cpu(cpu) { if (alloc_ds_buffer(cpu)) { bts_err = 1; @@ -521,11 +521,14 @@ void reserve_ds_buffers(void) if (x86_pmu.pebs && !pebs_err) x86_pmu.pebs_active = 1; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) { + /* + * Ignores wrmsr_on_cpu() errors for offline CPUs they + * will get this call through intel_pmu_cpu_starting(). + */ init_debug_store_on_cpu(cpu); + } } - - put_online_cpus(); } /* diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 005908ee9333..a2efb490f743 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -755,14 +755,14 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init), diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d45e06346f14..7874c980d569 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -975,10 +975,10 @@ static void uncore_pci_remove(struct pci_dev *pdev) int i, phys_id, pkg; phys_id = uncore_pcibus_to_physid(pdev->bus); - pkg = topology_phys_to_logical_pkg(phys_id); box = pci_get_drvdata(pdev); if (!box) { + pkg = topology_phys_to_logical_pkg(phys_id); for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { if (uncore_extra_pci_dev[pkg].dev[i] == pdev) { uncore_extra_pci_dev[pkg].dev[i] = NULL; @@ -994,7 +994,7 @@ static void uncore_pci_remove(struct pci_dev *pdev) return; pci_set_drvdata(pdev, NULL); - pmu->boxes[pkg] = NULL; + pmu->boxes[box->pkgid] = NULL; if (atomic_dec_return(&pmu->activeboxes) == 0) uncore_pmu_unregister(pmu); uncore_box_exit(box); diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 4364191e7c6b..414dc7e7c950 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -100,7 +100,7 @@ struct intel_uncore_extra_reg { struct intel_uncore_box { int pci_phys_id; - int pkgid; + int pkgid; /* Logical package ID */ int n_active; /* number of active events */ int n_events; int cpu; /* cpu to collect events */ diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 95cb19f4e06f..6d8044ab1060 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1057,7 +1057,7 @@ static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_eve if (reg1->idx != EXTRA_REG_NONE) { int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; - int pkg = topology_phys_to_logical_pkg(box->pci_phys_id); + int pkg = box->pkgid; struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx]; if (filter_pdev) { @@ -3035,11 +3035,19 @@ static struct intel_uncore_type *bdx_msr_uncores[] = { NULL, }; +/* Bit 7 'Use Occupancy' is not available for counter 0 on BDX */ +static struct event_constraint bdx_uncore_pcu_constraints[] = { + EVENT_CONSTRAINT(0x80, 0xe, 0x80), + EVENT_CONSTRAINT_END +}; + void bdx_uncore_cpu_init(void) { if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; + + hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints; } static struct intel_uncore_type bdx_uncore_ha = { diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a0b86cf486e0..189a398290db 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -210,9 +210,10 @@ void hyperv_cleanup(void) } EXPORT_SYMBOL_GPL(hyperv_cleanup); -void hyperv_report_panic(struct pt_regs *regs) +void hyperv_report_panic(struct pt_regs *regs, long err) { static bool panic_reported; + u64 guest_id; /* * We prefer to report panic on 'die' chain as we have proper @@ -223,11 +224,13 @@ void hyperv_report_panic(struct pt_regs *regs) return; panic_reported = true; - wrmsrl(HV_X64_MSR_CRASH_P0, regs->ip); - wrmsrl(HV_X64_MSR_CRASH_P1, regs->ax); - wrmsrl(HV_X64_MSR_CRASH_P2, regs->bx); - wrmsrl(HV_X64_MSR_CRASH_P3, regs->cx); - wrmsrl(HV_X64_MSR_CRASH_P4, regs->dx); + rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + + wrmsrl(HV_X64_MSR_CRASH_P0, err); + wrmsrl(HV_X64_MSR_CRASH_P1, guest_id); + wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip); + wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax); + wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp); /* * Let Hyper-V know there is crash data available diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 5f01671c68f2..98722773391d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -53,6 +53,15 @@ extern int local_apic_timer_c2_ok; extern int disable_apic; extern unsigned int lapic_timer_frequency; +extern enum apic_intr_mode_id apic_intr_mode; +enum apic_intr_mode_id { + APIC_PIC, + APIC_VIRTUAL_WIRE, + APIC_VIRTUAL_WIRE_NO_CONFIG, + APIC_SYMMETRIC_IO, + APIC_SYMMETRIC_IO_NO_ROUTING +}; + #ifdef CONFIG_SMP extern void __inquire_remote_apic(int apicid); #else /* CONFIG_SMP */ @@ -128,13 +137,13 @@ extern void disable_local_APIC(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); +extern void apic_intr_mode_init(void); extern void setup_local_APIC(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern void lapic_update_tsc_freq(void); -extern int APIC_init_uniprocessor(void); #ifdef CONFIG_X86_64 static inline int apic_force_enable(unsigned long addr) @@ -145,7 +154,7 @@ static inline int apic_force_enable(unsigned long addr) extern int apic_force_enable(unsigned long addr); #endif -extern int apic_bsp_setup(bool upmode); +extern void apic_bsp_setup(bool upmode); extern void apic_ap_setup(void); /* @@ -161,6 +170,10 @@ static inline int apic_is_clustered_box(void) #endif extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); +extern void lapic_assign_system_vectors(void); +extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); +extern void lapic_online(void); +extern void lapic_offline(void); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } @@ -170,6 +183,9 @@ static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop static inline void lapic_update_tsc_freq(void) { } +static inline void apic_intr_mode_init(void) { } +static inline void lapic_assign_system_vectors(void) { } +static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC @@ -265,73 +281,63 @@ struct irq_data; * James Cleverdon. */ struct apic { - char *name; - - int (*probe)(void); - int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - int (*apic_id_valid)(int apicid); - int (*apic_id_registered)(void); - - u32 irq_delivery_mode; - u32 irq_dest_mode; - - const struct cpumask *(*target_cpus)(void); - - int disable_esr; - - int dest_logical; - unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); - - void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, - const struct cpumask *mask); - void (*init_apic_ldr)(void); - - void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); - - void (*setup_apic_routing)(void); - int (*cpu_present_to_apicid)(int mps_cpu); - void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); - int (*check_phys_apicid_present)(int phys_apicid); - int (*phys_pkg_id)(int cpuid_apic, int index_msb); - - unsigned int (*get_apic_id)(unsigned long x); - /* Can't be NULL on 64-bit */ - unsigned long (*set_apic_id)(unsigned int id); - - int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, - struct irq_data *irqdata, - unsigned int *apicid); - - /* ipi */ - void (*send_IPI)(int cpu, int vector); - void (*send_IPI_mask)(const struct cpumask *mask, int vector); - void (*send_IPI_mask_allbutself)(const struct cpumask *mask, - int vector); - void (*send_IPI_allbutself)(int vector); - void (*send_IPI_all)(int vector); - void (*send_IPI_self)(int vector); + /* Hotpath functions first */ + void (*eoi_write)(u32 reg, u32 v); + void (*native_eoi_write)(u32 reg, u32 v); + void (*write)(u32 reg, u32 v); + u32 (*read)(u32 reg); + + /* IPI related functions */ + void (*wait_icr_idle)(void); + u32 (*safe_wait_icr_idle)(void); + + void (*send_IPI)(int cpu, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); + void (*send_IPI_allbutself)(int vector); + void (*send_IPI_all)(int vector); + void (*send_IPI_self)(int vector); + + /* dest_logical is used by the IPI functions */ + u32 dest_logical; + u32 disable_esr; + u32 irq_delivery_mode; + u32 irq_dest_mode; + + /* Functions and data related to vector allocation */ + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, + const struct cpumask *mask); + int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, + struct irq_data *irqdata, + unsigned int *apicid); + u32 (*calc_dest_apicid)(unsigned int cpu); + + /* ICR related functions */ + u64 (*icr_read)(void); + void (*icr_write)(u32 low, u32 high); + + /* Probe, setup and smpboot functions */ + int (*probe)(void); + int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); + int (*apic_id_valid)(int apicid); + int (*apic_id_registered)(void); + + bool (*check_apicid_used)(physid_mask_t *map, int apicid); + void (*init_apic_ldr)(void); + void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); + void (*setup_apic_routing)(void); + int (*cpu_present_to_apicid)(int mps_cpu); + void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); + int (*check_phys_apicid_present)(int phys_apicid); + int (*phys_pkg_id)(int cpuid_apic, int index_msb); + + u32 (*get_apic_id)(unsigned long x); + u32 (*set_apic_id)(unsigned int id); /* wakeup_secondary_cpu */ - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); - void (*inquire_remote_apic)(int apicid); - - /* apic ops */ - u32 (*read)(u32 reg); - void (*write)(u32 reg, u32 v); - /* - * ->eoi_write() has the same signature as ->write(). - * - * Drivers can support both ->eoi_write() and ->write() by passing the same - * callback value. Kernel can override ->eoi_write() and fall back - * on write for EOI. - */ - void (*eoi_write)(u32 reg, u32 v); - void (*native_eoi_write)(u32 reg, u32 v); - u64 (*icr_read)(void); - void (*icr_write)(u32 low, u32 high); - void (*wait_icr_idle)(void); - u32 (*safe_wait_icr_idle)(void); + void (*inquire_remote_apic)(int apicid); #ifdef CONFIG_X86_32 /* @@ -346,6 +352,7 @@ struct apic { */ int (*x86_32_early_logical_apicid)(int cpu); #endif + char *name; }; /* @@ -380,6 +387,7 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[]; */ #ifdef CONFIG_SMP extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); +extern int lapic_can_unplug_cpu(void); #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -463,84 +471,33 @@ static inline unsigned default_get_apic_id(unsigned long x) extern void apic_send_IPI_self(int vector); DECLARE_PER_CPU(int, x2apic_extra_bits); - -extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int phys_apicid); #endif extern void generic_bigsmp_probe(void); - #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> #define APIC_DFR_VALUE (APIC_DFR_FLAT) -static inline const struct cpumask *default_target_cpus(void) -{ -#ifdef CONFIG_SMP - return cpu_online_mask; -#else - return cpumask_of(0); -#endif -} - -static inline const struct cpumask *online_target_cpus(void) -{ - return cpu_online_mask; -} - DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); +extern struct apic apic_noop; static inline unsigned int read_apic_id(void) { - unsigned int reg; - - reg = apic_read(APIC_ID); + unsigned int reg = apic_read(APIC_ID); return apic->get_apic_id(reg); } -static inline int default_apic_id_valid(int apicid) -{ - return (apicid < 255); -} - +extern int default_apic_id_valid(int apicid); extern int default_acpi_madt_oem_check(char *, char *); - extern void default_setup_apic_routing(void); -extern struct apic apic_noop; - -#ifdef CONFIG_X86_32 - -static inline int noop_x86_32_early_logical_apicid(int cpu) -{ - return BAD_APICID; -} - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -extern void default_init_apic_ldr(void); - -static inline int default_apic_id_registered(void) -{ - return physid_isset(read_apic_id(), phys_cpu_present_map); -} - -static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - -#endif +extern u32 apic_default_calc_apicid(unsigned int cpu); +extern u32 apic_flat_calc_apicid(unsigned int cpu); extern int flat_cpu_mask_to_apicid(const struct cpumask *cpumask, struct irq_data *irqdata, @@ -548,71 +505,17 @@ extern int flat_cpu_mask_to_apicid(const struct cpumask *cpumask, extern int default_cpu_mask_to_apicid(const struct cpumask *cpumask, struct irq_data *irqdata, unsigned int *apicid); - -static inline void -flat_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - -static inline void -default_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - cpumask_copy(retmask, cpumask_of(cpu)); -} - -static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) -{ - return physid_isset(apicid, *map); -} - -static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - *retmap = *phys_map; -} - -static inline int __default_cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) - return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); - else - return BAD_APICID; -} - -static inline int -__default_check_phys_apicid_present(int phys_apicid) -{ - return physid_isset(phys_apicid, phys_cpu_present_map); -} - -#ifdef CONFIG_X86_32 -static inline int default_cpu_present_to_apicid(int mps_cpu) -{ - return __default_cpu_present_to_apicid(mps_cpu); -} - -static inline int -default_check_phys_apicid_present(int phys_apicid) -{ - return __default_check_phys_apicid_present(phys_apicid); -} -#else +extern bool default_check_apicid_used(physid_mask_t *map, int apicid); +extern void flat_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask); +extern void default_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask); +extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); extern int default_cpu_present_to_apicid(int mps_cpu); extern int default_check_phys_apicid_present(int phys_apicid); -#endif #endif /* CONFIG_X86_LOCAL_APIC */ + extern void irq_enter(void); extern void irq_exit(void); diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..386a6900e206 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -136,6 +136,7 @@ #endif #ifndef __ASSEMBLY__ +#ifndef __BPF__ /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -145,5 +146,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif +#endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 01727dbc294a..7fb336210e1b 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -12,11 +12,11 @@ */ #ifdef CONFIG_X86_32 -#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \ +#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \ +#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \ +#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #else #define mb() asm volatile("mfence":::"memory") @@ -31,7 +31,11 @@ #endif #define dma_wmb() barrier() -#define __smp_mb() mb() +#ifdef CONFIG_X86_32 +#define __smp_mb() asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc") +#else +#define __smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc") +#endif #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index a600a6cda9ec..2cbd75dd2fd3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -210,7 +210,6 @@ typedef struct compat_siginfo { } compat_siginfo_t; #define COMPAT_OFF_T_MAX 0x7fffffff -#define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL struct compat_ipc64_perm { compat_key_t key; diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 85e23bb7b34e..13c5ee878a47 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -389,7 +389,7 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) void update_intr_gate(unsigned int n, const void *addr); void alloc_intr_gate(unsigned int n, const void *addr); -extern unsigned long used_vectors[]; +extern unsigned long system_vectors[]; #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index c6a3af198294..33833d1909af 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -16,6 +16,12 @@ # define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) #endif +#ifdef CONFIG_X86_INTEL_UMIP +# define DISABLE_UMIP 0 +#else +# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) +#endif + #ifdef CONFIG_X86_64 # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) @@ -69,7 +75,7 @@ #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57) +#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 836ca1178a6a..0350d99bb8fd 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -7,7 +7,6 @@ * Documentation/DMA-API.txt for documentation. */ -#include <linux/kmemcheck.h> #include <linux/scatterlist.h> #include <linux/dma-debug.h> #include <asm/io.h> @@ -68,13 +67,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) } #endif /* CONFIG_X86_DMA_REMAP */ -static inline void -dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction dir) -{ - flush_write_buffers(); -} - static inline unsigned long dma_alloc_coherent_mask(struct device *dev, gfp_t gfp) { diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 3a091cea36c5..0d157d2a1e2a 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -309,6 +309,7 @@ static inline int mmap_is_ia32(void) extern unsigned long task_size_32bit(void); extern unsigned long task_size_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); +extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 8ec99a55e6b9..2851077b6051 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -16,6 +16,8 @@ #include <asm/irq_vectors.h> +#define IRQ_MATRIX_BITS NR_VECTORS + #ifndef __ASSEMBLY__ #include <linux/percpu.h> @@ -97,14 +99,6 @@ struct irq_alloc_info { void *dmar_data; }; #endif -#ifdef CONFIG_HT_IRQ - struct { - int ht_pos; - int ht_idx; - struct pci_dev *ht_dev; - void *ht_update; - }; -#endif #ifdef CONFIG_X86_UV struct { int uv_limit; @@ -123,15 +117,13 @@ struct irq_alloc_info { struct irq_cfg { unsigned int dest_apicid; - u8 vector; - u8 old_vector; + unsigned int vector; }; extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); -extern void setup_vector_irq(int cpu); #ifdef CONFIG_SMP extern void send_cleanup_vector(struct irq_cfg *); extern void irq_complete_move(struct irq_cfg *cfg); diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h deleted file mode 100644 index 5d55df352879..000000000000 --- a/arch/x86/include/asm/hypertransport.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_HYPERTRANSPORT_H -#define _ASM_X86_HYPERTRANSPORT_H - -/* - * Constants for x86 Hypertransport Interrupts. - */ - -#define HT_IRQ_LOW_BASE 0xf8000000 - -#define HT_IRQ_LOW_VECTOR_SHIFT 16 -#define HT_IRQ_LOW_VECTOR_MASK 0x00ff0000 -#define HT_IRQ_LOW_VECTOR(v) \ - (((v) << HT_IRQ_LOW_VECTOR_SHIFT) & HT_IRQ_LOW_VECTOR_MASK) - -#define HT_IRQ_LOW_DEST_ID_SHIFT 8 -#define HT_IRQ_LOW_DEST_ID_MASK 0x0000ff00 -#define HT_IRQ_LOW_DEST_ID(v) \ - (((v) << HT_IRQ_LOW_DEST_ID_SHIFT) & HT_IRQ_LOW_DEST_ID_MASK) - -#define HT_IRQ_LOW_DM_PHYSICAL 0x0000000 -#define HT_IRQ_LOW_DM_LOGICAL 0x0000040 - -#define HT_IRQ_LOW_RQEOI_EDGE 0x0000000 -#define HT_IRQ_LOW_RQEOI_LEVEL 0x0000020 - - -#define HT_IRQ_LOW_MT_FIXED 0x0000000 -#define HT_IRQ_LOW_MT_ARBITRATED 0x0000004 -#define HT_IRQ_LOW_MT_SMI 0x0000008 -#define HT_IRQ_LOW_MT_NMI 0x000000c -#define HT_IRQ_LOW_MT_INIT 0x0000010 -#define HT_IRQ_LOW_MT_STARTUP 0x0000014 -#define HT_IRQ_LOW_MT_EXTINT 0x0000018 -#define HT_IRQ_LOW_MT_LINT1 0x000008c -#define HT_IRQ_LOW_MT_LINT0 0x0000098 - -#define HT_IRQ_LOW_IRQ_MASKED 0x0000001 - - -#define HT_IRQ_HIGH_DEST_ID_SHIFT 0 -#define HT_IRQ_HIGH_DEST_ID_MASK 0x00ffffff -#define HT_IRQ_HIGH_DEST_ID(v) \ - ((((v) >> 8) << HT_IRQ_HIGH_DEST_ID_SHIFT) & HT_IRQ_HIGH_DEST_ID_MASK) - -#endif /* _ASM_X86_HYPERTRANSPORT_H */ diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 02aff0867211..1c78580e58be 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -97,6 +97,16 @@ #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) +/* Identifiers for segment registers */ +#define INAT_SEG_REG_IGNORE 0 +#define INAT_SEG_REG_DEFAULT 1 +#define INAT_SEG_REG_CS 2 +#define INAT_SEG_REG_SS 3 +#define INAT_SEG_REG_DS 4 +#define INAT_SEG_REG_ES 5 +#define INAT_SEG_REG_FS 6 +#define INAT_SEG_REG_GS 7 + /* Attribute search APIs */ extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); extern int inat_get_last_prefix_id(insn_byte_t last_pfx); diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h new file mode 100644 index 000000000000..2b6ccf2c49f1 --- /dev/null +++ b/arch/x86/include/asm/insn-eval.h @@ -0,0 +1,23 @@ +#ifndef _ASM_X86_INSN_EVAL_H +#define _ASM_X86_INSN_EVAL_H +/* + * A collection of utility functions for x86 instruction analysis to be + * used in a kernel context. Useful when, for instance, making sense + * of the registers indicated by operands. + */ + +#include <linux/compiler.h> +#include <linux/bug.h> +#include <linux/err.h> +#include <asm/ptrace.h> + +#define INSN_CODE_SEG_ADDR_SZ(params) ((params >> 4) & 0xf) +#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf) +#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4)) + +void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); +int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); +unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); +int insn_get_code_seg_params(struct pt_regs *regs); + +#endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 11398d55aefa..95e948627fd0 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -111,6 +111,10 @@ build_mmio_write(__writeq, "q", unsigned long, "r", ) #endif +#define ARCH_HAS_VALID_PHYS_ADDR_RANGE +extern int valid_phys_addr_range(phys_addr_t addr, size_t size); +extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); + /** * virt_to_phys - map virtual addresses to physical * @address: address to remap @@ -266,6 +270,21 @@ static inline void slow_down_io(void) #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT +#include <linux/jump_label.h> + +extern struct static_key_false sev_enable_key; +static inline bool sev_key_active(void) +{ + return static_branch_unlikely(&sev_enable_key); +} + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +static inline bool sev_key_active(void) { return false; } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + #define BUILDIO(bwl, bw, type) \ static inline void out##bwl(unsigned type value, int port) \ { \ @@ -296,14 +315,34 @@ static inline unsigned type in##bwl##_p(int port) \ \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ - asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port) : "memory"); \ + if (sev_key_active()) { \ + unsigned type *value = (unsigned type *)addr; \ + while (count) { \ + out##bwl(*value, port); \ + value++; \ + count--; \ + } \ + } else { \ + asm volatile("rep; outs" #bwl \ + : "+S"(addr), "+c"(count) \ + : "d"(port) : "memory"); \ + } \ } \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ - asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port) : "memory"); \ + if (sev_key_active()) { \ + unsigned type *value = (unsigned type *)addr; \ + while (count) { \ + *value = in##bwl(port); \ + value++; \ + count--; \ + } \ + } else { \ + asm volatile("rep; ins" #bwl \ + : "+D"(addr), "+c"(count) \ + : "d"(port) : "memory"); \ + } \ } BUILDIO(b, b, char) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 5c27e146a166..a8834dd546cd 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -193,7 +193,6 @@ static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); extern void disable_IO_APIC(void); -extern void setup_ioapic_dest(void); extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin); extern void print_IO_APICs(void); #else /* !CONFIG_X86_IO_APIC */ @@ -233,7 +232,6 @@ static inline void io_apic_init_mappings(void) { } static inline void setup_IO_APIC(void) { } static inline void enable_IO_APIC(void) { } -static inline void setup_ioapic_dest(void) { } #endif diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index d8632f8fa17d..2395bb794c7b 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -26,11 +26,7 @@ extern void irq_ctx_init(int cpu); struct irq_desc; -#ifdef CONFIG_HOTPLUG_CPU -#include <linux/cpumask.h> -extern int check_irq_vectors_for_cpu_disable(void); extern void fixup_irqs(void); -#endif #ifdef CONFIG_HAVE_KVM extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index c20ffca8fef1..67421f649cfa 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -102,12 +102,8 @@ #define POSTED_INTR_NESTED_VECTOR 0xf0 #endif -/* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. - */ -#define LOCAL_TIMER_VECTOR 0xef +#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef +#define LOCAL_TIMER_VECTOR 0xee #define NR_VECTORS 256 diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index 423e112c1e8f..c066ffae222b 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -9,6 +9,7 @@ enum { /* Allocate contiguous CPU vectors */ X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, + X86_IRQ_ALLOC_LEGACY = 0x2, }; extern struct irq_domain *x86_vector_domain; @@ -42,8 +43,8 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg); extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs); -extern void mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data); +extern int mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool reserve); extern void mp_irqdomain_deactivate(struct irq_domain *domain, struct irq_data *irq_data); extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); @@ -55,10 +56,4 @@ extern void arch_init_msi_domain(struct irq_domain *domain); static inline void arch_init_msi_domain(struct irq_domain *domain) { } #endif -#ifdef CONFIG_HT_IRQ -extern void arch_init_htirq_domain(struct irq_domain *domain); -#else -static inline void arch_init_htirq_domain(struct irq_domain *domain) { } -#endif - #endif diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h deleted file mode 100644 index 945a0337fbcf..000000000000 --- a/arch/x86/include/asm/kmemcheck.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_X86_KMEMCHECK_H -#define ASM_X86_KMEMCHECK_H - -#include <linux/types.h> -#include <asm/ptrace.h> - -#ifdef CONFIG_KMEMCHECK -bool kmemcheck_active(struct pt_regs *regs); - -void kmemcheck_show(struct pt_regs *regs); -void kmemcheck_hide(struct pt_regs *regs); - -bool kmemcheck_fault(struct pt_regs *regs, - unsigned long address, unsigned long error_code); -bool kmemcheck_trap(struct pt_regs *regs); -#else -static inline bool kmemcheck_active(struct pt_regs *regs) -{ - return false; -} - -static inline void kmemcheck_show(struct pt_regs *regs) -{ -} - -static inline void kmemcheck_hide(struct pt_regs *regs) -{ -} - -static inline bool kmemcheck_fault(struct pt_regs *regs, - unsigned long address, unsigned long error_code) -{ - return false; -} - -static inline bool kmemcheck_trap(struct pt_regs *regs) -{ - return false; -} -#endif /* CONFIG_KMEMCHECK */ - -#endif diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 6cf65437b5e5..9f2e3102e0bb 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -58,8 +58,8 @@ extern __visible kprobe_opcode_t optprobe_template_call[]; extern __visible kprobe_opcode_t optprobe_template_end[]; #define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) #define MAX_OPTINSN_SIZE \ - (((unsigned long)&optprobe_template_end - \ - (unsigned long)&optprobe_template_entry) + \ + (((unsigned long)optprobe_template_end - \ + (unsigned long)optprobe_template_entry) + \ MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) extern const int kretprobe_blacklist_size; diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index ee23a43386a2..b24b1c8b3979 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -214,8 +214,6 @@ struct x86_emulate_ops { void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); - void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ - void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ int (*intercept)(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage); @@ -226,6 +224,8 @@ struct x86_emulate_ops { unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); + int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase); + }; typedef u32 __attribute__((vector_size(16))) sse128_t; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c73e493adf07..516798431328 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -536,7 +536,20 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; + /* + * QEMU userspace and the guest each have their own FPU state. + * In vcpu_run, we switch between the user and guest FPU contexts. + * While running a VCPU, the VCPU thread will have the guest FPU + * context. + * + * Note that while the PKRU state lives inside the fpu registers, + * it is switched out separately at VMENTER and VMEXIT time. The + * "guest_fpu" state here contains the guest FPU context, with the + * host PRKU bits. + */ + struct fpu user_fpu; struct fpu guest_fpu; + u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; @@ -1061,6 +1074,11 @@ struct kvm_x86_ops { void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); void (*setup_mce)(struct kvm_vcpu *vcpu); + + int (*smi_allowed)(struct kvm_vcpu *vcpu); + int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); + int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); + int (*enable_smi_window)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1156,7 +1174,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, static inline int emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { - return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); + return x86_emulate_instruction(vcpu, 0, + emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); } void kvm_enable_efer_bits(u64); @@ -1419,11 +1438,17 @@ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} static inline int kvm_cpu_get_apicid(int mps_cpu) { #ifdef CONFIG_X86_LOCAL_APIC - return __default_cpu_present_to_apicid(mps_cpu); + return default_cpu_present_to_apicid(mps_cpu); #else WARN_ON_ONCE(1); return BAD_APICID; #endif } +#define put_smstate(type, buf, offset, val) \ + *(type *)((buf) + (offset) - 0x7e00) = val + +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c373e44049b1..7b407dda2bd7 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,7 +88,6 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, #ifdef CONFIG_KVM_GUEST bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); -void __init kvm_guest_init(void); void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); @@ -103,7 +102,6 @@ static inline void kvm_spinlock_init(void) #endif /* CONFIG_PARAVIRT_SPINLOCKS */ #else /* CONFIG_KVM_GUEST */ -#define kvm_guest_init() do {} while (0) #define kvm_async_pf_task_wait(T, I) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 6a77c63540f7..22c5f3e6f820 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -39,14 +39,20 @@ void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sme_encrypt_kernel(void); +void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_enable(struct boot_params *bp); +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); +bool sme_active(void); +bool sev_active(void); + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL @@ -61,9 +67,17 @@ static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } +static inline bool sme_active(void) { return false; } +static inline bool sev_active(void) { return false; } + +static inline int __init +early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } +static inline int __init +early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h index 9492893aec52..a6bec8028480 100644 --- a/arch/x86/include/asm/mpspec_def.h +++ b/arch/x86/include/asm/mpspec_def.h @@ -59,7 +59,7 @@ struct mpc_table { #define MP_TRANSLATION 192 #define CPU_ENABLED 1 /* Processor is available */ -#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */ +#define CPU_BOOTPROCESSOR 2 /* Processor is the boot CPU */ #define CPU_STEPPING_MASK 0x000F #define CPU_MODEL_MASK 0x00F0 diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 5119e4b555cc..8bf450b13d9f 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -313,7 +313,7 @@ static inline int hv_cpu_number_to_vp_number(int cpu_number) void hyperv_init(void); void hyperv_setup_mmu_ops(void); void hyper_alloc_mmu(void); -void hyperv_report_panic(struct pt_regs *regs); +void hyperv_report_panic(struct pt_regs *regs, long err); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); #else /* CONFIG_HYPERV */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index eb83ff1bae8f..e520a1e6fc11 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -336,6 +336,9 @@ #define MSR_AMD64_IBSBRTARGET 0xc001103b #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SEV 0xc0010131 +#define MSR_AMD64_SEV_ENABLED_BIT 0 +#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 09c06b0fb964..d32175e30259 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -89,10 +89,8 @@ extern unsigned long pci_mem_start; #define PCIBIOS_MIN_CARDBUS_IO 0x4000 extern int pcibios_enabled; -void pcibios_config_init(void); void pcibios_scan_root(int bus); -void pcibios_set_master(struct pci_dev *dev); struct irq_routing_table *pcibios_get_irq_routing_table(void); int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 7a5d6695abd3..eb66fa9cd0fc 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -38,6 +38,7 @@ do { \ #define PCI_NOASSIGN_ROMS 0x80000 #define PCI_ROOT_NO_CRS 0x100000 #define PCI_NOASSIGN_BARS 0x200000 +#define PCI_BIG_ROOT_WINDOW 0x400000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6b43d677f8ca..e42b8943cb1a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -668,11 +668,6 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) return false; } -static inline int pte_hidden(pte_t pte) -{ - return pte_flags(pte) & _PAGE_HIDDEN; -} - static inline int pmd_present(pmd_t pmd) { /* @@ -1081,7 +1076,7 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMD_WRITE +#define pmd_write pmd_write static inline int pmd_write(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_RW; @@ -1108,6 +1103,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); } +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return pud_flags(pud) & _PAGE_RW; +} + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 9e9b05fc4860..3696398a9475 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -32,7 +32,6 @@ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 -#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 @@ -79,18 +78,6 @@ #define _PAGE_KNL_ERRATUM_MASK 0 #endif -#ifdef CONFIG_KMEMCHECK -#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) -#else -#define _PAGE_HIDDEN (_AT(pteval_t, 0)) -#endif - -/* - * The same hidden bit is used by kmemcheck, but since kmemcheck - * works on kernel pages while soft-dirty engine on user space, - * they do not conflict with each other. - */ - #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 881ca3b1d6d4..efbde088a718 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -132,6 +132,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; + unsigned initialized : 1; } __randomize_layout; struct cpuid_regs { diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 3e4ed8fb5f91..a7471dcd2205 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -5,15 +5,6 @@ #include <linux/clocksource.h> #include <asm/pvclock-abi.h> -#ifdef CONFIG_KVM_GUEST -extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void); -#else -static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -{ - return NULL; -} -#endif - /* some helper functions for xen and kvm pv clock sources */ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); @@ -102,4 +93,14 @@ struct pvclock_vsyscall_time_info { #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) +#ifdef CONFIG_PARAVIRT_CLOCK +void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti); +struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void); +#else +static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) +{ + return NULL; +} +#endif + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 9982dd96f093..5e16b5d40d32 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_QSPINLOCK_H #define _ASM_X86_QSPINLOCK_H +#include <linux/jump_label.h> #include <asm/cpufeature.h> #include <asm-generic/qspinlock_types.h> #include <asm/paravirt.h> @@ -47,10 +48,14 @@ static inline void queued_spin_unlock(struct qspinlock *lock) #endif #ifdef CONFIG_PARAVIRT +DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); + +void native_pv_lock_init(void) __init; + #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + if (!static_branch_likely(&virt_spin_lock_key)) return false; /* @@ -66,6 +71,10 @@ static inline bool virt_spin_lock(struct qspinlock *lock) return true; } +#else +static inline void native_pv_lock_init(void) +{ +} #endif /* CONFIG_PARAVIRT */ #include <asm-generic/qspinlock.h> diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h index ff871210b9f2..4e44250e7d0d 100644 --- a/arch/x86/include/asm/refcount.h +++ b/arch/x86/include/asm/refcount.h @@ -15,7 +15,7 @@ * back to the regular execution flow in .text. */ #define _REFCOUNT_EXCEPTION \ - ".pushsection .text.unlikely\n" \ + ".pushsection .text..refcount\n" \ "111:\tlea %[counter], %%" _ASM_CX "\n" \ "112:\t" ASM_UD0 "\n" \ ASM_UNREACHABLE \ diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 4d38d85a16ad..4c25cf6caefa 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -61,18 +61,33 @@ /* * lock for reading */ +#define ____down_read(sem, slow_path) \ +({ \ + struct rw_semaphore* ret; \ + asm volatile("# beginning down_read\n\t" \ + LOCK_PREFIX _ASM_INC "(%[sem])\n\t" \ + /* adds 0x00000001 */ \ + " jns 1f\n" \ + " call " slow_path "\n" \ + "1:\n\t" \ + "# ending down_read\n\t" \ + : "+m" (sem->count), "=a" (ret), \ + ASM_CALL_CONSTRAINT \ + : [sem] "a" (sem) \ + : "memory", "cc"); \ + ret; \ +}) + static inline void __down_read(struct rw_semaphore *sem) { - asm volatile("# beginning down_read\n\t" - LOCK_PREFIX _ASM_INC "(%1)\n\t" - /* adds 0x00000001 */ - " jns 1f\n" - " call call_rwsem_down_read_failed\n" - "1:\n\t" - "# ending down_read\n\t" - : "+m" (sem->count) - : "a" (sem) - : "memory", "cc"); + ____down_read(sem, "call_rwsem_down_read_failed"); +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (IS_ERR(____down_read(sem, "call_rwsem_down_read_failed_killable"))) + return -EINTR; + return 0; } /* @@ -82,17 +97,18 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) { long result, tmp; asm volatile("# beginning __down_read_trylock\n\t" - " mov %0,%1\n\t" + " mov %[count],%[result]\n\t" "1:\n\t" - " mov %1,%2\n\t" - " add %3,%2\n\t" + " mov %[result],%[tmp]\n\t" + " add %[inc],%[tmp]\n\t" " jle 2f\n\t" - LOCK_PREFIX " cmpxchg %2,%0\n\t" + LOCK_PREFIX " cmpxchg %[tmp],%[count]\n\t" " jnz 1b\n\t" "2:\n\t" "# ending __down_read_trylock\n\t" - : "+m" (sem->count), "=&a" (result), "=&r" (tmp) - : "i" (RWSEM_ACTIVE_READ_BIAS) + : [count] "+m" (sem->count), [result] "=&a" (result), + [tmp] "=&r" (tmp) + : [inc] "i" (RWSEM_ACTIVE_READ_BIAS) : "memory", "cc"); return result >= 0; } @@ -106,7 +122,7 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) struct rw_semaphore* ret; \ \ asm volatile("# beginning down_write\n\t" \ - LOCK_PREFIX " xadd %1,(%4)\n\t" \ + LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" \ /* adds 0xffff0001, returns the old value */ \ " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ /* was the active mask 0 before? */\ @@ -114,9 +130,9 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) " call " slow_path "\n" \ "1:\n" \ "# ending down_write" \ - : "+m" (sem->count), "=d" (tmp), \ + : "+m" (sem->count), [tmp] "=d" (tmp), \ "=a" (ret), ASM_CALL_CONSTRAINT \ - : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ + : [sem] "a" (sem), "[tmp]" (RWSEM_ACTIVE_WRITE_BIAS) \ : "memory", "cc"); \ ret; \ }) @@ -142,21 +158,21 @@ static inline bool __down_write_trylock(struct rw_semaphore *sem) bool result; long tmp0, tmp1; asm volatile("# beginning __down_write_trylock\n\t" - " mov %0,%1\n\t" + " mov %[count],%[tmp0]\n\t" "1:\n\t" " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" /* was the active mask 0 before? */ " jnz 2f\n\t" - " mov %1,%2\n\t" - " add %4,%2\n\t" - LOCK_PREFIX " cmpxchg %2,%0\n\t" + " mov %[tmp0],%[tmp1]\n\t" + " add %[inc],%[tmp1]\n\t" + LOCK_PREFIX " cmpxchg %[tmp1],%[count]\n\t" " jnz 1b\n\t" "2:\n\t" CC_SET(e) "# ending __down_write_trylock\n\t" - : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1), - CC_OUT(e) (result) - : "er" (RWSEM_ACTIVE_WRITE_BIAS) + : [count] "+m" (sem->count), [tmp0] "=&a" (tmp0), + [tmp1] "=&r" (tmp1), CC_OUT(e) (result) + : [inc] "er" (RWSEM_ACTIVE_WRITE_BIAS) : "memory"); return result; } @@ -168,14 +184,14 @@ static inline void __up_read(struct rw_semaphore *sem) { long tmp; asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %1,(%2)\n\t" + LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" /* subtracts 1, returns the old value */ " jns 1f\n\t" " call call_rwsem_wake\n" /* expects old value in %edx */ "1:\n" "# ending __up_read\n" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS) + : "+m" (sem->count), [tmp] "=d" (tmp) + : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_READ_BIAS) : "memory", "cc"); } @@ -186,14 +202,14 @@ static inline void __up_write(struct rw_semaphore *sem) { long tmp; asm volatile("# beginning __up_write\n\t" - LOCK_PREFIX " xadd %1,(%2)\n\t" + LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" /* subtracts 0xffff0001, returns the old value */ " jns 1f\n\t" " call call_rwsem_wake\n" /* expects old value in %edx */ "1:\n\t" "# ending __up_write\n" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS) + : "+m" (sem->count), [tmp] "=d" (tmp) + : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_WRITE_BIAS) : "memory", "cc"); } @@ -203,7 +219,7 @@ static inline void __up_write(struct rw_semaphore *sem) static inline void __downgrade_write(struct rw_semaphore *sem) { asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" + LOCK_PREFIX _ASM_ADD "%[inc],(%[sem])\n\t" /* * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) @@ -213,7 +229,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) "1:\n\t" "# ending __downgrade_write\n" : "+m" (sem->count) - : "a" (sem), "er" (-RWSEM_WAITING_BIAS) + : [sem] "a" (sem), [inc] "er" (-RWSEM_WAITING_BIAS) : "memory", "cc"); } diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index b20f9d623f9c..8f09012b92e7 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -236,11 +236,23 @@ */ #define EARLY_IDT_HANDLER_SIZE 9 +/* + * xen_early_idt_handler_array is for Xen pv guests: for each entry in + * early_idt_handler_array it contains a prequel in the form of + * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to + * max 8 bytes. + */ +#define XEN_EARLY_IDT_HANDLER_SIZE 8 + #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; extern void early_ignore_irq(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE]; +#endif + /* * Load a segment. Fall back on loading the zero segment if something goes * wrong. This variant assumes that loading zero fully clears the segment. diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b34625796eb2..5b6bc7016c22 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -42,11 +42,4 @@ #include <asm/qrwlock.h> -#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) - -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - #endif /* _ASM_X86_SPINLOCK_H */ diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 076502241eae..55d392c6bd29 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) * No 3D Now! */ -#ifndef CONFIG_KMEMCHECK - #if (__GNUC__ >= 4) #define memcpy(t, f, n) __builtin_memcpy(t, f, n) #else @@ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) ? __constant_memcpy((t), (f), (n)) \ : __memcpy((t), (f), (n))) #endif -#else -/* - * kmemcheck becomes very happy if we use the REP instructions unconditionally, - * because it means that we know both memory operands in advance. - */ -#define memcpy(t, f, n) __memcpy((t), (f), (n)) -#endif #endif #endif /* !CONFIG_FORTIFY_SOURCE */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 0b1b4445f4c5..533f74c300c2 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #ifndef CONFIG_FORTIFY_SOURCE -#ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ ({ \ @@ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len); __ret; \ }) #endif -#else -/* - * kmemcheck becomes very happy if we use the REP instructions unconditionally, - * because it means that we know both memory operands in advance. - */ -#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) -#endif #endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMSET diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 982c325dad33..8be6afb58471 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,7 +12,13 @@ /* image of the saved processor state */ struct saved_context { - u16 es, fs, gs, ss; + /* + * On x86_32, all segment registers, with the possible exception of + * gs, are saved at kernel entry in pt_regs. + */ +#ifdef CONFIG_X86_32_LAZY_GS + u16 gs; +#endif unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 7306e911faee..a7af9f53c0cb 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -20,8 +20,20 @@ */ struct saved_context { struct pt_regs regs; - u16 ds, es, fs, gs, ss; - unsigned long gs_base, gs_kernel_base, fs_base; + + /* + * User CS and SS are saved in current_pt_regs(). The rest of the + * segment selectors need to be saved and restored here. + */ + u16 ds, es, fs, gs; + + /* + * Usermode FSBASE and GSBASE may not match the fs and gs selectors, + * so we save them separately. We save the kernelmode GSBASE to + * restore percpu access after resume. + */ + unsigned long kernelmode_gs_base, usermode_gs_base, fs_base; + unsigned long cr0, cr2, cr3, cr4, cr8; u64 misc_enable; bool misc_enable_saved; @@ -30,8 +42,7 @@ struct saved_context { u16 gdt_pad; /* Unused */ struct desc_ptr gdt_desc; u16 idt_pad; - u16 idt_limit; - unsigned long idt_base; + struct desc_ptr idt; u16 ldt; u16 tss; unsigned long tr; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 9b6df68d8fd1..eb5f7999a893 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -16,8 +16,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); /* This runs runs on the previous thread's stack. */ -static inline void prepare_switch_to(struct task_struct *prev, - struct task_struct *next) +static inline void prepare_switch_to(struct task_struct *next) { #ifdef CONFIG_VMAP_STACK /* @@ -70,7 +69,7 @@ struct fork_frame { #define switch_to(prev, next, last) \ do { \ - prepare_switch_to(prev, next); \ + prepare_switch_to(next); \ \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 47457ab975fd..7365dd4acffb 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -9,7 +9,7 @@ #define TICK_SIZE (tick_nsec / 1000) unsigned long long native_sched_clock(void); -extern int recalibrate_cpu_khz(void); +extern void recalibrate_cpu_khz(void); extern int no_timer_check; diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 3effd3c994af..d33e4a26dc7e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -245,40 +245,43 @@ static inline void cr4_init_shadow(void) this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); } +static inline void __cr4_set(unsigned long cr4) +{ + lockdep_assert_irqs_disabled(); + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); +} + /* Set in this cpu's CR4. */ static inline void cr4_set_bits(unsigned long mask) { - unsigned long cr4; + unsigned long cr4, flags; + local_irq_save(flags); cr4 = this_cpu_read(cpu_tlbstate.cr4); - if ((cr4 | mask) != cr4) { - cr4 |= mask; - this_cpu_write(cpu_tlbstate.cr4, cr4); - __write_cr4(cr4); - } + if ((cr4 | mask) != cr4) + __cr4_set(cr4 | mask); + local_irq_restore(flags); } /* Clear in this cpu's CR4. */ static inline void cr4_clear_bits(unsigned long mask) { - unsigned long cr4; + unsigned long cr4, flags; + local_irq_save(flags); cr4 = this_cpu_read(cpu_tlbstate.cr4); - if ((cr4 & ~mask) != cr4) { - cr4 &= ~mask; - this_cpu_write(cpu_tlbstate.cr4, cr4); - __write_cr4(cr4); - } + if ((cr4 & ~mask) != cr4) + __cr4_set(cr4 & ~mask); + local_irq_restore(flags); } -static inline void cr4_toggle_bits(unsigned long mask) +static inline void cr4_toggle_bits_irqsoff(unsigned long mask) { unsigned long cr4; cr4 = this_cpu_read(cpu_tlbstate.cr4); - cr4 ^= mask; - this_cpu_write(cpu_tlbstate.cr4, cr4); - __write_cr4(cr4); + __cr4_set(cr4 ^ mask); } /* Read the CR4 shadow. */ diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 8eb139ed1a03..22647a642e98 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -138,6 +138,254 @@ DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); DEFINE_IRQ_VECTOR_EVENT(thermal_apic); #endif +TRACE_EVENT(vector_config, + + TP_PROTO(unsigned int irq, unsigned int vector, + unsigned int cpu, unsigned int apicdest), + + TP_ARGS(irq, vector, cpu, apicdest), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( unsigned int, cpu ) + __field( unsigned int, apicdest ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->vector = vector; + __entry->cpu = cpu; + __entry->apicdest = apicdest; + ), + + TP_printk("irq=%u vector=%u cpu=%u apicdest=0x%08x", + __entry->irq, __entry->vector, __entry->cpu, + __entry->apicdest) +); + +DECLARE_EVENT_CLASS(vector_mod, + + TP_PROTO(unsigned int irq, unsigned int vector, + unsigned int cpu, unsigned int prev_vector, + unsigned int prev_cpu), + + TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( unsigned int, cpu ) + __field( unsigned int, prev_vector ) + __field( unsigned int, prev_cpu ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->vector = vector; + __entry->cpu = cpu; + __entry->prev_vector = prev_vector; + __entry->prev_cpu = prev_cpu; + + ), + + TP_printk("irq=%u vector=%u cpu=%u prev_vector=%u prev_cpu=%u", + __entry->irq, __entry->vector, __entry->cpu, + __entry->prev_vector, __entry->prev_cpu) +); + +#define DEFINE_IRQ_VECTOR_MOD_EVENT(name) \ +DEFINE_EVENT_FN(vector_mod, name, \ + TP_PROTO(unsigned int irq, unsigned int vector, \ + unsigned int cpu, unsigned int prev_vector, \ + unsigned int prev_cpu), \ + TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu), NULL, NULL); \ + +DEFINE_IRQ_VECTOR_MOD_EVENT(vector_update); +DEFINE_IRQ_VECTOR_MOD_EVENT(vector_clear); + +DECLARE_EVENT_CLASS(vector_reserve, + + TP_PROTO(unsigned int irq, int ret), + + TP_ARGS(irq, ret), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->ret = ret; + ), + + TP_printk("irq=%u ret=%d", __entry->irq, __entry->ret) +); + +#define DEFINE_IRQ_VECTOR_RESERVE_EVENT(name) \ +DEFINE_EVENT_FN(vector_reserve, name, \ + TP_PROTO(unsigned int irq, int ret), \ + TP_ARGS(irq, ret), NULL, NULL); \ + +DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve_managed); +DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve); + +TRACE_EVENT(vector_alloc, + + TP_PROTO(unsigned int irq, unsigned int vector, bool reserved, + int ret), + + TP_ARGS(irq, vector, ret, reserved), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( bool, reserved ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->vector = ret < 0 ? 0 : vector; + __entry->reserved = reserved; + __entry->ret = ret > 0 ? 0 : ret; + ), + + TP_printk("irq=%u vector=%u reserved=%d ret=%d", + __entry->irq, __entry->vector, + __entry->reserved, __entry->ret) +); + +TRACE_EVENT(vector_alloc_managed, + + TP_PROTO(unsigned int irq, unsigned int vector, + int ret), + + TP_ARGS(irq, vector, ret), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->vector = ret < 0 ? 0 : vector; + __entry->ret = ret > 0 ? 0 : ret; + ), + + TP_printk("irq=%u vector=%u ret=%d", + __entry->irq, __entry->vector, __entry->ret) +); + +DECLARE_EVENT_CLASS(vector_activate, + + TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve, + bool reserve), + + TP_ARGS(irq, is_managed, can_reserve, reserve), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( bool, is_managed ) + __field( bool, can_reserve ) + __field( bool, reserve ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->is_managed = is_managed; + __entry->can_reserve = can_reserve; + __entry->reserve = reserve; + ), + + TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d", + __entry->irq, __entry->is_managed, __entry->can_reserve, + __entry->reserve) +); + +#define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name) \ +DEFINE_EVENT_FN(vector_activate, name, \ + TP_PROTO(unsigned int irq, bool is_managed, \ + bool can_reserve, bool reserve), \ + TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL); \ + +DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate); +DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate); + +TRACE_EVENT(vector_teardown, + + TP_PROTO(unsigned int irq, bool is_managed, bool has_reserved), + + TP_ARGS(irq, is_managed, has_reserved), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( bool, is_managed ) + __field( bool, has_reserved ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->is_managed = is_managed; + __entry->has_reserved = has_reserved; + ), + + TP_printk("irq=%u is_managed=%d has_reserved=%d", + __entry->irq, __entry->is_managed, __entry->has_reserved) +); + +TRACE_EVENT(vector_setup, + + TP_PROTO(unsigned int irq, bool is_legacy, int ret), + + TP_ARGS(irq, is_legacy, ret), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( bool, is_legacy ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->is_legacy = is_legacy; + __entry->ret = ret; + ), + + TP_printk("irq=%u is_legacy=%d ret=%d", + __entry->irq, __entry->is_legacy, __entry->ret) +); + +TRACE_EVENT(vector_free_moved, + + TP_PROTO(unsigned int irq, unsigned int cpu, unsigned int vector, + bool is_managed), + + TP_ARGS(irq, cpu, vector, is_managed), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + __field( unsigned int, cpu ) + __field( unsigned int, vector ) + __field( bool, is_managed ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->cpu = cpu; + __entry->vector = vector; + __entry->is_managed = is_managed; + ), + + TP_printk("irq=%u cpu=%u vector=%u is_managed=%d", + __entry->irq, __entry->cpu, __entry->vector, + __entry->is_managed) +); + + #endif /* CONFIG_X86_LOCAL_APIC */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 8da0efb13544..cf5d53c3f9ea 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -32,15 +32,22 @@ static inline cycles_t get_cycles(void) extern struct system_counterval_t convert_art_to_tsc(u64 art); +extern void tsc_early_delay_calibrate(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); +extern void mark_tsc_async_resets(char *reason); extern unsigned long native_calibrate_cpu(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); extern int tsc_clocksource_reliable; +#ifdef CONFIG_X86_TSC +extern bool tsc_async_resets; +#else +# define tsc_async_resets false +#endif /* * Boot-time check whether the TSCs are synchronized across diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h new file mode 100644 index 000000000000..db43f2a0d92c --- /dev/null +++ b/arch/x86/include/asm/umip.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_UMIP_H +#define _ASM_X86_UMIP_H + +#include <linux/types.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_X86_INTEL_UMIP +bool fixup_umip_exception(struct pt_regs *regs); +#else +static inline bool fixup_umip_exception(struct pt_regs *regs) { return false; } +#endif /* CONFIG_X86_INTEL_UMIP */ +#endif /* _ASM_X86_UMIP_H */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 9cffb44a3cf5..036e26d63d9a 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -776,23 +776,36 @@ static inline int uv_num_possible_blades(void) extern void uv_nmi_setup(void); extern void uv_nmi_setup_hubless(void); +/* BIOS/Kernel flags exchange MMR */ +#define UVH_BIOS_KERNEL_MMR UVH_SCRATCH5 +#define UVH_BIOS_KERNEL_MMR_ALIAS UVH_SCRATCH5_ALIAS +#define UVH_BIOS_KERNEL_MMR_ALIAS_2 UVH_SCRATCH5_ALIAS_2 + +/* TSC sync valid, set by BIOS */ +#define UVH_TSC_SYNC_MMR UVH_BIOS_KERNEL_MMR +#define UVH_TSC_SYNC_SHIFT 10 +#define UVH_TSC_SYNC_SHIFT_UV2K 16 /* UV2/3k have different bits */ +#define UVH_TSC_SYNC_MASK 3 /* 0011 */ +#define UVH_TSC_SYNC_VALID 3 /* 0011 */ +#define UVH_TSC_SYNC_INVALID 2 /* 0010 */ + /* BMC sets a bit this MMR non-zero before sending an NMI */ -#define UVH_NMI_MMR UVH_SCRATCH5 -#define UVH_NMI_MMR_CLEAR UVH_SCRATCH5_ALIAS +#define UVH_NMI_MMR UVH_BIOS_KERNEL_MMR +#define UVH_NMI_MMR_CLEAR UVH_BIOS_KERNEL_MMR_ALIAS #define UVH_NMI_MMR_SHIFT 63 -#define UVH_NMI_MMR_TYPE "SCRATCH5" +#define UVH_NMI_MMR_TYPE "SCRATCH5" /* Newer SMM NMI handler, not present in all systems */ #define UVH_NMI_MMRX UVH_EVENT_OCCURRED0 #define UVH_NMI_MMRX_CLEAR UVH_EVENT_OCCURRED0_ALIAS #define UVH_NMI_MMRX_SHIFT UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT -#define UVH_NMI_MMRX_TYPE "EXTIO_INT0" +#define UVH_NMI_MMRX_TYPE "EXTIO_INT0" /* Non-zero indicates newer SMM NMI handler present */ #define UVH_NMI_MMRX_SUPPORTED UVH_EXTIO_INT0_BROADCAST /* Indicates to BIOS that we want to use the newer SMM NMI handler */ -#define UVH_NMI_MMRX_REQ UVH_SCRATCH5_ALIAS_2 +#define UVH_NMI_MMRX_REQ UVH_BIOS_KERNEL_MMR_ALIAS_2 #define UVH_NMI_MMRX_REQ_SHIFT 62 struct uv_hub_nmi_s { diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 52250681f68c..fb856c9f0449 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -49,7 +49,7 @@ static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s) unsigned ret; repeat: - ret = ACCESS_ONCE(s->seq); + ret = READ_ONCE(s->seq); if (unlikely(ret & 1)) { cpu_relax(); goto repeat; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index caec8417539f..8b6780751132 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -70,11 +70,11 @@ #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 -#define SECONDARY_EXEC_RDRAND 0x00000800 +#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 -#define SECONDARY_EXEC_RDSEED 0x00010000 +#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h deleted file mode 100644 index 78ccf28d17db..000000000000 --- a/arch/x86/include/asm/x2apic.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Common bits for X2APIC cluster/physical modes. - */ - -#ifndef _ASM_X86_X2APIC_H -#define _ASM_X86_X2APIC_H - -#include <asm/apic.h> -#include <asm/ipi.h> -#include <linux/cpumask.h> - -static int x2apic_apic_id_valid(int apicid) -{ - return 1; -} - -static int x2apic_apic_id_registered(void) -{ - return 1; -} - -static void -__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) -{ - unsigned long cfg = __prepare_ICR(0, vector, dest); - native_x2apic_icr_write(cfg, apicid); -} - -static unsigned int x2apic_get_apic_id(unsigned long id) -{ - return id; -} - -static unsigned long x2apic_set_apic_id(unsigned int id) -{ - return id; -} - -static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) -{ - return initial_apicid >> index_msb; -} - -static void x2apic_send_IPI_self(int vector) -{ - apic_write(APIC_SELF_IPI, vector); -} - -#endif /* _ASM_X86_X2APIC_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ad15a0fda917..aa4747569e23 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -51,11 +51,13 @@ struct x86_init_resources { * are set up. * @intr_init: interrupt init code * @trap_init: platform specific trap setup + * @intr_mode_init: interrupt delivery mode setup */ struct x86_init_irqs { void (*pre_vector_init)(void); void (*intr_init)(void); void (*trap_init)(void); + void (*intr_mode_init)(void); }; /** @@ -117,11 +119,13 @@ struct x86_init_pci { /** * struct x86_hyper_init - x86 hypervisor init functions * @init_platform: platform setup + * @guest_late_init: guest late init * @x2apic_available: X2APIC detection * @init_mem_mapping: setup early mappings during init_mem_mapping() */ struct x86_hyper_init { void (*init_platform)(void); + void (*guest_late_init)(void); bool (*x2apic_available)(void); void (*init_mem_mapping)(void); }; @@ -208,6 +212,7 @@ enum x86_legacy_i8042_state { struct x86_legacy_features { enum x86_legacy_i8042_state i8042; int rtc; + int no_vga; int reserve_bios_regions; struct x86_legacy_devices devices; }; diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h index 3bdd10d71223..a9630104f1c4 100644 --- a/arch/x86/include/asm/xen/cpuid.h +++ b/arch/x86/include/asm/xen/cpuid.h @@ -74,21 +74,43 @@ #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0) /* + * Leaf 4 (0x40000x03) + * Sub-leaf 0: EAX: bit 0: emulated tsc + * bit 1: host tsc is known to be reliable + * bit 2: RDTSCP instruction available + * EBX: tsc_mode: 0=default (emulate if necessary), 1=emulate, + * 2=no emulation, 3=no emulation + TSC_AUX support + * ECX: guest tsc frequency in kHz + * EDX: guest tsc incarnation (migration count) + * Sub-leaf 1: EAX: tsc offset low part + * EBX: tsc offset high part + * ECX: multiplicator for tsc->ns conversion + * EDX: shift amount for tsc->ns conversion + * Sub-leaf 2: EAX: host tsc frequency in kHz + */ + +/* * Leaf 5 (0x40000x04) * HVM-specific features - * EAX: Features - * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) + * Sub-leaf 0: EAX: Features + * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) */ - -/* Virtualized APIC registers */ -#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) -/* Virtualized x2APIC accesses */ -#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) +#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */ +#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */ /* Memory mapped from other domains has valid IOMMU entries */ #define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) -/* vcpu id is present in EBX */ -#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) +#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */ + +/* + * Leaf 6 (0x40000x05) + * PV-specific parameters + * Sub-leaf 0: EAX: max available sub-leaf + * Sub-leaf 0: EBX: bits 0-7: max machine address width + */ + +/* Max. address width in bits taking memory hotplug into account. */ +#define XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK (0xffu << 0) -#define XEN_CPUID_MAX_NUM_LEAVES 4 +#define XEN_CPUID_MAX_NUM_LEAVES 5 #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c6b84245e5ab..123e669bf363 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -27,6 +27,15 @@ typedef struct xpaddr { phys_addr_t paddr; } xpaddr_t; +#ifdef CONFIG_X86_64 +#define XEN_PHYSICAL_MASK __sme_clr((1UL << 52) - 1) +#else +#define XEN_PHYSICAL_MASK __PHYSICAL_MASK +#endif + +#define XEN_PTE_MFN_MASK ((pteval_t)(((signed long)PAGE_MASK) & \ + XEN_PHYSICAL_MASK)) + #define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) #define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) @@ -278,7 +287,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) static inline unsigned long pte_mfn(pte_t pte) { - return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pte.pte & XEN_PTE_MFN_MASK) >> PAGE_SHIFT; } static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 1f5c5161ead6..45c8605467f1 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -1,7 +1,4 @@ -#ifdef CONFIG_KMEMCHECK -/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ -# include <asm-generic/xor.h> -#elif !defined(_ASM_X86_XOR_H) +#ifndef _ASM_X86_XOR_H #define _ASM_X86_XOR_H /* diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index da1489cb64dc..1e901e421f2d 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -1,6 +1,7 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm +generic-y += bpf_perf_event.h generated-y += unistd_32.h generated-y += unistd_64.h generated-y += unistd_x32.h diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 554aa8f24f91..09cc06483bed 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -110,5 +110,4 @@ struct kvm_vcpu_pv_apf_data { #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK #define KVM_PV_EOI_DISABLED 0x0 - #endif /* _UAPI_ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 97abdaab9535..bcba3c643e63 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -110,6 +110,8 @@ #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) +#define X86_CR4_UMIP_BIT 11 /* enable UMIP support */ +#define X86_CR4_UMIP _BITUL(X86_CR4_UMIP_BIT) #define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */ #define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT) #define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 295abaa58add..7e2baf7304ae 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,10 +29,13 @@ KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y -OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y +ifdef CONFIG_FRAME_POINTER +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y +endif + # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, # boot, dumpstack/stacktrace, etc are either non-interesting or can lead to @@ -127,6 +130,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o +obj-$(CONFIG_X86_INTEL_UMIP) += umip.o obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index ea3046e0b0cf..bb8d300fecbd 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -52,8 +52,3 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) apei_mce_report_mem_error(sev, mem_err); #endif } - -void arch_apei_flush_tlb_one(unsigned long addr) -{ - __flush_tlb_one(addr); -} diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 079535e53e2a..f4c463df8b08 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -342,13 +342,12 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 +static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, + u8 trigger, u32 gsi); + static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { - int ioapic; - int pin; - struct mpc_intsrc mp_irq; - /* * Check bus_irq boundary. */ @@ -358,14 +357,6 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, } /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = mp_find_ioapic_pin(ioapic, gsi); - - /* * TBD: This check is for faulty timer entries, where the override * erroneously sets the trigger to level, resulting in a HUGE * increase of timer interrupts! @@ -373,16 +364,8 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, if ((bus_irq == 0) && (trigger == 3)) trigger = 1; - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (trigger << 2) | polarity; - mp_irq.srcbus = MP_ISA_BUS; - mp_irq.srcbusirq = bus_irq; /* IRQ */ - mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ - mp_irq.dstirq = pin; /* INTIN# */ - - mp_save_irq(&mp_irq); - + if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0) + return; /* * Reset default identity mapping if gsi is also an legacy IRQ, * otherwise there will be more than one entry with the same GSI @@ -429,6 +412,34 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } +static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, + u8 trigger, u32 gsi) +{ + struct mpc_intsrc mp_irq; + int ioapic, pin; + + /* Convert 'gsi' to 'ioapic.pin'(INTIN#) */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + pr_warn("Failed to find ioapic for gsi : %u\n", gsi); + return ioapic; + } + + pin = mp_find_ioapic_pin(ioapic, gsi); + + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger << 2) | polarity; + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.srcbusirq = bus_irq; + mp_irq.dstapic = mpc_ioapic_id(ioapic); + mp_irq.dstirq = pin; + + mp_save_irq(&mp_irq); + + return 0; +} + static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { @@ -473,7 +484,11 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK) polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; - mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + if (bus_irq < NR_IRQS_LEGACY) + mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + else + mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi); + acpi_penalize_sci_irq(bus_irq, trigger, polarity); /* @@ -961,6 +976,11 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) x86_platform.legacy.rtc = 0; } + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) { + pr_debug("ACPI: probing for VGA not safe\n"); + x86_platform.legacy.no_vga = 1; + } + #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) { diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 14a52c7d23d4..30571fdaaf6f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -445,7 +445,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, { const s32 *poff; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -455,7 +454,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); } - mutex_unlock(&text_mutex); } static void alternatives_smp_unlock(const s32 *start, const s32 *end, @@ -463,7 +461,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, { const s32 *poff; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -473,7 +470,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); } - mutex_unlock(&text_mutex); } struct smp_alt_module { @@ -492,8 +488,7 @@ struct smp_alt_module { struct list_head next; }; static LIST_HEAD(smp_alt_modules); -static DEFINE_MUTEX(smp_alt); -static bool uniproc_patched = false; /* protected by smp_alt */ +static bool uniproc_patched = false; /* protected by text_mutex */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, @@ -502,7 +497,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, { struct smp_alt_module *smp; - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); if (!uniproc_patched) goto unlock; @@ -529,14 +524,14 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp_unlock: alternatives_smp_unlock(locks, locks_end, text, text_end); unlock: - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; @@ -544,7 +539,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) kfree(item); break; } - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } void alternatives_enable_smp(void) @@ -554,7 +549,7 @@ void alternatives_enable_smp(void) /* Why bother if there are no other CPUs? */ BUG_ON(num_possible_cpus() == 1); - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); if (uniproc_patched) { pr_info("switching to SMP code\n"); @@ -566,10 +561,13 @@ void alternatives_enable_smp(void) mod->text, mod->text_end); uniproc_patched = false; } - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } -/* Return 1 if the address range is reserved for smp-alternatives */ +/* + * Return 1 if the address range is reserved for SMP-alternatives. + * Must hold text_mutex. + */ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; @@ -577,6 +575,8 @@ int alternatives_text_reserved(void *start, void *end) u8 *text_start = start; u8 *text_end = end; + lockdep_assert_held(&text_mutex); + list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 2fb7309c6900..a6fcaf16cdbf 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -7,12 +7,11 @@ # In particualr, smp_apic_timer_interrupt() is called in random places. KCOV_INSTRUMENT := n -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_PCI_MSI) += msi.o -obj-$(CONFIG_HT_IRQ) += htirq.o obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 89c7c8569e5e..25ddf02598d2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -211,11 +211,7 @@ static inline int lapic_get_version(void) */ static inline int lapic_is_integrated(void) { -#ifdef CONFIG_X86_64 - return 1; -#else return APIC_INTEGRATED(lapic_get_version()); -#endif } /* @@ -298,14 +294,11 @@ int get_physical_broadcast(void) */ int lapic_get_maxlvt(void) { - unsigned int v; - - v = apic_read(APIC_LVR); /* * - we always have APIC integrated on 64bit mode * - 82489DXs do not report # of LVT entries */ - return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; + return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2; } /* @@ -1229,6 +1222,70 @@ void __init sync_Arb_IDs(void) APIC_INT_LEVELTRIG | APIC_DM_INIT); } +enum apic_intr_mode_id apic_intr_mode; + +static int __init apic_intr_mode_select(void) +{ + /* Check kernel option */ + if (disable_apic) { + pr_info("APIC disabled via kernel command line\n"); + return APIC_PIC; + } + + /* Check BIOS */ +#ifdef CONFIG_X86_64 + /* On 64-bit, the APIC must be integrated, Check local APIC only */ + if (!boot_cpu_has(X86_FEATURE_APIC)) { + disable_apic = 1; + pr_info("APIC disabled by BIOS\n"); + return APIC_PIC; + } +#else + /* On 32-bit, the APIC may be integrated APIC or 82489DX */ + + /* Neither 82489DX nor integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) { + disable_apic = 1; + return APIC_PIC; + } + + /* If the BIOS pretends there is an integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && + APIC_INTEGRATED(boot_cpu_apic_version)) { + disable_apic = 1; + pr_err(FW_BUG "Local APIC %d not detected, force emulation\n", + boot_cpu_physical_apicid); + return APIC_PIC; + } +#endif + + /* Check MP table or ACPI MADT configuration */ + if (!smp_found_config) { + disable_ioapic_support(); + if (!acpi_lapic) { + pr_info("APIC: ACPI MADT or MP tables are not detected\n"); + return APIC_VIRTUAL_WIRE_NO_CONFIG; + } + return APIC_VIRTUAL_WIRE; + } + +#ifdef CONFIG_SMP + /* If SMP should be disabled, then really disable it! */ + if (!setup_max_cpus) { + pr_info("APIC: SMP mode deactivated\n"); + return APIC_SYMMETRIC_IO_NO_ROUTING; + } + + if (read_apic_id() != boot_cpu_physical_apicid) { + panic("Boot APIC ID in local APIC unexpected (%d vs %d)", + read_apic_id(), boot_cpu_physical_apicid); + /* Or can we switch back to PIC here? */ + } +#endif + + return APIC_SYMMETRIC_IO; +} + /* * An initial setup of the virtual wire mode. */ @@ -1278,6 +1335,38 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } +/* Init the interrupt delivery mode for the BSP */ +void __init apic_intr_mode_init(void) +{ + bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT); + + apic_intr_mode = apic_intr_mode_select(); + + switch (apic_intr_mode) { + case APIC_PIC: + pr_info("APIC: Keep in PIC mode(8259)\n"); + return; + case APIC_VIRTUAL_WIRE: + pr_info("APIC: Switch to virtual wire mode setup\n"); + default_setup_apic_routing(); + break; + case APIC_VIRTUAL_WIRE_NO_CONFIG: + pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); + upmode = true; + default_setup_apic_routing(); + break; + case APIC_SYMMETRIC_IO: + pr_info("APIC: Switch to symmetric I/O mode setup\n"); + default_setup_apic_routing(); + break; + case APIC_SYMMETRIC_IO_NO_ROUTING: + pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); + break; + } + + apic_bsp_setup(upmode); +} + static void lapic_setup_esr(void) { unsigned int oldvalue, value, maxlvt; @@ -1473,7 +1562,7 @@ void setup_local_APIC(void) /* * Set up LVT0, LVT1: * - * set up through-local-APIC on the BP's LINT0. This is not + * set up through-local-APIC on the boot CPU's LINT0. This is not * strictly necessary in pure symmetric-IO mode, but sometimes * we delegate interrupts to the 8259A. */ @@ -1499,7 +1588,9 @@ void setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!lapic_is_integrated()) /* 82489DX */ + + /* Is 82489DX ? */ + if (!lapic_is_integrated()) value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); @@ -1885,8 +1976,8 @@ void __init init_apic_mappings(void) * yeah -- we lie about apic_version * in case if apic was disabled via boot option * but it's not a problem for SMP compiled kernel - * since smp_sanity_check is prepared for such a case - * and disable smp mode + * since apic_intr_mode_select is prepared for such + * a case and disable smp mode */ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } @@ -2242,44 +2333,6 @@ int hard_smp_processor_id(void) return read_apic_id(); } -void default_init_apic_ldr(void) -{ - unsigned long val; - - apic_write(APIC_DFR, APIC_DFR_VALUE); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); - apic_write(APIC_LDR, val); -} - -int default_cpu_mask_to_apicid(const struct cpumask *mask, - struct irq_data *irqdata, - unsigned int *apicid) -{ - unsigned int cpu = cpumask_first(mask); - - if (cpu >= nr_cpu_ids) - return -EINVAL; - *apicid = per_cpu(x86_cpu_to_apicid, cpu); - irq_data_update_effective_affinity(irqdata, cpumask_of(cpu)); - return 0; -} - -int flat_cpu_mask_to_apicid(const struct cpumask *mask, - struct irq_data *irqdata, - unsigned int *apicid) - -{ - struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); - unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS; - - if (!cpu_mask) - return -EINVAL; - *apicid = (unsigned int)cpu_mask; - cpumask_bits(effmsk)[0] = cpu_mask; - return 0; -} - /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with @@ -2322,72 +2375,27 @@ static void __init apic_bsp_up_setup(void) * Returns: * apic_id of BSP APIC */ -int __init apic_bsp_setup(bool upmode) +void __init apic_bsp_setup(bool upmode) { - int id; - connect_bsp_APIC(); if (upmode) apic_bsp_up_setup(); setup_local_APIC(); - if (x2apic_mode) - id = apic_read(APIC_LDR); - else - id = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - enable_IO_APIC(); end_local_APIC_setup(); irq_remap_enable_fault_handling(); setup_IO_APIC(); - /* Setup local timer */ - x86_init.timers.setup_percpu_clockev(); - return id; -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor(void) -{ - if (disable_apic) { - pr_info("Apic disabled\n"); - return -1; - } -#ifdef CONFIG_X86_64 - if (!boot_cpu_has(X86_FEATURE_APIC)) { - disable_apic = 1; - pr_info("Apic disabled by BIOS\n"); - return -1; - } -#else - if (!smp_found_config && !boot_cpu_has(X86_FEATURE_APIC)) - return -1; - - /* - * Complain if the BIOS pretends there is one. - */ - if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(boot_cpu_apic_version)) { - pr_err("BIOS bug, local APIC 0x%x not detected!...\n", - boot_cpu_physical_apicid); - return -1; - } -#endif - - if (!smp_found_config) - disable_ioapic_support(); - - default_setup_apic_routing(); - apic_bsp_setup(true); - return 0; } #ifdef CONFIG_UP_LATE_INIT void __init up_late_init(void) { - APIC_init_uniprocessor(); + if (apic_intr_mode == APIC_PIC) + return; + + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); } #endif @@ -2667,11 +2675,13 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; +#ifdef CONFIG_X86_64 else { pr_warning("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } +#endif return 0; } diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c new file mode 100644 index 000000000000..a360801779ae --- /dev/null +++ b/arch/x86/kernel/apic/apic_common.c @@ -0,0 +1,46 @@ +/* + * Common functions shared between the various APIC flavours + * + * SPDX-License-Identifier: GPL-2.0 + */ +#include <linux/irq.h> +#include <asm/apic.h> + +u32 apic_default_calc_apicid(unsigned int cpu) +{ + return per_cpu(x86_cpu_to_apicid, cpu); +} + +u32 apic_flat_calc_apicid(unsigned int cpu) +{ + return 1U << cpu; +} + +bool default_check_apicid_used(physid_mask_t *map, int apicid) +{ + return physid_isset(apicid, *map); +} + +void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) +{ + *retmap = *phys_map; +} + +int default_cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + else + return BAD_APICID; +} +EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); + +int default_check_phys_apicid_present(int phys_apicid) +{ + return physid_isset(phys_apicid, phys_cpu_present_map); +} + +int default_apic_id_valid(int apicid) +{ + return (apicid < 255); +} diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index dedd5a41ba48..25a87028cb3f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -119,7 +119,7 @@ static unsigned int flat_get_apic_id(unsigned long x) return (x >> 24) & 0xFF; } -static unsigned long set_apic_id(unsigned int id) +static u32 set_apic_id(unsigned int id) { return (id & 0xFF) << 24; } @@ -151,15 +151,13 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -172,7 +170,7 @@ static struct apic apic_flat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, @@ -249,12 +247,10 @@ static struct apic apic_physflat __ro_after_init = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, /* not needed, but shouldn't hurt: */ .init_apic_ldr = flat_init_apic_ldr, @@ -268,7 +264,7 @@ static struct apic apic_physflat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index c8d211277315..5078b5ce63a7 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -84,20 +84,6 @@ static int noop_apic_id_registered(void) return physid_isset(0, phys_cpu_present_map); } -static const struct cpumask *noop_target_cpus(void) -{ - /* only BSP here */ - return cpumask_of(0); -} - -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - if (cpu != 0) - pr_warning("APIC: Vector allocated for non-BSP cpu\n"); - cpumask_copy(retmask, cpumask_of(cpu)); -} - static u32 noop_apic_read(u32 reg) { WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); @@ -109,6 +95,13 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } +#ifdef CONFIG_X86_32 +static int noop_x86_32_early_logical_apicid(int cpu) +{ + return BAD_APICID; +} +#endif + struct apic apic_noop __ro_after_init = { .name = "noop", .probe = noop_probe, @@ -117,16 +110,14 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, - .target_cpus = noop_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, - .vector_allocation_domain = noop_vector_allocation_domain, .init_apic_ldr = noop_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, @@ -142,7 +133,7 @@ struct apic apic_noop __ro_after_init = { .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = noop_send_IPI, .send_IPI_mask = noop_send_IPI_mask, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 2fda912219a6..134e04506ab4 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -38,7 +38,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) return id; } -static unsigned long numachip1_set_apic_id(unsigned int id) +static u32 numachip1_set_apic_id(unsigned int id) { return (id & 0xff) << 24; } @@ -51,7 +51,7 @@ static unsigned int numachip2_get_apic_id(unsigned long x) return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } -static unsigned long numachip2_set_apic_id(unsigned int id) +static u32 numachip2_set_apic_id(unsigned int id) { return id << 24; } @@ -249,12 +249,10 @@ static const struct apic apic_numachip1 __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -267,7 +265,7 @@ static const struct apic apic_numachip1 __refconst = { .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, @@ -300,12 +298,10 @@ static const struct apic apic_numachip2 __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -318,7 +314,7 @@ static const struct apic apic_numachip2 __refconst = { .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index e12fbcfc9571..afee386ff711 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -27,9 +27,9 @@ static int bigsmp_apic_id_registered(void) return 1; } -static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) +static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { - return 0; + return false; } static int bigsmp_early_logical_apicid(int cpu) @@ -155,12 +155,10 @@ static struct apic apic_bigsmp __ro_after_init = { /* phys delivery to target CPU: */ .irq_dest_mode = 0, - .target_cpus = default_target_cpus, .disable_esr = 1, .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, @@ -173,7 +171,7 @@ static struct apic apic_bigsmp __ro_after_init = { .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c deleted file mode 100644 index 56ccf9346b08..000000000000 --- a/arch/x86/kernel/apic/htirq.c +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Support Hypertransport IRQ - * - * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo - * Moved from arch/x86/kernel/apic/io_apic.c. - * Jiang Liu <jiang.liu@linux.intel.com> - * Add support of hierarchical irqdomain - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/device.h> -#include <linux/pci.h> -#include <linux/htirq.h> -#include <asm/irqdomain.h> -#include <asm/hw_irq.h> -#include <asm/apic.h> -#include <asm/hypertransport.h> - -static struct irq_domain *htirq_domain; - -/* - * Hypertransport interrupt support - */ -static int -ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) -{ - struct irq_data *parent = data->parent_data; - int ret; - - ret = parent->chip->irq_set_affinity(parent, mask, force); - if (ret >= 0) { - struct ht_irq_msg msg; - struct irq_cfg *cfg = irqd_cfg(data); - - fetch_ht_irq_msg(data->irq, &msg); - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | - HT_IRQ_LOW_DEST_ID_MASK); - msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) | - HT_IRQ_LOW_DEST_ID(cfg->dest_apicid); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); - write_ht_irq_msg(data->irq, &msg); - } - - return ret; -} - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .irq_mask = mask_ht_irq, - .irq_unmask = unmask_ht_irq, - .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = ht_set_affinity, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) -{ - struct ht_irq_cfg *ht_cfg; - struct irq_alloc_info *info = arg; - struct pci_dev *dev; - irq_hw_number_t hwirq; - int ret; - - if (nr_irqs > 1 || !info) - return -EINVAL; - - dev = info->ht_dev; - hwirq = (info->ht_idx & 0xFF) | - PCI_DEVID(dev->bus->number, dev->devfn) << 8 | - (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24; - if (irq_find_mapping(domain, hwirq) > 0) - return -EEXIST; - - ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL); - if (!ht_cfg) - return -ENOMEM; - - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); - if (ret < 0) { - kfree(ht_cfg); - return ret; - } - - /* Initialize msg to a value that will never match the first write. */ - ht_cfg->msg.address_lo = 0xffffffff; - ht_cfg->msg.address_hi = 0xffffffff; - ht_cfg->dev = info->ht_dev; - ht_cfg->update = info->ht_update; - ht_cfg->pos = info->ht_pos; - ht_cfg->idx = 0x10 + (info->ht_idx * 2); - irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg, - handle_edge_irq, ht_cfg, "edge"); - - return 0; -} - -static void htirq_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) -{ - struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); - - BUG_ON(nr_irqs != 1); - kfree(irq_data->chip_data); - irq_domain_free_irqs_top(domain, virq, nr_irqs); -} - -static void htirq_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct ht_irq_msg msg; - struct irq_cfg *cfg = irqd_cfg(irq_data); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((apic->irq_dest_mode == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - write_ht_irq_msg(irq_data->irq, &msg); -} - -static void htirq_domain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct ht_irq_msg msg; - - memset(&msg, 0, sizeof(msg)); - write_ht_irq_msg(irq_data->irq, &msg); -} - -static const struct irq_domain_ops htirq_domain_ops = { - .alloc = htirq_domain_alloc, - .free = htirq_domain_free, - .activate = htirq_domain_activate, - .deactivate = htirq_domain_deactivate, -}; - -void __init arch_init_htirq_domain(struct irq_domain *parent) -{ - struct fwnode_handle *fn; - - if (disable_apic) - return; - - fn = irq_domain_alloc_named_fwnode("PCI-HT"); - if (!fn) - goto warn; - - htirq_domain = irq_domain_create_tree(fn, &htirq_domain_ops, NULL); - irq_domain_free_fwnode(fn); - if (!htirq_domain) - goto warn; - - htirq_domain->parent = parent; - return; - -warn: - pr_warn("Failed to initialize irqdomain for HTIRQ.\n"); -} - -int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, - ht_irq_update_t *update) -{ - struct irq_alloc_info info; - - if (!htirq_domain) - return -ENOSYS; - - init_irq_alloc_info(&info, NULL); - info.ht_idx = idx; - info.ht_pos = pos; - info.ht_dev = dev; - info.ht_update = update; - - return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev), - &info); -} - -void arch_teardown_ht_irq(unsigned int irq) -{ - irq_domain_free_irqs(irq, 1); -} diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3b89b27945ff..8a7963421460 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1014,6 +1014,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, info->ioapic_pin)) return -ENOMEM; } else { + info->flags |= X86_IRQ_ALLOC_LEGACY; irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL); if (irq >= 0) { @@ -1586,6 +1587,43 @@ static int __init notimercheck(char *s) } __setup("no_timer_check", notimercheck); +static void __init delay_with_tsc(void) +{ + unsigned long long start, now; + unsigned long end = jiffies + 4; + + start = rdtsc(); + + /* + * We don't know the TSC frequency yet, but waiting for + * 40000000000/HZ TSC cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + */ + do { + rep_nop(); + now = rdtsc(); + } while ((now - start) < 40000000000UL / HZ && + time_before_eq(jiffies, end)); +} + +static void __init delay_without_tsc(void) +{ + unsigned long end = jiffies + 4; + int band = 1; + + /* + * We don't know any frequency yet, but waiting for + * 40940000000/HZ cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094 + */ + do { + __delay(((1U << band++) * 10000000UL) / HZ); + } while (band < 12 && time_before_eq(jiffies, end)); +} + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1604,8 +1642,12 @@ static int __init timer_irq_works(void) local_save_flags(flags); local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); + + if (boot_cpu_has(X86_FEATURE_TSC)) + delay_with_tsc(); + else + delay_without_tsc(); + local_irq_restore(flags); /* @@ -1821,26 +1863,36 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(data->entry.vector, data); } +static void ioapic_configure_entry(struct irq_data *irqd) +{ + struct mp_chip_data *mpd = irqd->chip_data; + struct irq_cfg *cfg = irqd_cfg(irqd); + struct irq_pin_list *entry; + + /* + * Only update when the parent is the vector domain, don't touch it + * if the parent is the remapping domain. Check the installed + * ioapic chip to verify that. + */ + if (irqd->chip == &ioapic_chip) { + mpd->entry.dest = cfg->dest_apicid; + mpd->entry.vector = cfg->vector; + } + for_each_irq_pin(entry, mpd->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); +} + static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) { struct irq_data *parent = irq_data->parent_data; - struct mp_chip_data *data = irq_data->chip_data; - struct irq_pin_list *entry; - struct irq_cfg *cfg; unsigned long flags; int ret; ret = parent->chip->irq_set_affinity(parent, mask, force); raw_spin_lock_irqsave(&ioapic_lock, flags); - if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - cfg = irqd_cfg(irq_data); - data->entry.dest = cfg->dest_apicid; - data->entry.vector = cfg->vector; - for_each_irq_pin(entry, data->irq_2_pin) - __ioapic_write_entry(entry->apic, entry->pin, - data->entry); - } + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) + ioapic_configure_entry(irq_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; @@ -2097,7 +2149,7 @@ static inline void __init check_timer(void) unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); - irq_domain_activate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2119,7 +2171,7 @@ static inline void __init check_timer(void) */ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); irq_domain_deactivate_irq(irq_data); - irq_domain_activate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2513,52 +2565,9 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) } /* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be apic->target_cpus() + * This function updates target affinity of IOAPIC interrupts to include + * the CPUs which came online during SMP bringup. */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - const struct cpumask *mask; - struct irq_desc *desc; - struct irq_data *idata; - struct irq_chip *chip; - - if (skip_ioapic_setup == 1) - return; - - for_each_ioapic_pin(ioapic, pin) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - - irq = pin_2_irq(irq_entry, ioapic, pin, 0); - if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq)) - continue; - - desc = irq_to_desc(irq); - raw_spin_lock_irq(&desc->lock); - idata = irq_desc_get_irq_data(desc); - - /* - * Honour affinities which have been set in early boot - */ - if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) - mask = irq_data_get_affinity_mask(idata); - else - mask = apic->target_cpus(); - - chip = irq_data_get_irq_chip(idata); - /* Might be lapic_chip for irq 0 */ - if (chip->irq_set_affinity) - chip->irq_set_affinity(idata, mask, false); - raw_spin_unlock_irq(&desc->lock); - } -} -#endif - #define IOAPIC_RESOURCE_NAME_SIZE 11 static struct resource *ioapic_resources; @@ -2978,17 +2987,15 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_top(domain, virq, nr_irqs); } -void mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +int mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool reserve) { unsigned long flags; - struct irq_pin_list *entry; - struct mp_chip_data *data = irq_data->chip_data; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, data->irq_2_pin) - __ioapic_write_entry(entry->apic, entry->pin, data->entry); + ioapic_configure_entry(irq_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return 0; } void mp_irqdomain_deactivate(struct irq_domain *domain, diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9b18be764422..ce503c99f5c4 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) ((apic->irq_dest_mode == 0) ? MSI_ADDR_DEST_MODE_PHYSICAL : MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_REDIRECTION_CPU | MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 63287659adb6..02e8acb134f8 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -66,6 +66,31 @@ static void setup_apic_flat_routing(void) #endif } +static int default_apic_id_registered(void) +{ + return physid_isset(read_apic_id(), phys_cpu_present_map); +} + +/* + * Set up the logical destination ID. Intel recommends to set DFR, LDR and + * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual" + * (Intel document number 292116). + */ +static void default_init_apic_ldr(void) +{ + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); +} + +static int default_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + /* should be called last. */ static int probe_default(void) { @@ -80,16 +105,14 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, - .target_cpus = default_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, - .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, @@ -102,7 +125,7 @@ static struct apic apic_default __ro_after_init = { .get_apic_id = default_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = default_send_IPI_mask_logical, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 88c214e75a6b..3cc471beb50b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -1,5 +1,5 @@ /* - * Local APIC related interfaces to support IOAPIC, MSI, HT_IRQ etc. + * Local APIC related interfaces to support IOAPIC, MSI, etc. * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. @@ -11,6 +11,7 @@ * published by the Free Software Foundation. */ #include <linux/interrupt.h> +#include <linux/seq_file.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/slab.h> @@ -21,20 +22,30 @@ #include <asm/desc.h> #include <asm/irq_remapping.h> +#include <asm/trace/irq_vectors.h> + struct apic_chip_data { - struct irq_cfg cfg; - cpumask_var_t domain; - cpumask_var_t old_domain; - u8 move_in_progress : 1; + struct irq_cfg hw_irq_cfg; + unsigned int vector; + unsigned int prev_vector; + unsigned int cpu; + unsigned int prev_cpu; + unsigned int irq; + struct hlist_node clist; + unsigned int move_in_progress : 1, + is_managed : 1, + can_reserve : 1, + has_reserved : 1; }; struct irq_domain *x86_vector_domain; EXPORT_SYMBOL_GPL(x86_vector_domain); static DEFINE_RAW_SPINLOCK(vector_lock); -static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask; +static cpumask_var_t vector_searchmask; static struct irq_chip lapic_controller; -#ifdef CONFIG_X86_IO_APIC -static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; +static struct irq_matrix *vector_matrix; +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct hlist_head, cleanup_list); #endif void lock_vector_lock(void) @@ -50,22 +61,37 @@ void unlock_vector_lock(void) raw_spin_unlock(&vector_lock); } -static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data) +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) { - if (!irq_data) + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + +static struct apic_chip_data *apic_chip_data(struct irq_data *irqd) +{ + if (!irqd) return NULL; - while (irq_data->parent_data) - irq_data = irq_data->parent_data; + while (irqd->parent_data) + irqd = irqd->parent_data; - return irq_data->chip_data; + return irqd->chip_data; } -struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +struct irq_cfg *irqd_cfg(struct irq_data *irqd) { - struct apic_chip_data *data = apic_chip_data(irq_data); + struct apic_chip_data *apicd = apic_chip_data(irqd); - return data ? &data->cfg : NULL; + return apicd ? &apicd->hw_irq_cfg : NULL; } EXPORT_SYMBOL_GPL(irqd_cfg); @@ -76,270 +102,407 @@ struct irq_cfg *irq_cfg(unsigned int irq) static struct apic_chip_data *alloc_apic_chip_data(int node) { - struct apic_chip_data *data; + struct apic_chip_data *apicd; - data = kzalloc_node(sizeof(*data), GFP_KERNEL, node); - if (!data) - return NULL; - if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node)) - goto out_data; - if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node)) - goto out_domain; - return data; -out_domain: - free_cpumask_var(data->domain); -out_data: - kfree(data); - return NULL; -} - -static void free_apic_chip_data(struct apic_chip_data *data) -{ - if (data) { - free_cpumask_var(data->domain); - free_cpumask_var(data->old_domain); - kfree(data); + apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node); + if (apicd) + INIT_HLIST_NODE(&apicd->clist); + return apicd; +} + +static void free_apic_chip_data(struct apic_chip_data *apicd) +{ + kfree(apicd); +} + +static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, + unsigned int cpu) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + lockdep_assert_held(&vector_lock); + + apicd->hw_irq_cfg.vector = vector; + apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); + trace_vector_config(irqd->irq, vector, cpu, + apicd->hw_irq_cfg.dest_apicid); +} + +static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, + unsigned int newcpu) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + struct irq_desc *desc = irq_data_to_desc(irqd); + + lockdep_assert_held(&vector_lock); + + trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector, + apicd->cpu); + + /* Setup the vector move, if required */ + if (apicd->vector && cpu_online(apicd->cpu)) { + apicd->move_in_progress = true; + apicd->prev_vector = apicd->vector; + apicd->prev_cpu = apicd->cpu; + } else { + apicd->prev_vector = 0; } + + apicd->vector = newvec; + apicd->cpu = newcpu; + BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec])); + per_cpu(vector_irq, newcpu)[newvec] = desc; } -static int __assign_irq_vector(int irq, struct apic_chip_data *d, - const struct cpumask *mask, - struct irq_data *irqdata) +static void vector_assign_managed_shutdown(struct irq_data *irqd) { - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; - static int current_offset = VECTOR_OFFSET_START % 16; - int cpu, vector; + unsigned int cpu = cpumask_first(cpu_online_mask); - /* - * If there is still a move in progress or the previous move has not - * been cleaned up completely, tell the caller to come back later. - */ - if (d->move_in_progress || - cpumask_intersects(d->old_domain, cpu_online_mask)) - return -EBUSY; + apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu); +} - /* Only try and allocate irqs on cpus that are present */ - cpumask_clear(d->old_domain); - cpumask_clear(searched_cpumask); - cpu = cpumask_first_and(mask, cpu_online_mask); - while (cpu < nr_cpu_ids) { - int new_cpu, offset; +static int reserve_managed_vector(struct irq_data *irqd) +{ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret; - /* Get the possible target cpus for @mask/@cpu from the apic */ - apic->vector_allocation_domain(cpu, vector_cpumask, mask); + raw_spin_lock_irqsave(&vector_lock, flags); + apicd->is_managed = true; + ret = irq_matrix_reserve_managed(vector_matrix, affmsk); + raw_spin_unlock_irqrestore(&vector_lock, flags); + trace_vector_reserve_managed(irqd->irq, ret); + return ret; +} - /* - * Clear the offline cpus from @vector_cpumask for searching - * and verify whether the result overlaps with @mask. If true, - * then the call to apic->cpu_mask_to_apicid() will - * succeed as well. If not, no point in trying to find a - * vector in this mask. - */ - cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask); - if (!cpumask_intersects(vector_searchmask, mask)) - goto next_cpu; - - if (cpumask_subset(vector_cpumask, d->domain)) { - if (cpumask_equal(vector_cpumask, d->domain)) - goto success; - /* - * Mark the cpus which are not longer in the mask for - * cleanup. - */ - cpumask_andnot(d->old_domain, d->domain, vector_cpumask); - vector = d->cfg.vector; - goto update; - } +static void reserve_irq_vector_locked(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + irq_matrix_reserve(vector_matrix); + apicd->can_reserve = true; + apicd->has_reserved = true; + irqd_set_can_reserve(irqd); + trace_vector_reserve(irqd->irq, 0); + vector_assign_managed_shutdown(irqd); +} - vector = current_vector; - offset = current_offset; -next: - vector += 16; - if (vector >= FIRST_SYSTEM_VECTOR) { - offset = (offset + 1) % 16; - vector = FIRST_EXTERNAL_VECTOR + offset; - } +static int reserve_irq_vector(struct irq_data *irqd) +{ + unsigned long flags; - /* If the search wrapped around, try the next cpu */ - if (unlikely(current_vector == vector)) - goto next_cpu; + raw_spin_lock_irqsave(&vector_lock, flags); + reserve_irq_vector_locked(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return 0; +} - if (test_bit(vector, used_vectors)) - goto next; +static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + bool resvd = apicd->has_reserved; + unsigned int cpu = apicd->cpu; + int vector = apicd->vector; - for_each_cpu(new_cpu, vector_searchmask) { - if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector])) - goto next; - } - /* Found one! */ - current_vector = vector; - current_offset = offset; - /* Schedule the old vector for cleanup on all cpus */ - if (d->cfg.vector) - cpumask_copy(d->old_domain, d->domain); - for_each_cpu(new_cpu, vector_searchmask) - per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq); - goto update; - -next_cpu: - /* - * We exclude the current @vector_cpumask from the requested - * @mask and try again with the next online cpu in the - * result. We cannot modify @mask, so we use @vector_cpumask - * as a temporary buffer here as it will be reassigned when - * calling apic->vector_allocation_domain() above. - */ - cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask); - cpumask_andnot(vector_cpumask, mask, searched_cpumask); - cpu = cpumask_first_and(vector_cpumask, cpu_online_mask); - continue; - } - return -ENOSPC; + lockdep_assert_held(&vector_lock); -update: /* - * Exclude offline cpus from the cleanup mask and set the - * move_in_progress flag when the result is not empty. + * If the current target CPU is online and in the new requested + * affinity mask, there is no point in moving the interrupt from + * one CPU to another. */ - cpumask_and(d->old_domain, d->old_domain, cpu_online_mask); - d->move_in_progress = !cpumask_empty(d->old_domain); - d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0; - d->cfg.vector = vector; - cpumask_copy(d->domain, vector_cpumask); -success: - /* - * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail - * as we already established, that mask & d->domain & cpu_online_mask - * is not empty. - * - * vector_searchmask is a subset of d->domain and has the offline - * cpus masked out. - */ - cpumask_and(vector_searchmask, vector_searchmask, mask); - BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqdata, - &d->cfg.dest_apicid)); + if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest)) + return 0; + + vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu); + if (vector > 0) + apic_update_vector(irqd, vector, cpu); + trace_vector_alloc(irqd->irq, vector, resvd, vector); + return vector; +} + +static int assign_vector_locked(struct irq_data *irqd, + const struct cpumask *dest) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int vector = allocate_vector(irqd, dest); + + if (vector < 0) + return vector; + + apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); return 0; } -static int assign_irq_vector(int irq, struct apic_chip_data *data, - const struct cpumask *mask, - struct irq_data *irqdata) +static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest) { - int err; unsigned long flags; + int ret; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, data, mask, irqdata); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + ret = assign_vector_locked(irqd, vector_searchmask); raw_spin_unlock_irqrestore(&vector_lock, flags); - return err; + return ret; } -static int assign_irq_vector_policy(int irq, int node, - struct apic_chip_data *data, - struct irq_alloc_info *info, - struct irq_data *irqdata) +static int assign_irq_vector_any_locked(struct irq_data *irqd) { - if (info && info->mask) - return assign_irq_vector(irq, data, info->mask, irqdata); - if (node != NUMA_NO_NODE && - assign_irq_vector(irq, data, cpumask_of_node(node), irqdata) == 0) + /* Get the affinity mask - either irq_default_affinity or (user) set */ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + int node = irq_data_get_node(irqd); + + if (node == NUMA_NO_NODE) + goto all; + /* Try the intersection of @affmsk and node mask */ + cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + /* Try the node mask */ + if (!assign_vector_locked(irqd, cpumask_of_node(node))) + return 0; +all: + /* Try the full affinity mask */ + cpumask_and(vector_searchmask, affmsk, cpu_online_mask); + if (!assign_vector_locked(irqd, vector_searchmask)) return 0; - return assign_irq_vector(irq, data, apic->target_cpus(), irqdata); + /* Try the full online mask */ + return assign_vector_locked(irqd, cpu_online_mask); +} + +static int +assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info) +{ + if (irqd_affinity_is_managed(irqd)) + return reserve_managed_vector(irqd); + if (info->mask) + return assign_irq_vector(irqd, info->mask); + /* + * Make only a global reservation with no guarantee. A real vector + * is associated at activation time. + */ + return reserve_irq_vector(irqd); } -static void clear_irq_vector(int irq, struct apic_chip_data *data) +static int +assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) { - struct irq_desc *desc; - int cpu, vector; + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + int vector, cpu; - if (!data->cfg.vector) + cpumask_and(vector_searchmask, vector_searchmask, affmsk); + cpu = cpumask_first(vector_searchmask); + if (cpu >= nr_cpu_ids) + return -EINVAL; + /* set_affinity might call here for nothing */ + if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask)) + return 0; + vector = irq_matrix_alloc_managed(vector_matrix, cpu); + trace_vector_alloc_managed(irqd->irq, vector, vector); + if (vector < 0) + return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); + return 0; +} + +static void clear_irq_vector(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + bool managed = irqd_affinity_is_managed(irqd); + unsigned int vector = apicd->vector; + + lockdep_assert_held(&vector_lock); + + if (!vector) return; - vector = data->cfg.vector; - for_each_cpu_and(cpu, data->domain, cpu_online_mask) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; + trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector, + apicd->prev_cpu); - data->cfg.vector = 0; - cpumask_clear(data->domain); + per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED; + irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apicd->vector = 0; - /* - * If move is in progress or the old_domain mask is not empty, - * i.e. the cleanup IPI has not been processed yet, we need to remove - * the old references to desc from all cpus vector tables. - */ - if (!data->move_in_progress && cpumask_empty(data->old_domain)) + /* Clean up move in progress */ + vector = apicd->prev_vector; + if (!vector) return; - desc = irq_to_desc(irq); - for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; - vector++) { - if (per_cpu(vector_irq, cpu)[vector] != desc) - continue; - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - break; - } + per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; + irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; + hlist_del_init(&apicd->clist); +} + +static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + + trace_vector_deactivate(irqd->irq, apicd->is_managed, + apicd->can_reserve, false); + + /* Regular fixed assigned interrupt */ + if (!apicd->is_managed && !apicd->can_reserve) + return; + /* If the interrupt has a global reservation, nothing to do */ + if (apicd->has_reserved) + return; + + raw_spin_lock_irqsave(&vector_lock, flags); + clear_irq_vector(irqd); + if (apicd->can_reserve) + reserve_irq_vector_locked(irqd); + else + vector_assign_managed_shutdown(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); +} + +static int activate_reserved(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int ret; + + ret = assign_irq_vector_any_locked(irqd); + if (!ret) { + apicd->has_reserved = false; + /* + * Core might have disabled reservation mode after + * allocating the irq descriptor. Ideally this should + * happen before allocation time, but that would require + * completely convoluted ways of transporting that + * information. + */ + if (!irqd_can_reserve(irqd)) + apicd->can_reserve = false; } - data->move_in_progress = 0; + return ret; } -void init_irq_alloc_info(struct irq_alloc_info *info, - const struct cpumask *mask) +static int activate_managed(struct irq_data *irqd) { - memset(info, 0, sizeof(*info)); - info->mask = mask; + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + int ret; + + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) { + /* Something in the core code broke! Survive gracefully */ + pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq); + return EINVAL; + } + + ret = assign_managed_vector(irqd, vector_searchmask); + /* + * This should not happen. The vector reservation got buggered. Handle + * it gracefully. + */ + if (WARN_ON_ONCE(ret < 0)) { + pr_err("Managed startup irq %u, no vector available\n", + irqd->irq); + } + return ret; } -void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, + bool reserve) { - if (src) - *dst = *src; - else - memset(dst, 0, sizeof(*dst)); + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret = 0; + + trace_vector_activate(irqd->irq, apicd->is_managed, + apicd->can_reserve, reserve); + + /* Nothing to do for fixed assigned vectors */ + if (!apicd->can_reserve && !apicd->is_managed) + return 0; + + raw_spin_lock_irqsave(&vector_lock, flags); + if (reserve || irqd_is_managed_and_shutdown(irqd)) + vector_assign_managed_shutdown(irqd); + else if (apicd->is_managed) + ret = activate_managed(irqd); + else if (apicd->has_reserved) + ret = activate_reserved(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static void vector_free_reserved_and_managed(struct irq_data *irqd) +{ + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + + trace_vector_teardown(irqd->irq, apicd->is_managed, + apicd->has_reserved); + + if (apicd->has_reserved) + irq_matrix_remove_reserved(vector_matrix); + if (apicd->is_managed) + irq_matrix_remove_managed(vector_matrix, dest); } static void x86_vector_free_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { - struct apic_chip_data *apic_data; - struct irq_data *irq_data; + struct apic_chip_data *apicd; + struct irq_data *irqd; unsigned long flags; int i; for (i = 0; i < nr_irqs; i++) { - irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); - if (irq_data && irq_data->chip_data) { + irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irqd && irqd->chip_data) { raw_spin_lock_irqsave(&vector_lock, flags); - clear_irq_vector(virq + i, irq_data->chip_data); - apic_data = irq_data->chip_data; - irq_domain_reset_irq_data(irq_data); + clear_irq_vector(irqd); + vector_free_reserved_and_managed(irqd); + apicd = irqd->chip_data; + irq_domain_reset_irq_data(irqd); raw_spin_unlock_irqrestore(&vector_lock, flags); - free_apic_chip_data(apic_data); -#ifdef CONFIG_X86_IO_APIC - if (virq + i < nr_legacy_irqs()) - legacy_irq_data[virq + i] = NULL; -#endif + free_apic_chip_data(apicd); } } } +static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd, + struct apic_chip_data *apicd) +{ + unsigned long flags; + bool realloc = false; + + apicd->vector = ISA_IRQ_VECTOR(virq); + apicd->cpu = 0; + + raw_spin_lock_irqsave(&vector_lock, flags); + /* + * If the interrupt is activated, then it must stay at this vector + * position. That's usually the timer interrupt (0). + */ + if (irqd_is_activated(irqd)) { + trace_vector_setup(virq, true, 0); + apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); + } else { + /* Release the vector */ + apicd->can_reserve = true; + irqd_set_can_reserve(irqd); + clear_irq_vector(irqd); + realloc = true; + } + raw_spin_unlock_irqrestore(&vector_lock, flags); + return realloc; +} + static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_alloc_info *info = arg; - struct apic_chip_data *data; - struct irq_data *irq_data; + struct apic_chip_data *apicd; + struct irq_data *irqd; int i, err, node; if (disable_apic) @@ -350,46 +513,99 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, return -ENOSYS; for (i = 0; i < nr_irqs; i++) { - irq_data = irq_domain_get_irq_data(domain, virq + i); - BUG_ON(!irq_data); - node = irq_data_get_node(irq_data); -#ifdef CONFIG_X86_IO_APIC - if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) - data = legacy_irq_data[virq + i]; - else -#endif - data = alloc_apic_chip_data(node); - if (!data) { + irqd = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irqd); + node = irq_data_get_node(irqd); + WARN_ON_ONCE(irqd->chip_data); + apicd = alloc_apic_chip_data(node); + if (!apicd) { err = -ENOMEM; goto error; } - irq_data->chip = &lapic_controller; - irq_data->chip_data = data; - irq_data->hwirq = virq + i; - err = assign_irq_vector_policy(virq + i, node, data, info, - irq_data); - if (err) - goto error; + apicd->irq = virq + i; + irqd->chip = &lapic_controller; + irqd->chip_data = apicd; + irqd->hwirq = virq + i; + irqd_set_single_target(irqd); /* - * If the apic destination mode is physical, then the - * effective affinity is restricted to a single target - * CPU. Mark the interrupt accordingly. + * Legacy vectors are already assigned when the IOAPIC + * takes them over. They stay on the same vector. This is + * required for check_timer() to work correctly as it might + * switch back to legacy mode. Only update the hardware + * config. */ - if (!apic->irq_dest_mode) - irqd_set_single_target(irq_data); + if (info->flags & X86_IRQ_ALLOC_LEGACY) { + if (!vector_configure_legacy(virq + i, irqd, apicd)) + continue; + } + + err = assign_irq_vector_policy(irqd, info); + trace_vector_setup(virq + i, false, err); + if (err) { + irqd->chip_data = NULL; + free_apic_chip_data(apicd); + goto error; + } } return 0; error: - x86_vector_free_irqs(domain, virq, i + 1); + x86_vector_free_irqs(domain, virq, i); return err; } +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + unsigned int cpu, vector, prev_cpu, prev_vector; + struct apic_chip_data *apicd; + unsigned long flags; + int irq; + + if (!irqd) { + irq_matrix_debug_show(m, vector_matrix, ind); + return; + } + + irq = irqd->irq; + if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) { + seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq)); + seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, ""); + return; + } + + apicd = irqd->chip_data; + if (!apicd) { + seq_printf(m, "%*sVector: Not assigned\n", ind, ""); + return; + } + + raw_spin_lock_irqsave(&vector_lock, flags); + cpu = apicd->cpu; + vector = apicd->vector; + prev_cpu = apicd->prev_cpu; + prev_vector = apicd->prev_vector; + raw_spin_unlock_irqrestore(&vector_lock, flags); + seq_printf(m, "%*sVector: %5u\n", ind, "", vector); + seq_printf(m, "%*sTarget: %5u\n", ind, "", cpu); + if (prev_vector) { + seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", prev_vector); + seq_printf(m, "%*sPrevious target: %5u\n", ind, "", prev_cpu); + } +} +#endif + static const struct irq_domain_ops x86_vector_domain_ops = { - .alloc = x86_vector_alloc_irqs, - .free = x86_vector_free_irqs, + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, + .activate = x86_vector_activate, + .deactivate = x86_vector_deactivate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = x86_vector_debug_show, +#endif }; int __init arch_probe_nr_irqs(void) @@ -400,7 +616,7 @@ int __init arch_probe_nr_irqs(void) nr_irqs = NR_VECTORS * nr_cpu_ids; nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; -#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) +#if defined(CONFIG_PCI_MSI) /* * for MSI and HT dyn irq */ @@ -419,35 +635,40 @@ int __init arch_probe_nr_irqs(void) return legacy_pic->probe(); } -#ifdef CONFIG_X86_IO_APIC -static void __init init_legacy_irqs(void) +void lapic_assign_legacy_vector(unsigned int irq, bool replace) { - int i, node = cpu_to_node(0); - struct apic_chip_data *data; - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * ISA_IRQ_VECTOR(i) for all cpu's. + * Use assign system here so it wont get accounted as allocated + * and moveable in the cpu hotplug check and it prevents managed + * irq reservation from touching it. */ - for (i = 0; i < nr_legacy_irqs(); i++) { - data = legacy_irq_data[i] = alloc_apic_chip_data(node); - BUG_ON(!data); + irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); +} + +void __init lapic_assign_system_vectors(void) +{ + unsigned int i, vector = 0; + + for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + irq_matrix_assign_system(vector_matrix, vector, false); + + if (nr_legacy_irqs() > 1) + lapic_assign_legacy_vector(PIC_CASCADE_IR, false); + + /* System vectors are reserved, online it */ + irq_matrix_online(vector_matrix); - data->cfg.vector = ISA_IRQ_VECTOR(i); - cpumask_setall(data->domain); - irq_set_chip_data(i, data); + /* Mark the preallocated legacy interrupts */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i)); } } -#else -static inline void init_legacy_irqs(void) { } -#endif int __init arch_early_irq_init(void) { struct fwnode_handle *fn; - init_legacy_irqs(); - fn = irq_domain_alloc_named_fwnode("VECTOR"); BUG_ON(!fn); x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, @@ -457,102 +678,116 @@ int __init arch_early_irq_init(void) irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); - arch_init_htirq_domain(x86_vector_domain); - BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL)); + + /* + * Allocate the vector matrix allocator data structure and limit the + * search area. + */ + vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR, + FIRST_SYSTEM_VECTOR); + BUG_ON(!vector_matrix); return arch_early_ioapic_init(); } -/* Initialize vector_irq on a new cpu */ -static void __setup_vector_irq(int cpu) +#ifdef CONFIG_SMP + +static struct irq_desc *__setup_vector_irq(int vector) { - struct apic_chip_data *data; - struct irq_desc *desc; - int irq, vector; + int isairq = vector - ISA_IRQ_VECTOR(0); + + /* Check whether the irq is in the legacy space */ + if (isairq < 0 || isairq >= nr_legacy_irqs()) + return VECTOR_UNUSED; + /* Check whether the irq is handled by the IOAPIC */ + if (test_bit(isairq, &io_apic_irqs)) + return VECTOR_UNUSED; + return irq_to_desc(isairq); +} - /* Mark the inuse vectors */ - for_each_irq_desc(irq, desc) { - struct irq_data *idata = irq_desc_get_irq_data(desc); +/* Online the local APIC infrastructure and initialize the vectors */ +void lapic_online(void) +{ + unsigned int vector; - data = apic_chip_data(idata); - if (!data || !cpumask_test_cpu(cpu, data->domain)) - continue; - vector = data->cfg.vector; - per_cpu(vector_irq, cpu)[vector] = desc; - } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - desc = per_cpu(vector_irq, cpu)[vector]; - if (IS_ERR_OR_NULL(desc)) - continue; + lockdep_assert_held(&vector_lock); - data = apic_chip_data(irq_desc_get_irq_data(desc)); - if (!cpumask_test_cpu(cpu, data->domain)) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - } + /* Online the vector matrix array for this CPU */ + irq_matrix_online(vector_matrix); + + /* + * The interrupt affinity logic never targets interrupts to offline + * CPUs. The exception are the legacy PIC interrupts. In general + * they are only targeted to CPU0, but depending on the platform + * they can be distributed to any online CPU in hardware. The + * kernel has no influence on that. So all active legacy vectors + * must be installed on all CPUs. All non legacy interrupts can be + * cleared. + */ + for (vector = 0; vector < NR_VECTORS; vector++) + this_cpu_write(vector_irq[vector], __setup_vector_irq(vector)); } -/* - * Setup the vector to irq mappings. Must be called with vector_lock held. - */ -void setup_vector_irq(int cpu) +void lapic_offline(void) { - int irq; + lock_vector_lock(); + irq_matrix_offline(vector_matrix); + unlock_vector_lock(); +} + +static int apic_set_affinity(struct irq_data *irqd, + const struct cpumask *dest, bool force) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int err; - lockdep_assert_held(&vector_lock); /* - * On most of the platforms, legacy PIC delivers the interrupts on the - * boot cpu. But there are certain platforms where PIC interrupts are - * delivered to multiple cpu's. If the legacy IRQ is handled by the - * legacy PIC, for the new cpu that is coming online, setup the static - * legacy vector to irq mapping: + * Core code can call here for inactive interrupts. For inactive + * interrupts which use managed or reservation mode there is no + * point in going through the vector assignment right now as the + * activation will assign a vector which fits the destination + * cpumask. Let the core code store the destination mask and be + * done with it. */ - for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq); + if (!irqd_is_activated(irqd) && + (apicd->is_managed || apicd->can_reserve)) + return IRQ_SET_MASK_OK; - __setup_vector_irq(cpu); + raw_spin_lock(&vector_lock); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (irqd_affinity_is_managed(irqd)) + err = assign_managed_vector(irqd, vector_searchmask); + else + err = assign_vector_locked(irqd, vector_searchmask); + raw_spin_unlock(&vector_lock); + return err ? err : IRQ_SET_MASK_OK; } -static int apic_retrigger_irq(struct irq_data *irq_data) +#else +# define apic_set_affinity NULL +#endif + +static int apic_retrigger_irq(struct irq_data *irqd) { - struct apic_chip_data *data = apic_chip_data(irq_data); + struct apic_chip_data *apicd = apic_chip_data(irqd); unsigned long flags; - int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(data->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector); + apic->send_IPI(apicd->cpu, apicd->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; } -void apic_ack_edge(struct irq_data *data) +void apic_ack_edge(struct irq_data *irqd) { - irq_complete_move(irqd_cfg(data)); - irq_move_irq(data); + irq_complete_move(irqd_cfg(irqd)); + irq_move_irq(irqd); ack_APIC_irq(); } -static int apic_set_affinity(struct irq_data *irq_data, - const struct cpumask *dest, bool force) -{ - struct apic_chip_data *data = irq_data->chip_data; - int err, irq = irq_data->irq; - - if (!IS_ENABLED(CONFIG_SMP)) - return -EPERM; - - if (!cpumask_intersects(dest, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irq, data, dest, irq_data); - return err ? err : IRQ_SET_MASK_OK; -} - static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack = apic_ack_edge, @@ -561,115 +796,98 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP -static void __send_cleanup_vector(struct apic_chip_data *data) -{ - raw_spin_lock(&vector_lock); - cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); - data->move_in_progress = 0; - if (!cpumask_empty(data->old_domain)) - apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR); - raw_spin_unlock(&vector_lock); -} -void send_cleanup_vector(struct irq_cfg *cfg) +static void free_moved_vector(struct apic_chip_data *apicd) { - struct apic_chip_data *data; + unsigned int vector = apicd->prev_vector; + unsigned int cpu = apicd->prev_cpu; + bool managed = apicd->is_managed; - data = container_of(cfg, struct apic_chip_data, cfg); - if (data->move_in_progress) - __send_cleanup_vector(data); + /* + * This should never happen. Managed interrupts are not + * migrated except on CPU down, which does not involve the + * cleanup vector. But try to keep the accounting correct + * nevertheless. + */ + WARN_ON_ONCE(managed); + + trace_vector_free_moved(apicd->irq, cpu, vector, managed); + irq_matrix_free(vector_matrix, cpu, vector, managed); + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; + hlist_del_init(&apicd->clist); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; } asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) { - unsigned vector, me; + struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); + struct apic_chip_data *apicd; + struct hlist_node *tmp; entering_ack_irq(); - /* Prevent vectors vanishing under us */ raw_spin_lock(&vector_lock); - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - struct apic_chip_data *data; - struct irq_desc *desc; - unsigned int irr; - - retry: - desc = __this_cpu_read(vector_irq[vector]); - if (IS_ERR_OR_NULL(desc)) - continue; - - if (!raw_spin_trylock(&desc->lock)) { - raw_spin_unlock(&vector_lock); - cpu_relax(); - raw_spin_lock(&vector_lock); - goto retry; - } - - data = apic_chip_data(irq_desc_get_irq_data(desc)); - if (!data) - goto unlock; - - /* - * Nothing to cleanup if irq migration is in progress - * or this cpu is not set in the cleanup mask. - */ - if (data->move_in_progress || - !cpumask_test_cpu(me, data->old_domain)) - goto unlock; - - /* - * We have two cases to handle here: - * 1) vector is unchanged but the target mask got reduced - * 2) vector and the target mask has changed - * - * #1 is obvious, but in #2 we have two vectors with the same - * irq descriptor: the old and the new vector. So we need to - * make sure that we only cleanup the old vector. The new - * vector has the current @vector number in the config and - * this cpu is part of the target mask. We better leave that - * one alone. - */ - if (vector == data->cfg.vector && - cpumask_test_cpu(me, data->domain)) - goto unlock; + hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { + unsigned int irr, vector = apicd->prev_vector; - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); /* - * Check if the vector that needs to be cleanedup is - * registered at the cpu's IRR. If so, then this is not - * the best time to clean it up. Lets clean it up in the + * Paranoia: Check if the vector that needs to be cleaned + * up is registered at the APICs IRR. If so, then this is + * not the best time to clean it up. Clean it up in the * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR - * to myself. + * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest + * priority external vector, so on return from this + * interrupt the device interrupt will happen first. */ - if (irr & (1 << (vector % 32))) { + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1U << (vector % 32))) { apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); - goto unlock; + continue; } - __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); - cpumask_clear_cpu(me, data->old_domain); -unlock: - raw_spin_unlock(&desc->lock); + free_moved_vector(apicd); } raw_spin_unlock(&vector_lock); - exiting_irq(); } +static void __send_cleanup_vector(struct apic_chip_data *apicd) +{ + unsigned int cpu; + + raw_spin_lock(&vector_lock); + apicd->move_in_progress = 0; + cpu = apicd->prev_cpu; + if (cpu_online(cpu)) { + hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); + apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); + } else { + apicd->prev_vector = 0; + } + raw_spin_unlock(&vector_lock); +} + +void send_cleanup_vector(struct irq_cfg *cfg) +{ + struct apic_chip_data *apicd; + + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); + if (apicd->move_in_progress) + __send_cleanup_vector(apicd); +} + static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { - unsigned me; - struct apic_chip_data *data; + struct apic_chip_data *apicd; - data = container_of(cfg, struct apic_chip_data, cfg); - if (likely(!data->move_in_progress)) + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); + if (likely(!apicd->move_in_progress)) return; - me = smp_processor_id(); - if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain)) - __send_cleanup_vector(data); + if (vector == apicd->vector && apicd->cpu == smp_processor_id()) + __send_cleanup_vector(apicd); } void irq_complete_move(struct irq_cfg *cfg) @@ -682,10 +900,9 @@ void irq_complete_move(struct irq_cfg *cfg) */ void irq_force_complete_move(struct irq_desc *desc) { - struct irq_data *irqdata; - struct apic_chip_data *data; - struct irq_cfg *cfg; - unsigned int cpu; + struct apic_chip_data *apicd; + struct irq_data *irqd; + unsigned int vector; /* * The function is called for all descriptors regardless of which @@ -696,43 +913,31 @@ void irq_force_complete_move(struct irq_desc *desc) * Check first that the chip_data is what we expect * (apic_chip_data) before touching it any further. */ - irqdata = irq_domain_get_irq_data(x86_vector_domain, - irq_desc_get_irq(desc)); - if (!irqdata) + irqd = irq_domain_get_irq_data(x86_vector_domain, + irq_desc_get_irq(desc)); + if (!irqd) return; - data = apic_chip_data(irqdata); - cfg = data ? &data->cfg : NULL; + raw_spin_lock(&vector_lock); + apicd = apic_chip_data(irqd); + if (!apicd) + goto unlock; - if (!cfg) - return; + /* + * If prev_vector is empty, no action required. + */ + vector = apicd->prev_vector; + if (!vector) + goto unlock; /* - * This is tricky. If the cleanup of @data->old_domain has not been + * This is tricky. If the cleanup of the old vector has not been * done yet, then the following setaffinity call will fail with * -EBUSY. This can leave the interrupt in a stale state. * * All CPUs are stuck in stop machine with interrupts disabled so * calling __irq_complete_move() would be completely pointless. - */ - raw_spin_lock(&vector_lock); - /* - * Clean out all offline cpus (including the outgoing one) from the - * old_domain mask. - */ - cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); - - /* - * If move_in_progress is cleared and the old_domain mask is empty, - * then there is nothing to cleanup. fixup_irqs() will take care of - * the stale vectors on the outgoing cpu. - */ - if (!data->move_in_progress && cpumask_empty(data->old_domain)) { - raw_spin_unlock(&vector_lock); - return; - } - - /* + * * 1) The interrupt is in move_in_progress state. That means that we * have not seen an interrupt since the io_apic was reprogrammed to * the new vector. @@ -740,7 +945,7 @@ void irq_force_complete_move(struct irq_desc *desc) * 2) The interrupt has fired on the new vector, but the cleanup IPIs * have not been processed yet. */ - if (data->move_in_progress) { + if (apicd->move_in_progress) { /* * In theory there is a race: * @@ -774,21 +979,43 @@ void irq_force_complete_move(struct irq_desc *desc) * area arises. */ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", - irqdata->irq, cfg->old_vector); + irqd->irq, vector); } - /* - * If old_domain is not empty, then other cpus still have the irq - * descriptor set in their vector array. Clean it up. - */ - for_each_cpu(cpu, data->old_domain) - per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED; + free_moved_vector(apicd); +unlock: + raw_spin_unlock(&vector_lock); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Note, this is not accurate accounting, but at least good enough to + * prevent that the actual interrupt move will run out of vectors. + */ +int lapic_can_unplug_cpu(void) +{ + unsigned int rsvd, avl, tomove, cpu = smp_processor_id(); + int ret = 0; - /* Cleanup the left overs of the (half finished) move */ - cpumask_clear(data->old_domain); - data->move_in_progress = 0; + raw_spin_lock(&vector_lock); + tomove = irq_matrix_allocated(vector_matrix); + avl = irq_matrix_available(vector_matrix, true); + if (avl < tomove) { + pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n", + cpu, tomove, avl); + ret = -ENOSPC; + goto out; + } + rsvd = irq_matrix_reserved(vector_matrix); + if (avl < rsvd) { + pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n", + rsvd, avl); + } +out: raw_spin_unlock(&vector_lock); + return ret; } -#endif +#endif /* HOTPLUG_CPU */ +#endif /* SMP */ static void __init print_APIC_field(int base) { diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h new file mode 100644 index 000000000000..b107de381cb5 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic.h @@ -0,0 +1,9 @@ +/* Common bits for X2APIC cluster/physical modes. */ + +int x2apic_apic_id_valid(int apicid); +int x2apic_apic_id_registered(void); +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); +unsigned int x2apic_get_apic_id(unsigned long id); +u32 x2apic_set_apic_id(unsigned int id); +int x2apic_phys_pkg_id(int initial_apicid, int index_msb); +void x2apic_send_IPI_self(int vector); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e216cf3d64d2..8b04234e010b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -9,22 +9,24 @@ #include <linux/cpu.h> #include <asm/smp.h> -#include <asm/x2apic.h> +#include "x2apic.h" + +struct cluster_mask { + unsigned int clusterid; + int node; + struct cpumask mask; +}; static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); -static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster); static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); +static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks); +static struct cluster_mask *cluster_hotplug_mask; static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return x2apic_enabled(); } -static inline u32 x2apic_cluster(int cpu) -{ - return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; -} - static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); @@ -36,49 +38,34 @@ static void x2apic_send_IPI(int cpu, int vector) static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { - struct cpumask *cpus_in_cluster_ptr; - struct cpumask *ipi_mask_ptr; - unsigned int cpu, this_cpu; + unsigned int cpu, clustercpu; + struct cpumask *tmpmsk; unsigned long flags; u32 dest; x2apic_wrmsr_fence(); - local_irq_save(flags); - this_cpu = smp_processor_id(); + tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask); + cpumask_copy(tmpmsk, mask); + /* If IPI should not be sent to self, clear current CPU */ + if (apic_dest != APIC_DEST_ALLINC) + cpumask_clear_cpu(smp_processor_id(), tmpmsk); - /* - * We are to modify mask, so we need an own copy - * and be sure it's manipulated with irq off. - */ - ipi_mask_ptr = this_cpu_cpumask_var_ptr(ipi_mask); - cpumask_copy(ipi_mask_ptr, mask); - - /* - * The idea is to send one IPI per cluster. - */ - for_each_cpu(cpu, ipi_mask_ptr) { - unsigned long i; + /* Collapse cpus in a cluster so a single IPI per cluster is sent */ + for_each_cpu(cpu, tmpmsk) { + struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu); - cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu); dest = 0; - - /* Collect cpus in cluster. */ - for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) { - if (apic_dest == APIC_DEST_ALLINC || i != this_cpu) - dest |= per_cpu(x86_cpu_to_logical_apicid, i); - } + for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) + dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu); if (!dest) continue; __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); - /* - * Cluster sibling cpus should be discared now so - * we would not send IPI them second time. - */ - cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr); + /* Remove cluster CPUs from tmpmask */ + cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); } local_irq_restore(flags); @@ -105,125 +92,90 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static int -x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, - unsigned int *apicid) +static u32 x2apic_calc_apicid(unsigned int cpu) { - struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); - unsigned int cpu; - u32 dest = 0; - u16 cluster; - - cpu = cpumask_first(mask); - if (cpu >= nr_cpu_ids) - return -EINVAL; - - dest = per_cpu(x86_cpu_to_logical_apicid, cpu); - cluster = x2apic_cluster(cpu); - - cpumask_clear(effmsk); - for_each_cpu(cpu, mask) { - if (cluster != x2apic_cluster(cpu)) - continue; - dest |= per_cpu(x86_cpu_to_logical_apicid, cpu); - cpumask_set_cpu(cpu, effmsk); - } - - *apicid = dest; - return 0; + return per_cpu(x86_cpu_to_logical_apicid, cpu); } static void init_x2apic_ldr(void) { - unsigned int this_cpu = smp_processor_id(); + struct cluster_mask *cmsk = this_cpu_read(cluster_masks); + u32 cluster, apicid = apic_read(APIC_LDR); unsigned int cpu; - per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); + this_cpu_write(x86_cpu_to_logical_apicid, apicid); + + if (cmsk) + goto update; - cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + cluster = apicid >> 16; for_each_online_cpu(cpu) { - if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) - continue; - cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); - cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); + cmsk = per_cpu(cluster_masks, cpu); + /* Matching cluster found. Link and update it. */ + if (cmsk && cmsk->clusterid == cluster) + goto update; } + cmsk = cluster_hotplug_mask; + cluster_hotplug_mask = NULL; +update: + this_cpu_write(cluster_masks, cmsk); + cpumask_set_cpu(smp_processor_id(), &cmsk->mask); } -/* - * At CPU state changes, update the x2apic cluster sibling info. - */ -static int x2apic_prepare_cpu(unsigned int cpu) +static int alloc_clustermask(unsigned int cpu, int node) { - if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) - return -ENOMEM; + if (per_cpu(cluster_masks, cpu)) + return 0; + /* + * If a hotplug spare mask exists, check whether it's on the right + * node. If not, free it and allocate a new one. + */ + if (cluster_hotplug_mask) { + if (cluster_hotplug_mask->node == node) + return 0; + kfree(cluster_hotplug_mask); + } - if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) { - free_cpumask_var(per_cpu(cpus_in_cluster, cpu)); + cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask), + GFP_KERNEL, node); + if (!cluster_hotplug_mask) return -ENOMEM; - } + cluster_hotplug_mask->node = node; + return 0; +} +static int x2apic_prepare_cpu(unsigned int cpu) +{ + if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0) + return -ENOMEM; + if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) + return -ENOMEM; return 0; } -static int x2apic_dead_cpu(unsigned int this_cpu) +static int x2apic_dead_cpu(unsigned int dead_cpu) { - int cpu; + struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); - for_each_online_cpu(cpu) { - if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) - continue; - cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); - cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); - } - free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); - free_cpumask_var(per_cpu(ipi_mask, this_cpu)); + cpumask_clear_cpu(dead_cpu, &cmsk->mask); + free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } static int x2apic_cluster_probe(void) { - int cpu = smp_processor_id(); - int ret; - if (!x2apic_mode) return 0; - ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", - x2apic_prepare_cpu, x2apic_dead_cpu); - if (ret < 0) { + if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", + x2apic_prepare_cpu, x2apic_dead_cpu) < 0) { pr_err("Failed to register X2APIC_PREPARE\n"); return 0; } - cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); + init_x2apic_ldr(); return 1; } -static const struct cpumask *x2apic_cluster_target_cpus(void) -{ - return cpu_all_mask; -} - -/* - * Each x2apic cluster is an allocation domain. - */ -static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - /* - * To minimize vector pressure, default case of boot, device bringup - * etc will use a single cpu for the interrupt destination. - * - * On explicit migration requests coming from irqbalance etc, - * interrupts will be routed to the x2apic cluster (cluster-id - * derived from the first cpu in the mask) members specified - * in the mask. - */ - if (mask == x2apic_cluster_target_cpus()) - cpumask_copy(retmask, cpumask_of(cpu)); - else - cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); -} - static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", @@ -232,15 +184,13 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ - .target_cpus = x2apic_cluster_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .vector_allocation_domain = cluster_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -253,7 +203,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .calc_dest_apicid = x2apic_calc_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index b94d35320f85..f8d9d69994e6 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -7,7 +7,8 @@ #include <linux/dmar.h> #include <asm/smp.h> -#include <asm/x2apic.h> +#include <asm/ipi.h> +#include "x2apic.h" int x2apic_phys; @@ -99,6 +100,43 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } +/* Common x2apic functions, also used by x2apic_cluster */ +int x2apic_apic_id_valid(int apicid) +{ + return 1; +} + +int x2apic_apic_id_registered(void) +{ + return 1; +} + +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +{ + unsigned long cfg = __prepare_ICR(0, vector, dest); + native_x2apic_icr_write(cfg, apicid); +} + +unsigned int x2apic_get_apic_id(unsigned long id) +{ + return id; +} + +u32 x2apic_set_apic_id(unsigned int id) +{ + return id; +} + +int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +{ + return initial_apicid >> index_msb; +} + +void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", @@ -110,12 +148,10 @@ static struct apic apic_x2apic_phys __ro_after_init = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -128,7 +164,7 @@ static struct apic apic_x2apic_phys __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c0b694810ff4..e1b8e8bf6b3c 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -154,6 +154,48 @@ static int __init early_get_pnodeid(void) return pnode; } +static void __init uv_tsc_check_sync(void) +{ + u64 mmr; + int sync_state; + int mmr_shift; + char *state; + bool valid; + + /* Accommodate different UV arch BIOSes */ + mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); + mmr_shift = + is_uv1_hub() ? 0 : + is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; + if (mmr_shift) + sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; + else + sync_state = 0; + + switch (sync_state) { + case UVH_TSC_SYNC_VALID: + state = "in sync"; + valid = true; + break; + + case UVH_TSC_SYNC_INVALID: + state = "unstable"; + valid = false; + break; + default: + state = "unknown: assuming valid"; + valid = true; + break; + } + pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state); + + /* Mark flag that says TSC != 0 is valid for socket 0 */ + if (valid) + mark_tsc_async_resets("UV BIOS"); + else + mark_tsc_unstable("UV BIOS"); +} + /* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ #define SMT_LEVEL 0 /* Leaf 0xb SMT level */ @@ -288,6 +330,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) } pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic); + uv_tsc_check_sync(); return uv_apic; @@ -525,16 +568,9 @@ static void uv_init_apic_ldr(void) { } -static int -uv_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, - unsigned int *apicid) +static u32 apic_uv_calc_apicid(unsigned int cpu) { - int ret = default_cpu_mask_to_apicid(mask, irqdata, apicid); - - if (!ret) - *apicid |= uv_apicid_hibits; - - return ret; + return apic_default_calc_apicid(cpu) | uv_apicid_hibits; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -547,7 +583,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x) return id; } -static unsigned long set_apic_id(unsigned int id) +static u32 set_apic_id(unsigned int id) { /* CHECKME: Do we need to mask out the xapic extra bits? */ return id; @@ -584,12 +620,10 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* Physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = uv_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -602,7 +636,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, + .calc_dest_apicid = apic_uv_calc_apicid, .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 90cb82dbba57..570e8bb1f386 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -22,7 +22,7 @@ obj-y += common.o obj-y += rdrand.o obj-y += match.o obj-y += bugs.o -obj-$(CONFIG_CPU_FREQ) += aperfmperf.o +obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 0ee83321a313..7eba34df54c3 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -14,6 +14,8 @@ #include <linux/percpu.h> #include <linux/smp.h> +#include "cpu.h" + struct aperfmperf_sample { unsigned int khz; ktime_t time; @@ -24,7 +26,7 @@ struct aperfmperf_sample { static DEFINE_PER_CPU(struct aperfmperf_sample, samples); #define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 20 +#define APERFMPERF_REFRESH_DELAY_MS 10 #define APERFMPERF_STALE_THRESHOLD_MS 1000 /* @@ -38,14 +40,8 @@ static void aperfmperf_snapshot_khz(void *dummy) u64 aperf, aperf_delta; u64 mperf, mperf_delta; struct aperfmperf_sample *s = this_cpu_ptr(&samples); - ktime_t now = ktime_get(); - s64 time_delta = ktime_ms_delta(now, s->time); unsigned long flags; - /* Don't bother re-computing within the cache threshold time. */ - if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return; - local_irq_save(flags); rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); @@ -61,31 +57,68 @@ static void aperfmperf_snapshot_khz(void *dummy) if (mperf_delta == 0) return; - s->time = now; + s->time = ktime_get(); s->aperf = aperf; s->mperf = mperf; + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); +} - /* If the previous iteration was too long ago, discard it. */ - if (time_delta > APERFMPERF_STALE_THRESHOLD_MS) - s->khz = 0; - else - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); +static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) +{ + s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); + + /* Don't bother re-computing within the cache threshold time. */ + if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) + return true; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + + /* Return false if the previous iteration was too long ago. */ + return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; } -unsigned int arch_freq_get_on_cpu(int cpu) +unsigned int aperfmperf_get_khz(int cpu) { - unsigned int khz; + if (!cpu_khz) + return 0; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); + return per_cpu(samples.khz, cpu); +} + +void arch_freq_prepare_all(void) +{ + ktime_t now = ktime_get(); + bool wait = false; + int cpu; + + if (!cpu_khz) + return; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + for_each_online_cpu(cpu) + if (!aperfmperf_snapshot_cpu(cpu, now, false)) + wait = true; + + if (wait) + msleep(APERFMPERF_REFRESH_DELAY_MS); +} + +unsigned int arch_freq_get_on_cpu(int cpu) +{ if (!cpu_khz) return 0; if (!static_cpu_has(X86_FEATURE_APERFMPERF)) return 0; - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); - khz = per_cpu(samples.khz, cpu); - if (khz) - return khz; + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) + return per_cpu(samples.khz, cpu); msleep(APERFMPERF_REFRESH_DELAY_MS); smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 970ee06dc8aa..c7c996a692fd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -331,6 +331,30 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +static __always_inline void setup_umip(struct cpuinfo_x86 *c) +{ + /* Check the boot processor, plus build option for UMIP. */ + if (!cpu_feature_enabled(X86_FEATURE_UMIP)) + goto out; + + /* Check the current processor's cpuid bits. */ + if (!cpu_has(c, X86_FEATURE_UMIP)) + goto out; + + cr4_set_bits(X86_CR4_UMIP); + + pr_info("x86/cpu: Activated the Intel User Mode Instruction Prevention (UMIP) CPU feature\n"); + + return; + +out: + /* + * Make sure UMIP is disabled in case it was enabled in a + * previous boot (e.g., via kexec). + */ + cr4_clear_bits(X86_CR4_UMIP); +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -896,8 +920,8 @@ static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) * cache alignment. * The others are not touched to avoid unwanted side effects. * - * WARNING: this function is only called on the BP. Don't add code here - * that is supposed to run on all CPUs. + * WARNING: this function is only called on the boot CPU. Don't add code + * here that is supposed to run on all CPUs. */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { @@ -1188,9 +1212,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); - /* Set up SMEP/SMAP */ + /* Set up SMEP/SMAP/UMIP */ setup_smep(c); setup_smap(c); + setup_umip(c); /* * The vendor-specific functions might have changed features. diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index f52a370b6c00..e806b11a99af 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -47,4 +47,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); + +unsigned int aperfmperf_get_khz(int cpu); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0c8b916abced..6936d14d4c77 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -264,21 +264,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); -#ifdef CONFIG_KMEMCHECK - /* - * P4s have a "fast strings" feature which causes single- - * stepping REP instructions to only generate a #DB on - * cache-line boundaries. - * - * Ingo Molnar reported a Pentium D (model 6) and a Xeon - * (model 2) with the same problem. - */ - if (c->x86 == 15) - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) - pr_info("kmemcheck: Disabling fast string operations\n"); -#endif - /* * If fast string is not enabled in IA32_MISC_ENABLE for any reason, * clear the fast string and enhanced fast string CPU capabilities. diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index cd5fc61ba450..99442370de40 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -267,6 +267,7 @@ static void rdt_get_cdp_l3_config(int type) r->num_closid = r_l3->num_closid / 2; r->cache.cbm_len = r_l3->cache.cbm_len; r->default_ctrl = r_l3->default_ctrl; + r->cache.shareable_bits = r_l3->cache.shareable_bits; r->data_width = (r->cache.cbm_len + 3) / 4; r->alloc_capable = true; /* @@ -524,10 +525,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) */ if (static_branch_unlikely(&rdt_mon_enable_key)) rmdir_mondata_subdir_allrdtgrp(r, d->id); - kfree(d->ctrl_val); - kfree(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); list_del(&d->list); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); @@ -544,6 +541,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cancel_delayed_work(&d->cqm_limbo); } + kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); kfree(d); return; } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index a43a72d8e88e..3397244984f5 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -127,12 +127,15 @@ struct rdtgroup { #define RFTYPE_BASE BIT(1) #define RF_CTRLSHIFT 4 #define RF_MONSHIFT 5 +#define RF_TOPSHIFT 6 #define RFTYPE_CTRL BIT(RF_CTRLSHIFT) #define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_TOP BIT(RF_TOPSHIFT) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) #define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) #define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) /* List of all resource groups */ @@ -409,6 +412,10 @@ union cpuid_0x10_x_edx { unsigned int full; }; +void rdt_last_cmd_clear(void); +void rdt_last_cmd_puts(const char *s); +void rdt_last_cmd_printf(const char *fmt, ...); + void rdt_ctrl_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index f6ea94f8954a..23e1d5c249c6 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -42,15 +42,22 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear) + if (!r->membw.delay_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; + } ret = kstrtoul(buf, 10, &bw); - if (ret) + if (ret) { + rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); return false; + } - if (bw < r->membw.min_bw || bw > r->default_ctrl) + if (bw < r->membw.min_bw || bw > r->default_ctrl) { + rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, + r->membw.min_bw, r->default_ctrl); return false; + } *data = roundup(bw, (unsigned long)r->membw.bw_gran); return true; @@ -60,8 +67,10 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - if (d->have_new_ctrl) + if (d->have_new_ctrl) { + rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; + } if (!bw_validate(buf, &data, r)) return -EINVAL; @@ -84,20 +93,29 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) int ret; ret = kstrtoul(buf, 16, &val); - if (ret) + if (ret) { + rdt_last_cmd_printf("non-hex character in mask %s\n", buf); return false; + } - if (val == 0 || val > r->default_ctrl) + if (val == 0 || val > r->default_ctrl) { + rdt_last_cmd_puts("mask out of range\n"); return false; + } first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) + if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) { + rdt_last_cmd_printf("mask %lx has non-consecutive 1-bits\n", val); return false; + } - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("Need at least %d bits in mask\n", + r->cache.min_cbm_bits); return false; + } *data = val; return true; @@ -111,8 +129,10 @@ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - if (d->have_new_ctrl) + if (d->have_new_ctrl) { + rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; + } if(!cbm_validate(buf, &data, r)) return -EINVAL; @@ -139,8 +159,10 @@ next: return 0; dom = strsep(&line, ";"); id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); return -EINVAL; + } dom = strim(dom); list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { @@ -196,6 +218,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) if (!strcmp(resname, r->name) && closid < r->num_closid) return parse_line(tok, r); } + rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname); return -EINVAL; } @@ -218,6 +241,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, rdtgroup_kn_unlock(of->kn); return -ENOENT; } + rdt_last_cmd_clear(); closid = rdtgrp->closid; @@ -229,6 +253,12 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, while ((tok = strsep(&buf, "\n")) != NULL) { resname = strim(strsep(&tok, ":")); if (!tok) { + rdt_last_cmd_puts("Missing ':'\n"); + ret = -EINVAL; + goto out; + } + if (tok[0] == '\0') { + rdt_last_cmd_printf("Missing '%s' value\n", resname); ret = -EINVAL; goto out; } diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 30827510094b..681450eee428 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -51,7 +51,7 @@ static LIST_HEAD(rmid_free_lru); * may have a occupancy value > intel_cqm_threshold. User can change * the threshold occupancy value. */ -unsigned int rmid_limbo_count; +static unsigned int rmid_limbo_count; /** * @rmid_entry - The entry in the limbo and free lists. diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index a869d4a073c5..64c5ff97ee0d 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -24,6 +24,7 @@ #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kernfs.h> +#include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> @@ -51,6 +52,31 @@ static struct kernfs_node *kn_mongrp; /* Kernel fs node for "mon_data" directory under root */ static struct kernfs_node *kn_mondata; +static struct seq_buf last_cmd_status; +static char last_cmd_status_buf[512]; + +void rdt_last_cmd_clear(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_clear(&last_cmd_status); +} + +void rdt_last_cmd_puts(const char *s) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_puts(&last_cmd_status, s); +} + +void rdt_last_cmd_printf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_vprintf(&last_cmd_status, fmt, ap); + va_end(ap); +} + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -238,8 +264,10 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus belong to parent ctrl group */ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (cpumask_weight(tmpmask)) + if (cpumask_weight(tmpmask)) { + rdt_last_cmd_puts("can only add CPUs to mongroup that belong to parent\n"); return -EINVAL; + } /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); @@ -291,8 +319,10 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); if (cpumask_weight(tmpmask)) { /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); return -EINVAL; + } /* Give any dropped cpus to rdtgroup_default */ cpumask_or(&rdtgroup_default.cpu_mask, @@ -357,8 +387,10 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, } rdtgrp = rdtgroup_kn_lock_live(of->kn); + rdt_last_cmd_clear(); if (!rdtgrp) { ret = -ENOENT; + rdt_last_cmd_puts("directory was removed\n"); goto unlock; } @@ -367,13 +399,16 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, else ret = cpumask_parse(buf, newmask); - if (ret) + if (ret) { + rdt_last_cmd_puts("bad cpu list/mask\n"); goto unlock; + } /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); if (cpumask_weight(tmpmask)) { ret = -EINVAL; + rdt_last_cmd_puts("can only assign online cpus\n"); goto unlock; } @@ -452,6 +487,7 @@ static int __rdtgroup_move_task(struct task_struct *tsk, */ atomic_dec(&rdtgrp->waitcount); kfree(callback); + rdt_last_cmd_puts("task exited\n"); } else { /* * For ctrl_mon groups move both closid and rmid. @@ -462,10 +498,12 @@ static int __rdtgroup_move_task(struct task_struct *tsk, tsk->closid = rdtgrp->closid; tsk->rmid = rdtgrp->mon.rmid; } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) + if (rdtgrp->mon.parent->closid == tsk->closid) { tsk->rmid = rdtgrp->mon.rmid; - else + } else { + rdt_last_cmd_puts("Can't move task to different control group\n"); ret = -EINVAL; + } } } return ret; @@ -484,8 +522,10 @@ static int rdtgroup_task_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + !uid_eq(cred->euid, tcred->suid)) { + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); ret = -EPERM; + } put_cred(tcred); return ret; @@ -502,6 +542,7 @@ static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); + rdt_last_cmd_printf("No task %d\n", pid); return -ESRCH; } } else { @@ -529,6 +570,7 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); + rdt_last_cmd_clear(); if (rdtgrp) ret = rdtgroup_move_task(pid, rdtgrp, of); @@ -569,6 +611,21 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } +static int rdt_last_cmd_status_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + int len; + + mutex_lock(&rdtgroup_mutex); + len = seq_buf_used(&last_cmd_status); + if (len) + seq_printf(seq, "%.*s", len, last_cmd_status_buf); + else + seq_puts(seq, "ok\n"); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -686,6 +743,13 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { + .name = "last_cmd_status", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_last_cmd_status_show, + .fflags = RF_TOP_INFO, + }, + { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -855,6 +919,10 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) return PTR_ERR(kn_info); kernfs_get(kn_info); + ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + if (ret) + goto out_destroy; + for_each_alloc_enabled_rdt_resource(r) { fflags = r->fflags | RF_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); @@ -1081,6 +1149,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, struct dentry *dentry; int ret; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* * resctrl file system can only be mounted once. @@ -1130,12 +1199,12 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_mondata; if (rdt_alloc_capable) - static_branch_enable(&rdt_alloc_enable_key); + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); if (rdt_mon_capable) - static_branch_enable(&rdt_mon_enable_key); + static_branch_enable_cpuslocked(&rdt_mon_enable_key); if (rdt_alloc_capable || rdt_mon_capable) - static_branch_enable(&rdt_enable_key); + static_branch_enable_cpuslocked(&rdt_enable_key); if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3]; @@ -1156,7 +1225,9 @@ out_info: out_cdp: cdp_disable(); out: + rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return dentry; } @@ -1295,9 +1366,7 @@ static void rmdir_all_sub(void) kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ - get_online_cpus(); update_closid_rmid(cpu_online_mask, &rdtgroup_default); - put_online_cpus(); kernfs_remove(kn_info); kernfs_remove(kn_mongrp); @@ -1308,6 +1377,7 @@ static void rdt_kill_sb(struct super_block *sb) { struct rdt_resource *r; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ @@ -1315,11 +1385,12 @@ static void rdt_kill_sb(struct super_block *sb) reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); - static_branch_disable(&rdt_alloc_enable_key); - static_branch_disable(&rdt_mon_enable_key); - static_branch_disable(&rdt_enable_key); + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_disable_cpuslocked(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } static struct file_system_type rdt_fs_type = { @@ -1524,8 +1595,10 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, int ret; prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + rdt_last_cmd_clear(); if (!prdtgrp) { ret = -ENODEV; + rdt_last_cmd_puts("directory was removed\n"); goto out_unlock; } @@ -1533,6 +1606,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; + rdt_last_cmd_puts("kernel out of memory\n"); goto out_unlock; } *r = rdtgrp; @@ -1544,6 +1618,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); + rdt_last_cmd_puts("kernfs create error\n"); goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1557,24 +1632,31 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, kernfs_get(kn); ret = rdtgroup_kn_set_ugid(kn); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs perm error\n"); goto out_destroy; + } - files = RFTYPE_BASE | RFTYPE_CTRL; files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); ret = rdtgroup_add_files(kn, files); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs fill error\n"); goto out_destroy; + } if (rdt_mon_capable) { ret = alloc_rmid(); - if (ret < 0) + if (ret < 0) { + rdt_last_cmd_puts("out of RMIDs\n"); goto out_destroy; + } rdtgrp->mon.rmid = ret; ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); goto out_idfree; + } } kernfs_activate(kn); @@ -1652,8 +1734,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, kn = rdtgrp->kn; ret = closid_alloc(); - if (ret < 0) + if (ret < 0) { + rdt_last_cmd_puts("out of CLOSIDs\n"); goto out_common_fail; + } closid = ret; rdtgrp->closid = closid; @@ -1665,8 +1749,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, * of tasks and cpus to monitor. */ ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); goto out_id_free; + } } goto out_unlock; @@ -1902,6 +1988,9 @@ int __init rdtgroup_init(void) { int ret = 0; + seq_buf_init(&last_cmd_status, last_cmd_status_buf, + sizeof(last_cmd_status_buf)); + ret = rdtgroup_setup_root(); if (ret) return ret; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 87cc9ab7a13c..4ca632a06e0b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -204,7 +204,7 @@ static int error_context(struct mce *m) return IN_KERNEL; } -static int mce_severity_amd_smca(struct mce *m, int err_ctx) +static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) { u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); u32 low, high; @@ -245,6 +245,9 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc if (m->status & MCI_STATUS_UC) { + if (ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + /* * On older systems where overflow_recov flag is not present, we * should simply panic if an error overflow occurs. If @@ -255,10 +258,6 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc if (mce_flags.smca) return mce_severity_amd_smca(m, ctx); - /* software can try to contain */ - if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - /* kill current process */ return MCE_AR_SEVERITY; } else { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a9e898b71208..868e412b4f0c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1367,13 +1367,12 @@ static void __start_timer(struct timer_list *t, unsigned long interval) local_irq_restore(flags); } -static void mce_timer_fn(unsigned long data) +static void mce_timer_fn(struct timer_list *t) { - struct timer_list *t = this_cpu_ptr(&mce_timer); - int cpu = smp_processor_id(); + struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); unsigned long iv; - WARN_ON(cpu != data); + WARN_ON(cpu_t != t); iv = __this_cpu_read(mce_next_interval); @@ -1763,17 +1762,15 @@ static void mce_start_timer(struct timer_list *t) static void __mcheck_cpu_setup_timer(void) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned int cpu = smp_processor_id(); - setup_pinned_timer(t, mce_timer_fn, cpu); + timer_setup(t, mce_timer_fn, TIMER_PINNED); } static void __mcheck_cpu_init_timer(void) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned int cpu = smp_processor_id(); - setup_pinned_timer(t, mce_timer_fn, cpu); + timer_setup(t, mce_timer_fn, TIMER_PINNED); mce_start_timer(t); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c6daec4bdba5..330b8462d426 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -470,6 +470,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 +#define F17H_MPB_MAX_SIZE 3200 switch (family) { case 0x14: @@ -481,6 +482,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, case 0x16: max_size = F16H_MPB_MAX_SIZE; break; + case 0x17: + max_size = F17H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index c4fa4a85d4cb..e4fc595cd6ea 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -239,7 +239,7 @@ static int __init save_microcode_in_initrd(void) break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(cpuid_eax(1)); + ret = save_microcode_in_initrd_amd(cpuid_eax(1)); break; default: break; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 8ccdca6d3f9e..f7c55b0e753a 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,6 +45,9 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; /* Current microcode patch used in early patching on the APs. */ static struct microcode_intel *intel_ucode_patch; +/* last level cache size per core */ +static int llc_size_per_core; + static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, unsigned int s2, unsigned int p2) { @@ -910,8 +913,19 @@ static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { - pr_err_once("late loading on model 79 is disabled.\n"); + /* + * Late loading on model 79 with microcode revision less than 0x0b000021 + * and LLC size per core bigger than 2.5MB may result in a system hang. + * This behavior is documented in item BDF90, #334165 (Intel Xeon + * Processor E7-8800/4800 v4 Product Family). + */ + if (c->x86 == 6 && + c->x86_model == INTEL_FAM6_BROADWELL_X && + c->x86_mask == 0x01 && + llc_size_per_core > 2621440 && + c->microcode < 0x0b000021) { + pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); return true; } @@ -966,6 +980,15 @@ static struct microcode_ops microcode_intel_ops = { .apply_microcode = apply_microcode_intel, }; +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ + u64 llc_size = c->x86_cache_size * 1024; + + do_div(llc_size, c->x86_max_cores); + + return (int)llc_size; +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -976,5 +999,7 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + llc_size_per_core = calc_llc_size_per_core(c); + return µcode_intel_ops; } diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 6b7e17bf0b71..e7ecedafa1c8 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -5,6 +5,8 @@ #include <linux/seq_file.h> #include <linux/cpufreq.h> +#include "cpu.h" + /* * Get CPU information for use by the procfs. */ @@ -78,9 +80,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = cpufreq_quick_get(cpu); + unsigned int freq = aperfmperf_get_khz(cpu); if (!freq) + freq = cpufreq_quick_get(cpu); + if (!freq) freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 44404e2307bb..10e74d4778a1 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -209,7 +209,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) } #ifdef CONFIG_KEXEC_FILE -static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg) +static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -342,7 +342,7 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced, return ret; } -static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) { struct crash_elf_data *ced = arg; Elf64_Ehdr *ehdr; @@ -355,7 +355,7 @@ static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) ehdr = ced->ehdr; /* Exclude unwanted mem ranges */ - ret = elf_header_exclude_ranges(ced, start, end); + ret = elf_header_exclude_ranges(ced, res->start, res->end); if (ret) return ret; @@ -518,14 +518,14 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) return 0; } -static int memmap_entry_callback(u64 start, u64 end, void *arg) +static int memmap_entry_callback(struct resource *res, void *arg) { struct crash_memmap_data *cmd = arg; struct boot_params *params = cmd->params; struct e820_entry ei; - ei.addr = start; - ei.size = end - start + 1; + ei.addr = res->start; + ei.size = resource_size(res); ei.type = cmd->type; add_e820_entry(params, &ei); @@ -619,12 +619,12 @@ out: return ret; } -static int determine_backup_region(u64 start, u64 end, void *arg) +static int determine_backup_region(struct resource *res, void *arg) { struct kimage *image = arg; - image->arch.backup_src_start = start; - image->arch.backup_src_sz = end - start + 1; + image->arch.backup_src_start = res->start; + image->arch.backup_src_sz = resource_size(res); /* Expecting only one range for backup region */ return 1; diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 9c4e7ba6870c..e5ec3cafa72e 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -57,7 +57,7 @@ # error "Need more virtual address space for the ESPFIX hack" #endif -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) /* This contains the *bottom* address of the espfix stack */ DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); @@ -155,14 +155,14 @@ void init_espfix_ap(int cpu) page = cpu/ESPFIX_STACKS_PER_PAGE; /* Did another CPU already set this up? */ - stack_page = ACCESS_ONCE(espfix_pages[page]); + stack_page = READ_ONCE(espfix_pages[page]); if (likely(stack_page)) goto done; mutex_lock(&espfix_init_mutex); /* Did we race on the lock? */ - stack_page = ACCESS_ONCE(espfix_pages[page]); + stack_page = READ_ONCE(espfix_pages[page]); if (stack_page) goto unlock_done; @@ -200,7 +200,7 @@ void init_espfix_ap(int cpu) set_pte(&pte_p[n*PTE_STRIDE], pte); /* Job is done for this CPU and any CPU which shares this page */ - ACCESS_ONCE(espfix_pages[page]) = stack_page; + WRITE_ONCE(espfix_pages[page], stack_page); unlock_done: mutex_unlock(&espfix_init_mutex); diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 7cb8ba08beb9..91b2cff4b79a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -8,6 +8,7 @@ #include <asm/ftrace.h> #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/unwind_hints.h> .code64 .section .entry.text, "ax" @@ -20,7 +21,6 @@ EXPORT_SYMBOL(__fentry__) EXPORT_SYMBOL(mcount) #endif -/* All cases save the original rbp (8 bytes) */ #ifdef CONFIG_FRAME_POINTER # ifdef CC_USING_FENTRY /* Save parent and function stack frames (rip and rbp) */ @@ -31,7 +31,7 @@ EXPORT_SYMBOL(mcount) # endif #else /* No need to save a stack frame */ -# define MCOUNT_FRAME_SIZE 8 +# define MCOUNT_FRAME_SIZE 0 #endif /* CONFIG_FRAME_POINTER */ /* Size of stack used to save mcount regs in save_mcount_regs */ @@ -64,10 +64,10 @@ EXPORT_SYMBOL(mcount) */ .macro save_mcount_regs added=0 - /* Always save the original rbp */ +#ifdef CONFIG_FRAME_POINTER + /* Save the original rbp */ pushq %rbp -#ifdef CONFIG_FRAME_POINTER /* * Stack traces will stop at the ftrace trampoline if the frame pointer * is not set up properly. If fentry is used, we need to save a frame @@ -105,7 +105,11 @@ EXPORT_SYMBOL(mcount) * Save the original RBP. Even though the mcount ABI does not * require this, it helps out callers. */ +#ifdef CONFIG_FRAME_POINTER movq MCOUNT_REG_SIZE-8(%rsp), %rdx +#else + movq %rbp, %rdx +#endif movq %rdx, RBP(%rsp) /* Copy the parent address into %rsi (second parameter) */ @@ -148,7 +152,7 @@ EXPORT_SYMBOL(mcount) ENTRY(function_hook) retq -END(function_hook) +ENDPROC(function_hook) ENTRY(ftrace_caller) /* save_mcount_regs fills in first two parameters */ @@ -184,7 +188,7 @@ GLOBAL(ftrace_graph_call) /* This is weak to keep gas from relaxing the jumps */ WEAK(ftrace_stub) retq -END(ftrace_caller) +ENDPROC(ftrace_caller) ENTRY(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ @@ -255,7 +259,7 @@ GLOBAL(ftrace_regs_caller_end) jmp ftrace_epilogue -END(ftrace_regs_caller) +ENDPROC(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -291,7 +295,7 @@ trace: restore_mcount_regs jmp fgraph_trace -END(function_hook) +ENDPROC(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -313,9 +317,10 @@ ENTRY(ftrace_graph_caller) restore_mcount_regs retq -END(ftrace_graph_caller) +ENDPROC(ftrace_graph_caller) -GLOBAL(return_to_handler) +ENTRY(return_to_handler) + UNWIND_HINT_EMPTY subq $24, %rsp /* Save the return values */ @@ -330,4 +335,5 @@ GLOBAL(return_to_handler) movq (%rsp), %rax addq $24, %rsp JMP_NOSPEC %rdi +END(return_to_handler) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6a5d757b9cfd..7ba5d819ebe3 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -157,8 +157,8 @@ unsigned long __head __startup_64(unsigned long physaddr, p = fixup_pointer(&phys_base, physaddr); *p += load_delta - sme_get_me_mask(); - /* Encrypt the kernel (if SME is active) */ - sme_encrypt_kernel(); + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); /* * Return the SME encryption mask (if SME is active) to be used as a diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 8f5cb2c7060c..86c4439f9d74 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int irq) io_apic_irqs &= ~(1<<irq); irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq); enable_irq(irq); + lapic_assign_legacy_vector(irq, true); } /* diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 014cb2fc47ff..56d99be3706a 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -56,7 +56,7 @@ struct idt_data { * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ -static const __initdata struct idt_data early_idts[] = { +static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, debug), SYSG(X86_TRAP_BP, int3), #ifdef CONFIG_X86_32 @@ -70,7 +70,7 @@ static const __initdata struct idt_data early_idts[] = { * the traps which use them are reinitialized with IST after cpu_init() has * set up TSS. */ -static const __initdata struct idt_data def_idts[] = { +static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, divide_error), INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, bounds), @@ -108,7 +108,7 @@ static const __initdata struct idt_data def_idts[] = { /* * The APIC and SMP idt entries */ -static const __initdata struct idt_data apic_idts[] = { +static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP INTG(RESCHEDULE_VECTOR, reschedule_interrupt), INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), @@ -150,7 +150,7 @@ static const __initdata struct idt_data apic_idts[] = { * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ -static const __initdata struct idt_data early_pf_idts[] = { +static const __initconst struct idt_data early_pf_idts[] = { INTG(X86_TRAP_PF, page_fault), }; @@ -158,7 +158,7 @@ static const __initdata struct idt_data early_pf_idts[] = { * Override for the debug_idt. Same as the default, but with interrupt * stack set to DEFAULT_STACK (0). Required for NMI trap handling. */ -static const __initdata struct idt_data dbg_idts[] = { +static const __initconst struct idt_data dbg_idts[] = { INTG(X86_TRAP_DB, debug), INTG(X86_TRAP_BP, int3), }; @@ -180,7 +180,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * The exceptions which use Interrupt stacks. They are setup after * cpu_init() when the TSS has been initialized. */ -static const __initdata struct idt_data ist_idts[] = { +static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, debug, DEBUG_STACK), ISTG(X86_TRAP_NMI, nmi, NMI_STACK), SISTG(X86_TRAP_BP, int3, DEBUG_STACK), @@ -223,7 +223,7 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy idt_init_desc(&desc, t); write_idt_entry(idt, t->vector, &desc); if (sys) - set_bit(t->vector, used_vectors); + set_bit(t->vector, system_vectors); } } @@ -311,14 +311,14 @@ void __init idt_setup_apic_and_irq_gates(void) idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); - for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { + for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) { entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); set_intr_gate(i, entry); } - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { + for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { #ifdef CONFIG_X86_LOCAL_APIC - set_bit(i, used_vectors); + set_bit(i, system_vectors); set_intr_gate(i, spurious_interrupt); #else entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); @@ -356,7 +356,7 @@ void idt_invalidate(void *addr) void __init update_intr_gate(unsigned int n, const void *addr) { - if (WARN_ON_ONCE(!test_bit(n, used_vectors))) + if (WARN_ON_ONCE(!test_bit(n, system_vectors))) return; set_intr_gate(n, addr); } @@ -364,6 +364,6 @@ void __init update_intr_gate(unsigned int n, const void *addr) void alloc_intr_gate(unsigned int n, const void *addr) { BUG_ON(n < FIRST_SYSTEM_VECTOR); - if (!test_and_set_bit(n, used_vectors)) + if (!test_and_set_bit(n, system_vectors)) set_intr_gate(n, addr); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index aa9d51eea9d0..68e1867cca80 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -134,7 +134,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) - if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) { + if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) { seq_printf(p, "%*s: ", prec, "HYP"); for_each_online_cpu(j) seq_printf(p, "%10u ", @@ -321,105 +321,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) #ifdef CONFIG_HOTPLUG_CPU - -/* These two declarations are only used in check_irq_vectors_for_cpu_disable() - * below, which is protected by stop_machine(). Putting them on the stack - * results in a stack frame overflow. Dynamically allocating could result in a - * failure so declare these two cpumasks as global. - */ -static struct cpumask affinity_new, online_new; - -/* - * This cpu is going to be removed and its vectors migrated to the remaining - * online cpus. Check to see if there are enough vectors in the remaining cpus. - * This function is protected by stop_machine(). - */ -int check_irq_vectors_for_cpu_disable(void) -{ - unsigned int this_cpu, vector, this_count, count; - struct irq_desc *desc; - struct irq_data *data; - int cpu; - - this_cpu = smp_processor_id(); - cpumask_copy(&online_new, cpu_online_mask); - cpumask_clear_cpu(this_cpu, &online_new); - - this_count = 0; - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - desc = __this_cpu_read(vector_irq[vector]); - if (IS_ERR_OR_NULL(desc)) - continue; - /* - * Protect against concurrent action removal, affinity - * changes etc. - */ - raw_spin_lock(&desc->lock); - data = irq_desc_get_irq_data(desc); - cpumask_copy(&affinity_new, - irq_data_get_affinity_mask(data)); - cpumask_clear_cpu(this_cpu, &affinity_new); - - /* Do not count inactive or per-cpu irqs. */ - if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) { - raw_spin_unlock(&desc->lock); - continue; - } - - raw_spin_unlock(&desc->lock); - /* - * A single irq may be mapped to multiple cpu's - * vector_irq[] (for example IOAPIC cluster mode). In - * this case we have two possibilities: - * - * 1) the resulting affinity mask is empty; that is - * this the down'd cpu is the last cpu in the irq's - * affinity mask, or - * - * 2) the resulting affinity mask is no longer a - * subset of the online cpus but the affinity mask is - * not zero; that is the down'd cpu is the last online - * cpu in a user set affinity mask. - */ - if (cpumask_empty(&affinity_new) || - !cpumask_subset(&affinity_new, &online_new)) - this_count++; - } - /* No need to check any further. */ - if (!this_count) - return 0; - - count = 0; - for_each_online_cpu(cpu) { - if (cpu == this_cpu) - continue; - /* - * We scan from FIRST_EXTERNAL_VECTOR to first system - * vector. If the vector is marked in the used vectors - * bitmap or an irq is assigned to it, we don't count - * it as available. - * - * As this is an inaccurate snapshot anyway, we can do - * this w/o holding vector_lock. - */ - for (vector = FIRST_EXTERNAL_VECTOR; - vector < FIRST_SYSTEM_VECTOR; vector++) { - if (!test_bit(vector, used_vectors) && - IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { - if (++count == this_count) - return 0; - } - } - } - - if (count < this_count) { - pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n", - this_cpu, this_count, count); - return -ERANGE; - } - return 0; -} - /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1e4094eba15e..a539410c4ea9 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -94,6 +94,7 @@ void __init native_init_IRQ(void) x86_init.irqs.pre_vector_init(); idt_setup_apic_and_irq_gates(); + lapic_assign_system_vectors(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 615105cf7d58..ae38dccf0c8f 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -85,11 +85,11 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, * Copy an instruction and adjust the displacement if the instruction * uses the %rip-relative addressing mode. */ -extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn); +extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn); /* Generate a relative-jump/call instruction */ -extern void synthesize_reljump(void *from, void *to); -extern void synthesize_relcall(void *from, void *to); +extern void synthesize_reljump(void *dest, void *from, void *to); +extern void synthesize_relcall(void *dest, void *from, void *to); #ifdef CONFIG_OPTPROBES extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0742491cbb73..bd36f3c33cd0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -119,29 +119,29 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); static nokprobe_inline void -__synthesize_relative_insn(void *from, void *to, u8 op) +__synthesize_relative_insn(void *dest, void *from, void *to, u8 op) { struct __arch_relative_insn { u8 op; s32 raddr; } __packed *insn; - insn = (struct __arch_relative_insn *)from; + insn = (struct __arch_relative_insn *)dest; insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); insn->op = op; } /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -void synthesize_reljump(void *from, void *to) +void synthesize_reljump(void *dest, void *from, void *to) { - __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); + __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE); } NOKPROBE_SYMBOL(synthesize_reljump); /* Insert a call instruction at address 'from', which calls address 'to'.*/ -void synthesize_relcall(void *from, void *to) +void synthesize_relcall(void *dest, void *from, void *to) { - __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); + __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE); } NOKPROBE_SYMBOL(synthesize_relcall); @@ -346,10 +346,11 @@ static int is_IF_modifier(kprobe_opcode_t *insn) /* * Copy an instruction with recovering modified instruction by kprobes * and adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * addressing mode. Note that since @real will be the final place of copied + * instruction, displacement must be adjust by @real, not @dest. * This returns the length of copied instruction, or 0 if it has an error. */ -int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) +int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) { kprobe_opcode_t buf[MAX_INSN_SIZE]; unsigned long recovered_insn = @@ -387,11 +388,11 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) * have given. */ newdisp = (u8 *) src + (s64) insn->displacement.value - - (u8 *) dest; + - (u8 *) real; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", - src, dest, insn->displacement.value); + src, real, insn->displacement.value); return 0; } disp = (u8 *) dest + insn_offset_displacement(insn); @@ -402,20 +403,38 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) } /* Prepare reljump right after instruction to boost */ -static void prepare_boost(struct kprobe *p, struct insn *insn) +static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, + struct insn *insn) { + int len = insn->length; + if (can_boost(insn, p->addr) && - MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) { + MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) { /* * These instructions can be executed directly if it * jumps back to correct address. */ - synthesize_reljump(p->ainsn.insn + insn->length, + synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); + len += RELATIVEJUMP_SIZE; p->ainsn.boostable = true; } else { p->ainsn.boostable = false; } + + return len; +} + +/* Make page to RO mode when allocate it */ +void *alloc_insn_page(void) +{ + void *page; + + page = module_alloc(PAGE_SIZE); + if (page) + set_memory_ro((unsigned long)page & PAGE_MASK, 1); + + return page; } /* Recover page to RW mode before releasing it */ @@ -429,12 +448,11 @@ void free_insn_page(void *page) static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; int len; - set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1); - /* Copy an instruction with recovering if other optprobe modifies it.*/ - len = __copy_instruction(p->ainsn.insn, p->addr, &insn); + len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn); if (!len) return -EINVAL; @@ -442,15 +460,16 @@ static int arch_copy_kprobe(struct kprobe *p) * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - prepare_boost(p, &insn); - - set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1); + len = prepare_boost(buf, p, &insn); /* Check whether the instruction modifies Interrupt Flag or not */ - p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); + p->ainsn.if_modifier = is_IF_modifier(buf); /* Also, displacement change doesn't affect the first byte */ - p->opcode = p->ainsn.insn[0]; + p->opcode = buf[0]; + + /* OK, write back the instruction(s) into ROX insn buffer */ + text_poke(p->ainsn.insn, buf, len); return 0; } diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 041f7b6dfa0f..8dc0161cec8f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -26,7 +26,7 @@ #include "common.h" static nokprobe_inline -int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, +void __skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, unsigned long orig_ip) { /* @@ -41,33 +41,31 @@ int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, __this_cpu_write(current_kprobe, NULL); if (orig_ip) regs->ip = orig_ip; - return 1; } int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { - if (kprobe_ftrace(p)) - return __skip_singlestep(p, regs, kcb, 0); - else - return 0; + if (kprobe_ftrace(p)) { + __skip_singlestep(p, regs, kcb, 0); + preempt_enable_no_resched(); + return 1; + } + return 0; } NOKPROBE_SYMBOL(skip_singlestep); -/* Ftrace callback handler for kprobes */ +/* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe *p; struct kprobe_ctlblk *kcb; - unsigned long flags; - - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); + /* Preempt is disabled by ftrace */ p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) - goto end; + return; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -77,17 +75,19 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); + /* To emulate trap based kprobes, preempt_disable here */ + preempt_disable(); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, regs)) + if (!p->pre_handler || !p->pre_handler(p, regs)) { __skip_singlestep(p, regs, kcb, orig_ip); + preempt_enable_no_resched(); + } /* * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe. + * resets current kprobe, and keep preempt count +1. */ } -end: - local_irq_restore(flags); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3668f28cf5fc..203d398802a3 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -143,11 +143,11 @@ void optprobe_template_func(void); STACK_FRAME_NON_STANDARD(optprobe_template_func); #define TMPL_MOVE_IDX \ - ((long)&optprobe_template_val - (long)&optprobe_template_entry) + ((long)optprobe_template_val - (long)optprobe_template_entry) #define TMPL_CALL_IDX \ - ((long)&optprobe_template_call - (long)&optprobe_template_entry) + ((long)optprobe_template_call - (long)optprobe_template_entry) #define TMPL_END_IDX \ - ((long)&optprobe_template_end - (long)&optprobe_template_entry) + ((long)optprobe_template_end - (long)optprobe_template_entry) #define INT3_SIZE sizeof(kprobe_opcode_t) @@ -155,17 +155,15 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func); static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - unsigned long flags; - /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - local_irq_save(flags); + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); /* Save skipped registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; @@ -181,17 +179,17 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - local_irq_restore(flags); + preempt_enable_no_resched(); } NOKPROBE_SYMBOL(optimized_callback); -static int copy_optimized_instructions(u8 *dest, u8 *src) +static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) { struct insn insn; int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len, &insn); + ret = __copy_instruction(dest + len, src + len, real, &insn); if (!ret || !can_boost(&insn, src + len)) return -EINVAL; len += ret; @@ -364,57 +362,66 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *__unused) { - u8 *buf; - int ret; + u8 *buf = NULL, *slot; + int ret, len; long rel; if (!can_optimize((unsigned long)op->kp.addr)) return -EILSEQ; - op->optinsn.insn = get_optinsn_slot(); - if (!op->optinsn.insn) + buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL); + if (!buf) return -ENOMEM; + op->optinsn.insn = slot = get_optinsn_slot(); + if (!slot) { + ret = -ENOMEM; + goto out; + } + /* * Verify if the address gap is in 2GB range, because this uses * a relative jump. */ - rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE; if (abs(rel) > 0x7fffffff) { - __arch_remove_optimized_kprobe(op, 0); - return -ERANGE; + ret = -ERANGE; + goto err; } - buf = (u8 *)op->optinsn.insn; - set_memory_rw((unsigned long)buf & PAGE_MASK, 1); + /* Copy arch-dep-instance from template */ + memcpy(buf, optprobe_template_entry, TMPL_END_IDX); /* Copy instructions into the out-of-line buffer */ - ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); - if (ret < 0) { - __arch_remove_optimized_kprobe(op, 0); - return ret; - } + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr, + slot + TMPL_END_IDX); + if (ret < 0) + goto err; op->optinsn.size = ret; - - /* Copy arch-dep-instance from template */ - memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + len = TMPL_END_IDX + op->optinsn.size; /* Set probe information */ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); /* Set probe function call */ - synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + synthesize_relcall(buf + TMPL_CALL_IDX, + slot + TMPL_CALL_IDX, optimized_callback); /* Set returning jmp instruction at the tail of out-of-line buffer */ - synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + synthesize_reljump(buf + len, slot + len, (u8 *)op->kp.addr + op->optinsn.size); + len += RELATIVEJUMP_SIZE; - set_memory_ro((unsigned long)buf & PAGE_MASK, 1); + /* We have to use text_poke for instuction buffer because it is RO */ + text_poke(slot, buf, len); + ret = 0; +out: + kfree(buf); + return ret; - flush_icache_range((unsigned long) buf, - (unsigned long) buf + TMPL_END_IDX + - op->optinsn.size + RELATIVEJUMP_SIZE); - return 0; +err: + __arch_remove_optimized_kprobe(op, 0); + goto out; } /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a94de09edbed..b40ffbf156c1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -75,8 +75,8 @@ static int parse_no_kvmclock_vsyscall(char *arg) early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); -static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; /* @@ -312,7 +312,7 @@ static void kvm_register_steal_time(void) cpu, (unsigned long long) slow_virt_to_phys(st)); } -static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; +static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) { @@ -426,9 +426,42 @@ void kvm_disable_steal_time(void) wrmsr(MSR_KVM_STEAL_TIME, 0, 0); } +static inline void __set_percpu_decrypted(void *ptr, unsigned long size) +{ + early_set_memory_decrypted((unsigned long) ptr, size); +} + +/* + * Iterate through all possible CPUs and map the memory region pointed + * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once. + * + * Note: we iterate through all possible CPUs to ensure that CPUs + * hotplugged will have their per-cpu variable already mapped as + * decrypted. + */ +static void __init sev_map_percpu_data(void) +{ + int cpu; + + if (!sev_active()) + return; + + for_each_possible_cpu(cpu) { + __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason)); + __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time)); + __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi)); + } +} + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { + /* + * Map the per-cpu variables as decrypted before kvm_guest_cpu_init() + * shares the guest physical address with the hypervisor. + */ + sev_map_percpu_data(); + kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); @@ -465,7 +498,7 @@ static void __init kvm_apf_trap_init(void) update_intr_gate(X86_TRAP_PF, async_page_fault); } -void __init kvm_guest_init(void) +static void __init kvm_guest_init(void) { int i; @@ -496,6 +529,7 @@ void __init kvm_guest_init(void) kvm_cpu_online, kvm_cpu_down_prepare) < 0) pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); #else + sev_map_percpu_data(); kvm_guest_cpu_init(); #endif @@ -548,6 +582,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, + .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5b609e28ce3f..8b26c9e01cc4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -27,6 +27,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> +#include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/reboot.h> #include <asm/kvmclock.h> @@ -45,13 +46,7 @@ early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock wall_clock; - -struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -{ - return hv_clock; -} -EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va); +static struct pvclock_wall_clock *wall_clock; /* * The wallclock is the time of day when we booted. Since then, some time may @@ -64,15 +59,15 @@ static void kvm_get_wallclock(struct timespec *now) int low, high; int cpu; - low = (int)__pa_symbol(&wall_clock); - high = ((u64)__pa_symbol(&wall_clock) >> 32); + low = (int)slow_virt_to_phys(wall_clock); + high = ((u64)slow_virt_to_phys(wall_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(&wall_clock, vcpu_time, now); + pvclock_read_wallclock(wall_clock, vcpu_time, now); put_cpu(); } @@ -249,11 +244,39 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, + phys_addr_t align) +{ + phys_addr_t mem; + + mem = memblock_alloc(size, align); + if (!mem) + return 0; + + if (sev_active()) { + if (early_set_memory_decrypted((unsigned long)__va(mem), size)) + goto e_free; + } + + return mem; +e_free: + memblock_free(mem, size); + return 0; +} + +static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) +{ + if (sev_active()) + early_set_memory_encrypted((unsigned long)__va(addr), size); + + memblock_free(addr, size); +} + void __init kvmclock_init(void) { struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem; - int size, cpu; + unsigned long mem, mem_wall_clock; + int size, cpu, wall_clock_size; u8 flags; size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); @@ -267,21 +290,35 @@ void __init kvmclock_init(void) } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) return; - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", - msr_kvm_system_time, msr_kvm_wall_clock); + wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); + mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); + if (!mem_wall_clock) + return; - mem = memblock_alloc(size, PAGE_SIZE); - if (!mem) + wall_clock = __va(mem_wall_clock); + memset(wall_clock, 0, wall_clock_size); + + mem = kvm_memblock_alloc(size, PAGE_SIZE); + if (!mem) { + kvm_memblock_free(mem_wall_clock, wall_clock_size); + wall_clock = NULL; return; + } + hv_clock = __va(mem); memset(hv_clock, 0, size); if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; - memblock_free(mem, size); + kvm_memblock_free(mem, size); + kvm_memblock_free(mem_wall_clock, wall_clock_size); + wall_clock = NULL; return; } + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); @@ -334,6 +371,7 @@ int __init kvm_setup_vsyscall_timeinfo(void) return 1; } + pvclock_set_pvti_cpu0_va(hv_clock); put_cpu(); kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 00bc751c861c..edfede768688 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -48,8 +48,6 @@ static void load_segments(void) "\tmovl $"STR(__KERNEL_DS)",%%eax\n" "\tmovl %%eax,%%ds\n" "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" : : : "eax", "memory"); #undef STR @@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0), 0); idt_invalidate(phys_to_virt(0)); + set_gdt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 410c5dadcee3..3a4b12809ab5 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -431,6 +431,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) } static unsigned long mpf_base; +static bool mpf_found; static unsigned long __init get_mpc_size(unsigned long physptr) { @@ -504,7 +505,7 @@ void __init default_get_smp_config(unsigned int early) if (!smp_found_config) return; - if (!mpf_base) + if (!mpf_found) return; if (acpi_lapic && early) @@ -593,6 +594,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) smp_found_config = 1; #endif mpf_base = base; + mpf_found = true; pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", base, base + sizeof(*mpf) - 1, mpf); @@ -858,7 +860,7 @@ static int __init update_mp_table(void) if (!enable_update_mptable) return 0; - if (!mpf_base) + if (!mpf_found) return 0; mpf = early_memremap(mpf_base, sizeof(*mpf)); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 35aafc95e4b8..18bc9b51ac9b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -105,7 +105,7 @@ static void nmi_max_handler(struct irq_work *w) { struct nmiaction *a = container_of(w, struct nmiaction, irq_work); int remainder_ns, decimal_msecs; - u64 whole_msecs = ACCESS_ONCE(a->max_duration); + u64 whole_msecs = READ_ONCE(a->max_duration); remainder_ns = do_div(whole_msecs, (1000 * 1000)); decimal_msecs = remainder_ns / 1000; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 19a3e8f961c7..041096bdef86 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -115,8 +115,18 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, return 5; } -/* Neat trick to map patch type back to the call within the - * corresponding structure. */ +DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); + +void __init native_pv_lock_init(void) +{ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_disable(&virt_spin_lock_key); +} + +/* + * Neat trick to map patch type back to the call within the + * corresponding structure. + */ static void *get_call_destination(u8 type) { struct paravirt_patch_template tmpl = { diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 5286a4a92cf7..35c461f21815 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -898,10 +898,9 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl) PHB_ROOT_COMPLEX_STATUS); } -static void calgary_watchdog(unsigned long data) +static void calgary_watchdog(struct timer_list *t) { - struct pci_dev *dev = (struct pci_dev *)data; - struct iommu_table *tbl = pci_iommu(dev->bus); + struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer); void __iomem *bbar = tbl->bbar; u32 val32; void __iomem *target; @@ -1016,8 +1015,7 @@ static void __init calgary_enable_translation(struct pci_dev *dev) writel(cpu_to_be32(val32), target); readl(target); /* flush */ - setup_timer(&tbl->watchdog_timer, &calgary_watchdog, - (unsigned long)dev); + timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0); mod_timer(&tbl->watchdog_timer, jiffies); } diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 3fe690067802..6b07faaa1579 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -7,7 +7,7 @@ #include <linux/init.h> #include <linux/ioport.h> -static int found(u64 start, u64 end, void *data) +static int found(struct resource *res, void *data) { return 1; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3cb2486c47e4..cb368c2a22ab 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -306,7 +306,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, } if ((tifp ^ tifn) & _TIF_NOTSC) - cr4_toggle_bits(X86_CR4_TSD); + cr4_toggle_bits_irqsoff(X86_CR4_TSD); if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); @@ -380,19 +380,24 @@ void stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); + /* + * Use wbinvd on processors that support SME. This provides support + * for performing a successful kexec when going from SME inactive + * to SME active (or vice-versa). The cache must be cleared so that + * if there are entries with the same physical address, both with and + * without the encryption bit, they don't race each other when flushed + * and potentially end up with the wrong entry being committed to + * memory. + */ + if (boot_cpu_has(X86_FEATURE_SME)) + native_wbinvd(); for (;;) { /* - * Use wbinvd followed by hlt to stop the processor. This - * provides support for kexec on a processor that supports - * SME. With kexec, going from SME inactive to SME active - * requires clearing cache entries so that addresses without - * the encryption bit set don't corrupt the same physical - * address that has the encryption bit set when caches are - * flushed. To achieve this a wbinvd is performed followed by - * a hlt. Even if the processor is not in the kexec/SME - * scenario this only adds a wbinvd to a halting processor. + * Use native_halt() so that memory contents don't change + * (stack usage and variables) after possibly issuing the + * native_wbinvd() above. */ - asm volatile("wbinvd; hlt" : : : "memory"); + native_halt(); } } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 5c3f6d6a5078..761f6af6efa5 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -25,8 +25,10 @@ #include <asm/fixmap.h> #include <asm/pvclock.h> +#include <asm/vgtod.h> static u8 valid_flags __read_mostly = 0; +static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly; void pvclock_set_flags(u8 flags) { @@ -144,3 +146,15 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } + +void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) +{ + WARN_ON(vclock_was_used(VCLOCK_PVCLOCK)); + pvti_cpu0_va = pvti; +} + +struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) +{ + return pvti_cpu0_va; +} +EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0957dd73d127..68d7ab81c62f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -136,18 +136,6 @@ RESERVE_BRK(dmi_alloc, 65536); static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; -#ifdef CONFIG_X86_64 -int default_cpu_present_to_apicid(int mps_cpu) -{ - return __default_cpu_present_to_apicid(mps_cpu); -} - -int default_check_phys_apicid_present(int phys_apicid) -{ - return __default_check_phys_apicid_present(phys_apicid); -} -#endif - struct boot_params boot_params; /* @@ -376,14 +364,6 @@ static void __init reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ - /* - * If SME is active, this memory will be marked encrypted by the - * kernel when it is accessed (including relocation). However, the - * ramdisk image was loaded decrypted by the bootloader, so make - * sure that it is encrypted before accessing it. - */ - sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); - initrd_start = 0; mapped_size = memblock_mem_size(max_pfn_mapped); @@ -822,26 +802,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } -static void __init simple_udelay_calibration(void) -{ - unsigned int tsc_khz, cpu_khz; - unsigned long lpj; - - if (!boot_cpu_has(X86_FEATURE_TSC)) - return; - - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - - tsc_khz = tsc_khz ? : cpu_khz; - if (!tsc_khz) - return; - - lpj = tsc_khz * 1000; - do_div(lpj, HZ); - loops_per_jiffy = lpj; -} - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -936,9 +896,6 @@ void __init setup_arch(char **cmdline_p) set_bit(EFI_BOOT, &efi.flags); set_bit(EFI_64BIT, &efi.flags); } - - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); #endif x86_init.oem.arch_setup(); @@ -992,6 +949,8 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -1045,12 +1004,10 @@ void __init setup_arch(char **cmdline_p) /* * VMware detection requires dmi to be available, so this - * needs to be done after dmi_scan_machine, for the BP. + * needs to be done after dmi_scan_machine(), for the boot CPU. */ init_hypervisor_platform(); - simple_udelay_calibration(); - x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1135,9 +1092,6 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); - if (!early_xdbc_setup_hardware()) - early_xdbc_register_console(); - reserve_bios_regions(); if (efi_enabled(EFI_MEMMAP)) { @@ -1243,6 +1197,10 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif + tsc_early_delay_calibrate(); + if (!early_xdbc_setup_hardware()) + early_xdbc_register_console(); + x86_init.paging.pagetable_init(); kasan_init(); @@ -1294,7 +1252,7 @@ void __init setup_arch(char **cmdline_p) io_apic_init_mappings(); - kvm_guest_init(); + x86_init.hyper.guest_late_init(); e820__reserve_resources(); e820__register_nosave_regions(max_low_pfn); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c3402fc30865..ed556d50d7ed 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -77,6 +77,7 @@ #include <asm/i8259.h> #include <asm/realmode.h> #include <asm/misc.h> +#include <asm/qspinlock.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -100,15 +101,12 @@ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); /* Logical package management. We might want to allocate that dynamically */ -static int *physical_to_logical_pkg __read_mostly; -static unsigned long *physical_package_map __read_mostly;; -static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; /* Maximum number of SMT threads on any online core */ -int __max_smt_threads __read_mostly; +int __read_mostly __max_smt_threads = 1; /* Flag to indicate if a complete sched domain rebuild is required */ bool x86_topology_update; @@ -230,7 +228,7 @@ static void notrace start_secondary(void *unused) load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - + load_current_idt(); cpu_init(); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); @@ -241,19 +239,19 @@ static void notrace start_secondary(void *unused) /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* - * Check TSC synchronization with the BP: + * Check TSC synchronization with the boot CPU: */ check_tsc_sync_target(); /* - * Lock vector_lock and initialize the vectors on this cpu - * before setting the cpu online. We must set it online with - * vector_lock held to prevent a concurrent setup/teardown - * from seeing a half valid vector space. + * Lock vector_lock, set CPU online and bring the vector + * allocator online. Online must be set with vector_lock held + * to prevent a concurrent irq setup/teardown from seeing a + * half valid vector space. */ lock_vector_lock(); - setup_vector_irq(smp_processor_id()); set_cpu_online(smp_processor_id(), true); + lapic_online(); unlock_vector_lock(); cpu_set_state_online(smp_processor_id()); x86_platform.nmi_init(); @@ -271,108 +269,48 @@ static void notrace start_secondary(void *unused) } /** + * topology_phys_to_logical_pkg - Map a physical package id to a logical + * + * Returns logical package id or -1 if not found + */ +int topology_phys_to_logical_pkg(unsigned int phys_pkg) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && c->phys_proc_id == phys_pkg) + return c->logical_proc_id; + } + return -1; +} +EXPORT_SYMBOL(topology_phys_to_logical_pkg); + +/** * topology_update_package_map - Update the physical to logical package map * @pkg: The physical package id as retrieved via CPUID * @cpu: The cpu for which this is updated */ int topology_update_package_map(unsigned int pkg, unsigned int cpu) { - unsigned int new; + int new; - /* Called from early boot ? */ - if (!physical_package_map) - return 0; - - if (pkg >= max_physical_pkg_id) - return -EINVAL; - - /* Set the logical package id */ - if (test_and_set_bit(pkg, physical_package_map)) + /* Already available somewhere? */ + new = topology_phys_to_logical_pkg(pkg); + if (new >= 0) goto found; - if (logical_packages >= __max_logical_packages) { - pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n", - logical_packages, cpu, __max_logical_packages); - return -ENOSPC; - } - new = logical_packages++; if (new != pkg) { pr_info("CPU %u Converting physical %u to logical package %u\n", cpu, pkg, new); } - physical_to_logical_pkg[pkg] = new; - found: - cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg]; + cpu_data(cpu).logical_proc_id = new; return 0; } -/** - * topology_phys_to_logical_pkg - Map a physical package id to a logical - * - * Returns logical package id or -1 if not found - */ -int topology_phys_to_logical_pkg(unsigned int phys_pkg) -{ - if (phys_pkg >= max_physical_pkg_id) - return -1; - return physical_to_logical_pkg[phys_pkg]; -} -EXPORT_SYMBOL(topology_phys_to_logical_pkg); - -static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu) -{ - unsigned int ncpus; - size_t size; - - /* - * Today neither Intel nor AMD support heterogenous systems. That - * might change in the future.... - * - * While ideally we'd want '* smp_num_siblings' in the below @ncpus - * computation, this won't actually work since some Intel BIOSes - * report inconsistent HT data when they disable HT. - * - * In particular, they reduce the APIC-IDs to only include the cores, - * but leave the CPUID topology to say there are (2) siblings. - * This means we don't know how many threads there will be until - * after the APIC enumeration. - * - * By not including this we'll sometimes over-estimate the number of - * logical packages by the amount of !present siblings, but this is - * still better than MAX_LOCAL_APIC. - * - * We use total_cpus not nr_cpu_ids because nr_cpu_ids can be limited - * on the command line leading to a similar issue as the HT disable - * problem because the hyperthreads are usually enumerated after the - * primary cores. - */ - ncpus = boot_cpu_data.x86_max_cores; - if (!ncpus) { - pr_warn("x86_max_cores == zero !?!?"); - ncpus = 1; - } - - __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); - logical_packages = 0; - - /* - * Possibly larger than what we need as the number of apic ids per - * package can be smaller than the actual used apic ids. - */ - max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus); - size = max_physical_pkg_id * sizeof(unsigned int); - physical_to_logical_pkg = kmalloc(size, GFP_KERNEL); - memset(physical_to_logical_pkg, 0xff, size); - size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); - physical_package_map = kzalloc(size, GFP_KERNEL); - - pr_info("Max logical packages: %u\n", __max_logical_packages); - - topology_update_package_map(c->phys_proc_id, cpu); -} - void __init smp_store_boot_cpu_info(void) { int id = 0; /* CPU 0 */ @@ -380,7 +318,8 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; - smp_init_package_map(c, id); + topology_update_package_map(c->phys_proc_id, id); + c->initialized = true; } /* @@ -391,13 +330,16 @@ void smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); - *c = boot_cpu_data; + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; c->cpu_index = id; /* * During boot time, CPU0 has this setup already. Save the info when * bringing up AP or offlined CPU0. */ identify_secondary_cpu(c); + c->initialized = true; } static bool @@ -1081,7 +1023,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) unsigned long flags; int err, ret = 0; - WARN_ON(irqs_disabled()); + lockdep_assert_irqs_enabled(); pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); @@ -1177,17 +1119,10 @@ static __init void disable_smp(void) cpumask_set_cpu(0, topology_core_cpumask(0)); } -enum { - SMP_OK, - SMP_NO_CONFIG, - SMP_NO_APIC, - SMP_FORCE_UP, -}; - /* * Various sanity checks. */ -static int __init smp_sanity_check(unsigned max_cpus) +static void __init smp_sanity_check(void) { preempt_disable(); @@ -1225,16 +1160,6 @@ static int __init smp_sanity_check(unsigned max_cpus) } /* - * If we couldn't find an SMP configuration at boot time, - * get out of here now! - */ - if (!smp_found_config && !acpi_lapic) { - preempt_enable(); - pr_notice("SMP motherboard not detected\n"); - return SMP_NO_CONFIG; - } - - /* * Should not be necessary because the MP table should list the boot * CPU too, but we do it for the sake of robustness anyway. */ @@ -1244,29 +1169,6 @@ static int __init smp_sanity_check(unsigned max_cpus) physid_set(hard_smp_processor_id(), phys_cpu_present_map); } preempt_enable(); - - /* - * If we couldn't find a local APIC, then get out of here now! - */ - if (APIC_INTEGRATED(boot_cpu_apic_version) && - !boot_cpu_has(X86_FEATURE_APIC)) { - if (!disable_apic) { - pr_err("BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); - } - return SMP_NO_APIC; - } - - /* - * If SMP should be disabled, then really disable it! - */ - if (!max_cpus) { - pr_info("SMP mode deactivated\n"); - return SMP_FORCE_UP; - } - - return SMP_OK; } static void __init smp_cpu_index_default(void) @@ -1281,9 +1183,18 @@ static void __init smp_cpu_index_default(void) } } +static void __init smp_get_logical_apicid(void) +{ + if (x2apic_mode) + cpu0_logical_apicid = apic_read(APIC_LDR); + else + cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); +} + /* - * Prepare for SMP bootup. The MP table or ACPI has been read - * earlier. Just do some sanity checking here and enable APIC mode. + * Prepare for SMP bootup. + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter + * for common interface support. */ void __init native_smp_prepare_cpus(unsigned int max_cpus) { @@ -1315,35 +1226,33 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); - switch (smp_sanity_check(max_cpus)) { - case SMP_NO_CONFIG: - disable_smp(); - if (APIC_init_uniprocessor()) - pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); - return; - case SMP_NO_APIC: + smp_sanity_check(); + + switch (apic_intr_mode) { + case APIC_PIC: + case APIC_VIRTUAL_WIRE_NO_CONFIG: disable_smp(); return; - case SMP_FORCE_UP: + case APIC_SYMMETRIC_IO_NO_ROUTING: disable_smp(); - apic_bsp_setup(false); + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); return; - case SMP_OK: + case APIC_VIRTUAL_WIRE: + case APIC_SYMMETRIC_IO: break; } - if (read_apic_id() != boot_cpu_physical_apicid) { - panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - read_apic_id(), boot_cpu_physical_apicid); - /* Or can we switch back to PIC here? */ - } + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); - default_setup_apic_routing(); - cpu0_logical_apicid = apic_bsp_setup(false); + smp_get_logical_apicid(); pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); + native_pv_lock_init(); + uv_system_init(); set_mtrr_aps_delayed_init(); @@ -1375,14 +1284,22 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { + int ncpus; + pr_debug("Boot done\n"); + /* + * Today neither Intel nor AMD support heterogenous systems so + * extrapolate the boot cpu's data to all packages. + */ + ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); + __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); + pr_info("Max logical packages: %u\n", __max_logical_packages); if (x86_has_numa_in_package) set_sched_topology(x86_numa_in_package_topology); nmi_selftest(); impress_friends(); - setup_ioapic_dest(); mtrr_aps_init(); } @@ -1541,13 +1458,14 @@ void cpu_disable_common(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); + lapic_offline(); } int native_cpu_disable(void) { int ret; - ret = check_irq_vectors_for_cpu_disable(); + ret = lapic_can_unplug_cpu(); if (ret) return ret; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 60244bfaf88f..093f2ea5dd56 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -30,7 +30,7 @@ static int save_stack_address(struct stack_trace *trace, unsigned long addr, return 0; } -static void __save_stack_trace(struct stack_trace *trace, +static void noinline __save_stack_trace(struct stack_trace *trace, struct task_struct *task, struct pt_regs *regs, bool nosched) { @@ -56,6 +56,7 @@ static void __save_stack_trace(struct stack_trace *trace, */ void save_stack_trace(struct stack_trace *trace) { + trace->skip++; __save_stack_trace(trace, current, NULL, false); } EXPORT_SYMBOL_GPL(save_stack_trace); @@ -70,6 +71,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (!try_get_task_stack(tsk)) return; + if (tsk == current) + trace->skip++; __save_stack_trace(trace, tsk, NULL, true); put_task_stack(tsk); @@ -88,8 +91,9 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); } \ }) -static int __save_stack_trace_reliable(struct stack_trace *trace, - struct task_struct *task) +static int __always_inline +__save_stack_trace_reliable(struct stack_trace *trace, + struct task_struct *task) { struct unwind_state state; struct pt_regs *regs; @@ -160,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk, { int ret; + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ if (!try_get_task_stack(tsk)) - return -EINVAL; + return 0; ret = __save_stack_trace_reliable(trace, tsk); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index a63fe77b3217..676774b9bb8d 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -188,6 +188,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (len > TASK_SIZE) return -ENOMEM; + /* No address checking. See comment at mmap_address_hint_valid() */ if (flags & MAP_FIXED) return addr; @@ -197,12 +198,15 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, /* requesting a specific address */ if (addr) { - addr = PAGE_ALIGN(addr); + addr &= PAGE_MASK; + if (!mmap_address_hint_valid(addr, len)) + goto get_unmapped_area; + vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vm_start_gap(vma))) + if (!vma || addr + len <= vm_start_gap(vma)) return addr; } +get_unmapped_area: info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 879af864d99a..749d189f8cd4 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -85,6 +85,11 @@ void __init hpet_time_init(void) static __init void x86_late_time_init(void) { x86_init.timers.timer_init(); + /* + * After PIT/HPET timers init, select and setup + * the final interrupt mode for delivering IRQs. + */ + x86_init.irqs.intr_mode_init(); tsc_init(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7c16fe0b60c2..446c9ef8cfc3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -42,7 +42,6 @@ #include <linux/edac.h> #endif -#include <asm/kmemcheck.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> @@ -61,6 +60,7 @@ #include <asm/trace/mpx.h> #include <asm/mpx.h> #include <asm/vm86.h> +#include <asm/umip.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -72,7 +72,7 @@ #include <asm/proto.h> #endif -DECLARE_BITMAP(used_vectors, NR_VECTORS); +DECLARE_BITMAP(system_vectors, NR_VECTORS); static inline void cond_local_irq_enable(struct pt_regs *regs) { @@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * * No need for ist_enter here because we don't use RCU. */ - if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { @@ -537,6 +537,11 @@ do_general_protection(struct pt_regs *regs, long error_code) RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); cond_local_irq_enable(regs); + if (static_cpu_has(X86_FEATURE_UMIP)) { + if (user_mode(regs) && fixup_umip_exception(regs)) + return; + } + if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); @@ -764,10 +769,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (!dr6 && user_mode(regs)) user_icebp = 1; - /* Catch kmemcheck conditions! */ - if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) - goto exit; - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ad2b925a808e..e169e85db434 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -112,7 +112,7 @@ static void cyc2ns_data_init(struct cyc2ns_data *data) data->cyc2ns_offset = 0; } -static void cyc2ns_init(int cpu) +static void __init cyc2ns_init(int cpu) { struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); @@ -602,7 +602,6 @@ unsigned long native_calibrate_tsc(void) case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; @@ -612,6 +611,8 @@ unsigned long native_calibrate_tsc(void) } } + if (crystal_khz == 0) + return 0; /* * TSC frequency determined by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This @@ -812,13 +813,13 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } -int recalibrate_cpu_khz(void) +void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; if (!boot_cpu_has(X86_FEATURE_TSC)) - return -ENODEV; + return; cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); @@ -828,10 +829,6 @@ int recalibrate_cpu_khz(void) cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); - - return 0; -#else - return -ENODEV; #endif } @@ -959,17 +956,21 @@ core_initcall(cpufreq_register_tsc_scaling); /* * If ART is present detect the numerator:denominator to convert to TSC */ -static void detect_art(void) +static void __init detect_art(void) { unsigned int unused[2]; if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; - /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */ + /* + * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, + * and the TSC counter resets must not occur asynchronously. + */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || - !boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || + tsc_async_resets) return; cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, @@ -1263,6 +1264,25 @@ static int __init init_tsc_clocksource(void) */ device_initcall(init_tsc_clocksource); +void __init tsc_early_delay_calibrate(void) +{ + unsigned long lpj; + + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + + tsc_khz = tsc_khz ? : cpu_khz; + if (!tsc_khz) + return; + + lpj = tsc_khz * 1000; + do_div(lpj, HZ); + loops_per_jiffy = lpj; +} + void __init tsc_init(void) { u64 lpj, cyc; @@ -1296,6 +1316,12 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); + if (cpu_khz != tsc_khz) { + pr_info("Detected %lu.%03lu MHz TSC", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + } + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index e76a9881306b..ec534f978867 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -31,6 +31,20 @@ struct tsc_adjust { static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); +/* + * TSC's on different sockets may be reset asynchronously. + * This may cause the TSC ADJUST value on socket 0 to be NOT 0. + */ +bool __read_mostly tsc_async_resets; + +void mark_tsc_async_resets(char *reason) +{ + if (tsc_async_resets) + return; + tsc_async_resets = true; + pr_info("tsc: Marking TSC async resets true due to %s\n", reason); +} + void tsc_verify_tsc_adjust(bool resume) { struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); @@ -39,6 +53,10 @@ void tsc_verify_tsc_adjust(bool resume) if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return; + /* Skip unnecessary error messages if TSC already unstable */ + if (check_tsc_unstable()) + return; + /* Rate limit the MSR check */ if (!resume && time_before(jiffies, adj->nextcheck)) return; @@ -72,12 +90,22 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, * non zero. We don't do that on non boot cpus because physical * hotplug should have set the ADJUST register to a value > 0 so * the TSC is in sync with the already running cpus. + * + * Also don't force the ADJUST value to zero if that is a valid value + * for socket 0 as determined by the system arch. This is required + * when multiple sockets are reset asynchronously with each other + * and socket 0 may not have an TSC ADJUST value of 0. */ if (bootcpu && bootval != 0) { - pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, - bootval); - wrmsrl(MSR_IA32_TSC_ADJUST, 0); - bootval = 0; + if (likely(!tsc_async_resets)) { + pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", + cpu, bootval); + wrmsrl(MSR_IA32_TSC_ADJUST, 0); + bootval = 0; + } else { + pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n", + cpu, bootval); + } } cur->adjusted = bootval; } @@ -91,6 +119,10 @@ bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return false; + /* Skip unnecessary error messages if TSC already unstable */ + if (check_tsc_unstable()) + return false; + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->nextcheck = jiffies + HZ; @@ -119,6 +151,13 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) cur->warned = false; /* + * If a non-zero TSC value for socket 0 may be valid then the default + * adjusted value cannot assumed to be zero either. + */ + if (tsc_async_resets) + cur->adjusted = bootval; + + /* * Check whether this CPU is the first in a package to come up. In * this case do not check the boot value against another package * because the new package might have been physically hotplugged, @@ -139,10 +178,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) * Compare the boot value and complain if it differs in the * package. */ - if (bootval != ref->bootval) { - pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", - refcpu, ref->bootval, cpu, bootval); - } + if (bootval != ref->bootval) + printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n"); + /* * The TSC_ADJUST values in a package must be the same. If the boot * value on this newly upcoming CPU differs from the adjustment @@ -150,8 +188,6 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) * adjusted value. */ if (bootval != ref->adjusted) { - pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n", - refcpu, ref->adjusted, cpu, bootval); cur->adjusted = ref->adjusted; wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); } diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c new file mode 100644 index 000000000000..f44ce0fb3583 --- /dev/null +++ b/arch/x86/kernel/umip.c @@ -0,0 +1,428 @@ +/* + * umip.c Emulation for instruction protected by the Intel User-Mode + * Instruction Prevention feature + * + * Copyright (c) 2017, Intel Corporation. + * Ricardo Neri <ricardo.neri-calderon@linux.intel.com> + */ + +#include <linux/uaccess.h> +#include <asm/umip.h> +#include <asm/traps.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <linux/ratelimit.h> + +#undef pr_fmt +#define pr_fmt(fmt) "umip: " fmt + +/** DOC: Emulation for User-Mode Instruction Prevention (UMIP) + * + * The feature User-Mode Instruction Prevention present in recent Intel + * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) + * from being executed with CPL > 0. Otherwise, a general protection fault is + * issued. + * + * Rather than relaying to the user space the general protection fault caused by + * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be + * trapped and emulate the result of such instructions to provide dummy values. + * This allows to both conserve the current kernel behavior and not reveal the + * system resources that UMIP intends to protect (i.e., the locations of the + * global descriptor and interrupt descriptor tables, the segment selectors of + * the local descriptor table, the value of the task state register and the + * contents of the CR0 register). + * + * This emulation is needed because certain applications (e.g., WineHQ and + * DOSEMU2) rely on this subset of instructions to function. + * + * The instructions protected by UMIP can be split in two groups. Those which + * return a kernel memory address (sgdt and sidt) and those which return a + * value (sldt, str and smsw). + * + * For the instructions that return a kernel memory address, applications + * such as WineHQ rely on the result being located in the kernel memory space, + * not the actual location of the table. The result is emulated as a hard-coded + * value that, lies close to the top of the kernel memory. The limit for the GDT + * and the IDT are set to zero. + * + * Given that sldt and str are not commonly used in programs that run on WineHQ + * or DOSEMU2, they are not emulated. + * + * The instruction smsw is emulated to return the value that the register CR0 + * has at boot time as set in the head_32. + * + * Also, emulation is provided only for 32-bit processes; 64-bit processes + * that attempt to use the instructions that UMIP protects will receive the + * SIGSEGV signal issued as a consequence of the general protection fault. + * + * Care is taken to appropriately emulate the results when segmentation is + * used. That is, rather than relying on USER_DS and USER_CS, the function + * insn_get_addr_ref() inspects the segment descriptor pointed by the + * registers in pt_regs. This ensures that we correctly obtain the segment + * base address and the address and operand sizes even if the user space + * application uses a local descriptor table. + */ + +#define UMIP_DUMMY_GDT_BASE 0xfffe0000 +#define UMIP_DUMMY_IDT_BASE 0xffff0000 + +/* + * The SGDT and SIDT instructions store the contents of the global descriptor + * table and interrupt table registers, respectively. The destination is a + * memory operand of X+2 bytes. X bytes are used to store the base address of + * the table and 2 bytes are used to store the limit. In 32-bit processes, the + * only processes for which emulation is provided, X has a value of 4. + */ +#define UMIP_GDT_IDT_BASE_SIZE 4 +#define UMIP_GDT_IDT_LIMIT_SIZE 2 + +#define UMIP_INST_SGDT 0 /* 0F 01 /0 */ +#define UMIP_INST_SIDT 1 /* 0F 01 /1 */ +#define UMIP_INST_SMSW 2 /* 0F 01 /4 */ +#define UMIP_INST_SLDT 3 /* 0F 00 /0 */ +#define UMIP_INST_STR 4 /* 0F 00 /1 */ + +const char * const umip_insns[5] = { + [UMIP_INST_SGDT] = "SGDT", + [UMIP_INST_SIDT] = "SIDT", + [UMIP_INST_SMSW] = "SMSW", + [UMIP_INST_SLDT] = "SLDT", + [UMIP_INST_STR] = "STR", +}; + +#define umip_pr_err(regs, fmt, ...) \ + umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) +#define umip_pr_warning(regs, fmt, ...) \ + umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) + +/** + * umip_printk() - Print a rate-limited message + * @regs: Register set with the context in which the warning is printed + * @log_level: Kernel log level to print the message + * @fmt: The text string to print + * + * Print the text contained in @fmt. The print rate is limited to bursts of 5 + * messages every two minutes. The purpose of this customized version of + * printk() is to print messages when user space processes use any of the + * UMIP-protected instructions. Thus, the printed text is prepended with the + * task name and process ID number of the current task as well as the + * instruction and stack pointers in @regs as seen when entering kernel mode. + * + * Returns: + * + * None. + */ +static __printf(3, 4) +void umip_printk(const struct pt_regs *regs, const char *log_level, + const char *fmt, ...) +{ + /* Bursts of 5 messages every two minutes */ + static DEFINE_RATELIMIT_STATE(ratelimit, 2 * 60 * HZ, 5); + struct task_struct *tsk = current; + struct va_format vaf; + va_list args; + + if (!__ratelimit(&ratelimit)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%s" pr_fmt("%s[%d] ip:%lx sp:%lx: %pV"), log_level, tsk->comm, + task_pid_nr(tsk), regs->ip, regs->sp, &vaf); + va_end(args); +} + +/** + * identify_insn() - Identify a UMIP-protected instruction + * @insn: Instruction structure with opcode and ModRM byte. + * + * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected + * instruction that can be emulated. + * + * Returns: + * + * On success, a constant identifying a specific UMIP-protected instruction that + * can be emulated. + * + * -EINVAL on error or when not an UMIP-protected instruction that can be + * emulated. + */ +static int identify_insn(struct insn *insn) +{ + /* By getting modrm we also get the opcode. */ + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + /* All the instructions of interest start with 0x0f. */ + if (insn->opcode.bytes[0] != 0xf) + return -EINVAL; + + if (insn->opcode.bytes[1] == 0x1) { + switch (X86_MODRM_REG(insn->modrm.value)) { + case 0: + return UMIP_INST_SGDT; + case 1: + return UMIP_INST_SIDT; + case 4: + return UMIP_INST_SMSW; + default: + return -EINVAL; + } + } else if (insn->opcode.bytes[1] == 0x0) { + if (X86_MODRM_REG(insn->modrm.value) == 0) + return UMIP_INST_SLDT; + else if (X86_MODRM_REG(insn->modrm.value) == 1) + return UMIP_INST_STR; + else + return -EINVAL; + } else { + return -EINVAL; + } +} + +/** + * emulate_umip_insn() - Emulate UMIP instructions and return dummy values + * @insn: Instruction structure with operands + * @umip_inst: A constant indicating the instruction to emulate + * @data: Buffer into which the dummy result is stored + * @data_size: Size of the emulated result + * + * Emulate an instruction protected by UMIP and provide a dummy result. The + * result of the emulation is saved in @data. The size of the results depends + * on both the instruction and type of operand (register vs memory address). + * The size of the result is updated in @data_size. Caller is responsible + * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE + + * UMIP_GDT_IDT_LIMIT_SIZE bytes. + * + * Returns: + * + * 0 on success, -EINVAL on error while emulating. + */ +static int emulate_umip_insn(struct insn *insn, int umip_inst, + unsigned char *data, int *data_size) +{ + unsigned long dummy_base_addr, dummy_value; + unsigned short dummy_limit = 0; + + if (!data || !data_size || !insn) + return -EINVAL; + /* + * These two instructions return the base address and limit of the + * global and interrupt descriptor table, respectively. According to the + * Intel Software Development manual, the base address can be 24-bit, + * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is + * 16-bit, the returned value of the base address is supposed to be a + * zero-extended 24-byte number. However, it seems that a 32-byte number + * is always returned irrespective of the operand size. + */ + if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) { + /* SGDT and SIDT do not use registers operands. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + + if (umip_inst == UMIP_INST_SGDT) + dummy_base_addr = UMIP_DUMMY_GDT_BASE; + else + dummy_base_addr = UMIP_DUMMY_IDT_BASE; + + *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE; + + memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE); + memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); + + } else if (umip_inst == UMIP_INST_SMSW) { + dummy_value = CR0_STATE; + + /* + * Even though the CR0 register has 4 bytes, the number + * of bytes to be copied in the result buffer is determined + * by whether the operand is a register or a memory location. + * If operand is a register, return as many bytes as the operand + * size. If operand is memory, return only the two least + * siginificant bytes of CR0. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + *data_size = insn->opnd_bytes; + else + *data_size = 2; + + memcpy(data, &dummy_value, *data_size); + /* STR and SLDT are not emulated */ + } else { + return -EINVAL; + } + + return 0; +} + +/** + * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR + * @addr: Address that caused the signal + * @regs: Register set containing the instruction pointer + * + * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is + * intended to be used to provide a segmentation fault when the result of the + * UMIP emulation could not be copied to the user space memory. + * + * Returns: none + */ +static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) +{ + siginfo_t info; + struct task_struct *tsk = current; + + tsk->thread.cr2 = (unsigned long)addr; + tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; + tsk->thread.trap_nr = X86_TRAP_PF; + + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = addr; + force_sig_info(SIGSEGV, &info, tsk); + + if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV))) + return; + + umip_pr_err(regs, "segfault in emulation. error%x\n", + X86_PF_USER | X86_PF_WRITE); +} + +/** + * fixup_umip_exception() - Fixup a general protection fault caused by UMIP + * @regs: Registers as saved when entering the #GP handler + * + * The instructions sgdt, sidt, str, smsw, sldt cause a general protection + * fault if executed with CPL > 0 (i.e., from user space). If the offending + * user-space process is not in long mode, this function fixes the exception + * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not + * fixed up. Also long mode user-space processes are not fixed up. + * + * If operands are memory addresses, results are copied to user-space memory as + * indicated by the instruction pointed by eIP using the registers indicated in + * the instruction operands. If operands are registers, results are copied into + * the context that was saved when entering kernel mode. + * + * Returns: + * + * True if emulation was successful; false if not. + */ +bool fixup_umip_exception(struct pt_regs *regs) +{ + int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst; + unsigned long seg_base = 0, *reg_addr; + /* 10 bytes is the maximum size of the result of UMIP instructions */ + unsigned char dummy_data[10] = { 0 }; + unsigned char buf[MAX_INSN_SIZE]; + void __user *uaddr; + struct insn insn; + int seg_defs; + + if (!regs) + return false; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + + if (seg_base == -1L) + return false; + + not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), + sizeof(buf)); + nr_copied = sizeof(buf) - not_copied; + + /* + * The copy_from_user above could have failed if user code is protected + * by a memory protection key. Give up on emulation in such a case. + * Should we issue a page fault? + */ + if (!nr_copied) + return false; + + insn_init(&insn, buf, nr_copied, user_64bit_mode(regs)); + + /* + * Override the default operand and address sizes with what is specified + * in the code segment descriptor. The instruction decoder only sets + * the address size it to either 4 or 8 address bytes and does nothing + * for the operand bytes. This OK for most of the cases, but we could + * have special cases where, for instance, a 16-bit code segment + * descriptor is used. + * If there is an address override prefix, the instruction decoder + * correctly updates these values, even for 16-bit defaults. + */ + seg_defs = insn_get_code_seg_params(regs); + if (seg_defs == -EINVAL) + return false; + + insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); + insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); + + insn_get_length(&insn); + if (nr_copied < insn.length) + return false; + + umip_inst = identify_insn(&insn); + if (umip_inst < 0) + return false; + + umip_pr_warning(regs, "%s instruction cannot be used by applications.\n", + umip_insns[umip_inst]); + + /* Do not emulate SLDT, STR or user long mode processes. */ + if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT || user_64bit_mode(regs)) + return false; + + umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n"); + + if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size)) + return false; + + /* + * If operand is a register, write result to the copy of the register + * value that was pushed to the stack when entering into kernel mode. + * Upon exit, the value we write will be restored to the actual hardware + * register. + */ + if (X86_MODRM_MOD(insn.modrm.value) == 3) { + reg_offset = insn_get_modrm_rm_off(&insn, regs); + + /* + * Negative values are usually errors. In memory addressing, + * the exception is -EDOM. Since we expect a register operand, + * all negative values are errors. + */ + if (reg_offset < 0) + return false; + + reg_addr = (unsigned long *)((unsigned long)regs + reg_offset); + memcpy(reg_addr, dummy_data, dummy_data_size); + } else { + uaddr = insn_get_addr_ref(&insn, regs); + if ((unsigned long)uaddr == -1L) + return false; + + nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size); + if (nr_copied > 0) { + /* + * If copy fails, send a signal and tell caller that + * fault was fixed up. + */ + force_sig_info_umip_fault(uaddr, regs); + return true; + } + } + + /* increase IP to let the program keep going */ + regs->ip += insn.length; + return true; +} diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index be86a865087a..1f9188f5357c 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -74,8 +74,50 @@ static struct orc_entry *orc_module_find(unsigned long ip) } #endif +#ifdef CONFIG_DYNAMIC_FTRACE +static struct orc_entry *orc_find(unsigned long ip); + +/* + * Ftrace dynamic trampolines do not have orc entries of their own. + * But they are copies of the ftrace entries that are static and + * defined in ftrace_*.S, which do have orc entries. + * + * If the undwinder comes across a ftrace trampoline, then find the + * ftrace function that was used to create it, and use that ftrace + * function's orc entrie, as the placement of the return code in + * the stack will be identical. + */ +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + struct ftrace_ops *ops; + unsigned long caller; + + ops = ftrace_ops_trampoline(ip); + if (!ops) + return NULL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + caller = (unsigned long)ftrace_regs_call; + else + caller = (unsigned long)ftrace_call; + + /* Prevent unlikely recursion */ + if (ip == caller) + return NULL; + + return orc_find(caller); +} +#else +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + return NULL; +} +#endif + static struct orc_entry *orc_find(unsigned long ip) { + static struct orc_entry *orc; + if (!orc_init) return NULL; @@ -111,7 +153,11 @@ static struct orc_entry *orc_find(unsigned long ip) __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); /* Module lookup: */ - return orc_module_find(ip); + orc = orc_module_find(ip); + if (orc) + return orc; + + return orc_ftrace_find(ip); } static void orc_sort_swap(void *_a, void *_b, int size) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 495c776de4b4..a3755d293a48 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -271,12 +271,15 @@ static bool is_prefix_bad(struct insn *insn) int i; for (i = 0; i < insn->prefixes.nbytes; i++) { - switch (insn->prefixes.bytes[i]) { - case 0x26: /* INAT_PFX_ES */ - case 0x2E: /* INAT_PFX_CS */ - case 0x36: /* INAT_PFX_DS */ - case 0x3E: /* INAT_PFX_SS */ - case 0xF0: /* INAT_PFX_LOCK */ + insn_attr_t attr; + + attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + switch (attr) { + case INAT_MAKE_PREFIX(INAT_PFX_ES): + case INAT_MAKE_PREFIX(INAT_PFX_CS): + case INAT_MAKE_PREFIX(INAT_PFX_DS): + case INAT_MAKE_PREFIX(INAT_PFX_SS): + case INAT_MAKE_PREFIX(INAT_PFX_LOCK): return true; } } diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index b034b1b14b9c..44685fb2a192 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -26,9 +26,6 @@ #define TOPOLOGY_REGISTER_OFFSET 0x10 -/* Flag below is initialized once during vSMP PCI initialization. */ -static int irq_routing_comply = 1; - #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: @@ -105,9 +102,6 @@ static void __init set_vsmp_pv_ops(void) if (cap & ctl & BIT(8)) { ctl &= ~BIT(8); - /* Interrupt routing set to ignore */ - irq_routing_comply = 0; - #ifdef CONFIG_PROC_FS /* Don't let users change irq affinity via procfs */ no_irq_affinity = 1; @@ -211,23 +205,10 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) return hard_smp_processor_id() >> index_msb; } -/* - * In vSMP, all cpus should be capable of handling interrupts, regardless of - * the APIC used. - */ -static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - cpumask_setall(retmask); -} - static void vsmp_apic_post_init(void) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; - - if (!irq_routing_comply) - apic->vector_allocation_domain = fill_vector_allocation_domain; } void __init vsmp_init(void) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 5b2d10c1973a..1151ccd72ce9 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -57,6 +57,7 @@ struct x86_init_ops x86_init __initdata = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .trap_init = x86_init_noop, + .intr_mode_init = apic_intr_mode_init }, .oem = { @@ -86,6 +87,7 @@ struct x86_init_ops x86_init __initdata = { .hyper = { .init_platform = x86_init_noop, + .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, }, diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index cdc70a3a6583..c2cea6651279 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -44,7 +44,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX}, [CPUID_1_ECX] = { 1, 0, CPUID_ECX}, [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX}, - [CPUID_8000_0001_ECX] = {0xc0000001, 0, CPUID_ECX}, + [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 453d8c990108..290ecf711aec 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1047,7 +1047,6 @@ static void fetch_register_operand(struct operand *op) static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { - ctxt->ops->get_fpu(ctxt); switch (reg) { case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; @@ -1069,13 +1068,11 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) #endif default: BUG(); } - ctxt->ops->put_fpu(ctxt); } static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { - ctxt->ops->get_fpu(ctxt); switch (reg) { case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; @@ -1097,12 +1094,10 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, #endif default: BUG(); } - ctxt->ops->put_fpu(ctxt); } static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { - ctxt->ops->get_fpu(ctxt); switch (reg) { case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; @@ -1114,12 +1109,10 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; default: BUG(); } - ctxt->ops->put_fpu(ctxt); } static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { - ctxt->ops->get_fpu(ctxt); switch (reg) { case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; @@ -1131,7 +1124,6 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; default: BUG(); } - ctxt->ops->put_fpu(ctxt); } static int em_fninit(struct x86_emulate_ctxt *ctxt) @@ -1139,9 +1131,7 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - ctxt->ops->get_fpu(ctxt); asm volatile("fninit"); - ctxt->ops->put_fpu(ctxt); return X86EMUL_CONTINUE; } @@ -1152,9 +1142,7 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - ctxt->ops->get_fpu(ctxt); asm volatile("fnstcw %0": "+m"(fcw)); - ctxt->ops->put_fpu(ctxt); ctxt->dst.val = fcw; @@ -1168,9 +1156,7 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - ctxt->ops->get_fpu(ctxt); asm volatile("fnstsw %0": "+m"(fsw)); - ctxt->ops->put_fpu(ctxt); ctxt->dst.val = fsw; @@ -2405,9 +2391,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) } static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, - u64 cr0, u64 cr4) + u64 cr0, u64 cr3, u64 cr4) { int bad; + u64 pcid; + + /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ + pcid = 0; + if (cr4 & X86_CR4_PCIDE) { + pcid = cr3 & 0xfff; + cr3 &= ~0xfff; + } + + bad = ctxt->ops->set_cr(ctxt, 3, cr3); + if (bad) + return X86EMUL_UNHANDLEABLE; /* * First enable PAE, long mode needs it before CR0.PG = 1 is set. @@ -2426,6 +2424,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, bad = ctxt->ops->set_cr(ctxt, 4, cr4); if (bad) return X86EMUL_UNHANDLEABLE; + if (pcid) { + bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + } return X86EMUL_CONTINUE; @@ -2436,11 +2440,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) struct desc_struct desc; struct desc_ptr dt; u16 selector; - u32 val, cr0, cr4; + u32 val, cr0, cr3, cr4; int i; cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); + cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); @@ -2482,14 +2486,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); - return rsm_enter_protected_mode(ctxt, cr0, cr4); + return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); } static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) { struct desc_struct desc; struct desc_ptr dt; - u64 val, cr0, cr4; + u64 val, cr0, cr3, cr4; u32 base3; u16 selector; int i, r; @@ -2506,7 +2510,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); cr0 = GET_SMSTATE(u64, smbase, 0x7f58); - ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); + cr3 = GET_SMSTATE(u64, smbase, 0x7f50); cr4 = GET_SMSTATE(u64, smbase, 0x7f48); ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); val = GET_SMSTATE(u64, smbase, 0x7ed0); @@ -2534,7 +2538,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) dt.address = GET_SMSTATE(u64, smbase, 0x7e68); ctxt->ops->set_gdt(ctxt, &dt); - r = rsm_enter_protected_mode(ctxt, cr0, cr4); + r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); if (r != X86EMUL_CONTINUE) return r; @@ -2592,6 +2596,15 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) ctxt->ops->set_msr(ctxt, MSR_EFER, efer); smbase = ctxt->ops->get_smbase(ctxt); + + /* + * Give pre_leave_smm() a chance to make ISA-specific changes to the + * vCPU state (e.g. enter guest mode) before loading state from the SMM + * state-save area. + */ + if (ctxt->ops->pre_leave_smm(ctxt, smbase)) + return X86EMUL_UNHANDLEABLE; + if (emulator_has_longmode(ctxt)) ret = rsm_load_state_64(ctxt, smbase + 0x8000); else @@ -3993,12 +4006,8 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - ctxt->ops->get_fpu(ctxt); - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); - ctxt->ops->put_fpu(ctxt); - if (rc != X86EMUL_CONTINUE) return rc; @@ -4006,6 +4015,26 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) fxstate_size(ctxt)); } +/* + * FXRSTOR might restore XMM registers not provided by the guest. Fill + * in the host registers (via FXSAVE) instead, so they won't be modified. + * (preemption has to stay disabled until FXRSTOR). + * + * Use noinline to keep the stack for other functions called by callers small. + */ +static noinline int fxregs_fixup(struct fxregs_state *fx_state, + const size_t used_size) +{ + struct fxregs_state fx_tmp; + int rc; + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp)); + memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size, + __fxstate_size(16) - used_size); + + return rc; +} + static int em_fxrstor(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; @@ -4016,19 +4045,17 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - ctxt->ops->get_fpu(ctxt); - size = fxstate_size(ctxt); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + if (rc != X86EMUL_CONTINUE) + return rc; + if (size < __fxstate_size(16)) { - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + rc = fxregs_fixup(&fx_state, size); if (rc != X86EMUL_CONTINUE) goto out; } - rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); - if (rc != X86EMUL_CONTINUE) - goto out; - if (fx_state.mxcsr >> 16) { rc = emulate_gp(ctxt, 0); goto out; @@ -4038,8 +4065,6 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); out: - ctxt->ops->put_fpu(ctxt); - return rc; } @@ -4992,6 +5017,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) bool op_prefix = false; bool has_seg_override = false; struct opcode opcode; + u16 dummy; + struct desc_struct desc; ctxt->memop.type = OP_NONE; ctxt->memopp = NULL; @@ -5010,6 +5037,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) switch (mode) { case X86EMUL_MODE_REAL: case X86EMUL_MODE_VM86: + def_op_bytes = def_ad_bytes = 2; + ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS); + if (desc.d) + def_op_bytes = def_ad_bytes = 4; + break; case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -5282,9 +5314,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { int rc; - ctxt->ops->get_fpu(ctxt); rc = asm_safe("fwait"); - ctxt->ops->put_fpu(ctxt); if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index bdff437acbcb..4e822ad363f3 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -209,12 +209,12 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, old_irr = ioapic->irr; ioapic->irr |= mask; - if (edge) + if (edge) { ioapic->irr_delivered &= ~mask; - if ((edge && old_irr == ioapic->irr) || - (!edge && entry.fields.remote_irr)) { - ret = 0; - goto out; + if (old_irr == ioapic->irr) { + ret = 0; + goto out; + } } ret = ioapic_service(ioapic, irq, line_status); @@ -257,8 +257,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, e->fields.dest_mode) || - (e->fields.trig_mode == IOAPIC_EDGE_TRIG && - kvm_apic_pending_eoi(vcpu, e->fields.vector))) + kvm_apic_pending_eoi(vcpu, e->fields.vector)) __set_bit(e->fields.vector, ioapic_handled_vectors); } @@ -277,6 +276,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; bool mask_before, mask_after; + int old_remote_irr, old_delivery_status; union kvm_ioapic_redirect_entry *e; switch (ioapic->ioregsel) { @@ -299,14 +299,28 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) return; e = &ioapic->redirtbl[index]; mask_before = e->fields.mask; + /* Preserve read-only fields */ + old_remote_irr = e->fields.remote_irr; + old_delivery_status = e->fields.delivery_status; if (ioapic->ioregsel & 1) { e->bits &= 0xffffffff; e->bits |= (u64) val << 32; } else { e->bits &= ~0xffffffffULL; e->bits |= (u32) val; - e->fields.remote_irr = 0; } + e->fields.remote_irr = old_remote_irr; + e->fields.delivery_status = old_delivery_status; + + /* + * Some OSes (Linux, Xen) assume that Remote IRR bit will + * be cleared by IOAPIC hardware when the entry is configured + * as edge-triggered. This behavior is used to simulate an + * explicit EOI on IOAPICs that don't have the EOI register. + */ + if (e->fields.trig_mode == IOAPIC_EDGE_TRIG) + e->fields.remote_irr = 0; + mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); @@ -324,7 +338,9 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) struct kvm_lapic_irq irqe; int ret; - if (entry->fields.mask) + if (entry->fields.mask || + (entry->fields.trig_mode == IOAPIC_LEVEL_TRIG && + entry->fields.remote_irr)) return -1; ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 36c90d631096..e2c1fb8d35ce 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -266,9 +266,14 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } +static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) +{ + return ((id >> 4) << 16) | (1 << (id & 0xf)); +} + static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + u32 ldr = kvm_apic_calc_x2apic_ldr(id); WARN_ON_ONCE(id != apic->vcpu->vcpu_id); @@ -1301,14 +1306,42 @@ static void update_divide_count(struct kvm_lapic *apic) apic->divide_count); } +static void limit_periodic_timer_frequency(struct kvm_lapic *apic) +{ + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } +} + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask; if (apic->lapic_timer.timer_mode != timer_mode) { + if (apic_lvtt_tscdeadline(apic) != (timer_mode == + APIC_LVT_TIMER_TSCDEADLINE)) { + hrtimer_cancel(&apic->lapic_timer.timer); + kvm_lapic_set_reg(apic, APIC_TMICT, 0); + apic->lapic_timer.period = 0; + apic->lapic_timer.tscdeadline = 0; + } apic->lapic_timer.timer_mode = timer_mode; - hrtimer_cancel(&apic->lapic_timer.timer); + limit_periodic_timer_frequency(apic); } } @@ -1430,6 +1463,30 @@ static void start_sw_period(struct kvm_lapic *apic) HRTIMER_MODE_ABS_PINNED); } +static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) +{ + ktime_t now, remaining; + u64 ns_remaining_old, ns_remaining_new; + + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + limit_periodic_timer_frequency(apic); + + now = ktime_get(); + remaining = ktime_sub(apic->lapic_timer.target_expiration, now); + if (ktime_to_ns(remaining) < 0) + remaining = 0; + + ns_remaining_old = ktime_to_ns(remaining); + ns_remaining_new = mul_u64_u32_div(ns_remaining_old, + apic->divide_count, old_divisor); + + apic->lapic_timer.tscdeadline += + nsec_to_cycles(apic->vcpu, ns_remaining_new) - + nsec_to_cycles(apic->vcpu, ns_remaining_old); + apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new); +} + static bool set_target_expiration(struct kvm_lapic *apic) { ktime_t now; @@ -1439,27 +1496,13 @@ static bool set_target_expiration(struct kvm_lapic *apic) apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; - if (!apic->lapic_timer.period) + if (!apic->lapic_timer.period) { + apic->lapic_timer.tscdeadline = 0; return false; - - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, - apic->lapic_timer.period, min_period); - apic->lapic_timer.period = min_period; - } } + limit_periodic_timer_frequency(apic); + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " "timer initial count 0x%x, period %lldns, " @@ -1515,6 +1558,9 @@ static bool start_hv_timer(struct kvm_lapic *apic) if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) return false; + if (!ktimer->tscdeadline) + return false; + r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); if (r < 0) return false; @@ -1738,13 +1784,21 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) start_apic_timer(apic); break; - case APIC_TDCR: + case APIC_TDCR: { + uint32_t old_divisor = apic->divide_count; + if (val & 4) apic_debug("KVM_WRITE:TDCR %x\n", val); kvm_lapic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); + if (apic->divide_count != old_divisor && + apic->lapic_timer.period) { + hrtimer_cancel(&apic->lapic_timer.timer); + update_target_expiration(apic, old_divisor); + restart_apic_timer(apic); + } break; - + } case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) { apic_debug("KVM_WRITE:ESR not zero %x\n", val); @@ -2196,6 +2250,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, { if (apic_x2apic_mode(vcpu->arch.apic)) { u32 *id = (u32 *)(s->regs + APIC_ID); + u32 *ldr = (u32 *)(s->regs + APIC_LDR); if (vcpu->kvm->arch.x2apic_format) { if (*id != vcpu->vcpu_id) @@ -2206,6 +2261,10 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, else *id <<= 24; } + + /* In x2APIC mode, the LDR is fixed and based on the id */ + if (set) + *ldr = kvm_apic_calc_x2apic_ldr(*id); } return 0; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7a69cf053711..2b8eb4da4d08 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -150,6 +150,20 @@ module_param(dbg, bool, 0644); /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 +/* + * Return values of handle_mmio_page_fault and mmu.page_fault: + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * + * For handle_mmio_page_fault only: + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE = 1, + RET_PF_INVALID = 2, +}; + struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -443,7 +457,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) static u64 __get_spte_lockless(u64 *sptep) { - return ACCESS_ONCE(*sptep); + return READ_ONCE(*sptep); } #else union split_spte { @@ -2424,7 +2438,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { - return __shadow_walk_next(iterator, *iterator->sptep); + __shadow_walk_next(iterator, *iterator->sptep); } static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, @@ -2794,13 +2808,13 @@ done: return ret; } -static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, - int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, - bool speculative, bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, + bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; - bool emulate = false; + int ret = RET_PF_RETRY; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2830,12 +2844,12 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) - emulate = true; + ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } if (unlikely(is_mmio_spte(*sptep))) - emulate = true; + ret = RET_PF_EMULATE; pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", @@ -2855,7 +2869,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, kvm_release_pfn_clean(pfn); - return emulate; + return ret; } static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2994,14 +3008,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. - * Return 1 to tell kvm to emulate it. */ if (pfn == KVM_PFN_ERR_RO_FAULT) - return 1; + return RET_PF_EMULATE; if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); - return 0; + return RET_PF_RETRY; } return -EFAULT; @@ -3286,13 +3299,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, } if (fast_page_fault(vcpu, v, level, error_code)) - return 0; + return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; @@ -3312,7 +3325,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } @@ -3382,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if(make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, 0, 0, vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); @@ -3397,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); @@ -3437,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, root_gfn, 0, vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); @@ -3474,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); if (make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); - return 1; + return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 0, ACC_ALL); @@ -3659,54 +3672,38 @@ exit: return reserved; } -/* - * Return values of handle_mmio_page_fault: - * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction - * directly. - * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page - * fault path update the mmio spte. - * RET_MMIO_PF_RETRY: let CPU fault again on the address. - * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). - */ -enum { - RET_MMIO_PF_EMULATE = 1, - RET_MMIO_PF_INVALID = 2, - RET_MMIO_PF_RETRY = 0, - RET_MMIO_PF_BUG = -1 -}; - static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; if (mmio_info_in_cache(vcpu, addr, direct)) - return RET_MMIO_PF_EMULATE; + return RET_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); if (WARN_ON(reserved)) - return RET_MMIO_PF_BUG; + return -EINVAL; if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); if (!check_mmio_spte(vcpu, spte)) - return RET_MMIO_PF_INVALID; + return RET_PF_INVALID; if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); - return RET_MMIO_PF_EMULATE; + return RET_PF_EMULATE; } /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ - return RET_MMIO_PF_RETRY; + return RET_PF_RETRY; } EXPORT_SYMBOL_GPL(handle_mmio_page_fault); @@ -3756,7 +3753,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return 1; + return RET_PF_EMULATE; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3784,7 +3781,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || - kvm_event_needs_reinjection(vcpu))) + kvm_event_needs_reinjection(vcpu) || + vcpu->arch.exception.pending)) return false; if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) @@ -3820,8 +3818,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, - u64 fault_address, char *insn, int insn_len, - bool need_unprotect) + u64 fault_address, char *insn, int insn_len) { int r = 1; @@ -3829,7 +3826,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, default: trace_kvm_page_fault(fault_address, error_code); - if (need_unprotect && kvm_event_needs_reinjection(vcpu)) + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); @@ -3876,7 +3873,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return 1; + return RET_PF_EMULATE; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3893,13 +3890,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } if (fast_page_fault(vcpu, gpa, level, error_code)) - return 0; + return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; @@ -3919,7 +3916,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -4819,7 +4816,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, * If we don't have indirect shadow pages, it means no page is * write-protected, so we can exit simply. */ - if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; remote_flush = local_flush = false; @@ -4918,25 +4915,25 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, vcpu->arch.gpa_val = cr2; } + r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2, direct); - if (r == RET_MMIO_PF_EMULATE) { + if (r == RET_PF_EMULATE) { emulation_type = 0; goto emulate; } - if (r == RET_MMIO_PF_RETRY) - return 1; - if (r < 0) - return r; - /* Must be RET_MMIO_PF_INVALID. */ } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), - false); + if (r == RET_PF_INVALID) { + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), + false); + WARN_ON(r == RET_PF_INVALID); + } + + if (r == RET_PF_RETRY) + return 1; if (r < 0) return r; - if (!r) - return 1; /* * Before emulating the instruction, check if the error code @@ -4993,8 +4990,7 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu.pae_root); - if (vcpu->arch.mmu.lm_root != NULL) - free_page((unsigned long)vcpu->arch.mmu.lm_root); + free_page((unsigned long)vcpu->arch.mmu.lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) @@ -5464,38 +5460,40 @@ static struct shrinker mmu_shrinker = { static void mmu_destroy_caches(void) { - if (pte_list_desc_cache) - kmem_cache_destroy(pte_list_desc_cache); - if (mmu_page_header_cache) - kmem_cache_destroy(mmu_page_header_cache); + kmem_cache_destroy(pte_list_desc_cache); + kmem_cache_destroy(mmu_page_header_cache); } int kvm_mmu_module_init(void) { + int ret = -ENOMEM; + kvm_mmu_clear_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), - 0, 0, NULL); + 0, SLAB_ACCOUNT, NULL); if (!pte_list_desc_cache) - goto nomem; + goto out; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof(struct kvm_mmu_page), - 0, 0, NULL); + 0, SLAB_ACCOUNT, NULL); if (!mmu_page_header_cache) - goto nomem; + goto out; if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto nomem; + goto out; - register_shrinker(&mmu_shrinker); + ret = register_shrinker(&mmu_shrinker); + if (ret) + goto out; return 0; -nomem: +out: mmu_destroy_caches(); - return -ENOMEM; + return ret; } /* diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index efc857615d8e..5b408c0ad612 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -66,8 +66,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, - u64 fault_address, char *insn, int insn_len, - bool need_unprotect); + u64 fault_address, char *insn, int insn_len); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index ea67dc876316..01c1371f39f8 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -157,7 +157,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, return false; index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); - return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); + return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } void kvm_page_track_cleanup(struct kvm *kvm) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f18d1f8d332b..5abae72266b7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -593,7 +593,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, emulate; + int top_level, ret; direct_access = gw->pte_access; @@ -659,15 +659,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } clear_sp_write_flooding_count(it.sptep); - emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, gw->gfn, pfn, prefault, map_writable); + ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, + it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return emulate; + return ret; out_gpte_changed: kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } /* @@ -762,12 +762,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!prefault) inject_page_fault(vcpu, &walker.fault); - return 0; + return RET_PF_RETRY; } if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { shadow_page_table_clear_flood(vcpu, addr); - return 1; + return RET_PF_EMULATE; } vcpu->arch.write_fault_to_shadow_pgtable = false; @@ -789,7 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) return r; @@ -834,7 +834,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2744b97345b8..f40d0da1f1d3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1035,15 +1035,12 @@ static int avic_ga_log_notifier(u32 ga_tag) } spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); - if (!vcpu) - return 0; - /* Note: * At this point, the IOMMU should have already set the pending * bit in the vAPIC backing page. So, we just need to schedule * in the vcpu. */ - if (vcpu->mode == OUTSIDE_GUEST_MODE) + if (vcpu) kvm_vcpu_wake_up(vcpu); return 0; @@ -2145,7 +2142,18 @@ static int pf_interception(struct vcpu_svm *svm) return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len, !npt_enabled); + svm->vmcb->control.insn_len); +} + +static int npf_interception(struct vcpu_svm *svm) +{ + u64 fault_address = svm->vmcb->control.exit_info_2; + u64 error_code = svm->vmcb->control.exit_info_1; + + trace_kvm_page_fault(fault_address, error_code); + return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + svm->vmcb->control.insn_bytes, + svm->vmcb->control.insn_len); } static int db_interception(struct vcpu_svm *svm) @@ -2190,6 +2198,8 @@ static int ud_interception(struct vcpu_svm *svm) int er; er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); + if (er == EMULATE_USER_EXIT) + return 0; if (er != EMULATE_DONE) kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -2917,70 +2927,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) return true; } -static bool nested_svm_vmrun(struct vcpu_svm *svm) +static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb, struct page *page) { - struct vmcb *nested_vmcb; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; - struct page *page; - u64 vmcb_gpa; - - vmcb_gpa = svm->vmcb->save.rax; - - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) - return false; - - if (!nested_vmcb_checks(nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; - - nested_svm_unmap(page); - - return false; - } - - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); - - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); - - /* Clear internal status */ - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); - - /* - * Save the old vmcb, so we don't need to pick what we save, but can - * restore everything when a VMEXIT occurs - */ - hsave->save.es = vmcb->save.es; - hsave->save.cs = vmcb->save.cs; - hsave->save.ss = vmcb->save.ss; - hsave->save.ds = vmcb->save.ds; - hsave->save.gdtr = vmcb->save.gdtr; - hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.efer; - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = kvm_get_rflags(&svm->vcpu); - hsave->save.rip = kvm_rip_read(&svm->vcpu); - hsave->save.rsp = vmcb->save.rsp; - hsave->save.rax = vmcb->save.rax; - if (npt_enabled) - hsave->save.cr3 = vmcb->save.cr3; - else - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - - copy_vmcb_control_area(hsave, vmcb); - if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; else @@ -3073,6 +3022,73 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) enable_gif(svm); mark_all_dirty(svm->vmcb); +} + +static bool nested_svm_vmrun(struct vcpu_svm *svm) +{ + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + struct page *page; + u64 vmcb_gpa; + + vmcb_gpa = svm->vmcb->save.rax; + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + if (!nested_vmcb) + return false; + + if (!nested_vmcb_checks(nested_vmcb)) { + nested_vmcb->control.exit_code = SVM_EXIT_ERR; + nested_vmcb->control.exit_code_hi = 0; + nested_vmcb->control.exit_info_1 = 0; + nested_vmcb->control.exit_info_2 = 0; + + nested_svm_unmap(page); + + return false; + } + + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, + nested_vmcb->save.rip, + nested_vmcb->control.int_ctl, + nested_vmcb->control.event_inj, + nested_vmcb->control.nested_ctl); + + trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, + nested_vmcb->control.intercept_cr >> 16, + nested_vmcb->control.intercept_exceptions, + nested_vmcb->control.intercept); + + /* Clear internal status */ + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + /* + * Save the old vmcb, so we don't need to pick what we save, but can + * restore everything when a VMEXIT occurs + */ + hsave->save.es = vmcb->save.es; + hsave->save.cs = vmcb->save.cs; + hsave->save.ss = vmcb->save.ss; + hsave->save.ds = vmcb->save.ds; + hsave->save.gdtr = vmcb->save.gdtr; + hsave->save.idtr = vmcb->save.idtr; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rflags = kvm_get_rflags(&svm->vcpu); + hsave->save.rip = kvm_rip_read(&svm->vcpu); + hsave->save.rsp = vmcb->save.rsp; + hsave->save.rax = vmcb->save.rax; + if (npt_enabled) + hsave->save.cr3 = vmcb->save.cr3; + else + hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); + + copy_vmcb_control_area(hsave, vmcb); + + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page); return true; } @@ -3174,7 +3190,7 @@ static int stgi_interception(struct vcpu_svm *svm) /* * If VGIF is enabled, the STGI intercept is only added to - * detect the opening of the NMI window; remove it now. + * detect the opening of the SMI/NMI window; remove it now. */ if (vgif_enabled(svm)) clr_intercept(svm, INTERCEPT_STGI); @@ -3658,6 +3674,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; switch (ecx) { + case MSR_IA32_CR_PAT: + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; + vcpu->arch.pat = data; + svm->vmcb->save.g_pat = data; + mark_dirty(svm->vmcb, VMCB_NPT); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr); break; @@ -4132,7 +4155,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MONITOR] = monitor_interception, [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, - [SVM_EXIT_NPF] = pf_interception, + [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = emulate_on_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, @@ -4957,6 +4980,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" #endif + /* + * Clear host registers marked as clobbered to prevent + * speculative use. + */ + "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t" + "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t" + "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t" + "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t" + "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t" +#ifdef CONFIG_X86_64 + "xor %%r8, %%r8 \n\t" + "xor %%r9, %%r9 \n\t" + "xor %%r10, %%r10 \n\t" + "xor %%r11, %%r11 \n\t" + "xor %%r12, %%r12 \n\t" + "xor %%r13, %%r13 \n\t" + "xor %%r14, %%r14 \n\t" + "xor %%r15, %%r15 \n\t" +#endif "pop %%" _ASM_BP : : [svm]"a"(svm), @@ -5397,6 +5439,88 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +static int svm_smi_allowed(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* Per APM Vol.2 15.22.2 "Response to SMI" */ + if (!gif_set(svm)) + return 0; + + if (is_guest_mode(&svm->vcpu) && + svm->nested.intercept & (1ULL << INTERCEPT_SMI)) { + /* TODO: Might need to set exit_info_1 and exit_info_2 here */ + svm->vmcb->control.exit_code = SVM_EXIT_SMI; + svm->nested.exit_required = true; + return 0; + } + + return 1; +} + +static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + if (is_guest_mode(vcpu)) { + /* FED8h - SVM Guest */ + put_smstate(u64, smstate, 0x7ed8, 1); + /* FEE0h - SVM Guest VMCB Physical Address */ + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb); + + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + + ret = nested_svm_vmexit(svm); + if (ret) + return ret; + } + return 0; +} + +static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *nested_vmcb; + struct page *page; + struct { + u64 guest; + u64 vmcb; + } svm_state_save; + int ret; + + ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save, + sizeof(svm_state_save)); + if (ret) + return ret; + + if (svm_state_save.guest) { + vcpu->arch.hflags &= ~HF_SMM_MASK; + nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page); + if (nested_vmcb) + enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page); + else + ret = 1; + vcpu->arch.hflags |= HF_SMM_MASK; + } + return ret; +} + +static int enable_smi_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!gif_set(svm)) { + if (vgif_enabled(svm)) + set_intercept(svm, INTERCEPT_STGI); + /* STGI will cause a vm exit */ + return 1; + } + return 0; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5507,6 +5631,11 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .deliver_posted_interrupt = svm_deliver_avic_intr, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, + + .smi_allowed = svm_smi_allowed, + .pre_enter_smm = svm_pre_enter_smm, + .pre_leave_smm = svm_pre_leave_smm, + .enable_smi_window = enable_smi_window, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 924589c53422..a8b96dc4cd83 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,6 +71,9 @@ MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); +static bool __read_mostly enable_vnmi = 1; +module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); + static bool __read_mostly flexpriority_enabled = 1; module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); @@ -203,6 +206,10 @@ struct loaded_vmcs { bool nmi_known_unmasked; unsigned long vmcs_host_cr3; /* May not match real cr3 */ unsigned long vmcs_host_cr4; /* May not match real cr4 */ + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; struct list_head loaded_vmcss_on_cpu_link; }; @@ -487,6 +494,14 @@ struct nested_vmx { u64 nested_vmx_cr4_fixed1; u64 nested_vmx_vmcs_enum; u64 nested_vmx_vmfunc_controls; + + /* SMM related state */ + struct { + /* in VMX operation on SMM entry? */ + bool vmxon; + /* in guest mode on SMM entry? */ + bool guest_mode; + } smm; }; #define POSTED_INTR_ON 0 @@ -885,8 +900,16 @@ static inline short vmcs_field_to_offset(unsigned long field) { BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || - vmcs_field_to_offset_table[field] == 0) + if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) + return -ENOENT; + + /* + * FIXME: Mitigation for CVE-2017-5753. To be replaced with a + * generic mechanism. + */ + asm("lfence"); + + if (vmcs_field_to_offset_table[field] == 0) return -ENOENT; return vmcs_field_to_offset_table[field]; @@ -901,16 +924,13 @@ static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); static bool vmx_xsaves_supported(void); -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static int alloc_identity_pagetable(struct kvm *kvm); static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, @@ -1287,6 +1307,11 @@ static inline bool cpu_has_vmx_invpcid(void) SECONDARY_EXEC_ENABLE_INVPCID; } +static inline bool cpu_has_virtual_nmis(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; +} + static inline bool cpu_has_vmx_wbinvd_exit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -1344,11 +1369,6 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12->secondary_vm_exec_control & bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; -} - static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & @@ -1599,18 +1619,15 @@ static inline void vpid_sync_context(int vpid) static inline void ept_sync_global(void) { - if (cpu_has_vmx_invept_global()) - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); } static inline void ept_sync_context(u64 eptp) { - if (enable_ept) { - if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); - else - ept_sync_global(); - } + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + else + ept_sync_global(); } static __always_inline void vmcs_check16(unsigned long field) @@ -2832,8 +2849,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_PML; vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; } - } else - vmx->nested.nested_vmx_ept_caps = 0; + } if (cpu_has_vmx_vmfunc()) { vmx->nested.nested_vmx_secondary_ctls_high |= @@ -2842,8 +2858,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * Advertise EPTP switching unconditionally * since we emulate it */ - vmx->nested.nested_vmx_vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; + if (enable_ept) + vmx->nested.nested_vmx_vmfunc_controls = + VMX_VMFUNC_EPTP_SWITCHING; } /* @@ -2857,8 +2874,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_VPID; vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; - } else - vmx->nested.nested_vmx_vpid_caps = 0; + } if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= @@ -3545,7 +3561,8 @@ static int hardware_enable(void) wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } kvm_cpu_vmxon(phys_addr); - ept_sync_global(); + if (enable_ept) + ept_sync_global(); return 0; } @@ -3658,8 +3675,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_RDSEED | - SECONDARY_EXEC_RDRAND | + SECONDARY_EXEC_RDSEED_EXITING | + SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_VMFUNC; @@ -3680,14 +3697,25 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, + &vmx_capability.ept, &vmx_capability.vpid); + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); - rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, - vmx_capability.ept, vmx_capability.vpid); + } else if (vmx_capability.ept) { + vmx_capability.ept = 0; + pr_warn_once("EPT CAP should not exist if not support " + "1-setting enable EPT VM-execution control\n"); + } + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && + vmx_capability.vpid) { + vmx_capability.vpid = 0; + pr_warn_once("VPID CAP should not exist if not support " + "1-setting enable VPID VM-execution control\n"); } min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; @@ -3700,9 +3728,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS; - opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | + PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -4782,18 +4810,18 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm_pfn_t identity_map_pfn; u32 tmp; - if (!enable_ept) - return 0; - /* Protect kvm->arch.ept_identity_pagetable_done. */ mutex_lock(&kvm->slots_lock); if (likely(kvm->arch.ept_identity_pagetable_done)) goto out2; + if (!kvm->arch.ept_identity_map_addr) + kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; - r = alloc_identity_pagetable(kvm); + r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm->arch.ept_identity_map_addr, PAGE_SIZE); if (r < 0) goto out2; @@ -4865,20 +4893,6 @@ out: return r; } -static int alloc_identity_pagetable(struct kvm *kvm) -{ - /* Called with kvm->slots_lock held. */ - - int r = 0; - - BUG_ON(kvm->arch.ept_identity_pagetable_done); - - r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, - kvm->arch.ept_identity_map_addr, PAGE_SIZE); - - return r; -} - static int allocate_vpid(void) { int vpid; @@ -5234,6 +5248,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + + if (!enable_vnmi) + pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; + /* Enable the preemption timer dynamically */ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; @@ -5283,13 +5301,13 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static bool vmx_rdrand_supported(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDRAND; + SECONDARY_EXEC_RDRAND_EXITING; } static bool vmx_rdseed_supported(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDSEED; + SECONDARY_EXEC_RDSEED_EXITING; } static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) @@ -5383,30 +5401,30 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (vmx_rdrand_supported()) { bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); if (rdrand_enabled) - exec_control &= ~SECONDARY_EXEC_RDRAND; + exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; if (nested) { if (rdrand_enabled) vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_RDRAND; + SECONDARY_EXEC_RDRAND_EXITING; else vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDRAND; + ~SECONDARY_EXEC_RDRAND_EXITING; } } if (vmx_rdseed_supported()) { bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); if (rdseed_enabled) - exec_control &= ~SECONDARY_EXEC_RDSEED; + exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; if (nested) { if (rdseed_enabled) vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_RDSEED; + SECONDARY_EXEC_RDSEED_EXITING; else vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDSEED; + ~SECONDARY_EXEC_RDSEED_EXITING; } } @@ -5427,7 +5445,7 @@ static void ept_set_mmio_spte_mask(void) /* * Sets up the vmcs for emulated real mode. */ -static int vmx_vcpu_setup(struct vcpu_vmx *vmx) +static void vmx_vcpu_setup(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 unsigned long a; @@ -5540,8 +5558,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } - - return 0; } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -5593,7 +5609,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } - vmcs_writel(GUEST_RFLAGS, 0x02); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0xfff0); vmcs_writel(GUEST_GDTR_BASE, 0); @@ -5605,6 +5621,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); + if (kvm_mpx_supported()) + vmcs_write64(GUEST_BNDCFGS, 0); setup_msrs(vmx); @@ -5668,7 +5686,8 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) static void enable_nmi_window(struct kvm_vcpu *vcpu) { - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + if (!enable_vnmi || + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } @@ -5708,6 +5727,19 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!enable_vnmi) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->loaded_vmcs->soft_vnmi_blocked = 1; + vmx->loaded_vmcs->vnmi_blocked_time = 0; + } + ++vcpu->stat.nmi_injections; vmx->loaded_vmcs->nmi_known_unmasked = false; @@ -5726,6 +5758,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); bool masked; + if (!enable_vnmi) + return vmx->loaded_vmcs->soft_vnmi_blocked; if (vmx->loaded_vmcs->nmi_known_unmasked) return false; masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; @@ -5737,13 +5771,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->loaded_vmcs->nmi_known_unmasked = !masked; - if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + if (!enable_vnmi) { + if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { + vmx->loaded_vmcs->soft_vnmi_blocked = masked; + vmx->loaded_vmcs->vnmi_blocked_time = 0; + } + } else { + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) @@ -5751,6 +5792,10 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) if (to_vmx(vcpu)->nested.nested_run_pending) return 0; + if (!enable_vnmi && + to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) + return 0; + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); @@ -5879,11 +5924,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) return 1; /* already handled by vmx_vcpu_run() */ if (is_invalid_opcode(intr_info)) { - if (is_guest_mode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); + if (er == EMULATE_USER_EXIT) + return 0; if (er != EMULATE_DONE) kvm_queue_exception(vcpu, UD_VECTOR); return 1; @@ -5913,8 +5956,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) cr2 = vmcs_readl(EXIT_QUALIFICATION); /* EPT won't cause page fault directly */ WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, - true); + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -6479,6 +6521,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * AAK134, BY25. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + enable_vnmi && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -6538,6 +6581,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { + WARN_ON_ONCE(!enable_vnmi); vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_VIRTUAL_NMI_PENDING); ++vcpu->stat.nmi_window_exits; @@ -6565,7 +6609,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; - err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); + err = emulate_instruction(vcpu, 0); if (err == EMULATE_USER_EXIT) { ++vcpu->stat.mmio_exits; @@ -6713,16 +6757,10 @@ static __init int hardware_setup(void) goto out; } - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - /* - * Allow direct access to the PC debug port (it is often used for I/O - * delays, but the vmexits simply slow things down). - */ memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - clear_bit(0x80, vmx_io_bitmap_a); memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); @@ -6748,21 +6786,22 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels() || - !cpu_has_vmx_ept_mt_wb()) { + !cpu_has_vmx_ept_mt_wb() || + !cpu_has_vmx_invept_global()) enable_ept = 0; - enable_unrestricted_guest = 0; - enable_ept_ad_bits = 0; - } if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; - if (!cpu_has_vmx_unrestricted_guest()) + if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) enable_unrestricted_guest = 0; if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; + if (!cpu_has_virtual_nmis()) + enable_vnmi = 0; + /* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not @@ -6777,8 +6816,13 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); - if (!cpu_has_vmx_ple()) + if (!cpu_has_vmx_ple()) { ple_gap = 0; + ple_window = 0; + ple_window_grow = 0; + ple_window_max = 0; + ple_window_shrink = 0; + } if (!cpu_has_vmx_apicv()) { enable_apicv = 0; @@ -6962,7 +7006,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) } /* Create a new VMCS */ - item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); + item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL); if (!item) return NULL; item->vmcs02.vmcs = alloc_vmcs(); @@ -7371,10 +7415,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) */ static void free_nested(struct vcpu_vmx *vmx) { - if (!vmx->nested.vmxon) + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; vmx->nested.vmxon = false; + vmx->nested.smm.vmxon = false; free_vpid(vmx->nested.vpid02); vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -7979,6 +8024,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) * "blocked by NMI" bit has to be set before next VM entry. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + enable_vnmi && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -8416,9 +8462,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); case EXIT_REASON_RDRAND: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); case EXIT_REASON_RDSEED: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: @@ -8823,6 +8869,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) { + if (vmx_interrupt_allowed(vcpu)) { + vmx->loaded_vmcs->soft_vnmi_blocked = 0; + } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->loaded_vmcs->soft_vnmi_blocked = 0; + } + } + if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); @@ -9105,33 +9170,38 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (vmx->loaded_vmcs->nmi_known_unmasked) - return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - * SDM 3: 23.2.2 (September 2008) - * Bit 12 is undefined in any of the following cases: - * If the VM exit sets the valid bit in the IDT-vectoring - * information field. - * If the VM exit is due to a double fault. - */ - if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && - vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->loaded_vmcs->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) - & GUEST_INTR_STATE_NMI); + if (enable_vnmi) { + if (vmx->loaded_vmcs->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); + } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), + vmx->loaded_vmcs->entry_time)); } static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, @@ -9248,6 +9318,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long debugctlmsr, cr3, cr4; + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!enable_vnmi && + vmx->loaded_vmcs->soft_vnmi_blocked)) + vmx->loaded_vmcs->entry_time = ktime_get(); + /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ if (vmx->emulation_required) @@ -9346,6 +9421,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Save guest registers, load host registers, keep flags */ "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" "pop %0 \n\t" + "setbe %c[fail](%0)\n\t" "mov %%" _ASM_AX ", %c[rax](%0) \n\t" "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" __ASM_SIZE(pop) " %c[rcx](%0) \n\t" @@ -9362,12 +9438,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" + "xor %%r8d, %%r8d \n\t" + "xor %%r9d, %%r9d \n\t" + "xor %%r10d, %%r10d \n\t" + "xor %%r11d, %%r11d \n\t" + "xor %%r12d, %%r12d \n\t" + "xor %%r13d, %%r13d \n\t" + "xor %%r14d, %%r14d \n\t" + "xor %%r15d, %%r15d \n\t" #endif "mov %%cr2, %%" _ASM_AX " \n\t" "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" + "xor %%eax, %%eax \n\t" + "xor %%ebx, %%ebx \n\t" + "xor %%esi, %%esi \n\t" + "xor %%edi, %%edi \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - "setbe %c[fail](%0) \n\t" ".pushsection .rodata \n\t" ".global vmx_return \n\t" "vmx_return: " _ASM_PTR " 2b \n\t" @@ -9479,7 +9566,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; put_cpu(); } @@ -9560,11 +9646,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; - err = vmx_vcpu_setup(vmx); + vmx_vcpu_setup(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); - if (err) - goto free_vmcs; if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { err = alloc_apic_access_page(kvm); if (err) @@ -9572,9 +9656,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) } if (enable_ept) { - if (!kvm->arch.ept_identity_map_addr) - kvm->arch.ept_identity_map_addr = - VMX_EPT_IDENTITY_PAGETABLE_ADDR; err = init_rmode_identity_map(kvm); if (err) goto free_vmcs; @@ -9736,8 +9817,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); - /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */ - cr4_fixed1_update(bit(11), ecx, bit(2)); + cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); #undef cr4_fixed1_update } @@ -10811,6 +10891,11 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 1; } + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return 1; + return 0; } @@ -11035,13 +11120,12 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qual; - - if (kvm_event_needs_reinjection(vcpu)) - return -EBUSY; + bool block_nested_events = + vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); if (vcpu->arch.exception.pending && nested_vmx_check_exception(vcpu, &exit_qual)) { - if (vmx->nested.nested_run_pending) + if (block_nested_events) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); vcpu->arch.exception.pending = false; @@ -11050,14 +11134,14 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && vmx->nested.preemption_timer_expired) { - if (vmx->nested.nested_run_pending) + if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); return 0; } if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { - if (vmx->nested.nested_run_pending) + if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR | @@ -11073,7 +11157,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && nested_exit_on_intr(vcpu)) { - if (vmx->nested.nested_run_pending) + if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); return 0; @@ -11260,6 +11344,24 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_clear_interrupt_queue(vcpu); } +static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 entry_failure_code; + + nested_ept_uninit_mmu_context(vcpu); + + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + + if (!enable_ept) + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; +} + /* * A part of what we need to when the nested L2 guest exits and we want to * run its L1 parent, is to reset L1's guest state to the host state specified @@ -11273,7 +11375,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; - u32 entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -11300,17 +11401,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); vmx_set_cr4(vcpu, vmcs12->host_cr4); - nested_ept_uninit_mmu_context(vcpu); - - /* - * Only PDPTE load can fail as the value of cr3 was checked on entry and - * couldn't have changed. - */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) - nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); - - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + load_vmcs12_mmu_host_state(vcpu, vmcs12); if (enable_vpid) { /* @@ -11329,6 +11420,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); + vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) @@ -11425,8 +11518,11 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); if (likely(!vmx->fail)) { - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); + if (exit_reason == -1) + sync_vmcs12(vcpu, vmcs12); + else + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) @@ -11490,7 +11586,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (enable_shadow_vmcs) + if (enable_shadow_vmcs && exit_reason != -1) vmx->nested.sync_shadow_vmcs = true; /* in case we halted in L2 */ @@ -11514,12 +11610,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; } - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); + if (exit_reason != -1) + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); load_vmcs12_host_state(vcpu, vmcs12); @@ -11534,6 +11631,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * accordingly. */ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + load_vmcs12_mmu_host_state(vcpu, vmcs12); + /* * The emulated instruction was already skipped in * nested_vmx_run, but the updated RIP was never @@ -11942,6 +12042,54 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_LMCE; } +static int vmx_smi_allowed(struct kvm_vcpu *vcpu) +{ + /* we need a nested vmexit to enter SMM, postpone if run is pending */ + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; + return 1; +} + +static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx->nested.smm.guest_mode = is_guest_mode(vcpu); + if (vmx->nested.smm.guest_mode) + nested_vmx_vmexit(vcpu, -1, 0, 0); + + vmx->nested.smm.vmxon = vmx->nested.vmxon; + vmx->nested.vmxon = false; + return 0; +} + +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int ret; + + if (vmx->nested.smm.vmxon) { + vmx->nested.vmxon = true; + vmx->nested.smm.vmxon = false; + } + + if (vmx->nested.smm.guest_mode) { + vcpu->arch.hflags &= ~HF_SMM_MASK; + ret = enter_vmx_non_root_mode(vcpu, false); + vcpu->arch.hflags |= HF_SMM_MASK; + if (ret) + return ret; + + vmx->nested.smm.guest_mode = false; + } + return 0; +} + +static int enable_smi_window(struct kvm_vcpu *vcpu) +{ + return 0; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -12067,6 +12215,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { #endif .setup_mce = vmx_setup_mce, + + .smi_allowed = vmx_smi_allowed, + .pre_enter_smm = vmx_pre_enter_smm, + .pre_leave_smm = vmx_pre_leave_smm, + .enable_smi_window = enable_smi_window, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03869eb7fcd6..c53298dfbf50 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -107,6 +107,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +static bool __read_mostly report_ignored_msrs = true; +module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); + unsigned int min_timer_period_us = 500; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); @@ -1795,10 +1798,13 @@ u64 get_kvmclock_ns(struct kvm *kvm) /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, - &hv_clock.tsc_shift, - &hv_clock.tsc_to_system_mul); - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + if (__this_cpu_read(cpu_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + &hv_clock.tsc_shift, + &hv_clock.tsc_to_system_mul); + ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + } else + ret = ktime_get_boot_ns() + ka->kvmclock_offset; put_cpu(); @@ -1830,6 +1836,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) */ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + if (guest_hv_clock.version & 1) + ++guest_hv_clock.version; /* first time write, random junk */ + vcpu->hv_clock.version = guest_hv_clock.version + 1; kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, @@ -2006,10 +2015,12 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } -static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 msr = msr_info->index; + u64 data = msr_info->data; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -2034,6 +2045,9 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) if ((offset & 0x3) == 0 && data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; + if (!msr_info->host_initiated && + (offset & 0x3) == 1 && data != 0) + return -1; vcpu->arch.mce_banks[offset] = data; break; } @@ -2283,7 +2297,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return set_msr_mce(vcpu, msr, data); + return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -2317,7 +2331,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", + msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) @@ -2354,8 +2370,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr, data); return 1; } else { - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); + if (report_ignored_msrs) + vcpu_unimpl(vcpu, + "ignored wrmsr: 0x%x data 0x%llx\n", + msr, data); break; } } @@ -2573,7 +2591,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->index); return 1; } else { - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", + msr_info->index); msr_info->data = 0; } break; @@ -2917,7 +2937,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, idx); pagefault_enable(); kvm_x86_ops->vcpu_put(vcpu); - kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = rdtsc(); } @@ -4034,10 +4053,16 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; + mutex_lock(&kvm->lock); + r = -EINVAL; + if (kvm->created_vcpus) + goto set_identity_unlock; r = -EFAULT; if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) - goto out; + goto set_identity_unlock; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); +set_identity_unlock: + mutex_unlock(&kvm->lock); break; } case KVM_SET_NR_MMU_PAGES: @@ -4359,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) addr, n, v)) && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) break; - trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v); handled += n; addr += n; len -= n; @@ -4618,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_fragments[0].gpa, *(u64 *)val); + vcpu->mmio_fragments[0].gpa, val); vcpu->mmio_read_completed = 0; return 1; } @@ -4640,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) { - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val); return vcpu_mmio_write(vcpu, gpa, bytes, val); } static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL); return X86EMUL_IO_NEEDED; } @@ -5226,17 +5251,6 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt) emul_to_vcpu(ctxt)->arch.halt_request = 1; } -static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) -{ - preempt_disable(); - kvm_load_guest_fpu(emul_to_vcpu(ctxt)); -} - -static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) -{ - preempt_enable(); -} - static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) @@ -5275,6 +5289,11 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); } +static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5309,13 +5328,12 @@ static const struct x86_emulate_ops emulate_ops = { .halt = emulator_halt, .wbinvd = emulator_wbinvd, .fix_hypercall = emulator_fix_hypercall, - .get_fpu = emulator_get_fpu, - .put_fpu = emulator_put_fpu, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, + .pre_leave_smm = emulator_pre_leave_smm, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -5413,7 +5431,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - r = EMULATE_FAIL; + r = EMULATE_USER_EXIT; } kvm_queue_exception(vcpu, UD_VECTOR); @@ -5705,6 +5723,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) return EMULATE_DONE; + if (ctxt->have_exception && inject_emulated_exception(vcpu)) + return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; return handle_emulation_failure(vcpu); @@ -6426,7 +6446,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } kvm_x86_ops->queue_exception(vcpu); - } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) { vcpu->arch.smi_pending = false; enter_smm(vcpu); } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { @@ -6473,9 +6493,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -#define put_smstate(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) { u32 flags = 0; @@ -6641,13 +6658,20 @@ static void enter_smm(struct kvm_vcpu *vcpu) u32 cr0; trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); - vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) enter_smm_save_state_64(vcpu, buf); else enter_smm_save_state_32(vcpu, buf); + /* + * Give pre_enter_smm() a chance to make ISA-specific changes to the + * vCPU state (e.g. leave guest mode) after we've saved the state into + * the SMM state-save area. + */ + kvm_x86_ops->pre_enter_smm(vcpu, buf); + + vcpu->arch.hflags |= HF_SMM_MASK; kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); if (kvm_x86_ops->get_nmi_mask(vcpu)) @@ -6740,6 +6764,20 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) kvm_x86_ops->tlb_flush(vcpu); } +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end) +{ + unsigned long apic_address; + + /* + * The physical address of apic access page is stored in the VMCS. + * Update it when it becomes invalid. + */ + apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (start <= apic_address && apic_address < end) + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); +} + void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { struct page *page = NULL; @@ -6876,17 +6914,23 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (inject_pending_event(vcpu, req_int_win) != 0) req_immediate_exit = true; else { - /* Enable NMI/IRQ window open exits if needed. + /* Enable SMI/NMI/IRQ window open exits if needed. * - * SMIs have two cases: 1) they can be nested, and - * then there is nothing to do here because RSM will - * cause a vmexit anyway; 2) or the SMI can be pending - * because inject_pending_event has completed the - * injection of an IRQ or NMI from the previous vmexit, - * and then we request an immediate exit to inject the SMI. + * SMIs have three cases: + * 1) They can be nested, and then there is nothing to + * do here because RSM will cause a vmexit anyway. + * 2) There is an ISA-specific reason why SMI cannot be + * injected, and the moment when this changes can be + * intercepted. + * 3) Or the SMI can be pending because + * inject_pending_event has completed the injection + * of an IRQ or NMI from the previous vmexit, and + * then we request an immediate exit to inject the + * SMI. */ if (vcpu->arch.smi_pending && !is_smm(vcpu)) - req_immediate_exit = true; + if (!kvm_x86_ops->enable_smi_window(vcpu)) + req_immediate_exit = true; if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) @@ -6908,7 +6952,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -7221,14 +7264,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - struct fpu *fpu = ¤t->thread.fpu; int r; - sigset_t sigsaved; - fpu__initialize(fpu); + kvm_sigset_activate(vcpu); - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + kvm_load_guest_fpu(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (kvm_run->immediate_exit) { @@ -7270,9 +7310,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = vcpu_run(vcpu); out: + kvm_put_guest_fpu(vcpu); post_kvm_run_save(vcpu); - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + kvm_sigset_deactivate(vcpu); return r; } @@ -7340,7 +7380,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) #endif kvm_rip_write(vcpu, regs->rip); - kvm_set_rflags(vcpu, regs->rflags); + kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); vcpu->arch.exception.pending = false; @@ -7454,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); +int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { + /* + * When EFER.LME and CR0.PG are set, the processor is in + * 64-bit mode (though maybe in a 32-bit code segment). + * CR4.PAE and EFER.LMA must be set. + */ + if (!(sregs->cr4 & X86_CR4_PAE) + || !(sregs->efer & EFER_LMA)) + return -EINVAL; + } else { + /* + * Not in 64-bit mode: EFER.LMA is clear and the code + * segment cannot be 64-bit. + */ + if (sregs->efer & EFER_LMA || sregs->cs.l) + return -EINVAL; + } + + return 0; +} + int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -7466,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, (sregs->cr4 & X86_CR4_OSXSAVE)) return -EINVAL; + if (kvm_valid_sregs(vcpu, sregs)) + return -EINVAL; + apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; if (kvm_set_apic_base(vcpu, &apic_base_msr)) @@ -7663,32 +7729,25 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } +/* Swap (qemu) user FPU context for the guest FPU context. */ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (vcpu->guest_fpu_loaded) - return; - - /* - * Restore all possible states in the guest, - * and assume host would use all available bits. - * Guest xcr0 would be loaded later. - */ - vcpu->guest_fpu_loaded = 1; - __kernel_fpu_begin(); + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, ~XFEATURE_MASK_PKRU); + preempt_enable(); trace_kvm_fpu(1); } +/* When vcpu_run ends, restore user space FPU context. */ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 0; + preempt_disable(); copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - __kernel_fpu_end(); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + preempt_enable(); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } @@ -7798,18 +7857,43 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; + if (kvm_mpx_supported()) { + void *mpx_state_buffer; + + /* + * To avoid have the INIT path from kvm_apic_has_events() that be + * called with loaded FPU and does not let userspace fix the state. + */ + if (init_event) + kvm_put_guest_fpu(vcpu); + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, + XFEATURE_MASK_BNDREGS); + if (mpx_state_buffer) + memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, + XFEATURE_MASK_BNDCSR); + if (mpx_state_buffer) + memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); + if (init_event) + kvm_load_guest_fpu(vcpu); + } + if (!init_event) { kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; vcpu->arch.msr_misc_features_enables = 0; + + vcpu->arch.xcr0 = XFEATURE_MASK_FP; } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + vcpu->arch.ia32_xss = 0; + kvm_x86_ops->vcpu_reset(vcpu, init_event); } @@ -7974,16 +8058,11 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; - struct kvm *kvm; int r; - BUG_ON(vcpu->kvm == NULL); - kvm = vcpu->kvm; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); - vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; @@ -8001,7 +8080,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (r < 0) goto fail_free_pio_data; - if (irqchip_in_kernel(kvm)) { + if (irqchip_in_kernel(vcpu->kvm)) { r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; @@ -8023,10 +8102,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) fx_init(vcpu); - vcpu->arch.ia32_tsc_adjust_msr = 0x0; - vcpu->arch.pv_time_enabled = false; - - vcpu->arch.guest_supported_xcr0 = 0; vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index d0a3170e6804..69a473919260 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -24,7 +24,7 @@ lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o -lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o +lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_RETPOLINE) += retpoline.o OBJECT_FILES_NON_STANDARD_retpoline.o :=y diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c new file mode 100644 index 000000000000..9119d8e41f1f --- /dev/null +++ b/arch/x86/lib/insn-eval.c @@ -0,0 +1,1364 @@ +/* + * Utility functions for x86 operand and address decoding + * + * Copyright (C) Intel Corporation 2017 + */ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/ratelimit.h> +#include <linux/mmu_context.h> +#include <asm/desc_defs.h> +#include <asm/desc.h> +#include <asm/inat.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <asm/ldt.h> +#include <asm/vm86.h> + +#undef pr_fmt +#define pr_fmt(fmt) "insn: " fmt + +enum reg_type { + REG_TYPE_RM = 0, + REG_TYPE_INDEX, + REG_TYPE_BASE, +}; + +/** + * is_string_insn() - Determine if instruction is a string instruction + * @insn: Instruction containing the opcode to inspect + * + * Returns: + * + * true if the instruction, determined by the opcode, is any of the + * string instructions as defined in the Intel Software Development manual. + * False otherwise. + */ +static bool is_string_insn(struct insn *insn) +{ + insn_get_opcode(insn); + + /* All string instructions have a 1-byte opcode. */ + if (insn->opcode.nbytes != 1) + return false; + + switch (insn->opcode.bytes[0]) { + case 0x6c ... 0x6f: /* INS, OUTS */ + case 0xa4 ... 0xa7: /* MOVS, CMPS */ + case 0xaa ... 0xaf: /* STOS, LODS, SCAS */ + return true; + default: + return false; + } +} + +/** + * get_seg_reg_override_idx() - obtain segment register override index + * @insn: Valid instruction with segment override prefixes + * + * Inspect the instruction prefixes in @insn and find segment overrides, if any. + * + * Returns: + * + * A constant identifying the segment register to use, among CS, SS, DS, + * ES, FS, or GS. INAT_SEG_REG_DEFAULT is returned if no segment override + * prefixes were found. + * + * -EINVAL in case of error. + */ +static int get_seg_reg_override_idx(struct insn *insn) +{ + int idx = INAT_SEG_REG_DEFAULT; + int num_overrides = 0, i; + + insn_get_prefixes(insn); + + /* Look for any segment override prefixes. */ + for (i = 0; i < insn->prefixes.nbytes; i++) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + switch (attr) { + case INAT_MAKE_PREFIX(INAT_PFX_CS): + idx = INAT_SEG_REG_CS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_SS): + idx = INAT_SEG_REG_SS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_DS): + idx = INAT_SEG_REG_DS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_ES): + idx = INAT_SEG_REG_ES; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_FS): + idx = INAT_SEG_REG_FS; + num_overrides++; + break; + case INAT_MAKE_PREFIX(INAT_PFX_GS): + idx = INAT_SEG_REG_GS; + num_overrides++; + break; + /* No default action needed. */ + } + } + + /* More than one segment override prefix leads to undefined behavior. */ + if (num_overrides > 1) + return -EINVAL; + + return idx; +} + +/** + * check_seg_overrides() - check if segment override prefixes are allowed + * @insn: Valid instruction with segment override prefixes + * @regoff: Operand offset, in pt_regs, for which the check is performed + * + * For a particular register used in register-indirect addressing, determine if + * segment override prefixes can be used. Specifically, no overrides are allowed + * for rDI if used with a string instruction. + * + * Returns: + * + * True if segment override prefixes can be used with the register indicated + * in @regoff. False if otherwise. + */ +static bool check_seg_overrides(struct insn *insn, int regoff) +{ + if (regoff == offsetof(struct pt_regs, di) && is_string_insn(insn)) + return false; + + return true; +} + +/** + * resolve_default_seg() - resolve default segment register index for an operand + * @insn: Instruction with opcode and address size. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @off: Operand offset, in pt_regs, for which resolution is needed + * + * Resolve the default segment register index associated with the instruction + * operand register indicated by @off. Such index is resolved based on defaults + * described in the Intel Software Development Manual. + * + * Returns: + * + * If in protected mode, a constant identifying the segment register to use, + * among CS, SS, ES or DS. If in long mode, INAT_SEG_REG_IGNORE. + * + * -EINVAL in case of error. + */ +static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off) +{ + if (user_64bit_mode(regs)) + return INAT_SEG_REG_IGNORE; + /* + * Resolve the default segment register as described in Section 3.7.4 + * of the Intel Software Development Manual Vol. 1: + * + * + DS for all references involving r[ABCD]X, and rSI. + * + If used in a string instruction, ES for rDI. Otherwise, DS. + * + AX, CX and DX are not valid register operands in 16-bit address + * encodings but are valid for 32-bit and 64-bit encodings. + * + -EDOM is reserved to identify for cases in which no register + * is used (i.e., displacement-only addressing). Use DS. + * + SS for rSP or rBP. + * + CS for rIP. + */ + + switch (off) { + case offsetof(struct pt_regs, ax): + case offsetof(struct pt_regs, cx): + case offsetof(struct pt_regs, dx): + /* Need insn to verify address size. */ + if (insn->addr_bytes == 2) + return -EINVAL; + + case -EDOM: + case offsetof(struct pt_regs, bx): + case offsetof(struct pt_regs, si): + return INAT_SEG_REG_DS; + + case offsetof(struct pt_regs, di): + if (is_string_insn(insn)) + return INAT_SEG_REG_ES; + return INAT_SEG_REG_DS; + + case offsetof(struct pt_regs, bp): + case offsetof(struct pt_regs, sp): + return INAT_SEG_REG_SS; + + case offsetof(struct pt_regs, ip): + return INAT_SEG_REG_CS; + + default: + return -EINVAL; + } +} + +/** + * resolve_seg_reg() - obtain segment register index + * @insn: Instruction with operands + * @regs: Register values as seen when entering kernel mode + * @regoff: Operand offset, in pt_regs, used to deterimine segment register + * + * Determine the segment register associated with the operands and, if + * applicable, prefixes and the instruction pointed by @insn. + * + * The segment register associated to an operand used in register-indirect + * addressing depends on: + * + * a) Whether running in long mode (in such a case segments are ignored, except + * if FS or GS are used). + * + * b) Whether segment override prefixes can be used. Certain instructions and + * registers do not allow override prefixes. + * + * c) Whether segment overrides prefixes are found in the instruction prefixes. + * + * d) If there are not segment override prefixes or they cannot be used, the + * default segment register associated with the operand register is used. + * + * The function checks first if segment override prefixes can be used with the + * operand indicated by @regoff. If allowed, obtain such overridden segment + * register index. Lastly, if not prefixes were found or cannot be used, resolve + * the segment register index to use based on the defaults described in the + * Intel documentation. In long mode, all segment register indexes will be + * ignored, except if overrides were found for FS or GS. All these operations + * are done using helper functions. + * + * The operand register, @regoff, is represented as the offset from the base of + * pt_regs. + * + * As stated, the main use of this function is to determine the segment register + * index based on the instruction, its operands and prefixes. Hence, @insn + * must be valid. However, if @regoff indicates rIP, we don't need to inspect + * @insn at all as in this case CS is used in all cases. This case is checked + * before proceeding further. + * + * Please note that this function does not return the value in the segment + * register (i.e., the segment selector) but our defined index. The segment + * selector needs to be obtained using get_segment_selector() and passing the + * segment register index resolved by this function. + * + * Returns: + * + * An index identifying the segment register to use, among CS, SS, DS, + * ES, FS, or GS. INAT_SEG_REG_IGNORE is returned if running in long mode. + * + * -EINVAL in case of error. + */ +static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff) +{ + int idx; + + /* + * In the unlikely event of having to resolve the segment register + * index for rIP, do it first. Segment override prefixes should not + * be used. Hence, it is not necessary to inspect the instruction, + * which may be invalid at this point. + */ + if (regoff == offsetof(struct pt_regs, ip)) { + if (user_64bit_mode(regs)) + return INAT_SEG_REG_IGNORE; + else + return INAT_SEG_REG_CS; + } + + if (!insn) + return -EINVAL; + + if (!check_seg_overrides(insn, regoff)) + return resolve_default_seg(insn, regs, regoff); + + idx = get_seg_reg_override_idx(insn); + if (idx < 0) + return idx; + + if (idx == INAT_SEG_REG_DEFAULT) + return resolve_default_seg(insn, regs, regoff); + + /* + * In long mode, segment override prefixes are ignored, except for + * overrides for FS and GS. + */ + if (user_64bit_mode(regs)) { + if (idx != INAT_SEG_REG_FS && + idx != INAT_SEG_REG_GS) + idx = INAT_SEG_REG_IGNORE; + } + + return idx; +} + +/** + * get_segment_selector() - obtain segment selector + * @regs: Register values as seen when entering kernel mode + * @seg_reg_idx: Segment register index to use + * + * Obtain the segment selector from any of the CS, SS, DS, ES, FS, GS segment + * registers. In CONFIG_X86_32, the segment is obtained from either pt_regs or + * kernel_vm86_regs as applicable. In CONFIG_X86_64, CS and SS are obtained + * from pt_regs. DS, ES, FS and GS are obtained by reading the actual CPU + * registers. This done for only for completeness as in CONFIG_X86_64 segment + * registers are ignored. + * + * Returns: + * + * Value of the segment selector, including null when running in + * long mode. + * + * -EINVAL on error. + */ +static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + switch (seg_reg_idx) { + case INAT_SEG_REG_IGNORE: + return 0; + case INAT_SEG_REG_CS: + return (unsigned short)(regs->cs & 0xffff); + case INAT_SEG_REG_SS: + return (unsigned short)(regs->ss & 0xffff); + case INAT_SEG_REG_DS: + savesegment(ds, sel); + return sel; + case INAT_SEG_REG_ES: + savesegment(es, sel); + return sel; + case INAT_SEG_REG_FS: + savesegment(fs, sel); + return sel; + case INAT_SEG_REG_GS: + savesegment(gs, sel); + return sel; + default: + return -EINVAL; + } +#else /* CONFIG_X86_32 */ + struct kernel_vm86_regs *vm86regs = (struct kernel_vm86_regs *)regs; + + if (v8086_mode(regs)) { + switch (seg_reg_idx) { + case INAT_SEG_REG_CS: + return (unsigned short)(regs->cs & 0xffff); + case INAT_SEG_REG_SS: + return (unsigned short)(regs->ss & 0xffff); + case INAT_SEG_REG_DS: + return vm86regs->ds; + case INAT_SEG_REG_ES: + return vm86regs->es; + case INAT_SEG_REG_FS: + return vm86regs->fs; + case INAT_SEG_REG_GS: + return vm86regs->gs; + case INAT_SEG_REG_IGNORE: + /* fall through */ + default: + return -EINVAL; + } + } + + switch (seg_reg_idx) { + case INAT_SEG_REG_CS: + return (unsigned short)(regs->cs & 0xffff); + case INAT_SEG_REG_SS: + return (unsigned short)(regs->ss & 0xffff); + case INAT_SEG_REG_DS: + return (unsigned short)(regs->ds & 0xffff); + case INAT_SEG_REG_ES: + return (unsigned short)(regs->es & 0xffff); + case INAT_SEG_REG_FS: + return (unsigned short)(regs->fs & 0xffff); + case INAT_SEG_REG_GS: + /* + * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS. + * The macro below takes care of both cases. + */ + return get_user_gs(regs); + case INAT_SEG_REG_IGNORE: + /* fall through */ + default: + return -EINVAL; + } +#endif /* CONFIG_X86_64 */ +} + +static int get_reg_offset(struct insn *insn, struct pt_regs *regs, + enum reg_type type) +{ + int regno = 0; + + static const int regoff[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif + }; + int nr_registers = ARRAY_SIZE(regoff); + /* + * Don't possibly decode a 32-bit instructions as + * reading a 64-bit-only register. + */ + if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) + nr_registers -= 8; + + switch (type) { + case REG_TYPE_RM: + regno = X86_MODRM_RM(insn->modrm.value); + + /* + * ModRM.mod == 0 and ModRM.rm == 5 means a 32-bit displacement + * follows the ModRM byte. + */ + if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5) + return -EDOM; + + if (X86_REX_B(insn->rex_prefix.value)) + regno += 8; + break; + + case REG_TYPE_INDEX: + regno = X86_SIB_INDEX(insn->sib.value); + if (X86_REX_X(insn->rex_prefix.value)) + regno += 8; + + /* + * If ModRM.mod != 3 and SIB.index = 4 the scale*index + * portion of the address computation is null. This is + * true only if REX.X is 0. In such a case, the SIB index + * is used in the address computation. + */ + if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4) + return -EDOM; + break; + + case REG_TYPE_BASE: + regno = X86_SIB_BASE(insn->sib.value); + /* + * If ModRM.mod is 0 and SIB.base == 5, the base of the + * register-indirect addressing is 0. In this case, a + * 32-bit displacement follows the SIB byte. + */ + if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5) + return -EDOM; + + if (X86_REX_B(insn->rex_prefix.value)) + regno += 8; + break; + + default: + pr_err_ratelimited("invalid register type: %d\n", type); + return -EINVAL; + } + + if (regno >= nr_registers) { + WARN_ONCE(1, "decoded an instruction with an invalid register"); + return -EINVAL; + } + return regoff[regno]; +} + +/** + * get_reg_offset_16() - Obtain offset of register indicated by instruction + * @insn: Instruction containing ModRM byte + * @regs: Register values as seen when entering kernel mode + * @offs1: Offset of the first operand register + * @offs2: Offset of the second opeand register, if applicable + * + * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte + * in @insn. This function is to be used with 16-bit address encodings. The + * @offs1 and @offs2 will be written with the offset of the two registers + * indicated by the instruction. In cases where any of the registers is not + * referenced by the instruction, the value will be set to -EDOM. + * + * Returns: + * + * 0 on success, -EINVAL on error. + */ +static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs, + int *offs1, int *offs2) +{ + /* + * 16-bit addressing can use one or two registers. Specifics of + * encodings are given in Table 2-1. "16-Bit Addressing Forms with the + * ModR/M Byte" of the Intel Software Development Manual. + */ + static const int regoff1[] = { + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, bx), + }; + + static const int regoff2[] = { + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), + -EDOM, + -EDOM, + -EDOM, + -EDOM, + }; + + if (!offs1 || !offs2) + return -EINVAL; + + /* Operand is a register, use the generic function. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + *offs1 = insn_get_modrm_rm_off(insn, regs); + *offs2 = -EDOM; + return 0; + } + + *offs1 = regoff1[X86_MODRM_RM(insn->modrm.value)]; + *offs2 = regoff2[X86_MODRM_RM(insn->modrm.value)]; + + /* + * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement- + * only addressing. This means that no registers are involved in + * computing the effective address. Thus, ensure that the first + * register offset is invalild. The second register offset is already + * invalid under the aforementioned conditions. + */ + if ((X86_MODRM_MOD(insn->modrm.value) == 0) && + (X86_MODRM_RM(insn->modrm.value) == 6)) + *offs1 = -EDOM; + + return 0; +} + +/** + * get_desc() - Obtain pointer to a segment descriptor + * @sel: Segment selector + * + * Given a segment selector, obtain a pointer to the segment descriptor. + * Both global and local descriptor tables are supported. + * + * Returns: + * + * Pointer to segment descriptor on success. + * + * NULL on error. + */ +static struct desc_struct *get_desc(unsigned short sel) +{ + struct desc_ptr gdt_desc = {0, 0}; + unsigned long desc_base; + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) { + struct desc_struct *desc = NULL; + struct ldt_struct *ldt; + + /* Bits [15:3] contain the index of the desired entry. */ + sel >>= 3; + + mutex_lock(¤t->active_mm->context.lock); + ldt = current->active_mm->context.ldt; + if (ldt && sel < ldt->nr_entries) + desc = &ldt->entries[sel]; + + mutex_unlock(¤t->active_mm->context.lock); + + return desc; + } +#endif + native_store_gdt(&gdt_desc); + + /* + * Segment descriptors have a size of 8 bytes. Thus, the index is + * multiplied by 8 to obtain the memory offset of the desired descriptor + * from the base of the GDT. As bits [15:3] of the segment selector + * contain the index, it can be regarded as multiplied by 8 already. + * All that remains is to clear bits [2:0]. + */ + desc_base = sel & ~(SEGMENT_RPL_MASK | SEGMENT_TI_MASK); + + if (desc_base > gdt_desc.size) + return NULL; + + return (struct desc_struct *)(gdt_desc.address + desc_base); +} + +/** + * insn_get_seg_base() - Obtain base address of segment descriptor. + * @regs: Register values as seen when entering kernel mode + * @seg_reg_idx: Index of the segment register pointing to seg descriptor + * + * Obtain the base address of the segment as indicated by the segment descriptor + * pointed by the segment selector. The segment selector is obtained from the + * input segment register index @seg_reg_idx. + * + * Returns: + * + * In protected mode, base address of the segment. Zero in long mode, + * except when FS or GS are used. In virtual-8086 mode, the segment + * selector shifted 4 bits to the right. + * + * -1L in case of error. + */ +unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + struct desc_struct *desc; + short sel; + + sel = get_segment_selector(regs, seg_reg_idx); + if (sel < 0) + return -1L; + + if (v8086_mode(regs)) + /* + * Base is simply the segment selector shifted 4 + * bits to the right. + */ + return (unsigned long)(sel << 4); + + if (user_64bit_mode(regs)) { + /* + * Only FS or GS will have a base address, the rest of + * the segments' bases are forced to 0. + */ + unsigned long base; + + if (seg_reg_idx == INAT_SEG_REG_FS) + rdmsrl(MSR_FS_BASE, base); + else if (seg_reg_idx == INAT_SEG_REG_GS) + /* + * swapgs was called at the kernel entry point. Thus, + * MSR_KERNEL_GS_BASE will have the user-space GS base. + */ + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = 0; + return base; + } + + /* In protected mode the segment selector cannot be null. */ + if (!sel) + return -1L; + + desc = get_desc(sel); + if (!desc) + return -1L; + + return get_desc_base(desc); +} + +/** + * get_seg_limit() - Obtain the limit of a segment descriptor + * @regs: Register values as seen when entering kernel mode + * @seg_reg_idx: Index of the segment register pointing to seg descriptor + * + * Obtain the limit of the segment as indicated by the segment descriptor + * pointed by the segment selector. The segment selector is obtained from the + * input segment register index @seg_reg_idx. + * + * Returns: + * + * In protected mode, the limit of the segment descriptor in bytes. + * In long mode and virtual-8086 mode, segment limits are not enforced. Thus, + * limit is returned as -1L to imply a limit-less segment. + * + * Zero is returned on error. + */ +static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx) +{ + struct desc_struct *desc; + unsigned long limit; + short sel; + + sel = get_segment_selector(regs, seg_reg_idx); + if (sel < 0) + return 0; + + if (user_64bit_mode(regs) || v8086_mode(regs)) + return -1L; + + if (!sel) + return 0; + + desc = get_desc(sel); + if (!desc) + return 0; + + /* + * If the granularity bit is set, the limit is given in multiples + * of 4096. This also means that the 12 least significant bits are + * not tested when checking the segment limits. In practice, + * this means that the segment ends in (limit << 12) + 0xfff. + */ + limit = get_desc_limit(desc); + if (desc->g) + limit = (limit << 12) + 0xfff; + + return limit; +} + +/** + * insn_get_code_seg_params() - Obtain code segment parameters + * @regs: Structure with register values as seen when entering kernel mode + * + * Obtain address and operand sizes of the code segment. It is obtained from the + * selector contained in the CS register in regs. In protected mode, the default + * address is determined by inspecting the L and D bits of the segment + * descriptor. In virtual-8086 mode, the default is always two bytes for both + * address and operand sizes. + * + * Returns: + * + * An int containing ORed-in default parameters on success. + * + * -EINVAL on error. + */ +int insn_get_code_seg_params(struct pt_regs *regs) +{ + struct desc_struct *desc; + short sel; + + if (v8086_mode(regs)) + /* Address and operand size are both 16-bit. */ + return INSN_CODE_SEG_PARAMS(2, 2); + + sel = get_segment_selector(regs, INAT_SEG_REG_CS); + if (sel < 0) + return sel; + + desc = get_desc(sel); + if (!desc) + return -EINVAL; + + /* + * The most significant byte of the Type field of the segment descriptor + * determines whether a segment contains data or code. If this is a data + * segment, return error. + */ + if (!(desc->type & BIT(3))) + return -EINVAL; + + switch ((desc->l << 1) | desc->d) { + case 0: /* + * Legacy mode. CS.L=0, CS.D=0. Address and operand size are + * both 16-bit. + */ + return INSN_CODE_SEG_PARAMS(2, 2); + case 1: /* + * Legacy mode. CS.L=0, CS.D=1. Address and operand size are + * both 32-bit. + */ + return INSN_CODE_SEG_PARAMS(4, 4); + case 2: /* + * IA-32e 64-bit mode. CS.L=1, CS.D=0. Address size is 64-bit; + * operand size is 32-bit. + */ + return INSN_CODE_SEG_PARAMS(4, 8); + case 3: /* Invalid setting. CS.L=1, CS.D=1 */ + /* fall through */ + default: + return -EINVAL; + } +} + +/** + * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte + * @insn: Instruction containing the ModRM byte + * @regs: Register values as seen when entering kernel mode + * + * Returns: + * + * The register indicated by the r/m part of the ModRM byte. The + * register is obtained as an offset from the base of pt_regs. In specific + * cases, the returned value can be -EDOM to indicate that the particular value + * of ModRM does not refer to a register and shall be ignored. + */ +int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs) +{ + return get_reg_offset(insn, regs, REG_TYPE_RM); +} + +/** + * get_seg_base_limit() - obtain base address and limit of a segment + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Operand offset, in pt_regs, used to resolve segment descriptor + * @base: Obtained segment base + * @limit: Obtained segment limit + * + * Obtain the base address and limit of the segment associated with the operand + * @regoff and, if any or allowed, override prefixes in @insn. This function is + * different from insn_get_seg_base() as the latter does not resolve the segment + * associated with the instruction operand. If a limit is not needed (e.g., + * when running in long mode), @limit can be NULL. + * + * Returns: + * + * 0 on success. @base and @limit will contain the base address and of the + * resolved segment, respectively. + * + * -EINVAL on error. + */ +static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs, + int regoff, unsigned long *base, + unsigned long *limit) +{ + int seg_reg_idx; + + if (!base) + return -EINVAL; + + seg_reg_idx = resolve_seg_reg(insn, regs, regoff); + if (seg_reg_idx < 0) + return seg_reg_idx; + + *base = insn_get_seg_base(regs, seg_reg_idx); + if (*base == -1L) + return -EINVAL; + + if (!limit) + return 0; + + *limit = get_seg_limit(regs, seg_reg_idx); + if (!(*limit)) + return -EINVAL; + + return 0; +} + +/** + * get_eff_addr_reg() - Obtain effective address from register operand + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, with the effective address + * @eff_addr: Obtained effective address + * + * Obtain the effective address stored in the register operand as indicated by + * the ModRM byte. This function is to be used only with register addressing + * (i.e., ModRM.mod is 3). The effective address is saved in @eff_addr. The + * register operand, as an offset from the base of pt_regs, is saved in @regoff; + * such offset can then be used to resolve the segment associated with the + * operand. This function can be used with any of the supported address sizes + * in x86. + * + * Returns: + * + * 0 on success. @eff_addr will have the effective address stored in the + * operand indicated by ModRM. @regoff will have such operand as an offset from + * the base of pt_regs. + * + * -EINVAL on error. + */ +static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs, + int *regoff, long *eff_addr) +{ + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) != 3) + return -EINVAL; + + *regoff = get_reg_offset(insn, regs, REG_TYPE_RM); + if (*regoff < 0) + return -EINVAL; + + /* Ignore bytes that are outside the address size. */ + if (insn->addr_bytes == 2) + *eff_addr = regs_get_register(regs, *regoff) & 0xffff; + else if (insn->addr_bytes == 4) + *eff_addr = regs_get_register(regs, *regoff) & 0xffffffff; + else /* 64-bit address */ + *eff_addr = regs_get_register(regs, *regoff); + + return 0; +} + +/** + * get_eff_addr_modrm() - Obtain referenced effective address via ModRM + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @eff_addr: Obtained effective address + * + * Obtain the effective address referenced by the ModRM byte of @insn. After + * identifying the registers involved in the register-indirect memory reference, + * its value is obtained from the operands in @regs. The computed address is + * stored @eff_addr. Also, the register operand that indicates the associated + * segment is stored in @regoff, this parameter can later be used to determine + * such segment. + * + * Returns: + * + * 0 on success. @eff_addr will have the referenced effective address. @regoff + * will have a register, as an offset from the base of pt_regs, that can be used + * to resolve the associated segment. + * + * -EINVAL on error. + */ +static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs, + int *regoff, long *eff_addr) +{ + long tmp; + + if (insn->addr_bytes != 8 && insn->addr_bytes != 4) + return -EINVAL; + + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) > 2) + return -EINVAL; + + *regoff = get_reg_offset(insn, regs, REG_TYPE_RM); + + /* + * -EDOM means that we must ignore the address_offset. In such a case, + * in 64-bit mode the effective address relative to the rIP of the + * following instruction. + */ + if (*regoff == -EDOM) { + if (user_64bit_mode(regs)) + tmp = regs->ip + insn->length; + else + tmp = 0; + } else if (*regoff < 0) { + return -EINVAL; + } else { + tmp = regs_get_register(regs, *regoff); + } + + if (insn->addr_bytes == 4) { + int addr32 = (int)(tmp & 0xffffffff) + insn->displacement.value; + + *eff_addr = addr32 & 0xffffffff; + } else { + *eff_addr = tmp + insn->displacement.value; + } + + return 0; +} + +/** + * get_eff_addr_modrm_16() - Obtain referenced effective address via ModRM + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @eff_addr: Obtained effective address + * + * Obtain the 16-bit effective address referenced by the ModRM byte of @insn. + * After identifying the registers involved in the register-indirect memory + * reference, its value is obtained from the operands in @regs. The computed + * address is stored @eff_addr. Also, the register operand that indicates + * the associated segment is stored in @regoff, this parameter can later be used + * to determine such segment. + * + * Returns: + * + * 0 on success. @eff_addr will have the referenced effective address. @regoff + * will have a register, as an offset from the base of pt_regs, that can be used + * to resolve the associated segment. + * + * -EINVAL on error. + */ +static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs, + int *regoff, short *eff_addr) +{ + int addr_offset1, addr_offset2, ret; + short addr1 = 0, addr2 = 0, displacement; + + if (insn->addr_bytes != 2) + return -EINVAL; + + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) > 2) + return -EINVAL; + + ret = get_reg_offset_16(insn, regs, &addr_offset1, &addr_offset2); + if (ret < 0) + return -EINVAL; + + /* + * Don't fail on invalid offset values. They might be invalid because + * they cannot be used for this particular value of ModRM. Instead, use + * them in the computation only if they contain a valid value. + */ + if (addr_offset1 != -EDOM) + addr1 = regs_get_register(regs, addr_offset1) & 0xffff; + + if (addr_offset2 != -EDOM) + addr2 = regs_get_register(regs, addr_offset2) & 0xffff; + + displacement = insn->displacement.value & 0xffff; + *eff_addr = addr1 + addr2 + displacement; + + /* + * The first operand register could indicate to use of either SS or DS + * registers to obtain the segment selector. The second operand + * register can only indicate the use of DS. Thus, the first operand + * will be used to obtain the segment selector. + */ + *regoff = addr_offset1; + + return 0; +} + +/** + * get_eff_addr_sib() - Obtain referenced effective address via SIB + * @insn: Instruction. Must be valid. + * @regs: Register values as seen when entering kernel mode + * @regoff: Obtained operand offset, in pt_regs, associated with segment + * @eff_addr: Obtained effective address + * + * Obtain the effective address referenced by the SIB byte of @insn. After + * identifying the registers involved in the indexed, register-indirect memory + * reference, its value is obtained from the operands in @regs. The computed + * address is stored @eff_addr. Also, the register operand that indicates the + * associated segment is stored in @regoff, this parameter can later be used to + * determine such segment. + * + * Returns: + * + * 0 on success. @eff_addr will have the referenced effective address. + * @base_offset will have a register, as an offset from the base of pt_regs, + * that can be used to resolve the associated segment. + * + * -EINVAL on error. + */ +static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs, + int *base_offset, long *eff_addr) +{ + long base, indx; + int indx_offset; + + if (insn->addr_bytes != 8 && insn->addr_bytes != 4) + return -EINVAL; + + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + if (X86_MODRM_MOD(insn->modrm.value) > 2) + return -EINVAL; + + insn_get_sib(insn); + + if (!insn->sib.nbytes) + return -EINVAL; + + *base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); + indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); + + /* + * Negative values in the base and index offset means an error when + * decoding the SIB byte. Except -EDOM, which means that the registers + * should not be used in the address computation. + */ + if (*base_offset == -EDOM) + base = 0; + else if (*base_offset < 0) + return -EINVAL; + else + base = regs_get_register(regs, *base_offset); + + if (indx_offset == -EDOM) + indx = 0; + else if (indx_offset < 0) + return -EINVAL; + else + indx = regs_get_register(regs, indx_offset); + + if (insn->addr_bytes == 4) { + int addr32, base32, idx32; + + base32 = base & 0xffffffff; + idx32 = indx & 0xffffffff; + + addr32 = base32 + idx32 * (1 << X86_SIB_SCALE(insn->sib.value)); + addr32 += insn->displacement.value; + + *eff_addr = addr32 & 0xffffffff; + } else { + *eff_addr = base + indx * (1 << X86_SIB_SCALE(insn->sib.value)); + *eff_addr += insn->displacement.value; + } + + return 0; +} + +/** + * get_addr_ref_16() - Obtain the 16-bit address referred by instruction + * @insn: Instruction containing ModRM byte and displacement + * @regs: Register values as seen when entering kernel mode + * + * This function is to be used with 16-bit address encodings. Obtain the memory + * address referred by the instruction's ModRM and displacement bytes. Also, the + * segment used as base is determined by either any segment override prefixes in + * @insn or the default segment of the registers involved in the address + * computation. In protected mode, segment limits are enforced. + * + * Returns: + * + * Linear address referenced by the instruction operands on success. + * + * -1L on error. + */ +static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs) +{ + unsigned long linear_addr = -1L, seg_base, seg_limit; + int ret, regoff; + short eff_addr; + long tmp; + + insn_get_modrm(insn); + insn_get_displacement(insn); + + if (insn->addr_bytes != 2) + goto out; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + ret = get_eff_addr_reg(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + } else { + ret = get_eff_addr_modrm_16(insn, regs, ®off, &eff_addr); + if (ret) + goto out; + } + + ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit); + if (ret) + goto out; + + /* + * Before computing the linear address, make sure the effective address + * is within the limits of the segment. In virtual-8086 mode, segment + * limits are not enforced. In such a case, the segment limit is -1L to + * reflect this fact. + */ + if ((unsigned long)(eff_addr & 0xffff) > seg_limit) + goto out; + + linear_addr = (unsigned long)(eff_addr & 0xffff) + seg_base; + + /* Limit linear address to 20 bits */ + if (v8086_mode(regs)) + linear_addr &= 0xfffff; + +out: + return (void __user *)linear_addr; +} + +/** + * get_addr_ref_32() - Obtain a 32-bit linear address + * @insn: Instruction with ModRM, SIB bytes and displacement + * @regs: Register values as seen when entering kernel mode + * + * This function is to be used with 32-bit address encodings to obtain the + * linear memory address referred by the instruction's ModRM, SIB, + * displacement bytes and segment base address, as applicable. If in protected + * mode, segment limits are enforced. + * + * Returns: + * + * Linear address referenced by instruction and registers on success. + * + * -1L on error. + */ +static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs) +{ + unsigned long linear_addr = -1L, seg_base, seg_limit; + int eff_addr, regoff; + long tmp; + int ret; + + if (insn->addr_bytes != 4) + goto out; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + ret = get_eff_addr_reg(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + + } else { + if (insn->sib.nbytes) { + ret = get_eff_addr_sib(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + } else { + ret = get_eff_addr_modrm(insn, regs, ®off, &tmp); + if (ret) + goto out; + + eff_addr = tmp; + } + } + + ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit); + if (ret) + goto out; + + /* + * In protected mode, before computing the linear address, make sure + * the effective address is within the limits of the segment. + * 32-bit addresses can be used in long and virtual-8086 modes if an + * address override prefix is used. In such cases, segment limits are + * not enforced. When in virtual-8086 mode, the segment limit is -1L + * to reflect this situation. + * + * After computed, the effective address is treated as an unsigned + * quantity. + */ + if (!user_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit)) + goto out; + + /* + * Even though 32-bit address encodings are allowed in virtual-8086 + * mode, the address range is still limited to [0x-0xffff]. + */ + if (v8086_mode(regs) && (eff_addr & ~0xffff)) + goto out; + + /* + * Data type long could be 64 bits in size. Ensure that our 32-bit + * effective address is not sign-extended when computing the linear + * address. + */ + linear_addr = (unsigned long)(eff_addr & 0xffffffff) + seg_base; + + /* Limit linear address to 20 bits */ + if (v8086_mode(regs)) + linear_addr &= 0xfffff; + +out: + return (void __user *)linear_addr; +} + +/** + * get_addr_ref_64() - Obtain a 64-bit linear address + * @insn: Instruction struct with ModRM and SIB bytes and displacement + * @regs: Structure with register values as seen when entering kernel mode + * + * This function is to be used with 64-bit address encodings to obtain the + * linear memory address referred by the instruction's ModRM, SIB, + * displacement bytes and segment base address, as applicable. + * + * Returns: + * + * Linear address referenced by instruction and registers on success. + * + * -1L on error. + */ +#ifndef CONFIG_X86_64 +static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs) +{ + return (void __user *)-1L; +} +#else +static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs) +{ + unsigned long linear_addr = -1L, seg_base; + int regoff, ret; + long eff_addr; + + if (insn->addr_bytes != 8) + goto out; + + if (X86_MODRM_MOD(insn->modrm.value) == 3) { + ret = get_eff_addr_reg(insn, regs, ®off, &eff_addr); + if (ret) + goto out; + + } else { + if (insn->sib.nbytes) { + ret = get_eff_addr_sib(insn, regs, ®off, &eff_addr); + if (ret) + goto out; + } else { + ret = get_eff_addr_modrm(insn, regs, ®off, &eff_addr); + if (ret) + goto out; + } + + } + + ret = get_seg_base_limit(insn, regs, regoff, &seg_base, NULL); + if (ret) + goto out; + + linear_addr = (unsigned long)eff_addr + seg_base; + +out: + return (void __user *)linear_addr; +} +#endif /* CONFIG_X86_64 */ + +/** + * insn_get_addr_ref() - Obtain the linear address referred by instruction + * @insn: Instruction structure containing ModRM byte and displacement + * @regs: Structure with register values as seen when entering kernel mode + * + * Obtain the linear address referred by the instruction's ModRM, SIB and + * displacement bytes, and segment base, as applicable. In protected mode, + * segment limits are enforced. + * + * Returns: + * + * Linear address referenced by instruction and registers on success. + * + * -1L on error. + */ +void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) +{ + if (!insn || !regs) + return (void __user *)-1L; + + switch (insn->addr_bytes) { + case 2: + return get_addr_ref_16(insn, regs); + case 4: + return get_addr_ref_32(insn, regs); + case 8: + return get_addr_ref_64(insn, regs); + default: + return (void __user *)-1L; + } +} diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index bf2c6074efd2..dc2ab6ea6768 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -98,6 +98,18 @@ ENTRY(call_rwsem_down_read_failed) ret ENDPROC(call_rwsem_down_read_failed) +ENTRY(call_rwsem_down_read_failed_killable) + FRAME_BEGIN + save_common_regs + __ASM_SIZE(push,) %__ASM_REG(dx) + movq %rax,%rdi + call rwsem_down_read_failed_killable + __ASM_SIZE(pop,) %__ASM_REG(dx) + restore_common_regs + FRAME_END + ret +ENDPROC(call_rwsem_down_read_failed_killable) + ENTRY(call_rwsem_down_write_failed) FRAME_BEGIN save_common_regs diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 12e377184ee4..e0b85930dd77 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) -ff: +ff: UD0 EndTable Table: 3-byte opcode 1 (0x0f 0x38) @@ -717,7 +717,7 @@ AVXcode: 2 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) 80: INVEPT Gy,Mdq (66) -81: INVPID Gy,Mdq (66) +81: INVVPID Gy,Mdq (66) 82: INVPCID Gy,Mdq (66) 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) 88: vexpandps/d Vpd,Wpd (66),(ev) @@ -896,7 +896,7 @@ EndTable GrpTable: Grp3_1 0: TEST Eb,Ib -1: +1: TEST Eb,Ib 2: NOT Eb 3: NEG Eb 4: MUL AL,Eb @@ -970,6 +970,15 @@ GrpTable: Grp9 EndTable GrpTable: Grp10 +# all are UD1 +0: UD1 +1: UD1 +2: UD1 +3: UD1 +4: UD1 +5: UD1 +6: UD1 +7: UD1 EndTable # Grp11A and Grp11B are expressed as Grp11 in Intel SDM diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 52906808e277..27e9e90a8d35 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_KMEMCHECK) += kmemcheck/ - KASAN_SANITIZE_kasan_init_$(BITS).o := n obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index c3521e2be396..9fe656c42aa5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,6 +1,7 @@ #include <linux/extable.h> #include <linux/uaccess.h> #include <linux/sched/debug.h> +#include <xen/xen.h> #include <asm/fpu/internal.h> #include <asm/traps.h> @@ -67,17 +68,22 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup, * wrapped around) will be set. Additionally, seeing the refcount * reach 0 will set ZF (Zero Flag: result was zero). In each of * these cases we want a report, since it's a boundary condition. - * + * The SF case is not reported since it indicates post-boundary + * manipulations below zero or above INT_MAX. And if none of the + * flags are set, something has gone very wrong, so report it. */ if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { bool zero = regs->flags & X86_EFLAGS_ZF; refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } else if ((regs->flags & X86_EFLAGS_SF) == 0) { + /* Report if none of OF, ZF, nor SF are set. */ + refcount_error_report(regs, "unexpected saturation"); } return true; } -EXPORT_SYMBOL_GPL(ex_handler_refcount); +EXPORT_SYMBOL(ex_handler_refcount); /* * Handler for when we fail to restore a task's FPU state. We should never get @@ -207,8 +213,9 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * Old CPUs leave the high bits of CS on the stack * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. + * Xen pv domains are not using the default __KERNEL_CS. */ - if (regs->cs != __KERNEL_CS) + if (!xen_pv_domain() && regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3109ba6c6ede..800de815519c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -20,7 +20,6 @@ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ -#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ @@ -173,14 +172,15 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) +static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info, + u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return; /* Fault not from Protection Keys: nothing to do */ - if (si_code != SEGV_PKUERR) + if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV)) return; /* * force_sig_info_fault() is called from a number of @@ -219,7 +219,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, pkey); + fill_sig_info_pkey(si_signo, si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -439,18 +439,13 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd_ref)) return -1; - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_ref); - arch_flush_lazy_mmu_mode(); - } else if (CONFIG_PGTABLE_LEVELS > 4) { - /* - * With folded p4d, pgd_none() is always false, so the pgd may - * point to an empty page table entry and pgd_page_vaddr() - * will return garbage. - * - * We will do the correct sanity check on the p4d level. - */ - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + if (CONFIG_PGTABLE_LEVELS > 4) { + if (pgd_none(*pgd)) { + set_pgd(pgd, *pgd_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } } /* With 4-level paging, copying happens on the p4d level. */ @@ -459,7 +454,7 @@ static noinline int vmalloc_fault(unsigned long address) if (p4d_none(*p4d_ref)) return -1; - if (p4d_none(*p4d)) { + if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) { set_p4d(p4d, *p4d_ref); arch_flush_lazy_mmu_mode(); } else { @@ -470,6 +465,7 @@ static noinline int vmalloc_fault(unsigned long address) * Below here mismatches are bugs because these lower tables * are shared: */ + BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); pud = pud_offset(p4d, address); pud_ref = pud_offset(p4d_ref, address); @@ -702,7 +698,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, else printk(KERN_CONT "paging request"); - printk(KERN_CONT " at %p\n", (void *) address); + printk(KERN_CONT " at %px\n", (void *) address); printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip); dump_pagetable(address); @@ -861,7 +857,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); @@ -1256,8 +1252,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. */ - if (kmemcheck_active(regs)) - kmemcheck_hide(regs); prefetchw(&mm->mmap_sem); if (unlikely(kmmio_fault(regs, address))) @@ -1280,9 +1274,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; - - if (kmemcheck_fault(regs, address, error_code)) - return; } /* Can handle a stale RO->RW TLB: */ diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8ae0000cbdb3..00b296617ca4 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -158,6 +158,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len > TASK_SIZE) return -ENOMEM; + /* No address checking. See comment at mmap_address_hint_valid() */ if (flags & MAP_FIXED) { if (prepare_hugepage_range(file, addr, len)) return -EINVAL; @@ -165,12 +166,16 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } if (addr) { - addr = ALIGN(addr, huge_page_size(h)); + addr &= huge_page_mask(h); + if (!mmap_address_hint_valid(addr, len)) + goto get_unmapped_area; + vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vm_start_gap(vma))) + if (!vma || addr + len <= vm_start_gap(vma)) return addr; } + +get_unmapped_area: if (mm->get_unmapped_area == arch_get_unmapped_area) return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6b462a472a7b..82f5252c723a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -93,8 +93,7 @@ __ref void *alloc_low_pages(unsigned int num) unsigned int order; order = get_order((unsigned long)num << PAGE_SHIFT); - return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | - __GFP_ZERO, order); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { @@ -171,12 +170,11 @@ static void enable_global_pages(void) static void __init probe_page_size_mask(void) { /* - * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will - * use small pages. + * For pagealloc debugging, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK)) + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; else direct_gbpages = 0; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index adcea90a2046..4a837289f2ad 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -184,7 +184,7 @@ static __ref void *spp_getpage(void) void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + ptr = (void *) get_zeroed_page(GFP_ATOMIC); else ptr = alloc_bootmem_pages(PAGE_SIZE); @@ -1173,12 +1173,18 @@ void __init mem_init(void) /* clear_bss() already clear the empty_zero_page */ - register_page_bootmem_info(); - /* this will put all memory onto the freelists */ free_all_bootmem(); after_bootmem = 1; + /* + * Must be done after boot memory is put on freelist, because here we + * might set fields in deferred struct pages that have not yet been + * initialized, and free_all_bootmem() initializes all the reserved + * deferred pages for us. + */ + register_page_bootmem_info(); + /* Register memory areas for /proc/kcore */ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_OTHER); @@ -1399,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, vmemmap_verify((pte_t *)pmd, node, addr, next); continue; } - pr_warn_once("vmemmap: falling back to regular page backing\n"); if (vmemmap_populate_basepages(addr, next, node)) return -ENOMEM; } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 34f0e1847dd6..c45b6ec5357b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -27,6 +27,11 @@ #include "physaddr.h" +struct ioremap_mem_flags { + bool system_ram; + bool desc_other; +}; + /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. @@ -56,17 +61,59 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } -static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, - void *arg) +static bool __ioremap_check_ram(struct resource *res) { + unsigned long start_pfn, stop_pfn; unsigned long i; - for (i = 0; i < nr_pages; ++i) - if (pfn_valid(start_pfn + i) && - !PageReserved(pfn_to_page(start_pfn + i))) - return 1; + if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) + return false; - return 0; + start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; + stop_pfn = (res->end + 1) >> PAGE_SHIFT; + if (stop_pfn > start_pfn) { + for (i = 0; i < (stop_pfn - start_pfn); ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return true; + } + + return false; +} + +static int __ioremap_check_desc_other(struct resource *res) +{ + return (res->desc != IORES_DESC_NONE); +} + +static int __ioremap_res_check(struct resource *res, void *arg) +{ + struct ioremap_mem_flags *flags = arg; + + if (!flags->system_ram) + flags->system_ram = __ioremap_check_ram(res); + + if (!flags->desc_other) + flags->desc_other = __ioremap_check_desc_other(res); + + return flags->system_ram && flags->desc_other; +} + +/* + * To avoid multiple resource walks, this function walks resources marked as + * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a + * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). + */ +static void __ioremap_check_mem(resource_size_t addr, unsigned long size, + struct ioremap_mem_flags *flags) +{ + u64 start, end; + + start = (u64)addr; + end = start + size - 1; + memset(flags, 0, sizeof(*flags)); + + walk_mem_res(start, end, flags, __ioremap_res_check); } /* @@ -87,9 +134,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, unsigned long size, enum page_cache_mode pcm, void *caller) { unsigned long offset, vaddr; - resource_size_t pfn, last_pfn, last_addr; + resource_size_t last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; + struct ioremap_mem_flags mem_flags; struct vm_struct *area; enum page_cache_mode new_pcm; pgprot_t prot; @@ -108,13 +156,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } + __ioremap_check_mem(phys_addr, size, &mem_flags); + /* * Don't allow anybody to remap normal RAM that we're using.. */ - pfn = phys_addr >> PAGE_SHIFT; - last_pfn = last_addr >> PAGE_SHIFT; - if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL, - __ioremap_check_ram) == 1) { + if (mem_flags.system_ram) { WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", &phys_addr, &last_addr); return NULL; @@ -146,7 +193,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, pcm = new_pcm; } + /* + * If the page being mapped is in memory and SEV is active then + * make sure the memory encryption attribute is enabled in the + * resulting mapping. + */ prot = PAGE_KERNEL_IO; + if (sev_active() && mem_flags.desc_other) + prot = pgprot_encrypted(prot); + switch (pcm) { case _PAGE_CACHE_MODE_UC: default: @@ -349,11 +404,11 @@ void iounmap(volatile void __iomem *addr) return; } + mmiotrace_iounmap(addr); + addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); - mmiotrace_iounmap(addr); - /* Use the vm area unlocked, assuming the caller ensures there isn't another iounmap for the same address in parallel. Reuse of the virtual address is prevented by @@ -422,6 +477,9 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) * areas should be mapped decrypted. And since the encryption key can * change across reboots, persistent memory should also be mapped * decrypted. + * + * If SEV is active, that implies that BIOS/UEFI also ran encrypted so + * only persistent memory should be mapped decrypted. */ static bool memremap_should_map_decrypted(resource_size_t phys_addr, unsigned long size) @@ -458,6 +516,11 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr, case E820_TYPE_ACPI: case E820_TYPE_NVS: case E820_TYPE_UNUSABLE: + /* For SEV, these areas are encrypted */ + if (sev_active()) + break; + /* Fallthrough */ + case E820_TYPE_PRAM: return true; default: @@ -581,7 +644,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, unsigned long flags) { - if (!sme_active()) + if (!mem_encrypt_active()) return true; if (flags & MEMREMAP_ENC) @@ -590,12 +653,13 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, if (flags & MEMREMAP_DEC) return false; - if (memremap_is_setup_data(phys_addr, size) || - memremap_is_efi_data(phys_addr, size) || - memremap_should_map_decrypted(phys_addr, size)) - return false; + if (sme_active()) { + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + return false; + } - return true; + return !memremap_should_map_decrypted(phys_addr, size); } /* @@ -608,17 +672,24 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, unsigned long size, pgprot_t prot) { - if (!sme_active()) + bool encrypted_prot; + + if (!mem_encrypt_active()) return prot; - if (early_memremap_is_setup_data(phys_addr, size) || - memremap_is_efi_data(phys_addr, size) || - memremap_should_map_decrypted(phys_addr, size)) - prot = pgprot_decrypted(prot); - else - prot = pgprot_encrypted(prot); + encrypted_prot = true; + + if (sme_active()) { + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + encrypted_prot = false; + } + + if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size)) + encrypted_prot = false; - return prot; + return encrypted_prot ? pgprot_encrypted(prot) + : pgprot_decrypted(prot); } bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile deleted file mode 100644 index 520b3bce4095..000000000000 --- a/arch/x86/mm/kmemcheck/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c deleted file mode 100644 index 872ec4159a68..000000000000 --- a/arch/x86/mm/kmemcheck/error.c +++ /dev/null @@ -1,228 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/interrupt.h> -#include <linux/kdebug.h> -#include <linux/kmemcheck.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/stacktrace.h> -#include <linux/string.h> - -#include "error.h" -#include "shadow.h" - -enum kmemcheck_error_type { - KMEMCHECK_ERROR_INVALID_ACCESS, - KMEMCHECK_ERROR_BUG, -}; - -#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) - -struct kmemcheck_error { - enum kmemcheck_error_type type; - - union { - /* KMEMCHECK_ERROR_INVALID_ACCESS */ - struct { - /* Kind of access that caused the error */ - enum kmemcheck_shadow state; - /* Address and size of the erroneous read */ - unsigned long address; - unsigned int size; - }; - }; - - struct pt_regs regs; - struct stack_trace trace; - unsigned long trace_entries[32]; - - /* We compress it to a char. */ - unsigned char shadow_copy[SHADOW_COPY_SIZE]; - unsigned char memory_copy[SHADOW_COPY_SIZE]; -}; - -/* - * Create a ring queue of errors to output. We can't call printk() directly - * from the kmemcheck traps, since this may call the console drivers and - * result in a recursive fault. - */ -static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; -static unsigned int error_count; -static unsigned int error_rd; -static unsigned int error_wr; -static unsigned int error_missed_count; - -static struct kmemcheck_error *error_next_wr(void) -{ - struct kmemcheck_error *e; - - if (error_count == ARRAY_SIZE(error_fifo)) { - ++error_missed_count; - return NULL; - } - - e = &error_fifo[error_wr]; - if (++error_wr == ARRAY_SIZE(error_fifo)) - error_wr = 0; - ++error_count; - return e; -} - -static struct kmemcheck_error *error_next_rd(void) -{ - struct kmemcheck_error *e; - - if (error_count == 0) - return NULL; - - e = &error_fifo[error_rd]; - if (++error_rd == ARRAY_SIZE(error_fifo)) - error_rd = 0; - --error_count; - return e; -} - -void kmemcheck_error_recall(void) -{ - static const char *desc[] = { - [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated", - [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized", - [KMEMCHECK_SHADOW_INITIALIZED] = "initialized", - [KMEMCHECK_SHADOW_FREED] = "freed", - }; - - static const char short_desc[] = { - [KMEMCHECK_SHADOW_UNALLOCATED] = 'a', - [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u', - [KMEMCHECK_SHADOW_INITIALIZED] = 'i', - [KMEMCHECK_SHADOW_FREED] = 'f', - }; - - struct kmemcheck_error *e; - unsigned int i; - - e = error_next_rd(); - if (!e) - return; - - switch (e->type) { - case KMEMCHECK_ERROR_INVALID_ACCESS: - printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n", - 8 * e->size, e->state < ARRAY_SIZE(desc) ? - desc[e->state] : "(invalid shadow state)", - (void *) e->address); - - printk(KERN_WARNING); - for (i = 0; i < SHADOW_COPY_SIZE; ++i) - printk(KERN_CONT "%02x", e->memory_copy[i]); - printk(KERN_CONT "\n"); - - printk(KERN_WARNING); - for (i = 0; i < SHADOW_COPY_SIZE; ++i) { - if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) - printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]); - else - printk(KERN_CONT " ?"); - } - printk(KERN_CONT "\n"); - printk(KERN_WARNING "%*c\n", 2 + 2 - * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); - break; - case KMEMCHECK_ERROR_BUG: - printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); - break; - } - - __show_regs(&e->regs, 1); - print_stack_trace(&e->trace, 0); -} - -static void do_wakeup(unsigned long data) -{ - while (error_count > 0) - kmemcheck_error_recall(); - - if (error_missed_count > 0) { - printk(KERN_WARNING "kmemcheck: Lost %d error reports because " - "the queue was too small\n", error_missed_count); - error_missed_count = 0; - } -} - -static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); - -/* - * Save the context of an error report. - */ -void kmemcheck_error_save(enum kmemcheck_shadow state, - unsigned long address, unsigned int size, struct pt_regs *regs) -{ - static unsigned long prev_ip; - - struct kmemcheck_error *e; - void *shadow_copy; - void *memory_copy; - - /* Don't report several adjacent errors from the same EIP. */ - if (regs->ip == prev_ip) - return; - prev_ip = regs->ip; - - e = error_next_wr(); - if (!e) - return; - - e->type = KMEMCHECK_ERROR_INVALID_ACCESS; - - e->state = state; - e->address = address; - e->size = size; - - /* Save regs */ - memcpy(&e->regs, regs, sizeof(*regs)); - - /* Save stack trace */ - e->trace.nr_entries = 0; - e->trace.entries = e->trace_entries; - e->trace.max_entries = ARRAY_SIZE(e->trace_entries); - e->trace.skip = 0; - save_stack_trace_regs(regs, &e->trace); - - /* Round address down to nearest 16 bytes */ - shadow_copy = kmemcheck_shadow_lookup(address - & ~(SHADOW_COPY_SIZE - 1)); - BUG_ON(!shadow_copy); - - memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); - - kmemcheck_show_addr(address); - memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); - memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); - kmemcheck_hide_addr(address); - - tasklet_hi_schedule_first(&kmemcheck_tasklet); -} - -/* - * Save the context of a kmemcheck bug. - */ -void kmemcheck_error_save_bug(struct pt_regs *regs) -{ - struct kmemcheck_error *e; - - e = error_next_wr(); - if (!e) - return; - - e->type = KMEMCHECK_ERROR_BUG; - - memcpy(&e->regs, regs, sizeof(*regs)); - - e->trace.nr_entries = 0; - e->trace.entries = e->trace_entries; - e->trace.max_entries = ARRAY_SIZE(e->trace_entries); - e->trace.skip = 1; - save_stack_trace(&e->trace); - - tasklet_hi_schedule_first(&kmemcheck_tasklet); -} diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h deleted file mode 100644 index 39f80d7a874d..000000000000 --- a/arch/x86/mm/kmemcheck/error.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H -#define ARCH__X86__MM__KMEMCHECK__ERROR_H - -#include <linux/ptrace.h> - -#include "shadow.h" - -void kmemcheck_error_save(enum kmemcheck_shadow state, - unsigned long address, unsigned int size, struct pt_regs *regs); - -void kmemcheck_error_save_bug(struct pt_regs *regs); - -void kmemcheck_error_recall(void); - -#endif diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c deleted file mode 100644 index 4515bae36bbe..000000000000 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ /dev/null @@ -1,658 +0,0 @@ -/** - * kmemcheck - a heavyweight memory checker for the linux kernel - * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no> - * (With a lot of help from Ingo Molnar and Pekka Enberg.) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2) as - * published by the Free Software Foundation. - */ - -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/kallsyms.h> -#include <linux/kernel.h> -#include <linux/kmemcheck.h> -#include <linux/mm.h> -#include <linux/page-flags.h> -#include <linux/percpu.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/types.h> - -#include <asm/cacheflush.h> -#include <asm/kmemcheck.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> - -#include "error.h" -#include "opcode.h" -#include "pte.h" -#include "selftest.h" -#include "shadow.h" - - -#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT -# define KMEMCHECK_ENABLED 0 -#endif - -#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT -# define KMEMCHECK_ENABLED 1 -#endif - -#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT -# define KMEMCHECK_ENABLED 2 -#endif - -int kmemcheck_enabled = KMEMCHECK_ENABLED; - -int __init kmemcheck_init(void) -{ -#ifdef CONFIG_SMP - /* - * Limit SMP to use a single CPU. We rely on the fact that this code - * runs before SMP is set up. - */ - if (setup_max_cpus > 1) { - printk(KERN_INFO - "kmemcheck: Limiting number of CPUs to 1.\n"); - setup_max_cpus = 1; - } -#endif - - if (!kmemcheck_selftest()) { - printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); - kmemcheck_enabled = 0; - return -EINVAL; - } - - printk(KERN_INFO "kmemcheck: Initialized\n"); - return 0; -} - -early_initcall(kmemcheck_init); - -/* - * We need to parse the kmemcheck= option before any memory is allocated. - */ -static int __init param_kmemcheck(char *str) -{ - int val; - int ret; - - if (!str) - return -EINVAL; - - ret = kstrtoint(str, 0, &val); - if (ret) - return ret; - kmemcheck_enabled = val; - return 0; -} - -early_param("kmemcheck", param_kmemcheck); - -int kmemcheck_show_addr(unsigned long address) -{ - pte_t *pte; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return 0; - - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - __flush_tlb_one(address); - return 1; -} - -int kmemcheck_hide_addr(unsigned long address) -{ - pte_t *pte; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return 0; - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); - __flush_tlb_one(address); - return 1; -} - -struct kmemcheck_context { - bool busy; - int balance; - - /* - * There can be at most two memory operands to an instruction, but - * each address can cross a page boundary -- so we may need up to - * four addresses that must be hidden/revealed for each fault. - */ - unsigned long addr[4]; - unsigned long n_addrs; - unsigned long flags; - - /* Data size of the instruction that caused a fault. */ - unsigned int size; -}; - -static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); - -bool kmemcheck_active(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - return data->balance > 0; -} - -/* Save an address that needs to be shown/hidden */ -static void kmemcheck_save_addr(unsigned long addr) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); - data->addr[data->n_addrs++] = addr; -} - -static unsigned int kmemcheck_show_all(void) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - unsigned int i; - unsigned int n; - - n = 0; - for (i = 0; i < data->n_addrs; ++i) - n += kmemcheck_show_addr(data->addr[i]); - - return n; -} - -static unsigned int kmemcheck_hide_all(void) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - unsigned int i; - unsigned int n; - - n = 0; - for (i = 0; i < data->n_addrs; ++i) - n += kmemcheck_hide_addr(data->addr[i]); - - return n; -} - -/* - * Called from the #PF handler. - */ -void kmemcheck_show(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - BUG_ON(!irqs_disabled()); - - if (unlikely(data->balance != 0)) { - kmemcheck_show_all(); - kmemcheck_error_save_bug(regs); - data->balance = 0; - return; - } - - /* - * None of the addresses actually belonged to kmemcheck. Note that - * this is not an error. - */ - if (kmemcheck_show_all() == 0) - return; - - ++data->balance; - - /* - * The IF needs to be cleared as well, so that the faulting - * instruction can run "uninterrupted". Otherwise, we might take - * an interrupt and start executing that before we've had a chance - * to hide the page again. - * - * NOTE: In the rare case of multiple faults, we must not override - * the original flags: - */ - if (!(regs->flags & X86_EFLAGS_TF)) - data->flags = regs->flags; - - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; -} - -/* - * Called from the #DB handler. - */ -void kmemcheck_hide(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - int n; - - BUG_ON(!irqs_disabled()); - - if (unlikely(data->balance != 1)) { - kmemcheck_show_all(); - kmemcheck_error_save_bug(regs); - data->n_addrs = 0; - data->balance = 0; - - if (!(data->flags & X86_EFLAGS_TF)) - regs->flags &= ~X86_EFLAGS_TF; - if (data->flags & X86_EFLAGS_IF) - regs->flags |= X86_EFLAGS_IF; - return; - } - - if (kmemcheck_enabled) - n = kmemcheck_hide_all(); - else - n = kmemcheck_show_all(); - - if (n == 0) - return; - - --data->balance; - - data->n_addrs = 0; - - if (!(data->flags & X86_EFLAGS_TF)) - regs->flags &= ~X86_EFLAGS_TF; - if (data->flags & X86_EFLAGS_IF) - regs->flags |= X86_EFLAGS_IF; -} - -void kmemcheck_show_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) { - unsigned long address; - pte_t *pte; - unsigned int level; - - address = (unsigned long) page_address(&p[i]); - pte = lookup_address(address, &level); - BUG_ON(!pte); - BUG_ON(level != PG_LEVEL_4K); - - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); - __flush_tlb_one(address); - } -} - -bool kmemcheck_page_is_tracked(struct page *p) -{ - /* This will also check the "hidden" flag of the PTE. */ - return kmemcheck_pte_lookup((unsigned long) page_address(p)); -} - -void kmemcheck_hide_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) { - unsigned long address; - pte_t *pte; - unsigned int level; - - address = (unsigned long) page_address(&p[i]); - pte = lookup_address(address, &level); - BUG_ON(!pte); - BUG_ON(level != PG_LEVEL_4K); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); - set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); - __flush_tlb_one(address); - } -} - -/* Access may NOT cross page boundary */ -static void kmemcheck_read_strict(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - void *shadow; - enum kmemcheck_shadow status; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return; - - kmemcheck_save_addr(addr); - status = kmemcheck_shadow_test(shadow, size); - if (status == KMEMCHECK_SHADOW_INITIALIZED) - return; - - if (kmemcheck_enabled) - kmemcheck_error_save(status, addr, size, regs); - - if (kmemcheck_enabled == 2) - kmemcheck_enabled = 0; - - /* Don't warn about it again. */ - kmemcheck_shadow_set(shadow, size); -} - -bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) -{ - enum kmemcheck_shadow status; - void *shadow; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return true; - - status = kmemcheck_shadow_test_all(shadow, size); - - return status == KMEMCHECK_SHADOW_INITIALIZED; -} - -/* Access may cross page boundary */ -static void kmemcheck_read(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - unsigned long page = addr & PAGE_MASK; - unsigned long next_addr = addr + size - 1; - unsigned long next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - kmemcheck_read_strict(regs, addr, size); - return; - } - - /* - * What we do is basically to split the access across the - * two pages and handle each part separately. Yes, this means - * that we may now see reads that are 3 + 5 bytes, for - * example (and if both are uninitialized, there will be two - * reports), but it makes the code a lot simpler. - */ - kmemcheck_read_strict(regs, addr, next_page - addr); - kmemcheck_read_strict(regs, next_page, next_addr - next_page); -} - -static void kmemcheck_write_strict(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - void *shadow; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return; - - kmemcheck_save_addr(addr); - kmemcheck_shadow_set(shadow, size); -} - -static void kmemcheck_write(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - unsigned long page = addr & PAGE_MASK; - unsigned long next_addr = addr + size - 1; - unsigned long next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - kmemcheck_write_strict(regs, addr, size); - return; - } - - /* See comment in kmemcheck_read(). */ - kmemcheck_write_strict(regs, addr, next_page - addr); - kmemcheck_write_strict(regs, next_page, next_addr - next_page); -} - -/* - * Copying is hard. We have two addresses, each of which may be split across - * a page (and each page will have different shadow addresses). - */ -static void kmemcheck_copy(struct pt_regs *regs, - unsigned long src_addr, unsigned long dst_addr, unsigned int size) -{ - uint8_t shadow[8]; - enum kmemcheck_shadow status; - - unsigned long page; - unsigned long next_addr; - unsigned long next_page; - - uint8_t *x; - unsigned int i; - unsigned int n; - - BUG_ON(size > sizeof(shadow)); - - page = src_addr & PAGE_MASK; - next_addr = src_addr + size - 1; - next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - /* Same page */ - x = kmemcheck_shadow_lookup(src_addr); - if (x) { - kmemcheck_save_addr(src_addr); - for (i = 0; i < size; ++i) - shadow[i] = x[i]; - } else { - for (i = 0; i < size; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } else { - n = next_page - src_addr; - BUG_ON(n > sizeof(shadow)); - - /* First page */ - x = kmemcheck_shadow_lookup(src_addr); - if (x) { - kmemcheck_save_addr(src_addr); - for (i = 0; i < n; ++i) - shadow[i] = x[i]; - } else { - /* Not tracked */ - for (i = 0; i < n; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - - /* Second page */ - x = kmemcheck_shadow_lookup(next_page); - if (x) { - kmemcheck_save_addr(next_page); - for (i = n; i < size; ++i) - shadow[i] = x[i - n]; - } else { - /* Not tracked */ - for (i = n; i < size; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - - page = dst_addr & PAGE_MASK; - next_addr = dst_addr + size - 1; - next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - /* Same page */ - x = kmemcheck_shadow_lookup(dst_addr); - if (x) { - kmemcheck_save_addr(dst_addr); - for (i = 0; i < size; ++i) { - x[i] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - } else { - n = next_page - dst_addr; - BUG_ON(n > sizeof(shadow)); - - /* First page */ - x = kmemcheck_shadow_lookup(dst_addr); - if (x) { - kmemcheck_save_addr(dst_addr); - for (i = 0; i < n; ++i) { - x[i] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - - /* Second page */ - x = kmemcheck_shadow_lookup(next_page); - if (x) { - kmemcheck_save_addr(next_page); - for (i = n; i < size; ++i) { - x[i - n] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - } - - status = kmemcheck_shadow_test(shadow, size); - if (status == KMEMCHECK_SHADOW_INITIALIZED) - return; - - if (kmemcheck_enabled) - kmemcheck_error_save(status, src_addr, size, regs); - - if (kmemcheck_enabled == 2) - kmemcheck_enabled = 0; -} - -enum kmemcheck_method { - KMEMCHECK_READ, - KMEMCHECK_WRITE, -}; - -static void kmemcheck_access(struct pt_regs *regs, - unsigned long fallback_address, enum kmemcheck_method fallback_method) -{ - const uint8_t *insn; - const uint8_t *insn_primary; - unsigned int size; - - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - /* Recursive fault -- ouch. */ - if (data->busy) { - kmemcheck_show_addr(fallback_address); - kmemcheck_error_save_bug(regs); - return; - } - - data->busy = true; - - insn = (const uint8_t *) regs->ip; - insn_primary = kmemcheck_opcode_get_primary(insn); - - kmemcheck_opcode_decode(insn, &size); - - switch (insn_primary[0]) { -#ifdef CONFIG_KMEMCHECK_BITOPS_OK - /* AND, OR, XOR */ - /* - * Unfortunately, these instructions have to be excluded from - * our regular checking since they access only some (and not - * all) bits. This clears out "bogus" bitfield-access warnings. - */ - case 0x80: - case 0x81: - case 0x82: - case 0x83: - switch ((insn_primary[1] >> 3) & 7) { - /* OR */ - case 1: - /* AND */ - case 4: - /* XOR */ - case 6: - kmemcheck_write(regs, fallback_address, size); - goto out; - - /* ADD */ - case 0: - /* ADC */ - case 2: - /* SBB */ - case 3: - /* SUB */ - case 5: - /* CMP */ - case 7: - break; - } - break; -#endif - - /* MOVS, MOVSB, MOVSW, MOVSD */ - case 0xa4: - case 0xa5: - /* - * These instructions are special because they take two - * addresses, but we only get one page fault. - */ - kmemcheck_copy(regs, regs->si, regs->di, size); - goto out; - - /* CMPS, CMPSB, CMPSW, CMPSD */ - case 0xa6: - case 0xa7: - kmemcheck_read(regs, regs->si, size); - kmemcheck_read(regs, regs->di, size); - goto out; - } - - /* - * If the opcode isn't special in any way, we use the data from the - * page fault handler to determine the address and type of memory - * access. - */ - switch (fallback_method) { - case KMEMCHECK_READ: - kmemcheck_read(regs, fallback_address, size); - goto out; - case KMEMCHECK_WRITE: - kmemcheck_write(regs, fallback_address, size); - goto out; - } - -out: - data->busy = false; -} - -bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) -{ - pte_t *pte; - - /* - * XXX: Is it safe to assume that memory accesses from virtual 86 - * mode or non-kernel code segments will _never_ access kernel - * memory (e.g. tracked pages)? For now, we need this to avoid - * invoking kmemcheck for PnP BIOS calls. - */ - if (regs->flags & X86_VM_MASK) - return false; - if (regs->cs != __KERNEL_CS) - return false; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return false; - - WARN_ON_ONCE(in_nmi()); - - if (error_code & 2) - kmemcheck_access(regs, address, KMEMCHECK_WRITE); - else - kmemcheck_access(regs, address, KMEMCHECK_READ); - - kmemcheck_show(regs); - return true; -} - -bool kmemcheck_trap(struct pt_regs *regs) -{ - if (!kmemcheck_active(regs)) - return false; - - /* We're done. */ - kmemcheck_hide(regs); - return true; -} diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c deleted file mode 100644 index df8109ddf7fe..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/types.h> - -#include "opcode.h" - -static bool opcode_is_prefix(uint8_t b) -{ - return - /* Group 1 */ - b == 0xf0 || b == 0xf2 || b == 0xf3 - /* Group 2 */ - || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 - || b == 0x64 || b == 0x65 - /* Group 3 */ - || b == 0x66 - /* Group 4 */ - || b == 0x67; -} - -#ifdef CONFIG_X86_64 -static bool opcode_is_rex_prefix(uint8_t b) -{ - return (b & 0xf0) == 0x40; -} -#else -static bool opcode_is_rex_prefix(uint8_t b) -{ - return false; -} -#endif - -#define REX_W (1 << 3) - -/* - * This is a VERY crude opcode decoder. We only need to find the size of the - * load/store that caused our #PF and this should work for all the opcodes - * that we care about. Moreover, the ones who invented this instruction set - * should be shot. - */ -void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) -{ - /* Default operand size */ - int operand_size_override = 4; - - /* prefixes */ - for (; opcode_is_prefix(*op); ++op) { - if (*op == 0x66) - operand_size_override = 2; - } - - /* REX prefix */ - if (opcode_is_rex_prefix(*op)) { - uint8_t rex = *op; - - ++op; - if (rex & REX_W) { - switch (*op) { - case 0x63: - *size = 4; - return; - case 0x0f: - ++op; - - switch (*op) { - case 0xb6: - case 0xbe: - *size = 1; - return; - case 0xb7: - case 0xbf: - *size = 2; - return; - } - - break; - } - - *size = 8; - return; - } - } - - /* escape opcode */ - if (*op == 0x0f) { - ++op; - - /* - * This is move with zero-extend and sign-extend, respectively; - * we don't have to think about 0xb6/0xbe, because this is - * already handled in the conditional below. - */ - if (*op == 0xb7 || *op == 0xbf) - operand_size_override = 2; - } - - *size = (*op & 1) ? operand_size_override : 1; -} - -const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) -{ - /* skip prefixes */ - while (opcode_is_prefix(*op)) - ++op; - if (opcode_is_rex_prefix(*op)) - ++op; - return op; -} diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h deleted file mode 100644 index 51a1ce94c24a..000000000000 --- a/arch/x86/mm/kmemcheck/opcode.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H -#define ARCH__X86__MM__KMEMCHECK__OPCODE_H - -#include <linux/types.h> - -void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); -const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); - -#endif diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c deleted file mode 100644 index 8a03be90272a..000000000000 --- a/arch/x86/mm/kmemcheck/pte.c +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/mm.h> - -#include <asm/pgtable.h> - -#include "pte.h" - -pte_t *kmemcheck_pte_lookup(unsigned long address) -{ - pte_t *pte; - unsigned int level; - - pte = lookup_address(address, &level); - if (!pte) - return NULL; - if (level != PG_LEVEL_4K) - return NULL; - if (!pte_hidden(*pte)) - return NULL; - - return pte; -} - diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h deleted file mode 100644 index b595612382c2..000000000000 --- a/arch/x86/mm/kmemcheck/pte.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H -#define ARCH__X86__MM__KMEMCHECK__PTE_H - -#include <linux/mm.h> - -#include <asm/pgtable.h> - -pte_t *kmemcheck_pte_lookup(unsigned long address); - -#endif diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c deleted file mode 100644 index 7ce0be1f99eb..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/bug.h> -#include <linux/kernel.h> - -#include "opcode.h" -#include "selftest.h" - -struct selftest_opcode { - unsigned int expected_size; - const uint8_t *insn; - const char *desc; -}; - -static const struct selftest_opcode selftest_opcodes[] = { - /* REP MOVS */ - {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"}, - {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"}, - - /* MOVZX / MOVZXD */ - {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"}, - {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"}, - - /* MOVSX / MOVSXD */ - {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"}, - {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"}, - -#ifdef CONFIG_X86_64 - /* MOVZX / MOVZXD */ - {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"}, - {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"}, - - /* MOVSX / MOVSXD */ - {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"}, - {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"}, - {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"}, -#endif -}; - -static bool selftest_opcode_one(const struct selftest_opcode *op) -{ - unsigned size; - - kmemcheck_opcode_decode(op->insn, &size); - - if (size == op->expected_size) - return true; - - printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", - op->desc, op->expected_size, size); - return false; -} - -static bool selftest_opcodes_all(void) -{ - bool pass = true; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) - pass = pass && selftest_opcode_one(&selftest_opcodes[i]); - - return pass; -} - -bool kmemcheck_selftest(void) -{ - bool pass = true; - - pass = pass && selftest_opcodes_all(); - - return pass; -} diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h deleted file mode 100644 index 8d759aae453d..000000000000 --- a/arch/x86/mm/kmemcheck/selftest.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H -#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H - -bool kmemcheck_selftest(void); - -#endif diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c deleted file mode 100644 index c2638a7d2c10..000000000000 --- a/arch/x86/mm/kmemcheck/shadow.c +++ /dev/null @@ -1,173 +0,0 @@ -#include <linux/kmemcheck.h> -#include <linux/export.h> -#include <linux/mm.h> - -#include <asm/page.h> -#include <asm/pgtable.h> - -#include "pte.h" -#include "shadow.h" - -/* - * Return the shadow address for the given address. Returns NULL if the - * address is not tracked. - * - * We need to be extremely careful not to follow any invalid pointers, - * because this function can be called for *any* possible address. - */ -void *kmemcheck_shadow_lookup(unsigned long address) -{ - pte_t *pte; - struct page *page; - - if (!virt_addr_valid(address)) - return NULL; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return NULL; - - page = virt_to_page(address); - if (!page->shadow) - return NULL; - return page->shadow + (address & (PAGE_SIZE - 1)); -} - -static void mark_shadow(void *address, unsigned int n, - enum kmemcheck_shadow status) -{ - unsigned long addr = (unsigned long) address; - unsigned long last_addr = addr + n - 1; - unsigned long page = addr & PAGE_MASK; - unsigned long last_page = last_addr & PAGE_MASK; - unsigned int first_n; - void *shadow; - - /* If the memory range crosses a page boundary, stop there. */ - if (page == last_page) - first_n = n; - else - first_n = page + PAGE_SIZE - addr; - - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, first_n); - - addr += first_n; - n -= first_n; - - /* Do full-page memset()s. */ - while (n >= PAGE_SIZE) { - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, PAGE_SIZE); - - addr += PAGE_SIZE; - n -= PAGE_SIZE; - } - - /* Do the remaining page, if any. */ - if (n > 0) { - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, n); - } -} - -void kmemcheck_mark_unallocated(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); -} - -void kmemcheck_mark_uninitialized(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); -} - -/* - * Fill the shadow memory of the given address such that the memory at that - * address is marked as being initialized. - */ -void kmemcheck_mark_initialized(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); -} -EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); - -void kmemcheck_mark_freed(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); -} - -void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); -} - -void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); -} - -void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); -} - -enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) -{ -#ifdef CONFIG_KMEMCHECK_PARTIAL_OK - uint8_t *x; - unsigned int i; - - x = shadow; - - /* - * Make sure _some_ bytes are initialized. Gcc frequently generates - * code to access neighboring bytes. - */ - for (i = 0; i < size; ++i) { - if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) - return x[i]; - } - - return x[0]; -#else - return kmemcheck_shadow_test_all(shadow, size); -#endif -} - -enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size) -{ - uint8_t *x; - unsigned int i; - - x = shadow; - - /* All bytes must be initialized. */ - for (i = 0; i < size; ++i) { - if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) - return x[i]; - } - - return x[0]; -} - -void kmemcheck_shadow_set(void *shadow, unsigned int size) -{ - uint8_t *x; - unsigned int i; - - x = shadow; - for (i = 0; i < size; ++i) - x[i] = KMEMCHECK_SHADOW_INITIALIZED; -} diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h deleted file mode 100644 index 49768dc18664..000000000000 --- a/arch/x86/mm/kmemcheck/shadow.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H -#define ARCH__X86__MM__KMEMCHECK__SHADOW_H - -enum kmemcheck_shadow { - KMEMCHECK_SHADOW_UNALLOCATED, - KMEMCHECK_SHADOW_UNINITIALIZED, - KMEMCHECK_SHADOW_INITIALIZED, - KMEMCHECK_SHADOW_FREED, -}; - -void *kmemcheck_shadow_lookup(unsigned long address); - -enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); -enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, - unsigned int size); -void kmemcheck_shadow_set(void *shadow, unsigned int size); - -#endif diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index c21c2ed04612..58477ec3d66d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -435,17 +435,18 @@ int register_kmmio_probe(struct kmmio_probe *p) unsigned long flags; int ret = 0; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); unsigned int l; pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); - if (get_kmmio_probe(p->addr)) { + if (get_kmmio_probe(addr)) { ret = -EEXIST; goto out; } - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) { ret = -EINVAL; goto out; @@ -454,7 +455,7 @@ int register_kmmio_probe(struct kmmio_probe *p) kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { - if (add_kmmio_fault_page(p->addr + size)) + if (add_kmmio_fault_page(addr + size)) pr_err("Unable to set page fault.\n"); size += page_level_size(l); } @@ -528,19 +529,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p) { unsigned long flags; unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; unsigned int l; pte_t *pte; - pte = lookup_address(p->addr, &l); + pte = lookup_address(addr, &l); if (!pte) return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { - release_kmmio_fault_page(p->addr + size, &release_list); + release_kmmio_fault_page(addr + size, &release_list); size += page_level_size(l); } list_del_rcu(&p->list); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 0286327e65fa..e1d61e8500f9 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -30,6 +30,8 @@ #include <asm/msr.h> #include <asm/cmdline.h> +#include "mm_internal.h" + static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; static char sme_cmdline_off[] __initdata = "off"; @@ -41,6 +43,10 @@ static char sme_cmdline_off[] __initdata = "off"; */ u64 sme_me_mask __section(.data) = 0; EXPORT_SYMBOL(sme_me_mask); +DEFINE_STATIC_KEY_FALSE(sev_enable_key); +EXPORT_SYMBOL_GPL(sev_enable_key); + +static bool sev_enabled __section(.data); /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); @@ -63,7 +69,6 @@ static void __init __sme_early_enc_dec(resource_size_t paddr, if (!sme_me_mask) return; - local_flush_tlb(); wbinvd(); /* @@ -190,8 +195,238 @@ void __init sme_early_init(void) /* Update the protection map with memory encryption mask */ for (i = 0; i < ARRAY_SIZE(protection_map); i++) protection_map[i] = pgprot_encrypted(protection_map[i]); + + if (sev_active()) + swiotlb_force = SWIOTLB_FORCE; +} + +static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + unsigned long dma_mask; + unsigned int order; + struct page *page; + void *vaddr = NULL; + + dma_mask = dma_alloc_coherent_mask(dev, gfp); + order = get_order(size); + + /* + * Memory will be memset to zero after marking decrypted, so don't + * bother clearing it before. + */ + gfp &= ~__GFP_ZERO; + + page = alloc_pages_node(dev_to_node(dev), gfp, order); + if (page) { + dma_addr_t addr; + + /* + * Since we will be clearing the encryption bit, check the + * mask with it already cleared. + */ + addr = __sme_clr(phys_to_dma(dev, page_to_phys(page))); + if ((addr + size) > dma_mask) { + __free_pages(page, get_order(size)); + } else { + vaddr = page_address(page); + *dma_handle = addr; + } + } + + if (!vaddr) + vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp); + + if (!vaddr) + return NULL; + + /* Clear the SME encryption bit for DMA use if not swiotlb area */ + if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) { + set_memory_decrypted((unsigned long)vaddr, 1 << order); + memset(vaddr, 0, PAGE_SIZE << order); + *dma_handle = __sme_clr(*dma_handle); + } + + return vaddr; +} + +static void sev_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) +{ + /* Set the SME encryption bit for re-use if not swiotlb area */ + if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle))) + set_memory_encrypted((unsigned long)vaddr, + 1 << get_order(size)); + + swiotlb_free_coherent(dev, size, vaddr, dma_handle); +} + +static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +{ + pgprot_t old_prot, new_prot; + unsigned long pfn, pa, size; + pte_t new_pte; + + switch (level) { + case PG_LEVEL_4K: + pfn = pte_pfn(*kpte); + old_prot = pte_pgprot(*kpte); + break; + case PG_LEVEL_2M: + pfn = pmd_pfn(*(pmd_t *)kpte); + old_prot = pmd_pgprot(*(pmd_t *)kpte); + break; + case PG_LEVEL_1G: + pfn = pud_pfn(*(pud_t *)kpte); + old_prot = pud_pgprot(*(pud_t *)kpte); + break; + default: + return; + } + + new_prot = old_prot; + if (enc) + pgprot_val(new_prot) |= _PAGE_ENC; + else + pgprot_val(new_prot) &= ~_PAGE_ENC; + + /* If prot is same then do nothing. */ + if (pgprot_val(old_prot) == pgprot_val(new_prot)) + return; + + pa = pfn << page_level_shift(level); + size = page_level_size(level); + + /* + * We are going to perform in-place en-/decryption and change the + * physical page attribute from C=1 to C=0 or vice versa. Flush the + * caches to ensure that data gets accessed with the correct C-bit. + */ + clflush_cache_range(__va(pa), size); + + /* Encrypt/decrypt the contents in-place */ + if (enc) + sme_early_encrypt(pa, size); + else + sme_early_decrypt(pa, size); + + /* Change the page encryption mask. */ + new_pte = pfn_pte(pfn, new_prot); + set_pte_atomic(kpte, new_pte); } +static int __init early_set_memory_enc_dec(unsigned long vaddr, + unsigned long size, bool enc) +{ + unsigned long vaddr_end, vaddr_next; + unsigned long psize, pmask; + int split_page_size_mask; + int level, ret; + pte_t *kpte; + + vaddr_next = vaddr; + vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + ret = 1; + goto out; + } + + if (level == PG_LEVEL_4K) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; + continue; + } + + psize = page_level_size(level); + pmask = page_level_mask(level); + + /* + * Check whether we can change the large page in one go. + * We request a split when the address is not aligned and + * the number of pages to set/clear encryption bit is smaller + * than the number of pages in the large page. + */ + if (vaddr == (vaddr & pmask) && + ((vaddr_end - vaddr) >= psize)) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & pmask) + psize; + continue; + } + + /* + * The virtual address is part of a larger page, create the next + * level page table mapping (4K or 2M). If it is part of a 2M + * page then we request a split of the large page into 4K + * chunks. A 1GB large page is split into 2M pages, resp. + */ + if (level == PG_LEVEL_2M) + split_page_size_mask = 0; + else + split_page_size_mask = 1 << PG_LEVEL_2M; + + kernel_physical_mapping_init(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); + } + + ret = 0; + +out: + __flush_tlb_all(); + return ret; +} + +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, false); +} + +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, true); +} + +/* + * SME and SEV are very similar but they are not the same, so there are + * times that the kernel will need to distinguish between SME and SEV. The + * sme_active() and sev_active() functions are used for this. When a + * distinction isn't needed, the mem_encrypt_active() function can be used. + * + * The trampoline code is a good example for this requirement. Before + * paging is activated, SME will access all memory as decrypted, but SEV + * will access all memory as encrypted. So, when APs are being brought + * up under SME the trampoline area cannot be encrypted, whereas under SEV + * the trampoline area must be encrypted. + */ +bool sme_active(void) +{ + return sme_me_mask && !sev_enabled; +} +EXPORT_SYMBOL(sme_active); + +bool sev_active(void) +{ + return sme_me_mask && sev_enabled; +} +EXPORT_SYMBOL(sev_active); + +static const struct dma_map_ops sev_dma_ops = { + .alloc = sev_alloc, + .free = sev_free, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .mapping_error = swiotlb_dma_mapping_error, +}; + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { @@ -201,7 +436,23 @@ void __init mem_encrypt_init(void) /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ swiotlb_update_mem_attributes(); - pr_info("AMD Secure Memory Encryption (SME) active\n"); + /* + * With SEV, DMA operations cannot use encryption. New DMA ops + * are required in order to mark the DMA areas as decrypted or + * to use bounce buffers. + */ + if (sev_active()) + dma_ops = &sev_dma_ops; + + /* + * With SEV, we need to unroll the rep string I/O instructions. + */ + if (sev_active()) + static_branch_enable(&sev_enable_key); + + pr_info("AMD %s active\n", + sev_active() ? "Secure Encrypted Virtualization (SEV)" + : "Secure Memory Encryption (SME)"); } void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) @@ -213,37 +464,62 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); } -static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, - unsigned long end) +struct sme_populate_pgd_data { + void *pgtable_area; + pgd_t *pgd; + + pmdval_t pmd_flags; + pteval_t pte_flags; + unsigned long paddr; + + unsigned long vaddr; + unsigned long vaddr_end; +}; + +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; pgd_t *pgd_p; - pgd_start = start & PGDIR_MASK; - pgd_end = end & PGDIR_MASK; + pgd_start = ppd->vaddr & PGDIR_MASK; + pgd_end = ppd->vaddr_end & PGDIR_MASK; - pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); - pgd_size *= sizeof(pgd_t); + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); - pgd_p = pgd_base + pgd_index(start); + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); memset(pgd_p, 0, pgd_size); } -#define PGD_FLAGS _KERNPG_TABLE_NOENC -#define P4D_FLAGS _KERNPG_TABLE_NOENC -#define PUD_FLAGS _KERNPG_TABLE_NOENC -#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS _KERNPG_TABLE_NOENC + +#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +#define PMD_FLAGS_DEC PMD_FLAGS_LARGE +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) -static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, - unsigned long vaddr, pmdval_t pmd_val) +#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) + +#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) + +#define PTE_FLAGS_DEC PTE_FLAGS +#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) + +static pmd_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) { pgd_t *pgd_p; p4d_t *p4d_p; pud_t *pud_p; pmd_t *pmd_p; - pgd_p = pgd_base + pgd_index(vaddr); + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); if (native_pgd_val(*pgd_p)) { if (IS_ENABLED(CONFIG_X86_5LEVEL)) p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); @@ -253,15 +529,15 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, pgd_t pgd; if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p = pgtable_area; + p4d_p = ppd->pgtable_area; memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); - pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; + ppd->pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); } else { - pud_p = pgtable_area; + pud_p = ppd->pgtable_area; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); } @@ -269,58 +545,160 @@ static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, } if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - p4d_p += p4d_index(vaddr); + p4d_p += p4d_index(ppd->vaddr); if (native_p4d_val(*p4d_p)) { pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); } else { p4d_t p4d; - pud_p = pgtable_area; + pud_p = ppd->pgtable_area; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + ppd->pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); native_set_p4d(p4d_p, p4d); } } - pud_p += pud_index(vaddr); + pud_p += pud_index(ppd->vaddr); if (native_pud_val(*pud_p)) { if (native_pud_val(*pud_p) & _PAGE_PSE) - goto out; + return NULL; pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); } else { pud_t pud; - pmd_p = pgtable_area; + pmd_p = ppd->pgtable_area; memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); - pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; + ppd->pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); native_set_pud(pud_p, pud); } - pmd_p += pmd_index(vaddr); + return pmd_p; +} + +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +{ + pmd_t *pmd_p; + + pmd_p = sme_prepare_pgd(ppd); + if (!pmd_p) + return; + + pmd_p += pmd_index(ppd->vaddr); if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) - native_set_pmd(pmd_p, native_make_pmd(pmd_val)); + native_set_pmd(pmd_p, native_make_pmd(ppd->paddr | ppd->pmd_flags)); +} -out: - return pgtable_area; +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +{ + pmd_t *pmd_p; + pte_t *pte_p; + + pmd_p = sme_prepare_pgd(ppd); + if (!pmd_p) + return; + + pmd_p += pmd_index(ppd->vaddr); + if (native_pmd_val(*pmd_p)) { + if (native_pmd_val(*pmd_p) & _PAGE_PSE) + return; + + pte_p = (pte_t *)(native_pmd_val(*pmd_p) & ~PTE_FLAGS_MASK); + } else { + pmd_t pmd; + + pte_p = ppd->pgtable_area; + memset(pte_p, 0, sizeof(*pte_p) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(*pte_p) * PTRS_PER_PTE; + + pmd = native_make_pmd((pteval_t)pte_p + PMD_FLAGS); + native_set_pmd(pmd_p, pmd); + } + + pte_p += pte_index(ppd->vaddr); + if (!native_pte_val(*pte_p)) + native_set_pte(pte_p, native_make_pte(ppd->paddr | ppd->pte_flags)); +} + +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd_large(ppd); + + ppd->vaddr += PMD_PAGE_SIZE; + ppd->paddr += PMD_PAGE_SIZE; + } +} + +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd(ppd); + + ppd->vaddr += PAGE_SIZE; + ppd->paddr += PAGE_SIZE; + } +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags, pteval_t pte_flags) +{ + unsigned long vaddr_end; + + ppd->pmd_flags = pmd_flags; + ppd->pte_flags = pte_flags; + + /* Save original end value since we modify the struct value */ + vaddr_end = ppd->vaddr_end; + + /* If start is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); + __sme_map_range_pte(ppd); + + /* Create PMD entries */ + ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; + __sme_map_range_pmd(ppd); + + /* If end is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = vaddr_end; + __sme_map_range_pte(ppd); +} + +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); +} + +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); +} + +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); } static unsigned long __init sme_pgtable_calc(unsigned long len) { - unsigned long p4d_size, pud_size, pmd_size; + unsigned long p4d_size, pud_size, pmd_size, pte_size; unsigned long total; /* * Perform a relatively simplistic calculation of the pagetable - * entries that are needed. That mappings will be covered by 2MB - * PMD entries so we can conservatively calculate the required + * entries that are needed. Those mappings will be covered mostly + * by 2MB PMD entries so we can conservatively calculate the required * number of P4D, PUD and PMD structures needed to perform the - * mappings. Incrementing the count for each covers the case where - * the addresses cross entries. + * mappings. For mappings that are not 2MB aligned, PTE mappings + * would be needed for the start and end portion of the address range + * that fall outside of the 2MB alignment. This results in, at most, + * two extra pages to hold PTE entries for each range that is mapped. + * Incrementing the count for each covers the case where the addresses + * cross entries. */ if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; @@ -334,8 +712,9 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) } pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + pte_size = 2 * sizeof(pte_t) * PTRS_PER_PTE; - total = p4d_size + pud_size + pmd_size; + total = p4d_size + pud_size + pmd_size + pte_size; /* * Now calculate the added pagetable structures needed to populate @@ -359,29 +738,29 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) return total; } -void __init sme_encrypt_kernel(void) +void __init __nostackprotector sme_encrypt_kernel(struct boot_params *bp) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; unsigned long kernel_start, kernel_end, kernel_len; + unsigned long initrd_start, initrd_end, initrd_len; + struct sme_populate_pgd_data ppd; unsigned long pgtable_area_len; - unsigned long paddr, pmd_flags; unsigned long decrypted_base; - void *pgtable_area; - pgd_t *pgd; if (!sme_active()) return; /* - * Prepare for encrypting the kernel by building new pagetables with - * the necessary attributes needed to encrypt the kernel in place. + * Prepare for encrypting the kernel and initrd by building new + * pagetables with the necessary attributes needed to encrypt the + * kernel in place. * * One range of virtual addresses will map the memory occupied - * by the kernel as encrypted. + * by the kernel and initrd as encrypted. * * Another range of virtual addresses will map the memory occupied - * by the kernel as decrypted and write-protected. + * by the kernel and initrd as decrypted and write-protected. * * The use of write-protect attribute will prevent any of the * memory from being cached. @@ -392,6 +771,20 @@ void __init sme_encrypt_kernel(void) kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); kernel_len = kernel_end - kernel_start; + initrd_start = 0; + initrd_end = 0; + initrd_len = 0; +#ifdef CONFIG_BLK_DEV_INITRD + initrd_len = (unsigned long)bp->hdr.ramdisk_size | + ((unsigned long)bp->ext_ramdisk_size << 32); + if (initrd_len) { + initrd_start = (unsigned long)bp->hdr.ramdisk_image | + ((unsigned long)bp->ext_ramdisk_image << 32); + initrd_end = PAGE_ALIGN(initrd_start + initrd_len); + initrd_len = initrd_end - initrd_start; + } +#endif + /* Set the encryption workarea to be immediately after the kernel */ workarea_start = kernel_end; @@ -414,16 +807,21 @@ void __init sme_encrypt_kernel(void) */ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + if (initrd_len) + pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; /* PUDs and PMDs needed in the current pagetables for the workarea */ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); /* * The total workarea includes the executable encryption area and - * the pagetable area. + * the pagetable area. The start of the workarea is already 2MB + * aligned, align the end of the workarea on a 2MB boundary so that + * we don't try to create/allocate PTE entries from the workarea + * before it is mapped. */ workarea_len = execute_len + pgtable_area_len; - workarea_end = workarea_start + workarea_len; + workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); /* * Set the address to the start of where newly created pagetable @@ -432,45 +830,30 @@ void __init sme_encrypt_kernel(void) * pagetables and when the new encrypted and decrypted kernel * mappings are populated. */ - pgtable_area = (void *)execute_end; + ppd.pgtable_area = (void *)execute_end; /* * Make sure the current pagetable structure has entries for * addressing the workarea. */ - pgd = (pgd_t *)native_read_cr3_pa(); - paddr = workarea_start; - while (paddr < workarea_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr, - paddr + PMD_FLAGS); - - paddr += PMD_PAGE_SIZE; - } + ppd.pgd = (pgd_t *)native_read_cr3_pa(); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); /* Flush the TLB - no globals so cr3 is enough */ native_write_cr3(__native_read_cr3()); /* * A new pagetable structure is being built to allow for the kernel - * to be encrypted. It starts with an empty PGD that will then be - * populated with new PUDs and PMDs as the encrypted and decrypted - * kernel mappings are created. + * and initrd to be encrypted. It starts with an empty PGD that will + * then be populated with new PUDs and PMDs as the encrypted and + * decrypted kernel mappings are created. */ - pgd = pgtable_area; - memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); - pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; - - /* Add encrypted kernel (identity) mappings */ - pmd_flags = PMD_FLAGS | _PAGE_ENC; - paddr = kernel_start; - while (paddr < kernel_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr, - paddr + pmd_flags); - - paddr += PMD_PAGE_SIZE; - } + ppd.pgd = ppd.pgtable_area; + memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; /* * A different PGD index/entry must be used to get different @@ -479,47 +862,79 @@ void __init sme_encrypt_kernel(void) * the base of the mapping. */ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + if (initrd_len) { + unsigned long check_base; + + check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base = max(decrypted_base, check_base); + } decrypted_base <<= PGDIR_SHIFT; + /* Add encrypted kernel (identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start; + ppd.vaddr_end = kernel_end; + sme_map_range_encrypted(&ppd); + /* Add decrypted, write-protected kernel (non-identity) mappings */ - pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); - paddr = kernel_start; - while (paddr < kernel_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr + decrypted_base, - paddr + pmd_flags); - - paddr += PMD_PAGE_SIZE; + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + + if (initrd_len) { + /* Add encrypted initrd (identity) mappings */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start; + ppd.vaddr_end = initrd_end; + sme_map_range_encrypted(&ppd); + /* + * Add decrypted, write-protected initrd (non-identity) mappings + */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); } /* Add decrypted workarea mappings to both kernel mappings */ - paddr = workarea_start; - while (paddr < workarea_end) { - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr, - paddr + PMD_FLAGS); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); - pgtable_area = sme_populate_pgd(pgd, pgtable_area, - paddr + decrypted_base, - paddr + PMD_FLAGS); - - paddr += PMD_PAGE_SIZE; - } + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(&ppd); /* Perform the encryption */ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, - kernel_len, workarea_start, (unsigned long)pgd); + kernel_len, workarea_start, (unsigned long)ppd.pgd); + + if (initrd_len) + sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, + initrd_len, workarea_start, + (unsigned long)ppd.pgd); /* * At this point we are running encrypted. Remove the mappings for * the decrypted areas - all that is needed for this is to remove * the PGD entry/entries. */ - sme_clear_pgd(pgd, kernel_start + decrypted_base, - kernel_end + decrypted_base); + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_clear_pgd(&ppd); + + if (initrd_len) { + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_clear_pgd(&ppd); + } - sme_clear_pgd(pgd, workarea_start + decrypted_base, - workarea_end + decrypted_base); + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_clear_pgd(&ppd); /* Flush the TLB - no globals so cr3 is enough */ native_write_cr3(__native_read_cr3()); @@ -529,37 +944,63 @@ void __init __nostackprotector sme_enable(struct boot_params *bp) { const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; unsigned int eax, ebx, ecx, edx; + unsigned long feature_mask; bool active_by_default; unsigned long me_mask; char buffer[16]; u64 msr; - /* Check for the SME support leaf */ + /* Check for the SME/SEV support leaf */ eax = 0x80000000; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); if (eax < 0x8000001f) return; +#define AMD_SME_BIT BIT(0) +#define AMD_SEV_BIT BIT(1) + /* + * Set the feature mask (SME or SEV) based on whether we are + * running under a hypervisor. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + /* - * Check for the SME feature: - * CPUID Fn8000_001F[EAX] - Bit 0 - * Secure Memory Encryption support - * CPUID Fn8000_001F[EBX] - Bits 5:0 - * Pagetable bit position used to indicate encryption + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption */ eax = 0x8000001f; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); - if (!(eax & 1)) + if (!(eax & feature_mask)) return; me_mask = 1UL << (ebx & 0x3f); - /* Check if SME is enabled */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + /* For SME, check the SYSCFG MSR */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + } else { + /* For SEV, check the SEV MSR */ + msr = __rdmsr(MSR_AMD64_SEV); + if (!(msr & MSR_AMD64_SEV_ENABLED)) + return; + + /* SEV state cannot be controlled by a command line option */ + sme_me_mask = me_mask; + sev_enabled = true; return; + } /* * Fixups have not been applied to phys_base yet and we're running diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 730e6d541df1..01f682cf77a8 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -22,9 +22,9 @@ ENTRY(sme_encrypt_execute) /* * Entry parameters: - * RDI - virtual address for the encrypted kernel mapping - * RSI - virtual address for the decrypted kernel mapping - * RDX - length of kernel + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping + * RDX - length to encrypt * RCX - virtual address of the encryption workarea, including: * - stack page (PAGE_SIZE) * - encryption routine page (PAGE_SIZE) @@ -41,9 +41,9 @@ ENTRY(sme_encrypt_execute) addq $PAGE_SIZE, %rax /* Workarea encryption routine */ push %r12 - movq %rdi, %r10 /* Encrypted kernel */ - movq %rsi, %r11 /* Decrypted kernel */ - movq %rdx, %r12 /* Kernel length */ + movq %rdi, %r10 /* Encrypted area */ + movq %rsi, %r11 /* Decrypted area */ + movq %rdx, %r12 /* Area length */ /* Copy encryption routine into the workarea */ movq %rax, %rdi /* Workarea encryption routine */ @@ -52,10 +52,10 @@ ENTRY(sme_encrypt_execute) rep movsb /* Setup registers for call */ - movq %r10, %rdi /* Encrypted kernel */ - movq %r11, %rsi /* Decrypted kernel */ + movq %r10, %rdi /* Encrypted area */ + movq %r11, %rsi /* Decrypted area */ movq %r8, %rdx /* Pagetables used for encryption */ - movq %r12, %rcx /* Kernel length */ + movq %r12, %rcx /* Area length */ movq %rax, %r8 /* Workarea encryption routine */ addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ @@ -71,7 +71,7 @@ ENDPROC(sme_encrypt_execute) ENTRY(__enc_copy) /* - * Routine used to encrypt kernel. + * Routine used to encrypt memory in place. * This routine must be run outside of the kernel proper since * the kernel will be encrypted during the process. So this * routine is defined here and then copied to an area outside @@ -79,19 +79,19 @@ ENTRY(__enc_copy) * during execution. * * On entry the registers must be: - * RDI - virtual address for the encrypted kernel mapping - * RSI - virtual address for the decrypted kernel mapping + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping * RDX - address of the pagetables to use for encryption - * RCX - length of kernel + * RCX - length of area * R8 - intermediate copy buffer * * RAX - points to this routine * - * The kernel will be encrypted by copying from the non-encrypted - * kernel space to an intermediate buffer and then copying from the - * intermediate buffer back to the encrypted kernel space. The physical - * addresses of the two kernel space mappings are the same which - * results in the kernel being encrypted "in place". + * The area will be encrypted by copying from the non-encrypted + * memory space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted memory space. The physical + * addresses of the two mappings are the same which results in the area + * being encrypted "in place". */ /* Enable the new page tables */ mov %rdx, %cr3 @@ -103,47 +103,55 @@ ENTRY(__enc_copy) orq $X86_CR4_PGE, %rdx mov %rdx, %cr4 + push %r15 + push %r12 + + movq %rcx, %r9 /* Save area length */ + movq %rdi, %r10 /* Save encrypted area address */ + movq %rsi, %r11 /* Save decrypted area address */ + /* Set the PAT register PA5 entry to write-protect */ - push %rcx movl $MSR_IA32_CR_PAT, %ecx rdmsr - push %rdx /* Save original PAT value */ + mov %rdx, %r15 /* Save original PAT value */ andl $0xffff00ff, %edx /* Clear PA5 */ orl $0x00000500, %edx /* Set PA5 to WP */ wrmsr - pop %rdx /* RDX contains original PAT value */ - pop %rcx - - movq %rcx, %r9 /* Save kernel length */ - movq %rdi, %r10 /* Save encrypted kernel address */ - movq %rsi, %r11 /* Save decrypted kernel address */ wbinvd /* Invalidate any cache entries */ - /* Copy/encrypt 2MB at a time */ + /* Copy/encrypt up to 2MB at a time */ + movq $PMD_PAGE_SIZE, %r12 1: - movq %r11, %rsi /* Source - decrypted kernel */ + cmpq %r12, %r9 + jnb 2f + movq %r9, %r12 + +2: + movq %r11, %rsi /* Source - decrypted area */ movq %r8, %rdi /* Dest - intermediate copy buffer */ - movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + movq %r12, %rcx rep movsb movq %r8, %rsi /* Source - intermediate copy buffer */ - movq %r10, %rdi /* Dest - encrypted kernel */ - movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + movq %r10, %rdi /* Dest - encrypted area */ + movq %r12, %rcx rep movsb - addq $PMD_PAGE_SIZE, %r11 - addq $PMD_PAGE_SIZE, %r10 - subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ + addq %r12, %r11 + addq %r12, %r10 + subq %r12, %r9 /* Kernel length decrement */ jnz 1b /* Kernel length not zero? */ /* Restore PAT register */ - push %rdx /* Save original PAT value */ movl $MSR_IA32_CR_PAT, %ecx rdmsr - pop %rdx /* Restore original PAT value */ + mov %r15, %rdx /* Restore original PAT value */ wrmsr + pop %r12 + pop %r15 + ret .L__enc_copy_end: ENDPROC(__enc_copy) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index a99679826846..155ecbac9e28 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -33,6 +33,8 @@ #include <linux/compat.h> #include <asm/elf.h> +#include "physaddr.h" + struct va_alignment __read_mostly va_align = { .flags = -1, }; @@ -174,3 +176,63 @@ const char *arch_vma_name(struct vm_area_struct *vma) return "[mpx]"; return NULL; } + +/** + * mmap_address_hint_valid - Validate the address hint of mmap + * @addr: Address hint + * @len: Mapping length + * + * Check whether @addr and @addr + @len result in a valid mapping. + * + * On 32bit this only checks whether @addr + @len is <= TASK_SIZE. + * + * On 64bit with 5-level page tables another sanity check is required + * because mappings requested by mmap(@addr, 0) which cross the 47-bit + * virtual address boundary can cause the following theoretical issue: + * + * An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr + * is below the border of the 47-bit address space and @addr + @len is + * above the border. + * + * With 4-level paging this request succeeds, but the resulting mapping + * address will always be within the 47-bit virtual address space, because + * the hint address does not result in a valid mapping and is + * ignored. Hence applications which are not prepared to handle virtual + * addresses above 47-bit work correctly. + * + * With 5-level paging this request would be granted and result in a + * mapping which crosses the border of the 47-bit virtual address + * space. If the application cannot handle addresses above 47-bit this + * will lead to misbehaviour and hard to diagnose failures. + * + * Therefore ignore address hints which would result in a mapping crossing + * the 47-bit virtual address boundary. + * + * Note, that in the same scenario with MAP_FIXED the behaviour is + * different. The request with @addr < 47-bit and @addr + @len > 47-bit + * fails on a 4-level paging machine but succeeds on a 5-level paging + * machine. It is reasonable to expect that an application does not rely on + * the failure of such a fixed mapping request, so the restriction is not + * applied. + */ +bool mmap_address_hint_valid(unsigned long addr, unsigned long len) +{ + if (TASK_SIZE - len < addr) + return false; + + return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW); +} + +/* Can we access it for direct reading/writing? Must be RAM: */ +int valid_phys_addr_range(phys_addr_t addr, size_t count) +{ + return addr + count <= __pa(high_memory); +} + +/* Can we access it through mmap? Must be a valid physical address: */ +int valid_mmap_phys_addr_range(unsigned long pfn, size_t count) +{ + phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT; + + return phys_addr_valid(addr + count - 1); +} diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 7eb06701a935..e500949bae24 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -13,6 +13,7 @@ #include <linux/sched/sysctl.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/mman.h> #include <asm/mmu_context.h> #include <asm/mpx.h> @@ -61,123 +62,6 @@ static unsigned long mpx_mmap(unsigned long len) return addr; } -enum reg_type { - REG_TYPE_RM = 0, - REG_TYPE_INDEX, - REG_TYPE_BASE, -}; - -static int get_reg_offset(struct insn *insn, struct pt_regs *regs, - enum reg_type type) -{ - int regno = 0; - - static const int regoff[] = { - offsetof(struct pt_regs, ax), - offsetof(struct pt_regs, cx), - offsetof(struct pt_regs, dx), - offsetof(struct pt_regs, bx), - offsetof(struct pt_regs, sp), - offsetof(struct pt_regs, bp), - offsetof(struct pt_regs, si), - offsetof(struct pt_regs, di), -#ifdef CONFIG_X86_64 - offsetof(struct pt_regs, r8), - offsetof(struct pt_regs, r9), - offsetof(struct pt_regs, r10), - offsetof(struct pt_regs, r11), - offsetof(struct pt_regs, r12), - offsetof(struct pt_regs, r13), - offsetof(struct pt_regs, r14), - offsetof(struct pt_regs, r15), -#endif - }; - int nr_registers = ARRAY_SIZE(regoff); - /* - * Don't possibly decode a 32-bit instructions as - * reading a 64-bit-only register. - */ - if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64) - nr_registers -= 8; - - switch (type) { - case REG_TYPE_RM: - regno = X86_MODRM_RM(insn->modrm.value); - if (X86_REX_B(insn->rex_prefix.value)) - regno += 8; - break; - - case REG_TYPE_INDEX: - regno = X86_SIB_INDEX(insn->sib.value); - if (X86_REX_X(insn->rex_prefix.value)) - regno += 8; - break; - - case REG_TYPE_BASE: - regno = X86_SIB_BASE(insn->sib.value); - if (X86_REX_B(insn->rex_prefix.value)) - regno += 8; - break; - - default: - pr_err("invalid register type"); - BUG(); - break; - } - - if (regno >= nr_registers) { - WARN_ONCE(1, "decoded an instruction with an invalid register"); - return -EINVAL; - } - return regoff[regno]; -} - -/* - * return the address being referenced be instruction - * for rm=3 returning the content of the rm reg - * for rm!=3 calculates the address using SIB and Disp - */ -static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs) -{ - unsigned long addr, base, indx; - int addr_offset, base_offset, indx_offset; - insn_byte_t sib; - - insn_get_modrm(insn); - insn_get_sib(insn); - sib = insn->sib.value; - - if (X86_MODRM_MOD(insn->modrm.value) == 3) { - addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); - if (addr_offset < 0) - goto out_err; - addr = regs_get_register(regs, addr_offset); - } else { - if (insn->sib.nbytes) { - base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE); - if (base_offset < 0) - goto out_err; - - indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX); - if (indx_offset < 0) - goto out_err; - - base = regs_get_register(regs, base_offset); - indx = regs_get_register(regs, indx_offset); - addr = base + indx * (1 << X86_SIB_SCALE(sib)); - } else { - addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM); - if (addr_offset < 0) - goto out_err; - addr = regs_get_register(regs, addr_offset); - } - addr += insn->displacement.value; - } - return (void __user *)addr; -out_err: - return (void __user *)-1; -} - static int mpx_insn_decode(struct insn *insn, struct pt_regs *regs) { @@ -290,7 +174,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) info->si_signo = SIGSEGV; info->si_errno = 0; info->si_code = SEGV_BNDERR; - info->si_addr = mpx_get_addr_ref(&insn, regs); + info->si_addr = insn_get_addr_ref(&insn, regs); /* * We were not able to extract an address from the instruction, * probably because there was something invalid in it. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dfb7d657cf43..85cf12219dea 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) @@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) static int alloc_pte_page(pmd_t *pmd) { - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) return -1; @@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd) static int alloc_pmd_page(pud_t *pud) { - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) return -1; @@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgd_entry = cpa->pgd + pgd_index(addr); if (pgd_none(*pgd_entry)) { - p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) return -1; @@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) */ p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) { - pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1; @@ -1781,8 +1781,8 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) unsigned long start; int ret; - /* Nothing to do if the SME is not active */ - if (!sme_active()) + /* Nothing to do if memory encryption is not active */ + if (!mem_encrypt_active()) return 0; /* Should not be working on unaligned addresses */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 9b7bcbd33cc2..004abf9ebf12 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -7,7 +7,7 @@ #include <asm/fixmap.h> #include <asm/mtrr.h> -#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a1561957dccb..5bfe61a5e8e3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -151,6 +151,34 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } +static void sync_current_stack_to_mm(struct mm_struct *mm) +{ + unsigned long sp = current_stack_pointer; + pgd_t *pgd = pgd_offset(mm, sp); + + if (CONFIG_PGTABLE_LEVELS > 4) { + if (unlikely(pgd_none(*pgd))) { + pgd_t *pgd_ref = pgd_offset_k(sp); + + set_pgd(pgd, *pgd_ref); + } + } else { + /* + * "pgd" is faked. The top level entries are "p4d"s, so sync + * the p4d. This compiles to approximately the same code as + * the 5-level case. + */ + p4d_t *p4d = p4d_offset(pgd, sp); + + if (unlikely(p4d_none(*p4d))) { + pgd_t *pgd_ref = pgd_offset_k(sp); + p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); + + set_p4d(p4d, *p4d_ref); + } + } +} + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -226,11 +254,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ - unsigned int index = pgd_index(current_stack_pointer); - pgd_t *pgd = next->pgd + index; - - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[index]); + sync_current_stack_to_mm(next); } /* Stop remote flushes for the previous mm */ diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index ffdbc4836b4f..174c59774cc9 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -592,7 +592,7 @@ enum __force_cpu_type { static int force_cpu_type; -static int set_cpu_type(const char *str, struct kernel_param *kp) +static int set_cpu_type(const char *str, const struct kernel_param *kp) { if (!strcmp(str, "timer")) { force_cpu_type = timer; diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index bb461cfd01ab..526536c81ddc 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -97,7 +97,7 @@ static int __init broadcom_postcore_init(void) * We should get host bridge information from ACPI unless the BIOS * doesn't support it. */ - if (acpi_os_get_root_pointer()) + if (!acpi_disabled && acpi_os_get_root_pointer()) return 0; #endif diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 7a5350d08cef..563049c483a1 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -594,6 +594,11 @@ char *__init pcibios_setup(char *str) } else if (!strcmp(str, "nocrs")) { pci_probe |= PCI_ROOT_NO_CRS; return NULL; +#ifdef CONFIG_PHYS_ADDR_T_64BIT + } else if (!strcmp(str, "big_root_window")) { + pci_probe |= PCI_BIG_ROOT_WINDOW; + return NULL; +#endif } else if (!strcmp(str, "earlydump")) { pci_early_dump_regs = 1; return NULL; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 4210da7b44de..54ef19e90705 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -636,3 +636,118 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); + +#ifdef CONFIG_PHYS_ADDR_T_64BIT + +#define AMD_141b_MMIO_BASE(x) (0x80 + (x) * 0x8) +#define AMD_141b_MMIO_BASE_RE_MASK BIT(0) +#define AMD_141b_MMIO_BASE_WE_MASK BIT(1) +#define AMD_141b_MMIO_BASE_MMIOBASE_MASK GENMASK(31,8) + +#define AMD_141b_MMIO_LIMIT(x) (0x84 + (x) * 0x8) +#define AMD_141b_MMIO_LIMIT_MMIOLIMIT_MASK GENMASK(31,8) + +#define AMD_141b_MMIO_HIGH(x) (0x180 + (x) * 0x4) +#define AMD_141b_MMIO_HIGH_MMIOBASE_MASK GENMASK(7,0) +#define AMD_141b_MMIO_HIGH_MMIOLIMIT_SHIFT 16 +#define AMD_141b_MMIO_HIGH_MMIOLIMIT_MASK GENMASK(23,16) + +/* + * The PCI Firmware Spec, rev 3.2, notes that ACPI should optionally allow + * configuring host bridge windows using the _PRS and _SRS methods. + * + * But this is rarely implemented, so we manually enable a large 64bit BAR for + * PCIe device on AMD Family 15h (Models 00h-1fh, 30h-3fh, 60h-7fh) Processors + * here. + */ +static void pci_amd_enable_64bit_bar(struct pci_dev *dev) +{ + static const char *name = "PCI Bus 0000:00"; + struct resource *res, *conflict; + u32 base, limit, high; + struct pci_dev *other; + unsigned i; + + if (!(pci_probe & PCI_BIG_ROOT_WINDOW)) + return; + + /* Check that we are the only device of that type */ + other = pci_get_device(dev->vendor, dev->device, NULL); + if (other != dev || + (other = pci_get_device(dev->vendor, dev->device, other))) { + /* This is a multi-socket system, don't touch it for now */ + pci_dev_put(other); + return; + } + + for (i = 0; i < 8; i++) { + pci_read_config_dword(dev, AMD_141b_MMIO_BASE(i), &base); + pci_read_config_dword(dev, AMD_141b_MMIO_HIGH(i), &high); + + /* Is this slot free? */ + if (!(base & (AMD_141b_MMIO_BASE_RE_MASK | + AMD_141b_MMIO_BASE_WE_MASK))) + break; + + base >>= 8; + base |= high << 24; + + /* Abort if a slot already configures a 64bit BAR. */ + if (base > 0x10000) + return; + } + if (i == 8) + return; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return; + + /* + * Allocate a 256GB window directly below the 0xfd00000000 hardware + * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6). + */ + res->name = name; + res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM | + IORESOURCE_MEM_64 | IORESOURCE_WINDOW; + res->start = 0xbd00000000ull; + res->end = 0xfd00000000ull - 1; + + conflict = request_resource_conflict(&iomem_resource, res); + if (conflict) { + kfree(res); + if (conflict->name != name) + return; + + /* We are resuming from suspend; just reenable the window */ + res = conflict; + } else { + dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n", + res); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + pci_bus_add_resource(dev->bus, res, 0); + } + + base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) | + AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK; + limit = ((res->end + 1) >> 8) & AMD_141b_MMIO_LIMIT_MMIOLIMIT_MASK; + high = ((res->start >> 40) & AMD_141b_MMIO_HIGH_MMIOBASE_MASK) | + ((((res->end + 1) >> 40) << AMD_141b_MMIO_HIGH_MMIOLIMIT_SHIFT) + & AMD_141b_MMIO_HIGH_MMIOLIMIT_MASK); + + pci_write_config_dword(dev, AMD_141b_MMIO_HIGH(i), high); + pci_write_config_dword(dev, AMD_141b_MMIO_LIMIT(i), limit); + pci_write_config_dword(dev, AMD_141b_MMIO_BASE(i), base); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); + +#endif diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 1012a5f0f98d..511921045312 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -280,7 +280,7 @@ static void intel_mid_pci_irq_disable(struct pci_dev *dev) } } -static struct pci_ops intel_mid_pci_ops = { +static const struct pci_ops intel_mid_pci_ops __initconst = { .read = pci_read, .write = pci_write, }; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 61975b6bcb1a..2dd15e967c3f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -33,6 +33,7 @@ #include <linux/reboot.h> #include <linux/slab.h> #include <linux/ucs2_string.h> +#include <linux/mem_encrypt.h> #include <asm/setup.h> #include <asm/page.h> @@ -211,7 +212,7 @@ int __init efi_alloc_page_tables(void) if (efi_enabled(EFI_OLD_MEMMAP)) return 0; - gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; + gfp_mask = GFP_KERNEL | __GFP_ZERO; efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); if (!efi_pgd) return -ENOMEM; @@ -375,7 +376,11 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * as trim_bios_range() will reserve the first page and isolate it away * from memory allocators anyway. */ - if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) { + pf = _PAGE_RW; + if (sev_active()) + pf |= _PAGE_ENC; + + if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, pf)) { pr_err("Failed to create 1:1 mapping for the first page!\n"); return 1; } @@ -418,6 +423,9 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va) if (!(md->attribute & EFI_MEMORY_WB)) flags |= _PAGE_PCD; + if (sev_active()) + flags |= _PAGE_ENC; + pfn = md->phys_addr >> PAGE_SHIFT; if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags)) pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n", @@ -544,6 +552,9 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m if (!(md->attribute & EFI_MEMORY_RO)) pf |= _PAGE_RW; + if (sev_active()) + pf |= _PAGE_ENC; + return efi_update_mappings(md, pf); } @@ -595,6 +606,9 @@ void __init efi_runtime_update_mappings(void) (md->type != EFI_RUNTIME_SERVICES_CODE)) pf |= _PAGE_RW; + if (sev_active()) + pf |= _PAGE_ENC; + efi_update_mappings(md, pf); } } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 8a99a2e96537..5b513ccffde4 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -592,7 +592,18 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff, /* * Update the first page pointer to skip over the CSH header. */ - cap_info->pages[0] += csh->headersize; + cap_info->phys[0] += csh->headersize; + + /* + * cap_info->capsule should point at a virtual mapping of the entire + * capsule, starting at the capsule header. Our image has the Quark + * security header prepended, so we cannot rely on the default vmap() + * mapping created by the generic capsule code. + * Given that the Quark firmware does not appear to care about the + * virtual mapping, let's just point cap_info->capsule at our copy + * of the capsule header. + */ + cap_info->capsule = &cap_info->header; return 1; } diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index dc036e511f48..5a0483e7bf66 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata) return 0; } -static const struct bt_sfi_data tng_bt_sfi_data __initdata = { +static struct bt_sfi_data tng_bt_sfi_data __initdata = { .setup = tng_bt_sfi_setup, }; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c index 74283875c7e8..e639e3116acf 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c @@ -62,10 +62,9 @@ static struct platform_device pb_device = { static int __init pb_keys_init(void) { struct gpio_keys_button *gb = gpio_button; - int i, num, good = 0; + int i, good = 0; - num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); - for (i = 0; i < num; i++) { + for (i = 0; i < ARRAY_SIZE(gpio_button); i++) { gb[i].gpio = get_gpio_by_name(gb[i].desc); pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio); diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 03fc397335b7..e4cb9f4cde8a 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -127,10 +127,11 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq, * Re-target the irq to the specified CPU and enable the specified MMR located * on the specified blade to allow the sending of MSIs to the specified CPU. */ -static void uv_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +static int uv_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool reserve) { uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data); + return 0; } /* diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index c34bd8233f7c..5f64f30873e2 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -905,7 +905,7 @@ static inline void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) /* * UV NMI handler */ -int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) +static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) { struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi; int cpu = smp_processor_id(); @@ -1013,7 +1013,7 @@ void uv_nmi_init(void) } /* Setup HUB NMI info */ -void __init uv_nmi_setup_common(bool hubbed) +static void __init uv_nmi_setup_common(bool hubbed) { int size = sizeof(void *) * (1 << NODES_SHIFT); int cpu; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 04d5157fe7f8..a7d966964c6f 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -82,12 +82,8 @@ static void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ -#ifdef CONFIG_X86_32 store_idt(&ctxt->idt); -#else -/* CONFIG_X86_64 */ - store_idt((struct desc_ptr *)&ctxt->idt_limit); -#endif + /* * We save it here, but restore it only in the hibernate case. * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit @@ -103,22 +99,18 @@ static void __save_processor_state(struct saved_context *ctxt) /* * segment registers */ -#ifdef CONFIG_X86_32 - savesegment(es, ctxt->es); - savesegment(fs, ctxt->fs); +#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, ctxt->gs); - savesegment(ss, ctxt->ss); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds)); - asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); - asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); - asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); - asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); +#endif +#ifdef CONFIG_X86_64 + savesegment(gs, ctxt->gs); + savesegment(fs, ctxt->fs); + savesegment(ds, ctxt->ds); + savesegment(es, ctxt->es); rdmsrl(MSR_FS_BASE, ctxt->fs_base); - rdmsrl(MSR_GS_BASE, ctxt->gs_base); - rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); mtrr_save_fixed_ranges(NULL); rdmsrl(MSR_EFER, ctxt->efer); @@ -180,6 +172,9 @@ static void fix_processor_context(void) write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); syscall_init(); /* This sets MSR_*STAR and related */ +#else + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ @@ -192,9 +187,12 @@ static void fix_processor_context(void) } /** - * __restore_processor_state - restore the contents of CPU registers saved - * by __save_processor_state() - * @ctxt - structure to load the registers contents from + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + * + * The asm code that gets us here will have restored a usable GDT, although + * it will be pointing to the wrong alias. */ static void notrace __restore_processor_state(struct saved_context *ctxt) { @@ -217,46 +215,52 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); + /* Restore the IDT. */ + load_idt(&ctxt->idt); + /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). + * Just in case the asm code got us here with the SS, DS, or ES + * out of sync with the GDT, update them. */ -#ifdef CONFIG_X86_32 - load_idt(&ctxt->idt); + loadsegment(ss, __KERNEL_DS); + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + /* + * Restore percpu access. Percpu access can happen in exception + * handlers or in complicated helpers like load_gs_index(). + */ +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); #else -/* CONFIG_X86_64 */ - load_idt((const struct desc_ptr *)&ctxt->idt_limit); + loadsegment(fs, __KERNEL_PERCPU); + loadsegment(gs, __KERNEL_STACK_CANARY); #endif + /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ + fix_processor_context(); + /* - * segment registers + * Now that we have descriptor tables fully restored and working + * exception handling, restore the usermode segments. */ -#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_64 + loadsegment(ds, ctxt->es); loadsegment(es, ctxt->es); loadsegment(fs, ctxt->fs); - loadsegment(gs, ctxt->gs); - loadsegment(ss, ctxt->ss); + load_gs_index(ctxt->gs); /* - * sysenter MSRs + * Restore FSBASE and GSBASE after restoring the selectors, since + * restoring the selectors clobbers the bases. Keep in mind + * that MSR_KERNEL_GS_BASE is horribly misnamed. */ - if (boot_cpu_has(X86_FEATURE_SEP)) - enable_sep_cpu(); -#else -/* CONFIG_X86_64 */ - asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); - asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); - asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); - load_gs_index(ctxt->gs); - asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); - wrmsrl(MSR_FS_BASE, ctxt->fs_base); - wrmsrl(MSR_GS_BASE, ctxt->gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); +#elif defined(CONFIG_X86_32_LAZY_GS) + loadsegment(gs, ctxt->gs); #endif - fix_processor_context(); - do_fpu_end(); tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index ed84d3917a59..d10105825d57 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -64,9 +64,10 @@ static void __init setup_real_mode(void) /* * If SME is active, the trampoline area will need to be in * decrypted memory in order to bring up other processors - * successfully. + * successfully. This is not needed for SEV. */ - set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); + if (sme_active()) + set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); memcpy(base, real_mode_blob, size); diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 30434b8708f2..de58533d3664 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -31,7 +31,7 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) return 0xfd; } -static unsigned long xen_set_apic_id(unsigned int x) +static u32 xen_set_apic_id(unsigned int x) { WARN_ON(1); return x; @@ -57,7 +57,7 @@ static u32 xen_apic_read(u32 reg) return 0; if (reg == APIC_LVR) - return 0x10; + return 0x14; #ifdef CONFIG_X86_32 if (reg == APIC_LDR) return SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); @@ -161,12 +161,10 @@ static struct apic xen_pv_apic = { /* .irq_delivery_mode - used in native_compose_msi_msg only */ /* .irq_dest_mode - used in native_compose_msi_msg only */ - .target_cpus = default_target_cpus, .disable_esr = 0, /* .dest_logical - default_send_IPI_ use it but we use our own. */ .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ - .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ @@ -179,7 +177,7 @@ static struct apic xen_pv_apic = { .get_apic_id = xen_get_apic_id, .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, #ifdef CONFIG_SMP .send_IPI_mask = xen_send_IPI_mask, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d669e9d89001..c9081c6671f0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,8 +1,12 @@ +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +#include <linux/bootmem.h> +#endif #include <linux/cpu.h> #include <linux/kexec.h> #include <xen/features.h> #include <xen/page.h> +#include <xen/interface/memory.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -331,3 +335,80 @@ void xen_arch_unregister_cpu(int num) } EXPORT_SYMBOL(xen_arch_unregister_cpu); #endif + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +void __init arch_xen_balloon_init(struct resource *hostmem_resource) +{ + struct xen_memory_map memmap; + int rc; + unsigned int i, last_guest_ram; + phys_addr_t max_addr = PFN_PHYS(max_pfn); + struct e820_table *xen_e820_table; + const struct e820_entry *entry; + struct resource *res; + + if (!xen_initial_domain()) + return; + + xen_e820_table = kmalloc(sizeof(*xen_e820_table), GFP_KERNEL); + if (!xen_e820_table) + return; + + memmap.nr_entries = ARRAY_SIZE(xen_e820_table->entries); + set_xen_guest_handle(memmap.buffer, xen_e820_table->entries); + rc = HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap); + if (rc) { + pr_warn("%s: Can't read host e820 (%d)\n", __func__, rc); + goto out; + } + + last_guest_ram = 0; + for (i = 0; i < memmap.nr_entries; i++) { + if (xen_e820_table->entries[i].addr >= max_addr) + break; + if (xen_e820_table->entries[i].type == E820_TYPE_RAM) + last_guest_ram = i; + } + + entry = &xen_e820_table->entries[last_guest_ram]; + if (max_addr >= entry->addr + entry->size) + goto out; /* No unallocated host RAM. */ + + hostmem_resource->start = max_addr; + hostmem_resource->end = entry->addr + entry->size; + + /* + * Mark non-RAM regions between the end of dom0 RAM and end of host RAM + * as unavailable. The rest of that region can be used for hotplug-based + * ballooning. + */ + for (; i < memmap.nr_entries; i++) { + entry = &xen_e820_table->entries[i]; + + if (entry->type == E820_TYPE_RAM) + continue; + + if (entry->addr >= hostmem_resource->end) + break; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + goto out; + + res->name = "Unavailable host RAM"; + res->start = entry->addr; + res->end = (entry->addr + entry->size < hostmem_resource->end) ? + entry->addr + entry->size : hostmem_resource->end; + rc = insert_resource(hostmem_resource, res); + if (rc) { + pr_warn("%s: Can't insert [%llx - %llx) (%d)\n", + __func__, res->start, res->end, rc); + kfree(res); + goto out; + } + } + + out: + kfree(xen_e820_table); +} +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 754d5391d9fa..826898701045 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -1,3 +1,4 @@ +#include <linux/acpi.h> #include <linux/cpu.h> #include <linux/kexec.h> #include <linux/memblock.h> @@ -188,8 +189,6 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); - if (xen_pvh_domain()) - machine_ops.emergency_restart = xen_emergency_restart; #ifdef CONFIG_KEXEC_CORE machine_ops.shutdown = xen_hvm_shutdown; machine_ops.crash_shutdown = xen_hvm_crash_shutdown; @@ -226,6 +225,26 @@ static uint32_t __init xen_platform_hvm(void) return xen_cpuid_base(); } +static __init void xen_hvm_guest_late_init(void) +{ +#ifdef CONFIG_XEN_PVH + /* Test for PVH domain (PVH boot path taken overrides ACPI flags). */ + if (!xen_pvh && + (x86_platform.legacy.rtc || !x86_platform.legacy.no_vga)) + return; + + /* PVH detected. */ + xen_pvh = true; + + /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ + if (!nr_ioapics && acpi_irq_model == ACPI_IRQ_MODEL_PIC) + acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; + + machine_ops.emergency_restart = xen_emergency_restart; + pv_info.name = "Xen PVH"; +#endif +} + const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = { .name = "Xen HVM", .detect = xen_platform_hvm, @@ -233,5 +252,6 @@ const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = { .init.init_platform = xen_hvm_guest_init, .init.x2apic_available = xen_x2apic_para_available, .init.init_mem_mapping = xen_hvm_init_mem_mapping, + .init.guest_late_init = xen_hvm_guest_late_init, .runtime.pin_vcpu = xen_pin_vcpu, }; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ae3a071e1d0f..c047f42552e1 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -88,6 +88,8 @@ #include "multicalls.h" #include "pmu.h" +#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */ + void *xen_initial_gdt; static int xen_cpu_up_prepare_pv(unsigned int cpu); @@ -622,7 +624,7 @@ static struct trap_array_entry trap_array[] = { { simd_coprocessor_error, xen_simd_coprocessor_error, false }, }; -static bool get_trap_addr(void **addr, unsigned int ist) +static bool __ref get_trap_addr(void **addr, unsigned int ist) { unsigned int nr; bool ist_okay = false; @@ -644,6 +646,14 @@ static bool get_trap_addr(void **addr, unsigned int ist) } } + if (nr == ARRAY_SIZE(trap_array) && + *addr >= (void *)early_idt_handler_array[0] && + *addr < (void *)early_idt_handler_array[NUM_EXCEPTION_VECTORS]) { + nr = (*addr - (void *)early_idt_handler_array[0]) / + EARLY_IDT_HANDLER_SIZE; + *addr = (void *)xen_early_idt_handler_array[nr]; + } + if (WARN_ON(ist != 0 && !ist_okay)) return false; @@ -1230,6 +1240,7 @@ asmlinkage __visible void __init xen_start_kernel(void) x86_platform.get_nmi_reason = xen_get_nmi_reason; x86_init.resources.memory_setup = xen_memory_setup; + x86_init.irqs.intr_mode_init = x86_init_noop; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; @@ -1249,6 +1260,7 @@ asmlinkage __visible void __init xen_start_kernel(void) __userpte_alloc_gfp &= ~__GFP_HIGHMEM; /* Work out if we support NX */ + get_cpu_cap(&boot_cpu_data); x86_configure_nx(); /* Get mfn list */ @@ -1261,6 +1273,21 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_setup_gdt(0); xen_init_irq_ops(); + + /* Let's presume PV guests always boot on vCPU with id 0. */ + per_cpu(xen_vcpu_id, 0) = 0; + + /* + * Setup xen_vcpu early because idt_setup_early_handler needs it for + * local_irq_disable(), irqs_disabled(). + * + * Don't do the full vcpu_info placement stuff until we have + * the cpu_possible_mask and a non-dummy shared_info. + */ + xen_vcpu_info_reset(0); + + idt_setup_early_handler(); + xen_init_capabilities(); #ifdef CONFIG_X86_LOCAL_APIC @@ -1294,18 +1321,6 @@ asmlinkage __visible void __init xen_start_kernel(void) */ acpi_numa = -1; #endif - /* Let's presume PV guests always boot on vCPU with id 0. */ - per_cpu(xen_vcpu_id, 0) = 0; - - /* - * Setup xen_vcpu early because start_kernel needs it for - * local_irq_disable(), irqs_disabled(). - * - * Don't do the full vcpu_info placement stuff until we have - * the cpu_possible_mask and a non-dummy shared_info. - */ - xen_vcpu_info_reset(0); - WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); local_irq_disable(); diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 7bd3ee08393e..436c4f003e17 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -25,13 +25,6 @@ struct boot_params pvh_bootparams __attribute__((section(".data"))); struct hvm_start_info pvh_start_info; unsigned int pvh_start_info_sz = sizeof(pvh_start_info); -static void xen_pvh_arch_setup(void) -{ - /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ - if (nr_ioapics == 0) - acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; -} - static void __init init_pvh_bootparams(void) { struct xen_memory_map memmap; @@ -102,6 +95,4 @@ void __init xen_prepare_pvh(void) wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); init_pvh_bootparams(); - - x86_init.oem.arch_setup = xen_pvh_arch_setup; } diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 809b6c812654..92ccc718152d 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -49,7 +49,7 @@ static struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; -} gnttab_shared_vm_area; +} gnttab_shared_vm_area, gnttab_status_vm_area; int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, @@ -73,16 +73,43 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, return 0; } +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared) +{ + grant_status_t *shared = *__shared; + unsigned long addr; + unsigned long i; + + if (shared == NULL) + *__shared = shared = gnttab_status_vm_area.area->addr; + + addr = (unsigned long)shared; + + for (i = 0; i < nr_gframes; i++) { + set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i], + mfn_pte(frames[i], PAGE_KERNEL)); + addr += PAGE_SIZE; + } + + return 0; +} + void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) { + pte_t **ptes; unsigned long addr; unsigned long i; + if (shared == gnttab_status_vm_area.area->addr) + ptes = gnttab_status_vm_area.ptes; + else + ptes = gnttab_shared_vm_area.ptes; + addr = (unsigned long)shared; for (i = 0; i < nr_gframes; i++) { - set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i], - __pte(0)); + set_pte_at(&init_mm, addr, ptes[i], __pte(0)); addr += PAGE_SIZE; } } @@ -102,12 +129,35 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) return 0; } -int arch_gnttab_init(unsigned long nr_shared) +static void arch_gnttab_vfree(struct gnttab_vm_area *area) { + free_vm_area(area->area); + kfree(area->ptes); +} + +int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) +{ + int ret; + if (!xen_pv_domain()) return 0; - return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); + ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); + if (ret < 0) + return ret; + + /* + * Always allocate the space for the status frames in case + * we're migrated to a host with V2 support. + */ + ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status); + if (ret < 0) + goto err; + + return 0; +err: + arch_gnttab_vfree(&gnttab_shared_vm_area); + return -ENOMEM; } #ifdef CONFIG_XEN_PVH diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3e15345abfe7..d33e7dbe3129 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -172,6 +172,9 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma, pgprot_t prot, unsigned domid, struct page **pages) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -EOPNOTSUPP; + return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages); } EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range); @@ -182,6 +185,10 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma, int *err_ptr, pgprot_t prot, unsigned domid, struct page **pages) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, + prot, domid, pages); + /* We BUG_ON because it's a programmer error to pass a NULL err_ptr, * and the consequences later is quite hard to detect what the actual * cause of "wrong memory was mapped in". @@ -193,9 +200,12 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array); /* Returns: 0 success */ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, - int numpgs, struct page **pages) + int nr, struct page **pages) { - if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_feature(XENFEAT_auto_translated_physmap)) + return xen_xlate_unmap_gfn_range(vma, nr, pages); + + if (!pages) return 0; return -EINVAL; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index a0e2b8c6e5c7..d85076223a69 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -315,7 +315,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, static pteval_t pte_mfn_to_pfn(pteval_t val) { if (val & _PAGE_PRESENT) { - unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT; unsigned long pfn = mfn_to_pfn(mfn); pteval_t flags = val & PTE_FLAGS_MASK; @@ -1325,20 +1325,18 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, { struct { struct mmuext_op op; -#ifdef CONFIG_SMP - DECLARE_BITMAP(mask, num_processors); -#else DECLARE_BITMAP(mask, NR_CPUS); -#endif } *args; struct multicall_space mcs; + const size_t mc_entry_size = sizeof(args->op) + + sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus()); trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); if (cpumask_empty(cpus)) return; /* nothing to do */ - mcs = xen_mc_entry(sizeof(*args)); + mcs = xen_mc_entry(mc_entry_size); args = mcs.args; args->op.arg2.vcpumask = to_cpumask(args->mask); @@ -1721,7 +1719,7 @@ static unsigned long __init m2p(phys_addr_t maddr) { phys_addr_t paddr; - maddr &= PTE_PFN_MASK; + maddr &= XEN_PTE_MFN_MASK; paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; return paddr; @@ -1902,6 +1900,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); + /* + * Zap execute permission from the ident map. Due to the sharing of + * L1 entries we need to do this in the L2. + */ + if (__supported_pte_mask & _PAGE_NX) { + for (i = 0; i < PTRS_PER_PMD; ++i) { + if (pmd_none(level2_ident_pgt[i])) + continue; + level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX); + } + } + /* Copy the initial P->M table mappings if necessary. */ i = pgd_index(xen_start_info->mfn_list); if (i && i < pgd_index(__START_KERNEL_map)) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 6083ba462f35..13b4f19b9131 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -547,7 +547,7 @@ int xen_alloc_p2m_entry(unsigned long pfn) if (p2m_top_mfn && pfn < MAX_P2M_PFN) { topidx = p2m_top_index(pfn); top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); + mid_mfn = READ_ONCE(p2m_top_mfn_p[topidx]); BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index c114ca767b3b..6e0d2086eacb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -808,7 +808,6 @@ char * __init xen_memory_setup(void) addr = xen_e820_table.entries[0].addr; size = xen_e820_table.entries[0].size; while (i < xen_e820_table.nr_entries) { - bool discard = false; chunk_size = size; type = xen_e820_table.entries[i].type; @@ -824,11 +823,10 @@ char * __init xen_memory_setup(void) xen_add_extra_mem(pfn_s, n_pfns); xen_max_p2m_pfn = pfn_s + n_pfns; } else - discard = true; + type = E820_TYPE_UNUSABLE; } - if (!discard) - xen_align_and_add_e820_region(addr, chunk_size, type); + xen_align_and_add_e820_region(addr, chunk_size, type); addr += chunk_size; size -= chunk_size; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 08324c64005d..02f3445a2b5f 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <asm/paravirt.h> +#include <asm/qspinlock.h> #include <xen/interface/xen.h> #include <xen/events.h> @@ -81,8 +82,11 @@ void xen_init_lock_cpu(int cpu) int irq; char *name; - if (!xen_pvspin) + if (!xen_pvspin) { + if (cpu == 0) + static_branch_disable(&virt_spin_lock_key); return; + } WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", cpu, per_cpu(lock_kicker_irq, cpu)); diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 92bf5ecb6baf..d9f96cc5d743 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -17,6 +17,8 @@ void xen_arch_pre_suspend(void) { + xen_save_time_memory_area(); + if (xen_pv_domain()) xen_pv_pre_suspend(); } @@ -27,6 +29,8 @@ void xen_arch_post_suspend(int cancelled) xen_pv_post_suspend(cancelled); else xen_hvm_post_suspend(cancelled); + + xen_restore_time_memory_area(); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 80c2a4bdf230..29163c43ebbd 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -75,7 +75,7 @@ static void xen_get_wallclock(struct timespec *now) static int xen_set_wallclock(const struct timespec *now) { - return -1; + return -ENODEV; } static int xen_pvclock_gtod_notify(struct notifier_block *nb, @@ -371,8 +371,95 @@ static const struct pv_time_ops xen_time_ops __initconst = { .steal_clock = xen_steal_clock, }; +static struct pvclock_vsyscall_time_info *xen_clock __read_mostly; + +void xen_save_time_memory_area(void) +{ + struct vcpu_register_time_memory_area t; + int ret; + + if (!xen_clock) + return; + + t.addr.v = NULL; + + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); + if (ret != 0) + pr_notice("Cannot save secondary vcpu_time_info (err %d)", + ret); + else + clear_page(xen_clock); +} + +void xen_restore_time_memory_area(void) +{ + struct vcpu_register_time_memory_area t; + int ret; + + if (!xen_clock) + return; + + t.addr.v = &xen_clock->pvti; + + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); + + /* + * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the + * secondary time info with Xen or if we migrated to a host without the + * necessary flags. On both of these cases what happens is either + * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT + * bit set. Userspace checks the latter and if 0, it discards the data + * in pvti and fallbacks to a system call for a reliable timestamp. + */ + if (ret != 0) + pr_notice("Cannot restore secondary vcpu_time_info (err %d)", + ret); +} + +static void xen_setup_vsyscall_time_info(void) +{ + struct vcpu_register_time_memory_area t; + struct pvclock_vsyscall_time_info *ti; + int ret; + + ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL); + if (!ti) + return; + + t.addr.v = &ti->pvti; + + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t); + if (ret) { + pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret); + free_page((unsigned long)ti); + return; + } + + /* + * If primary time info had this bit set, secondary should too since + * it's the same data on both just different memory regions. But we + * still check it in case hypervisor is buggy. + */ + if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) { + t.addr.v = NULL; + ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, + 0, &t); + if (!ret) + free_page((unsigned long)ti); + + pr_notice("xen: VCLOCK_PVCLOCK not supported (tsc unstable)\n"); + return; + } + + xen_clock = ti; + pvclock_set_pvti_cpu0_va(xen_clock); + + xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK; +} + static void __init xen_time_init(void) { + struct pvclock_vcpu_time_info *pvti; int cpu = smp_processor_id(); struct timespec tp; @@ -396,6 +483,16 @@ static void __init xen_time_init(void) setup_force_cpu_cap(X86_FEATURE_TSC); + /* + * We check ahead on the primary time info if this + * bit is supported hence speeding up Xen clocksource. + */ + pvti = &__this_cpu_read(xen_vcpu)->time; + if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) { + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); + xen_setup_vsyscall_time_info(); + } + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 8a10c9a9e2b5..417b339e5c8e 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,7 @@ #include <xen/interface/xen.h> +#include <linux/init.h> #include <linux/linkage.h> .macro xen_pv_trap name @@ -54,6 +55,19 @@ xen_pv_trap entry_INT80_compat #endif xen_pv_trap hypervisor_callback + __INIT +ENTRY(xen_early_idt_handler_array) + i = 0 + .rept NUM_EXCEPTION_VECTORS + pop %rcx + pop %r11 + jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE + i = i + 1 + .fill xen_early_idt_handler_array + i*XEN_EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc + .endr +END(xen_early_idt_handler_array) + __FINIT + hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* * Xen64 iret frame: diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f377e1820c6c..3b34745d0a52 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -70,7 +70,9 @@ void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); u64 xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); -void __init xen_init_time_ops(void); +void xen_save_time_memory_area(void); +void xen_restore_time_memory_area(void); +void __ref xen_init_time_ops(void); void __init xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); |