diff options
Diffstat (limited to 'arch/x86/include')
203 files changed, 6709 insertions, 3944 deletions
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4f1ce5fc4e19..6c23d1661b17 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -10,5 +10,5 @@ generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h generic-y += early_ioremap.h -generic-y += export.h generic-y += mcs_spinlock.h +generic-y += mmzone.h diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2888c0ee4df0..5ab1a4598d00 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -6,7 +6,7 @@ * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> */ -#include <acpi/pdc_intel.h> +#include <acpi/proc_cap_intel.h> #include <asm/numa.h> #include <asm/fixmap.h> @@ -16,6 +16,9 @@ #include <asm/x86_init.h> #include <asm/cpufeature.h> #include <asm/irq_vectors.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> #ifdef CONFIG_ACPI_APEI # include <asm/pgtable_types.h> @@ -53,6 +56,8 @@ static inline void disable_acpi(void) extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); +extern int acpi_blacklisted(void); + static inline void acpi_noirq_set(void) { acpi_noirq = 1; } static inline void acpi_disable_pci(void) { @@ -73,6 +78,13 @@ static inline bool acpi_skip_set_wakeup_address(void) #define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address +union acpi_subtable_headers; + +int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end); + +void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa); + /* * Check if the CPU can handle C2 and deeper */ @@ -102,23 +114,42 @@ static inline bool arch_has_acpi_pdc(void) c->x86_vendor == X86_VENDOR_CENTAUR); } -static inline void arch_acpi_set_pdc_bits(u32 *buf) +static inline void arch_acpi_set_proc_cap_bits(u32 *cap) { struct cpuinfo_x86 *c = &cpu_data(0); - buf[2] |= ACPI_PDC_C_CAPABILITY_SMP; + *cap |= ACPI_PROC_CAP_C_CAPABILITY_SMP; + + /* Enable coordination with firmware's _TSD info */ + *cap |= ACPI_PROC_CAP_SMP_T_SWCOORD; if (cpu_has(c, X86_FEATURE_EST)) - buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; + *cap |= ACPI_PROC_CAP_EST_CAPABILITY_SWSMP; if (cpu_has(c, X86_FEATURE_ACPI)) - buf[2] |= ACPI_PDC_T_FFH; + *cap |= ACPI_PROC_CAP_T_FFH; + + if (cpu_has(c, X86_FEATURE_HWP)) + *cap |= ACPI_PROC_CAP_COLLAB_PROC_PERF; /* - * If mwait/monitor is unsupported, C2/C3_FFH will be disabled + * If mwait/monitor is unsupported, C_C1_FFH and + * C2/C3_FFH will be disabled. */ - if (!cpu_has(c, X86_FEATURE_MWAIT)) - buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); + if (!cpu_has(c, X86_FEATURE_MWAIT) || + boot_option_idle_override == IDLE_NOMWAIT) + *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH); + + if (xen_initial_domain()) { + /* + * When Linux is running as Xen dom0, the hypervisor is the + * entity in charge of the processor power management, and so + * Xen needs to check the OS capabilities reported in the + * processor capabilities buffer matches what the hypervisor + * driver supports. + */ + xen_sanitize_proc_cap_bits(cap); + } } static inline bool acpi_has_cpu_in_madt(void) @@ -143,6 +174,14 @@ void acpi_generic_reduced_hw_init(void); void x86_default_set_root_pointer(u64 addr); u64 x86_default_get_root_pointer(void); +#ifdef CONFIG_XEN_PV +/* A Xen PV domain needs a special acpi_os_ioremap() handling. */ +extern void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, + acpi_size size); +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size); +#define acpi_os_ioremap acpi_os_ioremap +#endif + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9c4da699e11a..ca9ae606aab9 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -10,6 +10,9 @@ #define ALT_FLAG_NOT (1 << 0) #define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_FLAG_DIRECT_CALL (1 << 1) +#define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS) #ifndef __ASSEMBLY__ @@ -58,7 +61,7 @@ #define ANNOTATE_IGNORE_ALTERNATIVE \ "999:\n\t" \ ".pushsection .discard.ignore_alts\n\t" \ - ".long 999b - .\n\t" \ + ".long 999b\n\t" \ ".popsection\n\t" /* @@ -86,6 +89,8 @@ struct alt_instr { u8 replacementlen; /* length of new instruction */ } __packed; +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + /* * Debug flag that can be tested to see whether alternative * instructions were patched in already: @@ -101,11 +106,10 @@ extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, s32 *start_cfi, s32 *end_cfi); struct module; -struct paravirt_patch_site; struct callthunk_sites { s32 *call_start, *call_end; - struct paravirt_patch_site *pv_start, *pv_end; + struct alt_instr *alt_start, *alt_end; }; #ifdef CONFIG_CALL_THUNKS @@ -113,7 +117,7 @@ extern void callthunks_patch_builtin_calls(void); extern void callthunks_patch_module_calls(struct callthunk_sites *sites, struct module *mod); extern void *callthunks_translate_call_dest(void *dest); -extern int x86_call_depth_emit_accounting(u8 **pprog, void *func); +extern int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip); #else static __always_inline void callthunks_patch_builtin_calls(void) {} static __always_inline void @@ -124,7 +128,7 @@ static __always_inline void *callthunks_translate_call_dest(void *dest) return dest; } static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, - void *func) + void *func, void *ip) { return 0; } @@ -150,102 +154,52 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#define b_replacement(num) "664"#num -#define e_replacement(num) "665"#num +#define ALT_CALL_INSTR "call BUG_func" -#define alt_end_marker "663" -#define alt_slen "662b-661b" -#define alt_total_slen alt_end_marker"b-661b" -#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" +#define alt_slen "772b-771b" +#define alt_total_slen "773b-771b" +#define alt_rlen "775f-774f" -#define OLDINSTR(oldinstr, num) \ - "# ALT: oldnstr\n" \ - "661:\n\t" oldinstr "\n662:\n" \ +#define OLDINSTR(oldinstr) \ + "# ALT: oldinstr\n" \ + "771:\n\t" oldinstr "\n772:\n" \ "# ALT: padding\n" \ - ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ - "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" \ - alt_end_marker ":\n" - -/* - * gas compatible max based on the idea from: - * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax - * - * The additional "-" is needed because gas uses a "true" value of -1. - */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" - -/* - * Pad the second replacement alternative with additional NOPs if it is - * additionally longer than the first replacement alternative. - */ -#define OLDINSTR_2(oldinstr, num1, num2) \ - "# ALT: oldinstr2\n" \ - "661:\n\t" oldinstr "\n662:\n" \ - "# ALT: padding2\n" \ - ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \ - "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \ - alt_end_marker ":\n" - -#define OLDINSTR_3(oldinsn, n1, n2, n3) \ - "# ALT: oldinstr3\n" \ - "661:\n\t" oldinsn "\n662:\n" \ - "# ALT: padding3\n" \ - ".skip -((" alt_max_short(alt_max_short(alt_rlen(n1), alt_rlen(n2)), alt_rlen(n3)) \ - " - (" alt_slen ")) > 0) * " \ - "(" alt_max_short(alt_max_short(alt_rlen(n1), alt_rlen(n2)), alt_rlen(n3)) \ - " - (" alt_slen ")), 0x90\n" \ - alt_end_marker ":\n" - -#define ALTINSTR_ENTRY(ft_flags, num) \ - " .long 661b - .\n" /* label */ \ - " .long " b_replacement(num)"f - .\n" /* new instruction */ \ + ".skip -(((" alt_rlen ")-(" alt_slen ")) > 0) * " \ + "((" alt_rlen ")-(" alt_slen ")),0x90\n" \ + "773:\n" + +#define ALTINSTR_ENTRY(ft_flags) \ + ".pushsection .altinstructions,\"a\"\n" \ + " .long 771b - .\n" /* label */ \ + " .long 774f - .\n" /* new instruction */ \ " .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \ " .byte " alt_total_slen "\n" /* source len */ \ - " .byte " alt_rlen(num) "\n" /* replacement len */ + " .byte " alt_rlen "\n" /* replacement len */ \ + ".popsection\n" -#define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \ - "# ALT: replacement " #num "\n" \ - b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n" +#define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + "# ALT: replacement\n" \ + "774:\n\t" newinstr "\n775:\n" \ + ".popsection\n" /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, ft_flags) \ - OLDINSTR(oldinstr, 1) \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(ft_flags, 1) \ - ".popsection\n" \ - ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr, 1) \ - ".popsection\n" + OLDINSTR(oldinstr) \ + ALTINSTR_ENTRY(ft_flags) \ + ALTINSTR_REPLACEMENT(newinstr) #define ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \ - OLDINSTR_2(oldinstr, 1, 2) \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(ft_flags1, 1) \ - ALTINSTR_ENTRY(ft_flags2, 2) \ - ".popsection\n" \ - ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr1, 1) \ - ALTINSTR_REPLACEMENT(newinstr2, 2) \ - ".popsection\n" + ALTERNATIVE(ALTERNATIVE(oldinstr, newinstr1, ft_flags1), newinstr2, ft_flags2) /* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ #define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \ - ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ - newinstr_yes, ft_flags) - -#define ALTERNATIVE_3(oldinsn, newinsn1, ft_flags1, newinsn2, ft_flags2, \ - newinsn3, ft_flags3) \ - OLDINSTR_3(oldinsn, 1, 2, 3) \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(ft_flags1, 1) \ - ALTINSTR_ENTRY(ft_flags2, 2) \ - ALTINSTR_ENTRY(ft_flags3, 3) \ - ".popsection\n" \ - ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinsn1, 1) \ - ALTINSTR_REPLACEMENT(newinsn2, 2) \ - ALTINSTR_REPLACEMENT(newinsn3, 3) \ - ".popsection\n" + ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, newinstr_yes, ft_flags) + +#define ALTERNATIVE_3(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, \ + newinstr3, ft_flags3) \ + ALTERNATIVE(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2), \ + newinstr3, ft_flags3) /* * Alternative instructions for different CPU types or capabilities. @@ -260,14 +214,11 @@ static inline int alternatives_text_reserved(void *start, void *end) * without volatile and memory clobber. */ #define alternative(oldinstr, newinstr, ft_flags) \ - asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) : : : "memory") + asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) : : : "memory") #define alternative_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) ::: "memory") -#define alternative_ternary(oldinstr, ft_flags, newinstr_yes, newinstr_no) \ - asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) ::: "memory") - /* * Alternative inline assembly with input. * @@ -277,32 +228,28 @@ static inline int alternatives_text_reserved(void *start, void *end) * Leaving an unused argument 0 to keep API compatibility. */ #define alternative_input(oldinstr, newinstr, ft_flags, input...) \ - asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \ - : : "i" (0), ## input) - -/* - * This is similar to alternative_input. But it has two features and - * respective instructions. - * - * If CPU has feature2, newinstr2 is used. - * Otherwise, if CPU has feature1, newinstr1 is used. - * Otherwise, oldinstr is used. - */ -#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2, \ - ft_flags2, input...) \ - asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \ - newinstr2, ft_flags2) \ + asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) \ : : "i" (0), ## input) /* Like alternative_input, but with a single output argument */ #define alternative_io(oldinstr, newinstr, ft_flags, output, input...) \ - asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \ + asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) \ : output : "i" (0), ## input) -/* Like alternative_io, but for replacing a direct call with another one. */ -#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \ - asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", ft_flags) \ - : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) +/* + * Like alternative_io, but for replacing a direct call with another one. + * + * Use the %c operand modifier which is the generic way to print a bare + * constant expression with all syntax-specific punctuation omitted. %P + * is the x86-specific variant which can handle constants too, for + * historical reasons, but it should be used primarily for PIC + * references: i.e., if used for a function, it would add the PLT + * suffix. + */ +#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \ + asm_inline volatile(ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \ + : ALT_OUTPUT_SP(output) \ + : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) /* * Like alternative_call, but there are two features and respective functions. @@ -310,12 +257,12 @@ static inline int alternatives_text_reserved(void *start, void *end) * Otherwise, if CPU has feature1, function1 is used. * Otherwise, old function is used. */ -#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \ - output, input...) \ - asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", ft_flags1,\ - "call %P[new2]", ft_flags2) \ - : output, ASM_CALL_CONSTRAINT \ - : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ +#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \ + output, input...) \ + asm_inline volatile(ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \ + "call %c[new2]", ft_flags2) \ + : ALT_OUTPUT_SP(output) \ + : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ [new2] "i" (newfunc2), ## input) /* @@ -330,6 +277,24 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr +#define ALT_OUTPUT_SP(...) ASM_CALL_CONSTRAINT, ## __VA_ARGS__ + +/* Macro for creating assembler functions avoiding any C magic. */ +#define DEFINE_ASM_FUNC(func, instr, sec) \ + asm (".pushsection " #sec ", \"ax\"\n" \ + ".global " #func "\n\t" \ + ".type " #func ", @function\n\t" \ + ASM_FUNC_ALIGN "\n" \ + #func ":\n\t" \ + ASM_ENDBR \ + instr "\n\t" \ + ASM_RET \ + ".size " #func ", . - " #func "\n\t" \ + ".popsection") + +void BUG_func(void); +void nop_func(void); + #else /* __ASSEMBLY__ */ #ifdef CONFIG_SMP @@ -352,7 +317,7 @@ static inline int alternatives_text_reserved(void *start, void *end) .macro ANNOTATE_IGNORE_ALTERNATIVE .Lannotate_\@: .pushsection .discard.ignore_alts - .long .Lannotate_\@ - . + .long .Lannotate_\@ .popsection .endm @@ -370,28 +335,33 @@ static inline int alternatives_text_reserved(void *start, void *end) .byte \alt_len .endm +.macro ALT_CALL_INSTR + call BUG_func +.endm + /* * Define an alternative between two instructions. If @feature is * present, early code in apply_alternatives() replaces @oldinstr with * @newinstr. ".skip" directive takes care of proper instruction padding * in case @newinstr is longer than @oldinstr. */ -.macro ALTERNATIVE oldinstr, newinstr, ft_flags -140: - \oldinstr -141: - .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstr_entry 140b,143f,\ft_flags,142b-140b,144f-143f - .popsection +#define __ALTERNATIVE(oldinst, newinst, flag) \ +740: \ + oldinst ; \ +741: \ + .skip -(((744f-743f)-(741b-740b)) > 0) * ((744f-743f)-(741b-740b)),0x90 ;\ +742: \ + .pushsection .altinstructions,"a" ; \ + altinstr_entry 740b,743f,flag,742b-740b,744f-743f ; \ + .popsection ; \ + .pushsection .altinstr_replacement,"ax" ; \ +743: \ + newinst ; \ +744: \ + .popsection ; - .pushsection .altinstr_replacement,"ax" -143: - \newinstr -144: - .popsection +.macro ALTERNATIVE oldinstr, newinstr, ft_flags + __ALTERNATIVE(\oldinstr, \newinstr, \ft_flags) .endm #define old_len 141b-140b @@ -400,65 +370,18 @@ static inline int alternatives_text_reserved(void *start, void *end) #define new_len3 146f-145f /* - * gas compatible max based on the idea from: - * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax - * - * The additional "-" is needed because gas uses a "true" value of -1. - */ -#define alt_max_2(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) -#define alt_max_3(a, b, c) (alt_max_2(alt_max_2(a, b), c)) - - -/* * Same as ALTERNATIVE macro above but for two alternatives. If CPU * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has * @feature2, it replaces @oldinstr with @feature2. */ .macro ALTERNATIVE_2 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2 -140: - \oldinstr -141: - .skip -((alt_max_2(new_len1, new_len2) - (old_len)) > 0) * \ - (alt_max_2(new_len1, new_len2) - (old_len)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f - altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f - .popsection - - .pushsection .altinstr_replacement,"ax" -143: - \newinstr1 -144: - \newinstr2 -145: - .popsection + __ALTERNATIVE(__ALTERNATIVE(\oldinstr, \newinstr1, \ft_flags1), + \newinstr2, \ft_flags2) .endm .macro ALTERNATIVE_3 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, newinstr3, ft_flags3 -140: - \oldinstr -141: - .skip -((alt_max_3(new_len1, new_len2, new_len3) - (old_len)) > 0) * \ - (alt_max_3(new_len1, new_len2, new_len3) - (old_len)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f - altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f - altinstr_entry 140b,145f,\ft_flags3,142b-140b,146f-145f - .popsection - - .pushsection .altinstr_replacement,"ax" -143: - \newinstr1 -144: - \newinstr2 -145: - \newinstr3 -146: - .popsection + __ALTERNATIVE(ALTERNATIVE_2(\oldinstr, \newinstr1, \ft_flags1, \newinstr2, \ft_flags2), + \newinstr3, \ft_flags3) .endm /* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index ed0eaf65c437..6f3b6aef47ba 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -21,8 +21,8 @@ extern int amd_numa_init(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, unsigned long); -extern int amd_smn_read(u16 node, u32 address, u32 *value); -extern int amd_smn_write(u16 node, u32 address, u32 value); +int __must_check amd_smn_read(u16 node, u32 address, u32 *value); +int __must_check amd_smn_write(u16 node, u32 address, u32 value); struct amd_l3_cache { unsigned indices; @@ -104,7 +104,7 @@ static inline bool amd_gart_present(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return false; - /* GART present only on Fam15h, upto model 0fh */ + /* GART present only on Fam15h, up to model 0fh */ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) return true; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 98c32aa5963a..f21ff1932699 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -3,6 +3,7 @@ #define _ASM_X86_APIC_H #include <linux/cpumask.h> +#include <linux/static_call.h> #include <asm/alternative.h> #include <asm/cpufeature.h> @@ -12,9 +13,16 @@ #include <asm/mpspec.h> #include <asm/msr.h> #include <asm/hardirq.h> +#include <asm/io.h> +#include <asm/posted_intr.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 +/* Macros for apic_extnmi which controls external NMI masking */ +#define APIC_EXTNMI_BSP 0 /* Default */ +#define APIC_EXTNMI_ALL 1 +#define APIC_EXTNMI_NONE 2 + /* * Debugging macros */ @@ -22,41 +30,41 @@ #define APIC_VERBOSE 1 #define APIC_DEBUG 2 -/* Macros for apic_extnmi which controls external NMI masking */ -#define APIC_EXTNMI_BSP 0 /* Default */ -#define APIC_EXTNMI_ALL 1 -#define APIC_EXTNMI_NONE 2 - /* - * Define the default level of output to be very little - * This can be turned up by using apic=verbose for more - * information and apic=debug for _lots_ of information. - * apic_verbosity is defined in apic.c + * Define the default level of output to be very little This can be turned + * up by using apic=verbose for more information and apic=debug for _lots_ + * of information. apic_verbosity is defined in apic.c */ -#define apic_printk(v, s, a...) do { \ - if ((v) <= apic_verbosity) \ - printk(s, ##a); \ - } while (0) - +#define apic_printk(v, s, a...) \ +do { \ + if ((v) <= apic_verbosity) \ + printk(s, ##a); \ +} while (0) + +#define apic_pr_verbose(s, a...) apic_printk(APIC_VERBOSE, KERN_INFO s, ##a) +#define apic_pr_debug(s, a...) apic_printk(APIC_DEBUG, KERN_DEBUG s, ##a) +#define apic_pr_debug_cont(s, a...) apic_printk(APIC_DEBUG, KERN_CONT s, ##a) +/* Unconditional debug prints for code which is guarded by apic_verbosity already */ +#define apic_dbg(s, a...) printk(KERN_DEBUG s, ##a) #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) -extern void generic_apic_probe(void); +extern void x86_32_probe_apic(void); #else -static inline void generic_apic_probe(void) -{ -} +static inline void x86_32_probe_apic(void) { } #endif +extern u32 cpuid_to_apicid[]; + +#define CPU_ACPIID_INVALID U32_MAX + #ifdef CONFIG_X86_LOCAL_APIC extern int apic_verbosity; extern int local_apic_timer_c2_ok; -extern int disable_apic; +extern bool apic_is_disabled; extern unsigned int lapic_timer_period; -extern int cpuid_to_apicid[]; - extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { APIC_PIC, @@ -66,20 +74,6 @@ enum apic_intr_mode_id { APIC_SYMMETRIC_IO_NO_ROUTING }; -#ifdef CONFIG_SMP -extern void __inquire_remote_apic(int apicid); -#else /* CONFIG_SMP */ -static inline void __inquire_remote_apic(int apicid) -{ -} -#endif /* CONFIG_SMP */ - -static inline void default_inquire_remote_apic(int apicid) -{ - if (apic_verbosity >= APIC_DEBUG) - __inquire_remote_apic(apicid); -} - /* * With 82489DX we can't rely on apic feature bit * retrieved via cpuid but still have to deal with @@ -90,7 +84,7 @@ static inline void default_inquire_remote_apic(int apicid) */ static inline bool apic_from_smp_config(void) { - return smp_found_config && !disable_apic; + return smp_found_config && !apic_is_disabled; } /* @@ -104,18 +98,21 @@ static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, + alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } static inline u32 native_apic_mem_read(u32 reg) { - return *((volatile u32 *)(APIC_BASE + reg)); + return readl((void __iomem *)(APIC_BASE + reg)); +} + +static inline void native_apic_mem_eoi(void) +{ + native_apic_mem_write(APIC_EOI, APIC_EOI_ACK); } -extern void native_apic_wait_icr_idle(void); -extern u32 native_safe_apic_wait_icr_idle(void); extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); @@ -130,8 +127,6 @@ static inline bool apic_is_x2apic_enabled(void) extern void enable_IR_x2apic(void); -extern int get_physical_broadcast(void); - extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); @@ -149,12 +144,12 @@ extern void setup_secondary_APIC_clock(void); extern void lapic_update_tsc_freq(void); #ifdef CONFIG_X86_64 -static inline int apic_force_enable(unsigned long addr) +static inline bool apic_force_enable(unsigned long addr) { - return -1; + return false; } #else -extern int apic_force_enable(unsigned long addr); +extern bool apic_force_enable(unsigned long addr); #endif extern void apic_ap_setup(void); @@ -181,6 +176,14 @@ extern bool apic_needs_pit(void); extern void apic_send_IPI_allbutself(unsigned int vector); +extern void topology_register_apic(u32 apic_id, u32 acpi_id, bool present); +extern void topology_register_boot_apic(u32 apic_id); +extern int topology_hotplug_apic(u32 apic_id, u32 acpi_id); +extern void topology_hotunplug_apic(unsigned int cpu); +extern void topology_apply_cmdline_limits_early(void); +extern void topology_init_possible_cpus(void); +extern void topology_reset_possible_cpus_up(void); + #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 @@ -195,6 +198,8 @@ static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } static inline bool apic_needs_pit(void) { return true; } +static inline void topology_apply_cmdline_limits_early(void) { } +static inline void topology_init_possible_cpus(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC @@ -207,7 +212,7 @@ static inline void native_apic_msr_write(u32 reg, u32 v) wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); } -static inline void native_apic_msr_eoi_write(u32 reg, u32 v) +static inline void native_apic_msr_eoi(void) { __wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); } @@ -223,18 +228,6 @@ static inline u32 native_apic_msr_read(u32 reg) return (u32)msr; } -static inline void native_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return; -} - -static inline u32 native_safe_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return 0; -} - static inline void native_x2apic_icr_write(u32 low, u32 id) { wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); @@ -261,7 +254,7 @@ static inline int x2apic_enabled(void) #else /* !CONFIG_X86_X2APIC */ static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } - +static inline u32 native_apic_msr_read(u32 reg) { BUG(); } #define x2apic_mode (0) #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ @@ -280,8 +273,8 @@ struct irq_data; */ struct apic { /* Hotpath functions first */ - void (*eoi_write)(u32 reg, u32 v); - void (*native_eoi_write)(u32 reg, u32 v); + void (*eoi)(void); + void (*native_eoi)(void); void (*write)(u32 reg, u32 v); u32 (*read)(u32 reg); @@ -296,10 +289,10 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); - u32 disable_esr; - - enum apic_delivery_modes delivery_mode; - bool dest_mode_logical; + u32 disable_esr : 1, + dest_mode_logical : 1, + x2apic_set_max_apicid : 1, + nmi_to_offline_cpu : 1; u32 (*calc_dest_apicid)(unsigned int cpu); @@ -307,47 +300,43 @@ struct apic { u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); + /* The limit of the APIC ID space. */ + u32 max_apic_id; + /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - int (*apic_id_valid)(u32 apicid); - int (*apic_id_registered)(void); - bool (*check_apicid_used)(physid_mask_t *map, int apicid); void (*init_apic_ldr)(void); - void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); - void (*setup_apic_routing)(void); - int (*cpu_present_to_apicid)(int mps_cpu); - void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); - int (*check_phys_apicid_present)(int phys_apicid); - int (*phys_pkg_id)(int cpuid_apic, int index_msb); + u32 (*cpu_present_to_apicid)(int mps_cpu); - u32 (*get_apic_id)(unsigned long x); - u32 (*set_apic_id)(unsigned int id); + u32 (*get_apic_id)(u32 id); /* wakeup_secondary_cpu */ - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); /* wakeup secondary CPU using 64-bit wakeup point */ - int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); - - void (*inquire_remote_apic)(int apicid); - -#ifdef CONFIG_X86_32 - /* - * Called very early during boot from get_smp_config(). It should - * return the logical apicid. x86_[bios]_cpu_to_apicid is - * initialized before this function is called. - * - * If logical apicid can't be determined that early, the function - * may return BAD_APICID. Logical apicid will be configured after - * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity - * won't be applied properly during early boot in this case. - */ - int (*x86_32_early_logical_apicid)(int cpu); -#endif + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip); + char *name; }; +struct apic_override { + void (*eoi)(void); + void (*native_eoi)(void); + void (*write)(u32 reg, u32 v); + u32 (*read)(u32 reg); + void (*send_IPI)(int cpu, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); + void (*send_IPI_allbutself)(int vector); + void (*send_IPI_all)(int vector); + void (*send_IPI_self)(int vector); + u64 (*icr_read)(void); + void (*icr_write)(u32 low, u32 high); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip); +}; + /* * Pointer to the local APIC driver in use on this system (there's * always just one such driver in use - the kernel decides via an @@ -359,20 +348,12 @@ extern struct apic *apic; * APIC drivers are probed based on how they are listed in the .apicdrivers * section. So the order is important and enforced by the ordering * of different apic driver files in the Makefile. - * - * For the files having two apic drivers, we use apic_drivers() - * to enforce the order with in them. */ #define apic_driver(sym) \ static const struct apic *__apicdrivers_##sym __used \ __aligned(sizeof(struct apic *)) \ __section(".apicdrivers") = { &sym } -#define apic_drivers(sym1, sym2) \ - static struct apic *__apicdrivers_##sym1##sym2[2] __used \ - __aligned(sizeof(struct apic *)) \ - __section(".apicdrivers") = { &sym1, &sym2 } - extern struct apic *__apicdrivers[], *__apicdrivers_end[]; /* @@ -383,43 +364,111 @@ extern int lapic_can_unplug_cpu(void); #endif #ifdef CONFIG_X86_LOCAL_APIC +extern struct apic_override __x86_apic_override; + +void __init apic_setup_apic_calls(void); +void __init apic_install_driver(struct apic *driver); + +#define apic_update_callback(_callback, _fn) { \ + __x86_apic_override._callback = _fn; \ + apic->_callback = _fn; \ + static_call_update(apic_call_##_callback, _fn); \ + pr_info("APIC: %s() replaced with %ps()\n", #_callback, _fn); \ +} -static inline u32 apic_read(u32 reg) +#define DECLARE_APIC_CALL(__cb) \ + DECLARE_STATIC_CALL(apic_call_##__cb, *apic->__cb) + +DECLARE_APIC_CALL(eoi); +DECLARE_APIC_CALL(native_eoi); +DECLARE_APIC_CALL(icr_read); +DECLARE_APIC_CALL(icr_write); +DECLARE_APIC_CALL(read); +DECLARE_APIC_CALL(send_IPI); +DECLARE_APIC_CALL(send_IPI_mask); +DECLARE_APIC_CALL(send_IPI_mask_allbutself); +DECLARE_APIC_CALL(send_IPI_allbutself); +DECLARE_APIC_CALL(send_IPI_all); +DECLARE_APIC_CALL(send_IPI_self); +DECLARE_APIC_CALL(wait_icr_idle); +DECLARE_APIC_CALL(wakeup_secondary_cpu); +DECLARE_APIC_CALL(wakeup_secondary_cpu_64); +DECLARE_APIC_CALL(write); + +static __always_inline u32 apic_read(u32 reg) { - return apic->read(reg); + return static_call(apic_call_read)(reg); } -static inline void apic_write(u32 reg, u32 val) +static __always_inline void apic_write(u32 reg, u32 val) { - apic->write(reg, val); + static_call(apic_call_write)(reg, val); } -static inline void apic_eoi(void) +static __always_inline void apic_eoi(void) { - apic->eoi_write(APIC_EOI, APIC_EOI_ACK); + static_call(apic_call_eoi)(); } -static inline u64 apic_icr_read(void) +static __always_inline void apic_native_eoi(void) { - return apic->icr_read(); + static_call(apic_call_native_eoi)(); } -static inline void apic_icr_write(u32 low, u32 high) +static __always_inline u64 apic_icr_read(void) { - apic->icr_write(low, high); + return static_call(apic_call_icr_read)(); } -static inline void apic_wait_icr_idle(void) +static __always_inline void apic_icr_write(u32 low, u32 high) { - apic->wait_icr_idle(); + static_call(apic_call_icr_write)(low, high); } -static inline u32 safe_apic_wait_icr_idle(void) +static __always_inline void __apic_send_IPI(int cpu, int vector) { - return apic->safe_wait_icr_idle(); + static_call(apic_call_send_IPI)(cpu, vector); } -extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); +static __always_inline void __apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + static_call_mod(apic_call_send_IPI_mask)(mask, vector); +} + +static __always_inline void __apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + static_call(apic_call_send_IPI_mask_allbutself)(mask, vector); +} + +static __always_inline void __apic_send_IPI_allbutself(int vector) +{ + static_call(apic_call_send_IPI_allbutself)(vector); +} + +static __always_inline void __apic_send_IPI_all(int vector) +{ + static_call(apic_call_send_IPI_all)(vector); +} + +static __always_inline void __apic_send_IPI_self(int vector) +{ + static_call_mod(apic_call_send_IPI_self)(vector); +} + +static __always_inline void apic_wait_icr_idle(void) +{ + static_call_cond(apic_call_wait_icr_idle)(); +} + +static __always_inline u32 safe_apic_wait_icr_idle(void) +{ + return apic->safe_wait_icr_idle ? apic->safe_wait_icr_idle() : 0; +} + +static __always_inline bool apic_id_valid(u32 apic_id) +{ + return apic_id <= apic->max_apic_id; +} #else /* CONFIG_X86_LOCAL_APIC */ @@ -430,22 +479,15 @@ static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } -static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} +static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } +static inline void apic_setup_apic_calls(void) { } + +#define apic_update_callback(_callback, _fn) do { } while (0) #endif /* CONFIG_X86_LOCAL_APIC */ extern void apic_ack_irq(struct irq_data *data); -static inline void ack_APIC_irq(void) -{ - /* - * ack_APIC_irq() actually gets compiled as a single instruction - * ... yummie. - */ - apic_eoi(); -} - - static inline bool lapic_vector_set_in_irr(unsigned int vector) { u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -453,14 +495,9 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector) return !!(irr & (1U << (vector % 32))); } -static inline unsigned default_get_apic_id(unsigned long x) +static inline bool is_vector_pending(unsigned int vector) { - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - - if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) - return (x >> 24) & 0xFF; - else - return (x >> 24) & 0x0F; + return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector); } /* @@ -469,43 +506,40 @@ static inline unsigned default_get_apic_id(unsigned long x) #define TRAMPOLINE_PHYS_LOW 0x467 #define TRAMPOLINE_PHYS_HIGH 0x469 -extern void generic_bigsmp_probe(void); - #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> -#define APIC_DFR_VALUE (APIC_DFR_FLAT) - -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); - extern struct apic apic_noop; -static inline unsigned int read_apic_id(void) +static inline u32 read_apic_id(void) { - unsigned int reg = apic_read(APIC_ID); + u32 reg = apic_read(APIC_ID); return apic->get_apic_id(reg); } #ifdef CONFIG_X86_64 typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip); -extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler); -#endif - -extern int default_apic_id_valid(u32 apicid); extern int default_acpi_madt_oem_check(char *, char *); -extern void default_setup_apic_routing(void); +extern void x86_64_probe_apic(void); +#else +static inline int default_acpi_madt_oem_check(char *a, char *b) { return 0; } +static inline void x86_64_probe_apic(void) { } +#endif extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); -extern bool default_check_apicid_used(physid_mask_t *map, int apicid); -extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); -extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int phys_apicid); +extern u32 default_cpu_present_to_apicid(int mps_cpu); -#endif /* CONFIG_X86_LOCAL_APIC */ +void apic_send_nmi_to_offline_cpu(unsigned int cpu); + +#else /* CONFIG_X86_LOCAL_APIC */ + +static inline u32 read_apic_id(void) { return 0; } + +#endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_SMP void apic_smt_update(void); diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 4b125e5b3187..094106b6a538 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -20,6 +20,13 @@ */ #define IO_APIC_SLOT_SIZE 1024 +#define APIC_DELIVERY_MODE_FIXED 0 +#define APIC_DELIVERY_MODE_LOWESTPRIO 1 +#define APIC_DELIVERY_MODE_SMI 2 +#define APIC_DELIVERY_MODE_NMI 4 +#define APIC_DELIVERY_MODE_INIT 5 +#define APIC_DELIVERY_MODE_EXTINT 7 + #define APIC_ID 0x20 #define APIC_LVR 0x30 @@ -165,279 +172,10 @@ #define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) #define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) -#ifndef __ASSEMBLY__ -/* - * the local APIC register structure, memory mapped. Not terribly well - * tested, but we might eventually use this one in the future - the - * problem why we cannot use it right now is the P5 APIC, it has an - * errata which cannot take 8-bit reads and writes, only 32-bit ones ... - */ -#define u32 unsigned int - -struct local_apic { - -/*000*/ struct { u32 __reserved[4]; } __reserved_01; - -/*010*/ struct { u32 __reserved[4]; } __reserved_02; - -/*020*/ struct { /* APIC ID Register */ - u32 __reserved_1 : 24, - phys_apic_id : 4, - __reserved_2 : 4; - u32 __reserved[3]; - } id; - -/*030*/ const - struct { /* APIC Version Register */ - u32 version : 8, - __reserved_1 : 8, - max_lvt : 8, - __reserved_2 : 8; - u32 __reserved[3]; - } version; - -/*040*/ struct { u32 __reserved[4]; } __reserved_03; - -/*050*/ struct { u32 __reserved[4]; } __reserved_04; - -/*060*/ struct { u32 __reserved[4]; } __reserved_05; - -/*070*/ struct { u32 __reserved[4]; } __reserved_06; - -/*080*/ struct { /* Task Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } tpr; - -/*090*/ const - struct { /* Arbitration Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } apr; - -/*0A0*/ const - struct { /* Processor Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } ppr; - -/*0B0*/ struct { /* End Of Interrupt Register */ - u32 eoi; - u32 __reserved[3]; - } eoi; - -/*0C0*/ struct { u32 __reserved[4]; } __reserved_07; - -/*0D0*/ struct { /* Logical Destination Register */ - u32 __reserved_1 : 24, - logical_dest : 8; - u32 __reserved_2[3]; - } ldr; - -/*0E0*/ struct { /* Destination Format Register */ - u32 __reserved_1 : 28, - model : 4; - u32 __reserved_2[3]; - } dfr; - -/*0F0*/ struct { /* Spurious Interrupt Vector Register */ - u32 spurious_vector : 8, - apic_enabled : 1, - focus_cpu : 1, - __reserved_2 : 22; - u32 __reserved_3[3]; - } svr; - -/*100*/ struct { /* In Service Register */ -/*170*/ u32 bitfield; - u32 __reserved[3]; - } isr [8]; - -/*180*/ struct { /* Trigger Mode Register */ -/*1F0*/ u32 bitfield; - u32 __reserved[3]; - } tmr [8]; - -/*200*/ struct { /* Interrupt Request Register */ -/*270*/ u32 bitfield; - u32 __reserved[3]; - } irr [8]; - -/*280*/ union { /* Error Status Register */ - struct { - u32 send_cs_error : 1, - receive_cs_error : 1, - send_accept_error : 1, - receive_accept_error : 1, - __reserved_1 : 1, - send_illegal_vector : 1, - receive_illegal_vector : 1, - illegal_register_address : 1, - __reserved_2 : 24; - u32 __reserved_3[3]; - } error_bits; - struct { - u32 errors; - u32 __reserved_3[3]; - } all_errors; - } esr; - -/*290*/ struct { u32 __reserved[4]; } __reserved_08; - -/*2A0*/ struct { u32 __reserved[4]; } __reserved_09; - -/*2B0*/ struct { u32 __reserved[4]; } __reserved_10; - -/*2C0*/ struct { u32 __reserved[4]; } __reserved_11; - -/*2D0*/ struct { u32 __reserved[4]; } __reserved_12; - -/*2E0*/ struct { u32 __reserved[4]; } __reserved_13; - -/*2F0*/ struct { u32 __reserved[4]; } __reserved_14; - -/*300*/ struct { /* Interrupt Command Register 1 */ - u32 vector : 8, - delivery_mode : 3, - destination_mode : 1, - delivery_status : 1, - __reserved_1 : 1, - level : 1, - trigger : 1, - __reserved_2 : 2, - shorthand : 2, - __reserved_3 : 12; - u32 __reserved_4[3]; - } icr1; - -/*310*/ struct { /* Interrupt Command Register 2 */ - union { - u32 __reserved_1 : 24, - phys_dest : 4, - __reserved_2 : 4; - u32 __reserved_3 : 24, - logical_dest : 8; - } dest; - u32 __reserved_4[3]; - } icr2; - -/*320*/ struct { /* LVT - Timer */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - timer_mode : 1, - __reserved_3 : 14; - u32 __reserved_4[3]; - } lvt_timer; - -/*330*/ struct { /* LVT - Thermal Sensor */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_thermal; - -/*340*/ struct { /* LVT - Performance Counter */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_pc; - -/*350*/ struct { /* LVT - LINT0 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint0; - -/*360*/ struct { /* LVT - LINT1 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint1; - -/*370*/ struct { /* LVT - Error */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_error; - -/*380*/ struct { /* Timer Initial Count Register */ - u32 initial_count; - u32 __reserved_2[3]; - } timer_icr; - -/*390*/ const - struct { /* Timer Current Count Register */ - u32 curr_count; - u32 __reserved_2[3]; - } timer_ccr; - -/*3A0*/ struct { u32 __reserved[4]; } __reserved_16; - -/*3B0*/ struct { u32 __reserved[4]; } __reserved_17; - -/*3C0*/ struct { u32 __reserved[4]; } __reserved_18; - -/*3D0*/ struct { u32 __reserved[4]; } __reserved_19; - -/*3E0*/ struct { /* Timer Divide Configuration Register */ - u32 divisor : 4, - __reserved_1 : 28; - u32 __reserved_2[3]; - } timer_dcr; - -/*3F0*/ struct { u32 __reserved[4]; } __reserved_20; - -} __attribute__ ((packed)); - -#undef u32 - #ifdef CONFIG_X86_32 #define BAD_APICID 0xFFu #else #define BAD_APICID 0xFFFFu #endif -enum apic_delivery_modes { - APIC_DELIVERY_MODE_FIXED = 0, - APIC_DELIVERY_MODE_LOWESTPRIO = 1, - APIC_DELIVERY_MODE_SMI = 2, - APIC_DELIVERY_MODE_NMI = 4, - APIC_DELIVERY_MODE_INIT = 5, - APIC_DELIVERY_MODE_EXTINT = 7, -}; - -#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index b1a98fa38828..25466c4d2134 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -12,7 +12,9 @@ #include <asm/special_insns.h> #include <asm/preempt.h> #include <asm/asm.h> +#include <asm/fred.h> #include <asm/gsseg.h> +#include <asm/nospec-branch.h> #ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index fbcfec4dc4cc..2bec0c89a95c 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -113,6 +113,20 @@ #endif +#ifndef __ASSEMBLY__ +#ifndef __pic__ +static __always_inline __pure void *rip_rel_ptr(void *p) +{ + asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p)); + + return p; +} +#define RIP_REL_REF(var) (*(typeof(&(var)))rip_rel_ptr(&(var))) +#else +#define RIP_REL_REF(var) (var) +#endif +#endif + /* * Macros to generate condition code outputs from inline assembly, * The output operand must be type "bool". @@ -215,9 +229,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define _ASM_EXTABLE_UA(from, to) \ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS) -#define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY) - #define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 55a55ec04350..55b4d24356ea 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v) } #define arch_atomic_add_return arch_atomic_add_return -static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) -{ - return arch_atomic_add_return(-i, v); -} -#define arch_atomic_sub_return arch_atomic_sub_return +#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v) static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { @@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) } #define arch_atomic_fetch_add arch_atomic_fetch_add -static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic_fetch_sub arch_atomic_fetch_sub +#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v) static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 3486d91b8595..8db2ec4d6cda 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -14,6 +14,32 @@ typedef struct { #define ATOMIC64_INIT(val) { (val) } +/* + * Read an atomic64_t non-atomically. + * + * This is intended to be used in cases where a subsequent atomic operation + * will handle the torn value, and can be used to prime the first iteration + * of unconditional try_cmpxchg() loops, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + * + * This is NOT safe to use where the value is not always checked by a + * subsequent atomic operation, such as in conditional try_cmpxchg() loops + * that can break before the atomic operation, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { + * if (condition(val)) + * break; + * } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + */ +static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) +{ + /* See comment in arch_atomic_read(). */ + return __READ_ONCE(v->counter); +} + #define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...) #ifndef ATOMIC64_EXPORT #define ATOMIC64_DECL_ONE __ATOMIC64_DECL @@ -24,7 +50,7 @@ typedef struct { #ifdef CONFIG_X86_CMPXCHG64 #define __alternative_atomic64(f, g, out, in...) \ - asm volatile("call %P[func]" \ + asm volatile("call %c[func]" \ : out : [func] "i" (atomic64_##g##_cx8), ## in) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8) @@ -61,12 +87,18 @@ ATOMIC64_DECL(add_unless); #undef __ATOMIC64_DECL #undef ATOMIC64_EXPORT -static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { - return arch_cmpxchg64(&v->counter, o, n); + return arch_cmpxchg64(&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return arch_try_cmpxchg64(&v->counter, old, new); +} +#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg + static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { s64 o; @@ -195,69 +227,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); } static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); - return old; + return val; } #define arch_atomic64_fetch_and arch_atomic64_fetch_and static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); } static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); - return old; + return val; } #define arch_atomic64_fetch_or arch_atomic64_fetch_or static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); } static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); - return old; + return val; } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i)); - return old; + return val; } #define arch_atomic64_fetch_add arch_atomic64_fetch_add diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 3165c0feedf7..ae12acae5b06 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) } #define arch_atomic64_add_return arch_atomic64_add_return -static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) -{ - return arch_atomic64_add_return(-i, v); -} -#define arch_atomic64_sub_return arch_atomic64_sub_return +#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v) static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { @@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub +#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v) static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { diff --git a/arch/x86/include/asm/audit.h b/arch/x86/include/asm/audit.h index 36aec57ea7a3..fa918f01333e 100644 --- a/arch/x86/include/asm/audit.h +++ b/arch/x86/include/asm/audit.h @@ -4,4 +4,11 @@ int ia32_classify_syscall(unsigned int syscall); +extern unsigned ia32_dir_class[]; +extern unsigned ia32_write_class[]; +extern unsigned ia32_read_class[]; +extern unsigned ia32_chattr_class[]; +extern unsigned ia32_signal_class[]; + + #endif /* _ASM_X86_AUDIT_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 35389b2af88e..7b44b3c4cce1 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -33,20 +33,16 @@ * Returns: * 0 - (index < size) */ -static inline unsigned long array_index_mask_nospec(unsigned long index, - unsigned long size) -{ - unsigned long mask; - - asm volatile ("cmp %1,%2; sbb %0,%0;" - :"=r" (mask) - :"g"(size),"r" (index) - :"cc"); - return mask; -} - -/* Override the default implementation from linux/nospec.h. */ -#define array_index_mask_nospec array_index_mask_nospec +#define array_index_mask_nospec(idx,sz) ({ \ + typeof((idx)+(sz)) __idx = (idx); \ + typeof(__idx) __sz = (sz); \ + unsigned long __mask; \ + asm volatile ("cmp %1,%2; sbb %0,%0" \ + :"=r" (__mask) \ + :ASM_INPUT_G (__sz), \ + "r" (__idx) \ + :"cc"); \ + __mask; }) /* Prevent speculative execution past this barrier. */ #define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC) @@ -79,24 +75,9 @@ do { \ #define __smp_mb__before_atomic() do { } while (0) #define __smp_mb__after_atomic() do { } while (0) -#include <asm-generic/barrier.h> +/* Writing to CR3 provides a full memory barrier in switch_mm(). */ +#define smp_mb__after_switch_mm() do { } while (0) -/* - * Make previous memory operations globally visible before - * a WRMSR. - * - * MFENCE makes writes visible, but only affects load/store - * instructions. WRMSR is unfortunately not a load/store - * instruction and is unaffected by MFENCE. The LFENCE ensures - * that the WRMSR is not reordered. - * - * Most WRMSRs are full serializing instructions themselves and - * do not require this barrier. This is only required for the - * IA32_TSC_DEADLINE and X2APIC MSRs. - */ -static inline void weak_wrmsr_fence(void) -{ - asm volatile("mfence; lfence" : : : "memory"); -} +#include <asm-generic/barrier.h> #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 2edf68475fec..b96d45944c59 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -94,18 +94,17 @@ arch___clear_bit(unsigned long nr, volatile unsigned long *addr) asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -static __always_inline bool -arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1" + asm volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) - : "ir" ((char) ~(1 << nr)) : "memory"); + : "iq" ((char)mask) : "memory"); return negative; } -#define arch_clear_bit_unlock_is_negative_byte \ - arch_clear_bit_unlock_is_negative_byte +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte static __always_inline void arch___clear_bit_unlock(long nr, volatile unsigned long *addr) @@ -251,7 +250,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) - : "rm" (word)); + : ASM_INPUT_RM (word)); return word; } @@ -293,9 +292,12 @@ static __always_inline unsigned long variable_ffz(unsigned long word) */ static __always_inline unsigned long __fls(unsigned long word) { + if (__builtin_constant_p(word)) + return BITS_PER_LONG - 1 - __builtin_clzl(word); + asm("bsr %1,%0" : "=r" (word) - : "rm" (word)); + : ASM_INPUT_RM (word)); return word; } @@ -318,7 +320,7 @@ static __always_inline int variable_ffs(int x) */ asm("bsfl %1,%0" : "=r" (r) - : "rm" (x), "0" (-1)); + : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" @@ -360,6 +362,9 @@ static __always_inline int fls(unsigned int x) { int r; + if (__builtin_constant_p(x)) + return x ? 32 - __builtin_clz(x) : 0; + #ifdef CONFIG_X86_64 /* * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the @@ -372,7 +377,7 @@ static __always_inline int fls(unsigned int x) */ asm("bsrl %1,%0" : "=r" (r) - : "rm" (x), "0" (-1)); + : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" @@ -401,6 +406,9 @@ static __always_inline int fls(unsigned int x) static __always_inline int fls64(__u64 x) { int bitpos = -1; + + if (__builtin_constant_p(x)) + return x ? 64 - __builtin_clzll(x) : 0; /* * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its @@ -408,7 +416,7 @@ static __always_inline int fls64(__u64 x) */ asm("bsrq %1,%q0" : "+r" (bitpos) - : "rm" (x)); + : ASM_INPUT_RM (x)); return bitpos + 1; } #else diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 9191280d9ea3..3e5b111e619d 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -6,11 +6,6 @@ #include <asm/pgtable_types.h> #include <uapi/asm/boot.h> -/* Physical address where kernel should be loaded. */ -#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ - + (CONFIG_PHYSICAL_ALIGN - 1)) \ - & ~(CONFIG_PHYSICAL_ALIGN - 1)) - /* Minimum kernel alignment, as a power of two */ #ifdef CONFIG_X86_64 # define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT @@ -40,26 +35,54 @@ #ifdef CONFIG_X86_64 # define BOOT_STACK_SIZE 0x4000 +/* + * Used by decompressor's startup_32() to allocate page tables for identity + * mapping of the 4G of RAM in 4-level paging mode: + * - 1 level4 table; + * - 1 level3 table; + * - 4 level2 table that maps everything with 2M pages; + * + * The additional level5 table needed for 5-level paging is allocated from + * trampoline_32bit memory. + */ # define BOOT_INIT_PGT_SIZE (6*4096) -# ifdef CONFIG_RANDOMIZE_BASE + /* - * Assuming all cross the 512GB boundary: - * 1 page for level4 - * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel - * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP). - * Total is 19 pages. + * Total number of page tables kernel_add_identity_map() can allocate, + * including page tables consumed by startup_32(). + * + * Worst-case scenario: + * - 5-level paging needs 1 level5 table; + * - KASLR needs to map kernel, boot_params, cmdline and randomized kernel, + * assuming all of them cross 256T boundary: + * + 4*2 level4 table; + * + 4*2 level3 table; + * + 4*2 level2 table; + * - X86_VERBOSE_BOOTUP needs to map the first 2M (video RAM): + * + 1 level4 table; + * + 1 level3 table; + * + 1 level2 table; + * Total: 28 tables + * + * Add 4 spare table in case decompressor touches anything beyond what is + * accounted above. Warn if it happens. */ -# ifdef CONFIG_X86_VERBOSE_BOOTUP -# define BOOT_PGT_SIZE (19*4096) -# else /* !CONFIG_X86_VERBOSE_BOOTUP */ -# define BOOT_PGT_SIZE (17*4096) -# endif -# else /* !CONFIG_RANDOMIZE_BASE */ -# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE -# endif +# define BOOT_PGT_SIZE_WARN (28*4096) +# define BOOT_PGT_SIZE (32*4096) #else /* !CONFIG_X86_64 */ # define BOOT_STACK_SIZE 0x1000 #endif +#ifndef __ASSEMBLY__ +extern unsigned int output_len; +extern const unsigned long kernel_text_size; +extern const unsigned long kernel_total_size; + +unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, + void (*error)(char *x)); + +extern struct boot_params *boot_params_ptr; +#endif + #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index a3ec87d198ac..806649c7f23d 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -13,6 +13,18 @@ #define INSN_UD2 0x0b0f #define LEN_UD2 2 +/* + * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit. + */ +#define INSN_ASOP 0x67 +#define OPCODE_ESCAPE 0x0f +#define SECOND_BYTE_OPCODE_UD1 0xb9 +#define SECOND_BYTE_OPCODE_UD2 0x0b + +#define BUG_NONE 0xffff +#define BUG_UD1 0xfffe +#define BUG_UD2 0xfffd + #ifdef CONFIG_GENERIC_BUG #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index ce9685fc78d8..5aa061199866 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -7,9 +7,6 @@ extern unsigned int memory_caching_control; #define CACHE_MTRR 0x01 #define CACHE_PAT 0x02 -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu); -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu); - void cache_disable(void); void cache_enable(void); void set_cache_aps_delayed_init(bool val); diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 58dacd90daef..31d19c815f99 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -7,16 +7,140 @@ * * Copyright (C) 2022 Google LLC */ +#include <linux/bug.h> +#include <asm/ibt.h> -#include <linux/cfi.h> +/* + * An overview of the various calling conventions... + * + * Traditional: + * + * foo: + * ... code here ... + * ret + * + * direct caller: + * call foo + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * call *%r11 + * + * + * IBT: + * + * foo: + * endbr64 + * ... code here ... + * ret + * + * direct caller: + * call foo / call foo+4 + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * call *%r11 + * + * + * kCFI: + * + * __cfi_foo: + * movl $0x12345678, %eax + * # 11 nops when CONFIG_CALL_PADDING + * foo: + * endbr64 # when IBT + * ... code here ... + * ret + * + * direct call: + * call foo # / call foo+4 when IBT + * + * indirect call: + * lea foo(%rip), %r11 + * ... + * movl $(-0x12345678), %r10d + * addl -4(%r11), %r10d # -15 when CONFIG_CALL_PADDING + * jz 1f + * ud2 + * 1:call *%r11 + * + * + * FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches into): + * + * __cfi_foo: + * endbr64 + * subl 0x12345678, %r10d + * jz foo + * ud2 + * nop + * foo: + * osp nop3 # was endbr64 + * ... code here ... + * ret + * + * direct caller: + * call foo / call foo+4 + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * movl $0x12345678, %r10d + * subl $16, %r11 + * nop4 + * call *%r11 + * + */ +enum cfi_mode { + CFI_AUTO, /* FineIBT if hardware has IBT, otherwise kCFI */ + CFI_OFF, /* Taditional / IBT depending on .config */ + CFI_KCFI, /* Optionally CALL_PADDING, IBT, RETPOLINE */ + CFI_FINEIBT, /* see arch/x86/kernel/alternative.c */ +}; + +extern enum cfi_mode cfi_mode; + +struct pt_regs; #ifdef CONFIG_CFI_CLANG enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); +#define __bpfcall +extern u32 cfi_bpf_hash; +extern u32 cfi_bpf_subprog_hash; + +static inline int cfi_get_offset(void) +{ + switch (cfi_mode) { + case CFI_FINEIBT: + return 16; + case CFI_KCFI: + if (IS_ENABLED(CONFIG_CALL_PADDING)) + return 16; + return 5; + default: + return 0; + } +} +#define cfi_get_offset cfi_get_offset + +extern u32 cfi_get_func_hash(void *func); + #else static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { return BUG_TRAP_TYPE_NONE; } +#define cfi_bpf_hash 0U +#define cfi_bpf_subprog_hash 0U +static inline u32 cfi_get_func_hash(void *func) +{ + return 0; +} #endif /* CONFIG_CFI_CLANG */ +#if HAS_KERNEL_IBT == 1 +#define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x))) +#endif + #endif /* _ASM_X86_CFI_H */ diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index 6faaf27e8899..6cbd9ae58b21 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,6 +2,10 @@ #ifndef _ASM_X86_CMDLINE_H #define _ASM_X86_CMDLINE_H +#include <asm/setup.h> + +extern char builtin_cmdline[COMMAND_LINE_SIZE]; + int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); int cmdline_find_option(const char *cmdline_ptr, const char *option, char *buffer, int bufsize); diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d53636506134..5612648b0202 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -221,12 +221,18 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) +#define __sync_try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ") + #define __try_cmpxchg_local(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), "") #define arch_try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) +#define arch_sync_try_cmpxchg(ptr, pold, new) \ + __sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + #define arch_try_cmpxchg_local(ptr, pold, new) \ __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr))) diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index b5731c51f0f4..62cef2113ca7 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -3,103 +3,148 @@ #define _ASM_X86_CMPXCHG_32_H /* - * Note: if you use set64_bit(), __cmpxchg64(), or their variants, + * Note: if you use __cmpxchg64(), or their variants, * you need to test for the feature in boot_cpu_data. */ -#ifdef CONFIG_X86_CMPXCHG64 -#define arch_cmpxchg64(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_cmpxchg64_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_try_cmpxchg64(ptr, po, n) \ - __try_cmpxchg64((ptr), (unsigned long long *)(po), \ - (unsigned long long)(n)) -#endif +union __u64_halves { + u64 full; + struct { + u32 low, high; + }; +}; + +#define __arch_cmpxchg64(_ptr, _old, _new, _lock) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(_lock "cmpxchg8b %[ptr]" \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + o.full; \ +}) + + +static __always_inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64(ptr, old, new, LOCK_PREFIX); +} -static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) { - u64 prev; - asm volatile(LOCK_PREFIX "cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; + return __arch_cmpxchg64(ptr, old, new,); } -static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +#define __arch_try_cmpxchg64(_ptr, _oldp, _new, _lock) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(_lock "cmpxchg8b %[ptr]" \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) { - u64 prev; - asm volatile("cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; + return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX); } -static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new) +static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) { - bool success; - u64 old = *pold; - asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]" - CC_SET(z) - : CC_OUT(z) (success), - [ptr] "+m" (*ptr), - "+A" (old) - : "b" ((u32)new), - "c" ((u32)(new >> 32)) - : "memory"); - - if (unlikely(!success)) - *pold = old; - return success; + return __arch_try_cmpxchg64(ptr, oldp, new,); } -#ifndef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CMPXCHG64 + +#define arch_cmpxchg64 __cmpxchg64 + +#define arch_cmpxchg64_local __cmpxchg64_local + +#define arch_try_cmpxchg64 __try_cmpxchg64 + +#define arch_try_cmpxchg64_local __try_cmpxchg64_local + +#else + /* * Building a kernel capable running on 80386 and 80486. It may be necessary * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -#define arch_cmpxchg64(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io(LOCK_PREFIX_HERE \ - "call cmpxchg8b_emu", \ - "lock; cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) - - -#define arch_cmpxchg64_local(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io("call cmpxchg8b_emu", \ - "cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) +#define __arch_cmpxchg64_emu(_ptr, _old, _new, _lock_loc, _lock) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + : "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ + : "memory"); \ + \ + o.full; \ +}) + +static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; "); +} +#define arch_cmpxchg64 arch_cmpxchg64 + +static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu(ptr, old, new, ,); +} +#define arch_cmpxchg64_local arch_cmpxchg64_local + +#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new, _lock_loc, _lock) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; "); +} +#define arch_try_cmpxchg64 arch_try_cmpxchg64 + +static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu(ptr, oldp, new, ,); +} +#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local #endif diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 44b08b53ab32..5e241306db26 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -20,6 +20,12 @@ arch_try_cmpxchg((ptr), (po), (n)); \ }) +#define arch_try_cmpxchg64_local(ptr, po, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + arch_try_cmpxchg_local((ptr), (po), (n)); \ +}) + union __u128_halves { u128 full; struct { @@ -62,7 +68,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, asm volatile(_lock "cmpxchg16b %[ptr]" \ CC_SET(e) \ : CC_OUT(e) (ret), \ - [ptr] "+m" (*ptr), \ + [ptr] "+m" (*(_ptr)), \ "+a" (o.low), "+d" (o.high) \ : "b" (n.low), "c" (n.high) \ : "memory"); \ diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index 6ae2d16a7613..aa6c8f8ca958 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_COCO_H #define _ASM_X86_COCO_H +#include <asm/asm.h> #include <asm/types.h> enum cc_vendor { @@ -10,13 +11,22 @@ enum cc_vendor { CC_VENDOR_INTEL, }; +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM extern enum cc_vendor cc_vendor; +extern u64 cc_mask; + +static inline void cc_set_mask(u64 mask) +{ + RIP_REL_REF(cc_mask) = mask; +} -#ifdef CONFIG_ARCH_HAS_CC_PLATFORM -void cc_set_mask(u64 mask); u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); +void cc_random_init(void); #else +#define cc_vendor (CC_VENDOR_NONE) +static const u64 cc_mask = 0; + static inline u64 cc_mkenc(u64 val) { return val; @@ -26,6 +36,7 @@ static inline u64 cc_mkdec(u64 val) { return val; } +static inline void cc_random_init(void) { } #endif #endif /* _ASM_X86_COCO_H */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 3a233ebff712..aa30fd8cad7f 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -9,27 +9,13 @@ #include <linux/percpu.h> #include <asm/ibt.h> -#ifdef CONFIG_SMP - -extern void prefill_possible_map(void); - -#else /* CONFIG_SMP */ - -static inline void prefill_possible_map(void) {} - +#ifndef CONFIG_SMP #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define cpu_acpi_id(cpu) 0 #define safe_smp_processor_id() 0 - #endif /* CONFIG_SMP */ -struct x86_cpu { - struct cpu cpu; -}; - #ifdef CONFIG_HOTPLUG_CPU -extern int arch_register_cpu(int num); -extern void arch_unregister_cpu(int); extern void soft_restart_cpu(void); #endif @@ -73,26 +59,12 @@ static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {} extern __noendbr void cet_disable(void); -struct ucode_cpu_info; +struct cpu_signature; -int intel_cpu_collect_info(struct ucode_cpu_info *uci); - -static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} +void intel_collect_cpu_info(struct cpu_signature *sig); extern u64 x86_read_arch_cap_msr(void); -int intel_find_matching_signature(void *mc, unsigned int csig, int cpf); +bool intel_find_matching_signature(void *mc, struct cpu_signature *sig); int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type); extern struct cpumask cpus_stop_mask; diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index eb8fcede9e3b..e4121d9aa9e1 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -3,6 +3,39 @@ #define _ASM_X86_CPU_DEVICE_ID /* + * Can't use <linux/bitfield.h> because it generates expressions that + * cannot be used in structure initializers. Bitfield construction + * here must match the union in struct cpuinfo_86: + * union { + * struct { + * __u8 x86_model; + * __u8 x86; + * __u8 x86_vendor; + * __u8 x86_reserved; + * }; + * __u32 x86_vfm; + * }; + */ +#define VFM_MODEL_BIT 0 +#define VFM_FAMILY_BIT 8 +#define VFM_VENDOR_BIT 16 +#define VFM_RSVD_BIT 24 + +#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT) +#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT) +#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT) + +#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT) +#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT) +#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT) + +#define VFM_MAKE(_vendor, _family, _model) ( \ + ((_model) << VFM_MODEL_BIT) | \ + ((_family) << VFM_FAMILY_BIT) | \ + ((_vendor) << VFM_VENDOR_BIT) \ +) + +/* * Declare drivers belonging to specific x86 CPUs * Similar in spirit to pci_device_id and related PCI functions * @@ -20,6 +53,9 @@ #define X86_CENTAUR_FAM6_C7_D 0xd #define X86_CENTAUR_FAM6_NANO 0xf +/* x86_cpu_id::flags */ +#define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0) + #define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) /** * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching @@ -46,6 +82,18 @@ .model = _model, \ .steppings = _steppings, \ .feature = _feature, \ + .flags = X86_CPU_ID_FLAG_ENTRY_VALID, \ + .driver_data = (unsigned long) _data \ +} + +#define X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ + _steppings, _feature, _data) { \ + .vendor = _vendor, \ + .family = _family, \ + .model = _model, \ + .steppings = _steppings, \ + .feature = _feature, \ + .flags = X86_CPU_ID_FLAG_ENTRY_VALID, \ .driver_data = (unsigned long) _data \ } @@ -145,24 +193,54 @@ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data) /** - * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model - * @model: The model name without the INTEL_FAM6_ prefix or ANY - * The model name is expanded to INTEL_FAM6_@model internally + * X86_MATCH_VFM - Match encoded vendor/family/model + * @vfm: Encoded 8-bits each for vendor, family, model * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer - * etc. is casted to unsigned long internally. + * etc. is cast to unsigned long internally. * - * The vendor is set to INTEL, the family to 6 and all other missing - * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards. + * Stepping and feature are set to wildcards + */ +#define X86_MATCH_VFM(vfm, data) \ + X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ + VFM_VENDOR(vfm), \ + VFM_FAMILY(vfm), \ + VFM_MODEL(vfm), \ + X86_STEPPING_ANY, X86_FEATURE_ANY, data) + +/** + * X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping + * @vfm: Encoded 8-bits each for vendor, family, model + * @steppings: Bitmask of steppings to match + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is cast to unsigned long internally. * - * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information. + * feature is set to wildcard */ -#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ - X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) +#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \ + X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ + VFM_VENDOR(vfm), \ + VFM_FAMILY(vfm), \ + VFM_MODEL(vfm), \ + steppings, X86_FEATURE_ANY, data) -#define X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(model, steppings, data) \ - X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ - steppings, X86_FEATURE_ANY, data) +/** + * X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature + * @vfm: Encoded 8-bits each for vendor, family, model + * @feature: A X86_FEATURE bit + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is cast to unsigned long internally. + * + * Steppings is set to wildcard + */ +#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \ + X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ + VFM_VENDOR(vfm), \ + VFM_FAMILY(vfm), \ + VFM_MODEL(vfm), \ + X86_STEPPING_ANY, feature, data) /* * Match specific microcode revisions. @@ -182,9 +260,17 @@ struct x86_cpu_desc { u32 x86_microcode_rev; }; -#define INTEL_CPU_DESC(model, stepping, revision) { \ - .x86_family = 6, \ - .x86_vendor = X86_VENDOR_INTEL, \ +#define INTEL_CPU_DESC(vfm, stepping, revision) { \ + .x86_family = VFM_FAMILY(vfm), \ + .x86_vendor = VFM_VENDOR(vfm), \ + .x86_model = VFM_MODEL(vfm), \ + .x86_stepping = (stepping), \ + .x86_microcode_rev = (revision), \ +} + +#define AMD_CPU_DESC(fam, model, stepping, revision) { \ + .x86_family = (fam), \ + .x86_vendor = X86_VENDOR_AMD, \ .x86_model = (model), \ .x86_stepping = (stepping), \ .x86_microcode_rev = (revision), \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a26bebbdff87..0b9611da6c53 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -33,6 +33,8 @@ enum cpuid_leafs CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, + CPUID_LNX_5, + NR_CPUID_WORDS, }; #define X86_CAP_FMT_NUM "%d:%d" @@ -91,8 +93,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 21)) + BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -116,8 +119,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 21)) + BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ @@ -125,8 +129,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ - x86_this_cpu_test_bit(bit, \ - (unsigned long __percpu *)&cpu_info.x86_capability)) + x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* * This macro is for detection of features which need kernel @@ -146,8 +149,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); -#define setup_force_cpu_cap(bit) do { \ - set_cpu_cap(&boot_cpu_data, bit); \ +#define setup_force_cpu_cap(bit) do { \ + \ + if (!boot_cpu_has(bit)) \ + WARN_ON(alternatives_patched); \ + \ + set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) @@ -168,11 +175,10 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto( - ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") + asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" - " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n" + " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b69b0d7756aa..dd4682857c12 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,177 +13,175 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 21 /* N 32-bit words worth of info */ +#define NCAPINTS 22 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. + * in /proc/cpuinfo instead of the macro name. Otherwise, this feature + * bit is not displayed in /proc/cpuinfo at all. * * When adding new features here that depend on other features, * please update the table in kernel/cpu/cpuid-deps.c as well. */ /* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* "fpu" Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* "vme" Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* "de" Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* "pse" Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* "tsc" Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* "msr" Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* "pae" Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* "mce" Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* "cx8" CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* "apic" Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* "sep" SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* "mtrr" Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* "pge" Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* "mca" Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* "cmov" CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* "pat" Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* "pse36" 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* "pn" Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* "clflush" CLFLUSH instruction */ #define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_ACPI ( 0*32+22) /* "acpi" ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* "mmx" Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* "fxsr" FXSAVE/FXRSTOR, CR4.OSFXSR */ #define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ #define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ #define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_HT ( 0*32+28) /* "ht" Hyper-Threading */ #define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ +#define X86_FEATURE_IA64 ( 0*32+30) /* "ia64" IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* "pbe" Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* "syscall" SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* "mp" MP Capable */ +#define X86_FEATURE_NX ( 1*32+20) /* "nx" Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* "mmxext" AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* "fxsr_opt" FXSAVE/FXRSTOR optimizations */ #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* "rdtscp" RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* "lm" Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* "3dnowext" AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* "3dnow" 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* "recovery" CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* "longrun" Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* "lrti" LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ - -/* CPU types for specific tunings: */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ -#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ -#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ -#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* "cxmmx" Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* "k6_mtrr" AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* "cyrix_arr" Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */ +#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* "arch_perfmon" Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* "pebs" Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* "bts" Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* "rep_good" REP microcode works well */ +#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* "amd_lbr_v2" AMD Last Branch Record Extension Version 2 */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* Clear CPU buffers using VERW */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* "acc_power" AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL ( 3*32+20) /* "nopl" The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* "xtopology" CPU topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* "tsc_reliable" TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* "nonstop_tsc" TSC does not stop in C states */ +#define X86_FEATURE_CPUID ( 3*32+25) /* "cpuid" CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* "extd_apicid" Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* "amd_dcm" AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* "aperfmperf" P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +#define X86_FEATURE_RAPL ( 3*32+29) /* "rapl" AMD/Hygon RAPL interface */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* "nonstop_tsc_s3" TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* "tsc_known_freq" TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* "pclmulqdq" PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* "dtes64" 64-bit Debug Store */ #define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ #define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* "vmx" Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* "smx" Safer Mode eXtensions */ +#define X86_FEATURE_EST ( 4*32+ 7) /* "est" Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* "tm2" Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* "ssse3" Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* "cid" Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* "sdbg" Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* "fma" Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* "cx16" CMPXCHG16B instruction */ +#define X86_FEATURE_XTPR ( 4*32+14) /* "xtpr" Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* "pdcm" Perf/Debug Capabilities MSR */ +#define X86_FEATURE_PCID ( 4*32+17) /* "pcid" Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* "dca" Direct Cache Access */ #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* "x2apic" X2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* "movbe" MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* "popcnt" POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* "tsc_deadline_timer" TSC deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* "aes" AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* "xsave" XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* XSAVE instruction enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* "avx" Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* "f16c" 16-bit FP conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* "rdrand" RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* "hypervisor" Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ #define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ #define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ #define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* "ace2" Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* "ace2_en" ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* "phe" PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* "phe_en" PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* "pmm" PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* "pmm_en" PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ -#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* "lahf_lm" LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* "cmp_legacy" If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* "svm" Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* "extapic" Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* "cr8_legacy" CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* "abm" Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* "sse4a" SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* "misalignsse" Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* "3dnowprefetch" 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* "osvw" OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* "ibs" Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* "xop" Extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* "skinit" SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* "wdt" Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* "lwp" Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* "fma4" 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* "tce" Translation Cache Extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* "nodeid_msr" NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* "tbm" Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* "topoext" Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* "perfctr_core" Core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* "perfctr_nb" NB performance counter extensions */ +#define X86_FEATURE_BPEXT ( 6*32+26) /* "bpext" Data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* "ptsc" Performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* "perfctr_llc" Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* "mwaitx" MWAIT extension (MONITORX/MWAITX instructions) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -191,93 +189,93 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ -#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ -#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ -#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ -#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ -#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ -#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ -#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ -#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ -#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ -#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ -#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ -#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */ -#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ -#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ -#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ -#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ -#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* "ring3mwait" Ring 3 MONITOR/MWAIT instructions */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* "cpuid_fault" Intel CPUID faulting */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* "cpb" AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* "epb" IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* "cat_l3" Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* "cat_l2" Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* "cdp_l3" Code and Data Prioritization L3 */ +#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* "tdx_host_platform" Platform supports being a TDX host */ +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* "hw_pstate" AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* "proc_feedback" AMD ProcFeedbackInterface */ +#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* Use compacted XSTATE (XSAVES or XSAVEC) */ +#define X86_FEATURE_PTI ( 7*32+11) /* "pti" Kernel Page Table Isolation enabled */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* Fill RSB on VM-Exit */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* "intel_ppin" Intel Processor Inventory Number */ +#define X86_FEATURE_CDP_L2 ( 7*32+15) /* "cdp_l2" Code and Data Prioritization L2 */ +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* MSR SPEC_CTRL is implemented */ +#define X86_FEATURE_SSBD ( 7*32+17) /* "ssbd" Speculative Store Bypass Disable */ +#define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */ +#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ +#define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */ +#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier */ +#define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */ +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* "ibrs_enhanced" Enhanced IBRS */ +#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* MSR IA32_FEAT_CTL configured */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */ - -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ -#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ -#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ -#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ -#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ -#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* "tpr_shadow" Intel TPR Shadow */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */ + +#define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */ +#define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */ +#define X86_FEATURE_EPT_AD ( 8*32+17) /* "ept_ad" Intel Extended Page Table access-dirty bit */ +#define X86_FEATURE_VMCALL ( 8*32+18) /* Hypervisor supports the VMCALL instruction */ +#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* PV vcpu_is_preempted function */ +#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* "tdx_guest" Intel Trust Domain Extensions Guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ -#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ -#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* "fsgsbase" RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* "tsc_adjust" TSC adjustment MSR 0x3B */ +#define X86_FEATURE_SGX ( 9*32+ 2) /* "sgx" Software Guard Extensions */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* "bmi1" 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* "hle" Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* "avx2" AVX2 instructions */ +#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* FPU data pointer updated only on x87 exceptions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* "smep" Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* "bmi2" 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* "erms" Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* "invpcid" Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* "rtm" Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* "cqm" Cache QoS Monitoring */ +#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* Zero out FPU CS and FPU DS */ +#define X86_FEATURE_MPX ( 9*32+14) /* "mpx" Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* "rdt_a" Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* "avx512f" AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* "avx512dq" AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* "rdseed" RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* "adx" ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* "smap" Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* "avx512ifma" AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* "clflushopt" CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* "clwb" CLWB instruction */ +#define X86_FEATURE_INTEL_PT ( 9*32+25) /* "intel_pt" Intel Processor Trace */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* "avx512pf" AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* "avx512er" AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* "avx512cd" AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* "sha_ni" SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* "avx512bw" AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* "avx512vl" AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* "xsaveopt" XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* "xsavec" XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* "xgetbv1" XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* "xsaves" XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XFD (10*32+ 4) /* eXtended Feature Disabling */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -285,215 +283,244 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ -#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ -#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ -#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ -#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ -#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ -#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ -#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ -#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ -#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ -#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ -#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ -#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ -#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ -#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ -#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ -#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ -#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ -#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ - -#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ -#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ -#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* "cqm_llc" LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* "cqm_occup_llc" LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* "cqm_mbm_total" LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* "cqm_mbm_local" LLC Local MBM monitoring */ +#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* LFENCE in user entry SWAPGS path */ +#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* LFENCE in kernel entry SWAPGS path */ +#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* "split_lock_detect" #AC for split lock */ +#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* Issue an IBPB on kernel entry */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RETHUNK (11*32+14) /* Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* AMD BTB untrain return */ +#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* Use IBPB during runtime firmware calls */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* SGX EDECCSSA user leaf function */ +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* Call depth tracking for RSB stuffing */ +#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* MSR IA32_TSX_CTRL (Intel) implemented */ +#define X86_FEATURE_SMBA (11*32+21) /* Slow Memory Bandwidth Allocation */ +#define X86_FEATURE_BMEC (11*32+22) /* Bandwidth Monitoring Event Configuration */ +#define X86_FEATURE_USER_SHSTK (11*32+23) /* "user_shstk" Shadow stack support for user mode applications */ +#define X86_FEATURE_SRSO (11*32+24) /* AMD BTB untrain RETs */ +#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* AMD BTB untrain RETs through aliasing */ +#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ +#define X86_FEATURE_ZEN2 (11*32+28) /* CPU based on Zen2 microarchitecture */ +#define X86_FEATURE_ZEN3 (11*32+29) /* CPU based on Zen3 microarchitecture */ +#define X86_FEATURE_ZEN4 (11*32+30) /* CPU based on Zen4 microarchitecture */ +#define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ -#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ -#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ -#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ -#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */ -#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */ -#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */ -#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */ -#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ -#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ -#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ -#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */ +#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */ +#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */ +#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* Intel Architectural PerfMon Extension */ +#define X86_FEATURE_FZRM (12*32+10) /* Fast zero-length REP MOVSB */ +#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */ +#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */ +#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */ +#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ +#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ +#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ -#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -#define X86_FEATURE_RDPRU (13*32+ 4) /* Read processor register at user level */ -#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ -#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ -#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ -#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */ -#define X86_FEATURE_AMD_PPIN (13*32+23) /* Protected Processor Inventory Number */ -#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ -#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ -#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ -#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ -#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */ -#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ -#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ +#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ +#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ +#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ +#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ +#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ +#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* Speculative Store Bypass is fixed in hardware. */ +#define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */ +#define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ +#define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ -#define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* "dtherm" Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* "ida" Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* "arat" Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* "pln" Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* "pts" Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* "hwp" Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* "hwp_notify" HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* "hwp_act_window" HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* "hwp_epp" HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* "hwp_pkg_req" HWP Package Level Request */ +#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */ +#define X86_FEATURE_HFI (14*32+19) /* "hfi" Hardware Feedback Interface */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_NPT (15*32+ 0) /* "npt" Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* "lbrv" LBR Virtualization support */ #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ #define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ #define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ #define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ -#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ -#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ -#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */ -#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* "flushbyasid" Flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* "decodeassists" Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* "pausefilter" Filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* "pfthreshold" Pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */ +#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ +#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ +#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ -#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ -#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ -#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ -#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ -#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ -#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ -#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */ -#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ -#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ -#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ -#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ -#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ -#define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ -#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* "avx512vbmi" AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* "umip" User Mode Instruction Protection */ +#define X86_FEATURE_PKU (16*32+ 3) /* "pku" Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (16*32+ 4) /* "ospke" OS Protection Keys Enable */ +#define X86_FEATURE_WAITPKG (16*32+ 5) /* "waitpkg" UMONITOR/UMWAIT/TPAUSE Instructions */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* "avx512_vbmi2" Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow stack */ +#define X86_FEATURE_GFNI (16*32+ 8) /* "gfni" Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* "vaes" Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* "vpclmulqdq" Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+11) /* "avx512_vnni" Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */ +#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */ +#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */ +#define X86_FEATURE_MOVDIRI (16*32+27) /* "movdiri" MOVDIRI instruction */ +#define X86_FEATURE_MOVDIR64B (16*32+28) /* "movdir64b" MOVDIR64B instruction */ +#define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */ +#define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ -#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ -#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ -#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ -#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ -#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */ -#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ -#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ -#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ -#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ -#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ -#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ -#define X86_FEATURE_IBT (18*32+20) /* Indirect Branch Tracking */ -#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */ -#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */ -#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ -#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */ -#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ -#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ -#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ -#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ -#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* "avx512_4vnniw" AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* "avx512_4fmaps" AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_FSRM (18*32+ 4) /* "fsrm" Fast Short Rep Mov */ +#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* "avx512_vp2intersect" AVX-512 Intersect for D/Q */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* SRBDS mitigation MSR available */ +#define X86_FEATURE_MD_CLEAR (18*32+10) /* "md_clear" VERW clears CPU buffers */ +#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* RTM transaction always aborts */ +#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* TSX_FORCE_ABORT */ +#define X86_FEATURE_SERIALIZE (18*32+14) /* "serialize" SERIALIZE instruction */ +#define X86_FEATURE_HYBRID_CPU (18*32+15) /* This part has CPUs of more than one type */ +#define X86_FEATURE_TSXLDTRK (18*32+16) /* "tsxldtrk" TSX Suspend Load Address Tracking */ +#define X86_FEATURE_PCONFIG (18*32+18) /* "pconfig" Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* "arch_lbr" Intel ARCH LBR */ +#define X86_FEATURE_IBT (18*32+20) /* "ibt" Indirect Branch Tracking */ +#define X86_FEATURE_AMX_BF16 (18*32+22) /* "amx_bf16" AMX bf16 Support */ +#define X86_FEATURE_AVX512_FP16 (18*32+23) /* "avx512_fp16" AVX512 FP16 */ +#define X86_FEATURE_AMX_TILE (18*32+24) /* "amx_tile" AMX tile Support */ +#define X86_FEATURE_AMX_INT8 (18*32+25) /* "amx_int8" AMX int8 Support */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* "flush_l1d" Flush L1D cache */ +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* "arch_capabilities" IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* IA32_CORE_CAPABILITIES MSR */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */ /* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ -#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ -#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ -#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ -#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ -#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ +#define X86_FEATURE_SME (19*32+ 0) /* "sme" AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* "sev" AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" AMD Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */ +#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ -#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ -#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ -#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ -#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ -#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */ +#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ +#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ +#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */ +#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */ +#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ +#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ -#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */ -#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ -#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ +#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ +#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ +#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ + +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0x80000022, etc and Linux defined features. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* "amd_lbr_pmc_freeze" AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ /* * BUG word(s) */ #define X86_BUG(x) (NCAPINTS*32 + (x)) -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_F00F X86_BUG(0) /* "f00f" Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* "fdiv" FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* "coma" Cyrix 6x86 coma */ #define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ #define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_11AP X86_BUG(5) /* "11ap" Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* "fxsave_leak" FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* "clflush_monitor" AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* "sysret_ss_attrs" SYSRET doesn't fix up SS attrs */ #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional * to avoid confusion. */ -#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#define X86_BUG_ESPFIX X86_BUG(9) /* IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif -#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ -#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ -#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ -#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ -#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ -#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ -#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ -#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ -#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ -#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ -#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ -#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ -#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ -#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ -#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ -#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ -#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ -#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ -#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */ +#define X86_BUG_NULL_SEG X86_BUG(10) /* "null_seg" Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* "swapgs_fence" SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* "monitor" IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* "amd_e400" CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* "cpu_meltdown" CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* "spectre_v1" CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* "spectre_v2" CPU is affected by Spectre variant 2 attack with indirect branches */ +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* "spec_store_bypass" CPU is affected by speculative store bypass attack */ +#define X86_BUG_L1TF X86_BUG(18) /* "l1tf" CPU is affected by L1 Terminal Fault */ +#define X86_BUG_MDS X86_BUG(19) /* "mds" CPU is affected by Microarchitectural data sampling */ +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* "msbds_only" CPU is only affected by the MSDBS variant of BUG_MDS */ +#define X86_BUG_SWAPGS X86_BUG(21) /* "swapgs" CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* "taa" CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* "itlb_multihit" CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* "srbds" CPU may leak RNG bits if not mitigated */ +#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* "mmio_stale_data" CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */ +#define X86_BUG_RETBLEED X86_BUG(27) /* "retbleed" CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* "eibrs_pbrsb" EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_SMT_RSB X86_BUG(29) /* "smt_rsb" CPU is vulnerable to Cross-Thread Return Address Predictions */ +#define X86_BUG_GDS X86_BUG(30) /* "gds" CPU is affected by Gather Data Sampling */ +#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* "tdx_pw_mce" CPU may incur #MC if non-TD software does partial write to TDX private memory */ /* BUG word 2 */ -#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ -#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ +#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* "srso" AMD SRSO bug */ +#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* "div0" AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 9bee3e7bf973..ca4243318aad 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -127,6 +127,42 @@ static inline unsigned int cpuid_edx(unsigned int op) return edx; } +static inline void __cpuid_read(unsigned int leaf, unsigned int subleaf, u32 *regs) +{ + regs[CPUID_EAX] = leaf; + regs[CPUID_ECX] = subleaf; + __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX); +} + +#define cpuid_subleaf(leaf, subleaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, subleaf, (u32 *)(regs)); \ +} + +#define cpuid_leaf(leaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, 0, (u32 *)(regs)); \ +} + +static inline void __cpuid_read_reg(unsigned int leaf, unsigned int subleaf, + enum cpuid_regs_idx regidx, u32 *reg) +{ + u32 regs[4]; + + __cpuid_read(leaf, subleaf, regs); + *reg = regs[regidx]; +} + +#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \ +} + +#define cpuid_leaf_reg(leaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \ +} + static __always_inline bool cpuid_function_is_indexed(u32 function) { switch (function) { @@ -143,6 +179,7 @@ static __always_inline bool cpuid_function_is_indexed(u32 function) case 0x1d: case 0x1e: case 0x1f: + case 0x24: case 0x8000001d: return true; } @@ -160,7 +197,12 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) for_each_possible_hypervisor_cpuid_base(base) { cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); - if (!memcmp(sig, signature, 12) && + /* + * This must not compile to "call memcmp" because it's called + * from PVH early boot code before instrumentation is set up + * and memcmp() itself may be instrumented. + */ + if (!__builtin_memcmp(sig, signature, 12) && (leaves == 0 || ((eax - base) >= leaves))) return base; } diff --git a/arch/x86/include/asm/crash_reserve.h b/arch/x86/include/asm/crash_reserve.h new file mode 100644 index 000000000000..7835b2cdff04 --- /dev/null +++ b/arch/x86/include/asm/crash_reserve.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_CRASH_RESERVE_H +#define _X86_CRASH_RESERVE_H + +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_16M + +/* + * Keep the crash kernel below this limit. + * + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range + * due to mapping restrictions. + * + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is + * the upper limit of system RAM in 4-level paging mode. Since the kdump + * jump could be from 5-level paging to 4-level paging, the jump will fail if + * the kernel is put above 64 TB, and during the 1st kernel bootup there's + * no good way to detect the paging mode of the target kernel which will be + * loaded for dumping. + */ +extern unsigned long swiotlb_size_or_default(void); + +#ifdef CONFIG_X86_32 +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M +#else +# define CRASH_ADDR_LOW_MAX SZ_4G +# define CRASH_ADDR_HIGH_MAX SZ_64T +#endif + +# define DEFAULT_CRASH_KERNEL_LOW_SIZE crash_low_size_default() + +static inline unsigned long crash_low_size_default(void) +{ +#ifdef CONFIG_X86_64 + return max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); +#else + return 0; +#endif +} + +#define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY + +#endif /* _X86_CRASH_RESERVE_H */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index a1168e7b69e5..bf5953883ec3 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_CURRENT_H #define _ASM_X86_CURRENT_H +#include <linux/build_bug.h> #include <linux/compiler.h> #ifndef __ASSEMBLY__ @@ -17,7 +18,7 @@ struct pcpu_hot { struct task_struct *current_task; int preempt_count; int cpu_number; -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING u64 call_depth; #endif unsigned long top_of_stack; @@ -36,8 +37,15 @@ static_assert(sizeof(struct pcpu_hot) == 64); DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); +/* const-qualified alias to pcpu_hot, aliased by linker. */ +DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override, + const_pcpu_hot); + static __always_inline struct task_struct *get_current(void) { + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return this_cpu_read_const(const_pcpu_hot.current_task); + return this_cpu_read_stable(pcpu_hot.current_task); } diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 66eb5e1ac4fb..fdbbbfec745a 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -6,6 +6,9 @@ #include <linux/percpu.h> #include <uapi/asm/debugreg.h> +#include <asm/cpufeature.h> +#include <asm/msr.h> + DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL @@ -158,4 +161,26 @@ static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) } #endif +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + +static inline void update_debugctlmsr(unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); +} + #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ab97b22ac04a..62dc9f59ea76 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -46,6 +46,7 @@ struct gdt_page { } __attribute__((aligned(PAGE_SIZE))); DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); +DECLARE_INIT_PER_CPU(gdt_page); /* Provide the original GDT */ static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) @@ -402,8 +403,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void alloc_intr_gate(unsigned int n, const void *addr); - static inline void init_idt_data(struct idt_data *data, unsigned int n, const void *addr) { diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index f7e7099af595..d440a65af8f3 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -8,6 +8,56 @@ * archs. */ +/* + * Low-level interface mapping flags/field names to bits + */ + +/* Flags for _DESC_S (non-system) descriptors */ +#define _DESC_ACCESSED 0x0001 +#define _DESC_DATA_WRITABLE 0x0002 +#define _DESC_CODE_READABLE 0x0002 +#define _DESC_DATA_EXPAND_DOWN 0x0004 +#define _DESC_CODE_CONFORMING 0x0004 +#define _DESC_CODE_EXECUTABLE 0x0008 + +/* Common flags */ +#define _DESC_S 0x0010 +#define _DESC_DPL(dpl) ((dpl) << 5) +#define _DESC_PRESENT 0x0080 + +#define _DESC_LONG_CODE 0x2000 +#define _DESC_DB 0x4000 +#define _DESC_GRANULARITY_4K 0x8000 + +/* System descriptors have a numeric "type" field instead of flags */ +#define _DESC_SYSTEM(code) (code) + +/* + * High-level interface mapping intended usage to low-level combinations + * of flags + */ + +#define _DESC_DATA (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ + _DESC_DATA_WRITABLE) +#define _DESC_CODE (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ + _DESC_CODE_READABLE | _DESC_CODE_EXECUTABLE) + +#define DESC_DATA16 (_DESC_DATA) +#define DESC_CODE16 (_DESC_CODE) + +#define DESC_DATA32 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_DATA32_BIOS (_DESC_DATA | _DESC_DB) + +#define DESC_CODE32 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE32_BIOS (_DESC_CODE | _DESC_DB) + +#define DESC_TSS32 (_DESC_SYSTEM(9) | _DESC_PRESENT) + +#define DESC_DATA64 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE64 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_LONG_CODE) + +#define DESC_USER (_DESC_DPL(3)) + #ifndef __ASSEMBLY__ #include <linux/types.h> @@ -22,19 +72,19 @@ struct desc_struct { #define GDT_ENTRY_INIT(flags, base, limit) \ { \ - .limit0 = (u16) (limit), \ - .limit1 = ((limit) >> 16) & 0x0F, \ - .base0 = (u16) (base), \ - .base1 = ((base) >> 16) & 0xFF, \ - .base2 = ((base) >> 24) & 0xFF, \ - .type = (flags & 0x0f), \ - .s = (flags >> 4) & 0x01, \ - .dpl = (flags >> 5) & 0x03, \ - .p = (flags >> 7) & 0x01, \ - .avl = (flags >> 12) & 0x01, \ - .l = (flags >> 13) & 0x01, \ - .d = (flags >> 14) & 0x01, \ - .g = (flags >> 15) & 0x01, \ + .limit0 = ((limit) >> 0) & 0xFFFF, \ + .limit1 = ((limit) >> 16) & 0x000F, \ + .base0 = ((base) >> 0) & 0xFFFF, \ + .base1 = ((base) >> 16) & 0x00FF, \ + .base2 = ((base) >> 24) & 0x00FF, \ + .type = ((flags) >> 0) & 0x000F, \ + .s = ((flags) >> 4) & 0x0001, \ + .dpl = ((flags) >> 5) & 0x0003, \ + .p = ((flags) >> 7) & 0x0001, \ + .avl = ((flags) >> 12) & 0x0001, \ + .l = ((flags) >> 13) & 0x0001, \ + .d = ((flags) >> 14) & 0x0001, \ + .g = ((flags) >> 15) & 0x0001, \ } enum { @@ -94,6 +144,7 @@ struct gate_struct { typedef struct gate_struct gate_desc; +#ifndef _SETUP static inline unsigned long gate_offset(const gate_desc *g) { #ifdef CONFIG_X86_64 @@ -108,6 +159,7 @@ static inline unsigned long gate_segment(const gate_desc *g) { return g->segment; } +#endif struct desc_ptr { unsigned short size; diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index fafe9be7a6f4..c492bdc97b05 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -44,32 +44,32 @@ # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define DISABLE_PTI 0 #else # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE # define DISABLE_RETPOLINE 0 #else # define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) #endif -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK # define DISABLE_RETHUNK 0 #else # define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY # define DISABLE_UNRET 0 #else # define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) #endif -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING # define DISABLE_CALL_DEPTH_TRACKING 0 #else # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) @@ -105,6 +105,30 @@ # define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) #endif +#ifdef CONFIG_X86_USER_SHADOW_STACK +#define DISABLE_USER_SHSTK 0 +#else +#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31)) +#endif + +#ifdef CONFIG_X86_KERNEL_IBT +#define DISABLE_IBT 0 +#else +#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) +#endif + +#ifdef CONFIG_X86_FRED +# define DISABLE_FRED 0 +#else +# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) +#endif + +#ifdef CONFIG_KVM_AMD_SEV +#define DISABLE_SEV_SNP 0 +#else +#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -120,17 +144,18 @@ #define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ - DISABLE_CALL_DEPTH_TRACKING) -#define DISABLED_MASK12 (DISABLE_LAM) + DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) +#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ DISABLE_ENQCMD) #define DISABLED_MASK17 0 -#define DISABLED_MASK18 0 -#define DISABLED_MASK19 0 +#define DISABLED_MASK18 (DISABLE_IBT) +#define DISABLED_MASK19 (DISABLE_SEV_SNP) #define DISABLED_MASK20 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define DISABLED_MASK21 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index b8f1dc0761e4..9931e4c7d73f 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -71,6 +71,12 @@ static inline u64 mul_u32_u32(u32 a, u32 b) } #define mul_u32_u32 mul_u32_u32 +/* + * __div64_32() is never called on x86, so prevent the + * generic definition from getting built. + */ +#define __div64_32 + #else # include <asm-generic/div64.h> diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index e8f58ddd06d9..2e74a7f0e935 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -17,6 +17,7 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); extern void e820__range_add (u64 start, u64 size, enum e820_type type); extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); +extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern void e820__print_table(char *who); extern int e820__update_table(struct e820_table *table); diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 8b4be7cecdb8..521aad70e41b 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -90,21 +90,9 @@ static inline void efi_fpu_end(void) } #ifdef CONFIG_X86_32 -#define arch_efi_call_virt_setup() \ -({ \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ -}) - -#define arch_efi_call_virt_teardown() \ -({ \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - +#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1) #else /* !CONFIG_X86_32 */ - -#define EFI_LOADER_SIGNATURE "EL64" +#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT extern asmlinkage u64 __efi_call(void *fp, ...); @@ -115,14 +103,6 @@ extern bool efi_disable_ibt_for_runtime; __efi_call(__VA_ARGS__); \ }) -#define arch_efi_call_virt_setup() \ -({ \ - efi_sync_low_kernel_mappings(); \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ - efi_enter_mm(); \ -}) - #undef arch_efi_call_virt #define arch_efi_call_virt(p, f, args...) ({ \ u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \ @@ -131,13 +111,6 @@ extern bool efi_disable_ibt_for_runtime; ret; \ }) -#define arch_efi_call_virt_teardown() \ -({ \ - efi_leave_mm(); \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - #ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present @@ -167,17 +140,8 @@ extern void efi_delete_dummy_variable(void); extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); -void efi_enter_mm(void); -void efi_leave_mm(void); - -/* kexec external ABI */ -struct efi_setup_data { - u64 fw_vendor; - u64 __unused; - u64 tables; - u64 smbios; - u64 reserved[8]; -}; +void arch_efi_call_virt_setup(void); +void arch_efi_call_virt_teardown(void); extern u64 efi_setup; @@ -218,6 +182,8 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, #ifdef CONFIG_EFI_MIXED +#define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX) + #define ARCH_HAS_EFISTUB_WRAPPERS static inline bool efi_is_64bit(void) @@ -263,7 +229,8 @@ static inline bool efi_is_native(void) static inline void *efi64_zero_upper(void *p) { - ((u32 *)p)[1] = 0; + if (p) + ((u32 *)p)[1] = 0; return p; } @@ -349,6 +316,10 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_clear_memory_attributes(protocol, phys, size, flags) \ ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags)) +/* EFI SMBIOS protocol */ +#define __efi64_argmap_get_next(protocol, smbioshandle, type, record, phandle) \ + ((protocol), (smbioshandle), (type), efi64_zero_upper(record), \ + efi64_zero_upper(phandle)) /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro @@ -418,24 +389,8 @@ static inline void efi_reserve_boot_services(void) } #endif /* CONFIG_EFI */ -#ifdef CONFIG_EFI_FAKE_MEMMAP -extern void __init efi_fake_memmap_early(void); -extern void __init efi_fake_memmap(void); -#else -static inline void efi_fake_memmap_early(void) -{ -} - -static inline void efi_fake_memmap(void) -{ -} -#endif - extern int __init efi_memmap_alloc(unsigned int num_entries, struct efi_memory_map_data *data); -extern void __efi_memmap_free(u64 phys, unsigned long size, - unsigned long flags); -#define __efi_memmap_free __efi_memmap_free extern int __init efi_memmap_install(struct efi_memory_map_data *data); extern int __init efi_memmap_split_count(efi_memory_desc_t *md, @@ -443,8 +398,9 @@ extern int __init efi_memmap_split_count(efi_memory_desc_t *md, extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, struct efi_mem_range *mem); -#define arch_ima_efi_boot_mode \ - ({ extern struct boot_params boot_params; boot_params.secure_boot; }) +extern enum efi_secureboot_mode __x86_ima_efi_boot_mode(void); + +#define arch_ima_efi_boot_mode __x86_ima_efi_boot_mode() #ifdef CONFIG_EFI_RUNTIME_MAP int efi_get_runtime_map_size(void); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 18fd06f7936a..1fb83d47711f 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -7,6 +7,7 @@ */ #include <linux/thread_info.h> +#include <asm/ia32.h> #include <asm/ptrace.h> #include <asm/user.h> #include <asm/auxvec.h> @@ -149,7 +150,7 @@ do { \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ - (elf_check_arch_ia32(x) || \ + ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) static inline void elf_common_init(struct thread_struct *t, @@ -391,5 +392,4 @@ struct va_alignment { } ____cacheline_aligned; extern struct va_alignment va_align; -extern unsigned long align_vdso_addr(unsigned long); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index ce8f50192ae3..77d20555e04d 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -8,6 +8,7 @@ #include <asm/nospec-branch.h> #include <asm/io_bitmap.h> #include <asm/fpu/api.h> +#include <asm/fred.h> /* Check that the stack and regs on entry from user mode are sane. */ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) @@ -44,8 +45,7 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) } #define arch_enter_from_user_mode arch_enter_from_user_mode -static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, - unsigned long ti_work) +static inline void arch_exit_work(unsigned long ti_work) { if (ti_work & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); @@ -56,6 +56,15 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, fpregs_assert_state_consistent(); if (unlikely(ti_work & _TIF_NEED_FPU_LOAD)) switch_fpu_return(); +} + +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + if (IS_ENABLED(CONFIG_X86_DEBUG_FPU) || unlikely(ti_work)) + arch_exit_work(ti_work); + + fred_update_rsp0(); #ifdef CONFIG_COMPAT /* @@ -73,25 +82,21 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, #endif /* - * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), - * but not enough for x86 stack utilization comfort. To keep - * reasonable stack head room, reduce the maximum offset to 8 bits. - * - * The actual entropy will be further reduced by the compiler when - * applying stack alignment constraints (see cc_stack_align4/8 in + * This value will get limited by KSTACK_OFFSET_MAX(), which is 10 + * bits. The actual entropy will be further reduced by the compiler + * when applying stack alignment constraints (see cc_stack_align4/8 in * arch/x86/Makefile), which will remove the 3 (x86_64) or 2 (ia32) * low bits from any entropy chosen here. * - * Therefore, final stack offset entropy will be 5 (x86_64) or - * 6 (ia32) bits. + * Therefore, final stack offset entropy will be 7 (x86_64) or + * 8 (ia32) bits. */ - choose_random_kstack_offset(rdtsc() & 0xFF); + choose_random_kstack_offset(rdtsc()); } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare static __always_inline void arch_exit_to_user_mode(void) { - mds_user_clear_cpu_buffers(); amd_clear_divider(); } #define arch_exit_to_user_mode arch_exit_to_user_mode diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index eeed395c3177..a0e0c6b50155 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -37,7 +37,6 @@ struct pt_regs; extern int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr); -extern int fixup_bug(struct pt_regs *regs, int trapnr); extern int ex_get_fixup_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index 991e31cfde94..906b0d5541e8 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -4,7 +4,7 @@ /* * Our IMM is signed, as such it must live at the top end of the word. Also, - * since C99 hex constants are of ambigious type, force cast the mask to 'int' + * since C99 hex constants are of ambiguous type, force cast the mask to 'int' * so that FIELD_GET() will DTRT and sign extend the value when it extracts it. */ #define EX_DATA_TYPE_MASK ((int)0x000000FF) @@ -36,7 +36,7 @@ #define EX_TYPE_DEFAULT 1 #define EX_TYPE_FAULT 2 #define EX_TYPE_UACCESS 3 -#define EX_TYPE_COPY 4 +/* unused, was: #define EX_TYPE_COPY 4 */ #define EX_TYPE_CLEAR_FS 5 #define EX_TYPE_FPU_RESTORE 6 #define EX_TYPE_BPF 7 @@ -64,6 +64,8 @@ #define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4)) #define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) -#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ +#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ + +#define EX_TYPE_ERETU 21 #endif diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h deleted file mode 100644 index 23873da8fb77..000000000000 --- a/arch/x86/include/asm/fb.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_FB_H -#define _ASM_X86_FB_H - -struct fb_info; -struct file; -struct vm_area_struct; - -void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off); -#define fb_pgprotect fb_pgprotect - -int fb_is_primary_device(struct fb_info *info); -#define fb_is_primary_device fb_is_primary_device - -#include <asm-generic/fb.h> - -#endif /* _ASM_X86_FB_H */ diff --git a/arch/x86/include/asm/fpu.h b/arch/x86/include/asm/fpu.h new file mode 100644 index 000000000000..b2743fe19339 --- /dev/null +++ b/arch/x86/include/asm/fpu.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 SiFive + */ + +#ifndef _ASM_X86_FPU_H +#define _ASM_X86_FPU_H + +#include <asm/fpu/api.h> + +#define kernel_fpu_available() true + +#endif /* ! _ASM_X86_FPU_H */ diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index b475d9a582b8..f86ad3335529 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -82,6 +82,15 @@ static inline void fpregs_unlock(void) preempt_enable(); } +/* + * FPU state gets lazily restored before returning to userspace. So when in the + * kernel, the valid FPU state may be kept in the buffer. This function will force + * restore all the fpu state to the registers early if needed, and lock them from + * being automatically saved/restored. Then FPU state can be modified safely in the + * registers, before unlocking with fpregs_unlock(). + */ +void fpregs_lock_and_load(void); + #ifdef CONFIG_X86_DEBUG_FPU extern void fpregs_assert_state_consistent(void); #else @@ -134,6 +143,9 @@ extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfe extern u64 xstate_get_guest_group_perm(void); +extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); + + /* KVM specific functions */ extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu); extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); @@ -148,7 +160,8 @@ static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { static inline void fpu_sync_guest_vmexit_xfd_state(void) { } #endif -extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u64 xfeatures, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h index 4f928d6a367b..697b77e96025 100644 --- a/arch/x86/include/asm/fpu/regset.h +++ b/arch/x86/include/asm/fpu/regset.h @@ -7,11 +7,12 @@ #include <linux/regset.h> -extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; +extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active, + ssp_active; extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; + xstateregs_get, ssp_get; extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; + xstateregs_set, ssp_set; /* * xstateregs_active == regset_fpregs_active. Please refer to the comment diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index 78fcde7b1f07..c485f1944c5f 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -11,7 +11,8 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); extern void fpu__drop(struct fpu *fpu); -extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, + unsigned long shstk_addr); extern void fpu_flush_thread(void); /* @@ -36,10 +37,12 @@ extern void fpu_flush_thread(void); * The FPU context is only stored/restored for a user task and * PF_KTHREAD is used to distinguish between kernel and user threads. */ -static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) +static inline void switch_fpu_prepare(struct task_struct *old, int cpu) { if (cpu_feature_enabled(X86_FEATURE_FPU) && - !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct fpu *old_fpu = &old->thread.fpu; + save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the @@ -59,10 +62,10 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) * Delay loading of the complete FPU state until the return to userland. * PKRU is handled separately. */ -static inline void switch_fpu_finish(void) +static inline void switch_fpu_finish(struct task_struct *new) { if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_thread_flag(TIF_NEED_FPU_LOAD); + set_tsk_thread_flag(new, TIF_NEED_FPU_LOAD); } #endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 611fa41711af..eccc75bc9c4f 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -29,7 +29,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long fpu__get_fpstate_size(void); -extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru); extern void fpu__clear_user_states(struct fpu *fpu); extern bool fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 7f6d858ff47a..de16862bf230 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -2,8 +2,10 @@ /* * FPU data structures: */ -#ifndef _ASM_X86_FPU_H -#define _ASM_X86_FPU_H +#ifndef _ASM_X86_FPU_TYPES_H +#define _ASM_X86_FPU_TYPES_H + +#include <asm/page_types.h> /* * The legacy x87 FPU state format, as saved by FSAVE and @@ -115,8 +117,8 @@ enum xfeature { XFEATURE_PT_UNIMPLEMENTED_SO_FAR, XFEATURE_PKRU, XFEATURE_PASID, - XFEATURE_RSRVD_COMP_11, - XFEATURE_RSRVD_COMP_12, + XFEATURE_CET_USER, + XFEATURE_CET_KERNEL_UNUSED, XFEATURE_RSRVD_COMP_13, XFEATURE_RSRVD_COMP_14, XFEATURE_LBR, @@ -138,6 +140,8 @@ enum xfeature { #define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) +#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER) +#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) #define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) @@ -253,6 +257,16 @@ struct pkru_state { } __packed; /* + * State component 11 is Control-flow Enforcement user states + */ +struct cet_user_state { + /* user control-flow settings */ + u64 user_cet; + /* user shadow stack pointer */ + u64 user_ssp; +}; + +/* * State component 15: Architectural LBR configuration state. * The size of Arch LBR state depends on the number of LBRs (lbr_depth). */ @@ -403,7 +417,7 @@ struct fpu_state_perm { * * This master permission field is only to be used when * task.fpu.fpstate based checks fail to validate whether the task - * is allowed to expand it's xfeatures set which requires to + * is allowed to expand its xfeatures set which requires to * allocate a larger sized fpstate buffer. * * Do not access this field directly. Use the provided helper @@ -577,9 +591,16 @@ struct fpu_state_config { * even without XSAVE support, i.e. legacy features FP + SSE */ u64 legacy_features; + /* + * @independent_features: + * + * Features that are supported by XSAVES, but not managed as part of + * the FPU core, such as LBR + */ + u64 independent_features; }; /* FPU state configuration information */ extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg; -#endif /* _ASM_X86_FPU_H */ +#endif /* _ASM_X86_FPU_TYPES_H */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index cd3dd170e23a..d4427b88ee12 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -50,7 +50,8 @@ #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA /* All currently supported supervisor features */ -#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) +#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \ + XFEATURE_MASK_CET_USER) /* * A supervisor state component may not always contain valuable information, @@ -77,7 +78,8 @@ * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ -#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) +#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \ + XFEATURE_MASK_CET_KERNEL) /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h new file mode 100644 index 000000000000..25ca00bd70e8 --- /dev/null +++ b/arch/x86/include/asm/fred.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Macros for Flexible Return and Event Delivery (FRED) + */ + +#ifndef ASM_X86_FRED_H +#define ASM_X86_FRED_H + +#include <linux/const.h> + +#include <asm/asm.h> +#include <asm/trapnr.h> + +/* + * FRED event return instruction opcodes for ERET{S,U}; supported in + * binutils >= 2.41. + */ +#define ERETS _ASM_BYTES(0xf2,0x0f,0x01,0xca) +#define ERETU _ASM_BYTES(0xf3,0x0f,0x01,0xca) + +/* + * RSP is aligned to a 64-byte boundary before used to push a new stack frame + */ +#define FRED_STACK_FRAME_RSP_MASK _AT(unsigned long, (~0x3f)) + +/* + * Used for the return address for call emulation during code patching, + * and measured in 64-byte cache lines. + */ +#define FRED_CONFIG_REDZONE_AMOUNT 1 +#define FRED_CONFIG_REDZONE (_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6) +#define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9) +#define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p)) + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_X86_FRED +#include <linux/kernel.h> +#include <linux/sched/task_stack.h> + +#include <asm/ptrace.h> + +struct fred_info { + /* Event data: CR2, DR6, ... */ + unsigned long edata; + unsigned long resv; +}; + +/* Full format of the FRED stack frame */ +struct fred_frame { + struct pt_regs regs; + struct fred_info info; +}; + +static __always_inline struct fred_info *fred_info(struct pt_regs *regs) +{ + return &container_of(regs, struct fred_frame, regs)->info; +} + +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) +{ + return fred_info(regs)->edata; +} + +void asm_fred_entrypoint_user(void); +void asm_fred_entrypoint_kernel(void); +void asm_fred_entry_from_kvm(struct fred_ss); + +__visible void fred_entry_from_user(struct pt_regs *regs); +__visible void fred_entry_from_kernel(struct pt_regs *regs); +__visible void __fred_entry_from_kvm(struct pt_regs *regs); + +/* Can be called from noinstr code, thus __always_inline */ +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) +{ + struct fred_ss ss = { + .ss =__KERNEL_DS, + .type = type, + .vector = vector, + .nmi = type == EVENT_TYPE_NMI, + .lm = 1, + }; + + asm_fred_entry_from_kvm(ss); +} + +void cpu_init_fred_exceptions(void); +void cpu_init_fred_rsps(void); +void fred_complete_exception_setup(void); + +DECLARE_PER_CPU(unsigned long, fred_rsp0); + +static __always_inline void fred_sync_rsp0(unsigned long rsp0) +{ + __this_cpu_write(fred_rsp0, rsp0); +} + +static __always_inline void fred_update_rsp0(void) +{ + unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE; + + if (cpu_feature_enabled(X86_FEATURE_FRED) && (__this_cpu_read(fred_rsp0) != rsp0)) { + wrmsrns(MSR_IA32_FRED_RSP0, rsp0); + __this_cpu_write(fred_rsp0, rsp0); + } +} +#else /* CONFIG_X86_FRED */ +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +static inline void cpu_init_fred_exceptions(void) { } +static inline void cpu_init_fred_rsps(void) { } +static inline void fred_complete_exception_setup(void) { } +static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } +static inline void fred_sync_rsp0(unsigned long rsp0) { } +static inline void fred_update_rsp0(void) { } +#endif /* CONFIG_X86_FRED */ +#endif /* !__ASSEMBLY__ */ + +#endif /* ASM_X86_FRED_H */ diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index 35cff5f2becf..9e7e8ca8e299 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -6,7 +6,7 @@ #ifdef CONFIG_X86_64 -#include <asm/msr-index.h> +#include <asm/msr.h> /* * Read/write a task's FSBASE or GSBASE. This returns the value that diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 897cf02c20b1..0152a81d9b4a 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -20,8 +20,6 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif -#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR - #ifndef __ASSEMBLY__ extern void __fentry__(void); diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 66837b8c67f1..6ffa8b75f4cd 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -15,7 +15,7 @@ typedef struct { unsigned int irq_spurious_count; unsigned int icr_read_retry_count; #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; @@ -44,10 +44,16 @@ typedef struct { unsigned int irq_hv_reenlightenment_count; unsigned int hyperv_stimer0_count; #endif +#ifdef CONFIG_X86_POSTED_MSI + unsigned int posted_msi_notification_count; +#endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +#ifdef CONFIG_X86_POSTED_MSI +DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); +#endif #define __ARCH_IRQ_STAT #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) @@ -63,7 +69,11 @@ extern u64 arch_irq_stat(void); #define local_softirq_pending_ref pcpu_hot.softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) -static inline void kvm_set_cpu_l1tf_flush_l1d(void) +/* + * This function is called from noinstr interrupt contexts + * and must be inlined to not get instrumentation. + */ +static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } @@ -78,7 +88,7 @@ static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } #else /* !IS_ENABLED(CONFIG_KVM_INTEL) */ -static inline void kvm_set_cpu_l1tf_flush_l1d(void) { } +static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { } #endif /* IS_ENABLED(CONFIG_KVM_INTEL) */ #endif /* _ASM_X86_HARDIRQ_H */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d465ece58151..edebf1020e04 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -16,8 +16,6 @@ #include <asm/irq_vectors.h> -#define IRQ_MATRIX_BITS NR_VECTORS - #ifndef __ASSEMBLY__ #include <linux/percpu.h> @@ -28,7 +26,7 @@ #include <asm/irq.h> #include <asm/sections.h> -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_data; struct pci_dev; struct msi_desc; @@ -97,18 +95,18 @@ extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); #ifdef CONFIG_SMP -extern void send_cleanup_vector(struct irq_cfg *); +extern void vector_schedule_cleanup(struct irq_cfg *); extern void irq_complete_move(struct irq_cfg *cfg); #else -static inline void send_cleanup_vector(struct irq_cfg *c) { } +static inline void vector_schedule_cleanup(struct irq_cfg *c) { } static inline void irq_complete_move(struct irq_cfg *c) { } #endif extern void apic_ack_edge(struct irq_data *data); -#else /* CONFIG_X86_LOCAL_APIC */ +#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} -#endif /* CONFIG_X86_LOCAL_APIC */ +#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /* Statistics */ extern atomic_t irq_err_count; diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index cea95dcd27c2..3787d26810c1 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -169,7 +169,8 @@ enum hv_isolation_type { HV_ISOLATION_TYPE_NONE = 0, HV_ISOLATION_TYPE_VBS = 1, - HV_ISOLATION_TYPE_SNP = 2 + HV_ISOLATION_TYPE_SNP = 2, + HV_ISOLATION_TYPE_TDX = 3 }; /* Hyper-V specific model specific registers (MSRs) */ @@ -181,7 +182,7 @@ enum hv_isolation_type { #define HV_X64_MSR_HYPERCALL 0x40000001 /* MSR used to provide vcpu index */ -#define HV_REGISTER_VP_INDEX 0x40000002 +#define HV_X64_MSR_VP_INDEX 0x40000002 /* MSR used to reset the guest OS. */ #define HV_X64_MSR_RESET 0x40000003 @@ -190,10 +191,10 @@ enum hv_isolation_type { #define HV_X64_MSR_VP_RUNTIME 0x40000010 /* MSR used to read the per-partition time reference counter */ -#define HV_REGISTER_TIME_REF_COUNT 0x40000020 +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 /* A partition's reference time stamp counter (TSC) page */ -#define HV_REGISTER_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 /* MSR used to retrieve the TSC frequency */ #define HV_X64_MSR_TSC_FREQUENCY 0x40000022 @@ -208,61 +209,61 @@ enum hv_isolation_type { #define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 /* Define synthetic interrupt controller model specific registers. */ -#define HV_REGISTER_SCONTROL 0x40000080 -#define HV_REGISTER_SVERSION 0x40000081 -#define HV_REGISTER_SIEFP 0x40000082 -#define HV_REGISTER_SIMP 0x40000083 -#define HV_REGISTER_EOM 0x40000084 -#define HV_REGISTER_SINT0 0x40000090 -#define HV_REGISTER_SINT1 0x40000091 -#define HV_REGISTER_SINT2 0x40000092 -#define HV_REGISTER_SINT3 0x40000093 -#define HV_REGISTER_SINT4 0x40000094 -#define HV_REGISTER_SINT5 0x40000095 -#define HV_REGISTER_SINT6 0x40000096 -#define HV_REGISTER_SINT7 0x40000097 -#define HV_REGISTER_SINT8 0x40000098 -#define HV_REGISTER_SINT9 0x40000099 -#define HV_REGISTER_SINT10 0x4000009A -#define HV_REGISTER_SINT11 0x4000009B -#define HV_REGISTER_SINT12 0x4000009C -#define HV_REGISTER_SINT13 0x4000009D -#define HV_REGISTER_SINT14 0x4000009E -#define HV_REGISTER_SINT15 0x4000009F +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F /* * Define synthetic interrupt controller model specific registers for * nested hypervisor. */ -#define HV_REGISTER_NESTED_SCONTROL 0x40001080 -#define HV_REGISTER_NESTED_SVERSION 0x40001081 -#define HV_REGISTER_NESTED_SIEFP 0x40001082 -#define HV_REGISTER_NESTED_SIMP 0x40001083 -#define HV_REGISTER_NESTED_EOM 0x40001084 -#define HV_REGISTER_NESTED_SINT0 0x40001090 +#define HV_X64_MSR_NESTED_SCONTROL 0x40001080 +#define HV_X64_MSR_NESTED_SVERSION 0x40001081 +#define HV_X64_MSR_NESTED_SIEFP 0x40001082 +#define HV_X64_MSR_NESTED_SIMP 0x40001083 +#define HV_X64_MSR_NESTED_EOM 0x40001084 +#define HV_X64_MSR_NESTED_SINT0 0x40001090 /* * Synthetic Timer MSRs. Four timers per vcpu. */ -#define HV_REGISTER_STIMER0_CONFIG 0x400000B0 -#define HV_REGISTER_STIMER0_COUNT 0x400000B1 -#define HV_REGISTER_STIMER1_CONFIG 0x400000B2 -#define HV_REGISTER_STIMER1_COUNT 0x400000B3 -#define HV_REGISTER_STIMER2_CONFIG 0x400000B4 -#define HV_REGISTER_STIMER2_COUNT 0x400000B5 -#define HV_REGISTER_STIMER3_CONFIG 0x400000B6 -#define HV_REGISTER_STIMER3_COUNT 0x400000B7 +#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0 +#define HV_X64_MSR_STIMER0_COUNT 0x400000B1 +#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2 +#define HV_X64_MSR_STIMER1_COUNT 0x400000B3 +#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4 +#define HV_X64_MSR_STIMER2_COUNT 0x400000B5 +#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 +#define HV_X64_MSR_STIMER3_COUNT 0x400000B7 /* Hyper-V guest idle MSR */ #define HV_X64_MSR_GUEST_IDLE 0x400000F0 /* Hyper-V guest crash notification MSR's */ -#define HV_REGISTER_CRASH_P0 0x40000100 -#define HV_REGISTER_CRASH_P1 0x40000101 -#define HV_REGISTER_CRASH_P2 0x40000102 -#define HV_REGISTER_CRASH_P3 0x40000103 -#define HV_REGISTER_CRASH_P4 0x40000104 -#define HV_REGISTER_CRASH_CTL 0x40000105 +#define HV_X64_MSR_CRASH_P0 0x40000100 +#define HV_X64_MSR_CRASH_P1 0x40000101 +#define HV_X64_MSR_CRASH_P2 0x40000102 +#define HV_X64_MSR_CRASH_P3 0x40000103 +#define HV_X64_MSR_CRASH_P4 0x40000104 +#define HV_X64_MSR_CRASH_CTL 0x40000105 /* TSC emulation after migration */ #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 @@ -275,31 +276,45 @@ enum hv_isolation_type { /* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ #define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0) -/* Register name aliases for temporary compatibility */ -#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT -#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG -#define HV_X64_MSR_STIMER1_COUNT HV_REGISTER_STIMER1_COUNT -#define HV_X64_MSR_STIMER1_CONFIG HV_REGISTER_STIMER1_CONFIG -#define HV_X64_MSR_STIMER2_COUNT HV_REGISTER_STIMER2_COUNT -#define HV_X64_MSR_STIMER2_CONFIG HV_REGISTER_STIMER2_CONFIG -#define HV_X64_MSR_STIMER3_COUNT HV_REGISTER_STIMER3_COUNT -#define HV_X64_MSR_STIMER3_CONFIG HV_REGISTER_STIMER3_CONFIG -#define HV_X64_MSR_SCONTROL HV_REGISTER_SCONTROL -#define HV_X64_MSR_SVERSION HV_REGISTER_SVERSION -#define HV_X64_MSR_SIMP HV_REGISTER_SIMP -#define HV_X64_MSR_SIEFP HV_REGISTER_SIEFP -#define HV_X64_MSR_VP_INDEX HV_REGISTER_VP_INDEX -#define HV_X64_MSR_EOM HV_REGISTER_EOM -#define HV_X64_MSR_SINT0 HV_REGISTER_SINT0 -#define HV_X64_MSR_SINT15 HV_REGISTER_SINT15 -#define HV_X64_MSR_CRASH_P0 HV_REGISTER_CRASH_P0 -#define HV_X64_MSR_CRASH_P1 HV_REGISTER_CRASH_P1 -#define HV_X64_MSR_CRASH_P2 HV_REGISTER_CRASH_P2 -#define HV_X64_MSR_CRASH_P3 HV_REGISTER_CRASH_P3 -#define HV_X64_MSR_CRASH_P4 HV_REGISTER_CRASH_P4 -#define HV_X64_MSR_CRASH_CTL HV_REGISTER_CRASH_CTL -#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT -#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC +/* + * To support arch-generic code calling hv_set/get_register: + * - On x86, HV_MSR_ indicates an MSR accessed via rdmsrl/wrmsrl + * - On ARM, HV_MSR_ indicates a VP register accessed via hypercall + */ +#define HV_MSR_CRASH_P0 (HV_X64_MSR_CRASH_P0) +#define HV_MSR_CRASH_P1 (HV_X64_MSR_CRASH_P1) +#define HV_MSR_CRASH_P2 (HV_X64_MSR_CRASH_P2) +#define HV_MSR_CRASH_P3 (HV_X64_MSR_CRASH_P3) +#define HV_MSR_CRASH_P4 (HV_X64_MSR_CRASH_P4) +#define HV_MSR_CRASH_CTL (HV_X64_MSR_CRASH_CTL) + +#define HV_MSR_VP_INDEX (HV_X64_MSR_VP_INDEX) +#define HV_MSR_TIME_REF_COUNT (HV_X64_MSR_TIME_REF_COUNT) +#define HV_MSR_REFERENCE_TSC (HV_X64_MSR_REFERENCE_TSC) + +#define HV_MSR_SINT0 (HV_X64_MSR_SINT0) +#define HV_MSR_SVERSION (HV_X64_MSR_SVERSION) +#define HV_MSR_SCONTROL (HV_X64_MSR_SCONTROL) +#define HV_MSR_SIEFP (HV_X64_MSR_SIEFP) +#define HV_MSR_SIMP (HV_X64_MSR_SIMP) +#define HV_MSR_EOM (HV_X64_MSR_EOM) + +#define HV_MSR_NESTED_SCONTROL (HV_X64_MSR_NESTED_SCONTROL) +#define HV_MSR_NESTED_SVERSION (HV_X64_MSR_NESTED_SVERSION) +#define HV_MSR_NESTED_SIEFP (HV_X64_MSR_NESTED_SIEFP) +#define HV_MSR_NESTED_SIMP (HV_X64_MSR_NESTED_SIMP) +#define HV_MSR_NESTED_EOM (HV_X64_MSR_NESTED_EOM) +#define HV_MSR_NESTED_SINT0 (HV_X64_MSR_NESTED_SINT0) + +#define HV_MSR_STIMER0_CONFIG (HV_X64_MSR_STIMER0_CONFIG) +#define HV_MSR_STIMER0_COUNT (HV_X64_MSR_STIMER0_COUNT) + +/* + * Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and + * there is not associated MSR address. + */ +#define HV_X64_REGISTER_VSM_VP_STATUS 0x000D0003 +#define HV_X64_VTL_MASK GENMASK(3, 0) /* Hyper-V memory host visibility */ enum hv_mem_host_visibility { diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 637fa1df3512..c715097e92fd 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -69,6 +69,8 @@ struct legacy_pic { void (*make_irq)(unsigned int irq); }; +void legacy_pic_pcat_compat(void); + extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index fada857f0a1e..9d69f3f8dbab 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_IA32_H #define _ASM_X86_IA32_H - #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> @@ -57,17 +56,37 @@ struct stat64 { unsigned long long st_ino; } __attribute__((packed)); -#define IA32_STACK_TOP IA32_PAGE_OFFSET +extern bool __ia32_enabled; + +static __always_inline bool ia32_enabled(void) +{ + return __ia32_enabled; +} + +static inline void ia32_disable(void) +{ + __ia32_enabled = false; +} + +#else /* !CONFIG_IA32_EMULATION */ -#ifdef __KERNEL__ -struct linux_binprm; -extern int ia32_setup_arg_pages(struct linux_binprm *bprm, - unsigned long stack_top, int exec_stack); -struct mm_struct; -extern void ia32_pick_mmap_layout(struct mm_struct *mm); +static __always_inline bool ia32_enabled(void) +{ + return IS_ENABLED(CONFIG_X86_32); +} + +static inline void ia32_disable(void) {} #endif -#endif /* CONFIG_IA32_EMULATION */ +static inline bool ia32_enabled_verbose(void) +{ + bool enabled = ia32_enabled(); + + if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled) + pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n"); + + return enabled; +} #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h deleted file mode 100644 index aa065c94ccf5..000000000000 --- a/arch/x86/include/asm/ia32_unistd.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_IA32_UNISTD_H -#define _ASM_X86_IA32_UNISTD_H - -/* - * This file contains the system call numbers of the ia32 compat ABI, - * this is for the kernel only. - */ -#define __SYSCALL_ia32_NR(x) (x) -#include <asm/unistd_32_ia32.h> - -#endif /* _ASM_X86_IA32_UNISTD_H */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b241af4ce9b4..ad5c68f0509d 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -13,15 +13,18 @@ #include <asm/irq_stack.h> +typedef void (*idtentry_t)(struct pt_regs *regs); + /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware * @vector: Vector number (ignored for C) * @func: Function name of the entry point * - * Declares three functions: + * Declares four functions: * - The ASM entry point: asm_##func * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the FRED event dispatcher (maybe unused) * - The C handler called from the ASM entry point * * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it @@ -31,6 +34,7 @@ #define DECLARE_IDTENTRY(vector, func) \ asmlinkage void asm_##func(void); \ asmlinkage void xen_asm_##func(void); \ + void fred_##func(struct pt_regs *regs); \ __visible void func(struct pt_regs *regs) /** @@ -138,6 +142,17 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs) /** + * DEFINE_FREDENTRY_RAW - Emit code for raw FRED entry points + * @func: Function name of the entry point + * + * @func is called from the FRED event dispatcher with interrupts disabled. + * + * See @DEFINE_IDTENTRY_RAW for further details. + */ +#define DEFINE_FREDENTRY_RAW(func) \ +noinstr void fred_##func(struct pt_regs *regs) + +/** * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points * Error code pushed by hardware * @vector: Vector number (ignored for C) @@ -197,8 +212,8 @@ __visible noinstr void func(struct pt_regs *regs, \ irqentry_state_t state = irqentry_enter(regs); \ u32 vector = (u32)(u8)error_code; \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ run_irq_on_irqstack_cond(__##func, regs, vector); \ instrumentation_end(); \ irqentry_exit(regs, state); \ @@ -233,17 +248,27 @@ static noinline void __##func(struct pt_regs *regs, u32 vector) #define DEFINE_IDTENTRY_SYSVEC(func) \ static void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + run_sysvec_on_irqstack_cond(__##func, regs); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - run_sysvec_on_irqstack_cond(__##func, regs); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static noinline void __##func(struct pt_regs *regs) /** @@ -260,19 +285,29 @@ static noinline void __##func(struct pt_regs *regs) #define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \ static __always_inline void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + __irq_enter_raw(); \ + __##func (regs); \ + __irq_exit_raw(); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ - __irq_enter_raw(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - __##func (regs); \ - __irq_exit_raw(); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static __always_inline void __##func(struct pt_regs *regs) /** @@ -410,17 +445,35 @@ __visible noinstr void func(struct pt_regs *regs, \ /* C-Code mapping */ #define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW #define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW +#define DEFINE_FREDENTRY_NMI DEFINE_FREDENTRY_RAW #ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_MCE DEFINE_FREDENTRY_RAW #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW +#endif + +void idt_install_sysvec(unsigned int n, const void *function); + +#ifdef CONFIG_X86_FRED +void fred_install_sysvec(unsigned int vector, const idtentry_t function); +#else +static inline void fred_install_sysvec(unsigned int vector, const idtentry_t function) { } #endif +#define sysvec_install(vector, function) { \ + if (cpu_feature_enabled(X86_FEATURE_FRED)) \ + fred_install_sysvec(vector, function); \ + else \ + idt_install_sysvec(vector, asm_##function); \ +} + #else /* !__ASSEMBLY__ */ /* @@ -447,7 +500,7 @@ __visible noinstr void func(struct pt_regs *regs, \ /* System vector entries */ #define DECLARE_IDTENTRY_SYSVEC(vector, func) \ - idtentry_sysvec vector func + DECLARE_IDTENTRY(vector, func) #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ @@ -569,6 +622,10 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); +#if defined(CONFIG_IA32_EMULATION) +DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation); +#endif + #ifdef CONFIG_X86_MCE #ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); @@ -614,7 +671,7 @@ DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault); #endif /* #CP */ -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection); #endif @@ -648,40 +705,62 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi); #ifdef CONFIG_SMP DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); -DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +#else +# define fred_sysvec_reschedule_ipi NULL +# define fred_sysvec_reboot NULL +# define fred_sysvec_call_function_single NULL +# define fred_sysvec_call_function NULL #endif #ifdef CONFIG_X86_LOCAL_APIC # ifdef CONFIG_X86_MCE_THRESHOLD DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); +# else +# define fred_sysvec_threshold NULL # endif # ifdef CONFIG_X86_MCE_AMD DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error); +# else +# define fred_sysvec_deferred_error NULL # endif # ifdef CONFIG_X86_THERMAL_VECTOR DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal); +# else +# define fred_sysvec_thermal NULL # endif # ifdef CONFIG_IRQ_WORK DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); +# else +# define fred_sysvec_irq_work NULL # endif #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); +#else +# define fred_sysvec_kvm_posted_intr_ipi NULL +# define fred_sysvec_kvm_posted_intr_wakeup_ipi NULL +# define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif +# ifdef CONFIG_X86_POSTED_MSI +DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); +#else +# define fred_sysvec_posted_msi_notification NULL +# endif + #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); -DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); #endif #if IS_ENABLED(CONFIG_ACRN_GUEST) diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index b56c5741581a..53e4015242b4 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -35,6 +35,8 @@ #define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ #define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ #define INAT_PFX_EVEX 15 /* EVEX prefix */ +/* x86-64 REX2 prefix */ +#define INAT_PFX_REX2 16 /* 0xD5 */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -50,7 +52,7 @@ /* Legacy prefix */ #define INAT_PFX_OFFS 0 -#define INAT_PFX_BITS 4 +#define INAT_PFX_BITS 5 #define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) #define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) /* Escape opcodes */ @@ -77,6 +79,9 @@ #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) +#define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) +#define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9)) +#define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -128,6 +133,11 @@ static inline int inat_is_rex_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_REX; } +static inline int inat_is_rex2_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_REX2; +} + static inline int inat_last_prefix_id(insn_attr_t attr) { if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) @@ -227,4 +237,9 @@ static inline int inat_must_evex(insn_attr_t attr) { return attr & INAT_EVEXONLY; } + +static inline int inat_evex_scalable(insn_attr_t attr) +{ + return attr & INAT_EVEX_SCALABLE; +} #endif diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 5f1d3c421f68..14d72727d7ee 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,8 +2,11 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H +#define __head __section(".head.text") + struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ + void (*free_pgt_page)(void *, void *); /* free buf for page table */ void *context; /* context for alloc_pgt_page */ unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ @@ -14,4 +17,6 @@ struct x86_mapping_info { int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend); +void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd); + #endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 1b29f58f730f..7152ea809e6a 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -112,10 +112,15 @@ struct insn { #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) -#define X86_REX_W(rex) ((rex) & 8) -#define X86_REX_R(rex) ((rex) & 4) -#define X86_REX_X(rex) ((rex) & 2) -#define X86_REX_B(rex) ((rex) & 1) +#define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */ +#define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */ +#define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */ +#define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */ + +#define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */ +#define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */ +#define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */ +#define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */ /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ @@ -161,6 +166,18 @@ static inline void insn_get_attribute(struct insn *insn) /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); +static inline int insn_is_rex2(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return insn->rex_prefix.nbytes == 2; +} + +static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) +{ + return X86_REX2_M(insn->rex_prefix.bytes[1]); +} + static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) @@ -198,6 +215,13 @@ static inline insn_byte_t insn_vex_p_bits(struct insn *insn) return X86_VEX_P(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_vex_w_bit(struct insn *insn) +{ + if (insn->vex_prefix.nbytes < 3) + return 0; + return X86_VEX_W(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index b3af2d45bbbb..44949f972826 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -10,7 +10,7 @@ * that group keep the CPUID for the variants sorted by model number. * * The defined symbol names have the following form: - * INTEL_FAM6{OPTFAMILY}_{MICROARCH}{OPTDIFF} + * INTEL_{OPTFAMILY}_{MICROARCH}{OPTDIFF} * where: * OPTFAMILY Describes the family of CPUs that this belongs to. Default * is assumed to be "_CORE" (and should be omitted). Other values @@ -27,6 +27,7 @@ * _X - regular server parts * _D - micro server parts * _N,_P - other mobile parts + * _H - premium mobile parts * _S - other client parts * * Historical OPTDIFFs: @@ -39,131 +40,142 @@ * their own names :-( */ -/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ -#define INTEL_FAM6_ANY X86_MODEL_ANY +#define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model) -#define INTEL_FAM6_CORE_YONAH 0x0E +/* Wildcard match so X86_MATCH_VFM(ANY) works */ +#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY) -#define INTEL_FAM6_CORE2_MEROM 0x0F -#define INTEL_FAM6_CORE2_MEROM_L 0x16 -#define INTEL_FAM6_CORE2_PENRYN 0x17 -#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D +#define INTEL_PENTIUM_PRO IFM(6, 0x01) -#define INTEL_FAM6_NEHALEM 0x1E -#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ -#define INTEL_FAM6_NEHALEM_EP 0x1A -#define INTEL_FAM6_NEHALEM_EX 0x2E +#define INTEL_CORE_YONAH IFM(6, 0x0E) -#define INTEL_FAM6_WESTMERE 0x25 -#define INTEL_FAM6_WESTMERE_EP 0x2C -#define INTEL_FAM6_WESTMERE_EX 0x2F +#define INTEL_CORE2_MEROM IFM(6, 0x0F) +#define INTEL_CORE2_MEROM_L IFM(6, 0x16) +#define INTEL_CORE2_PENRYN IFM(6, 0x17) +#define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D) -#define INTEL_FAM6_SANDYBRIDGE 0x2A -#define INTEL_FAM6_SANDYBRIDGE_X 0x2D -#define INTEL_FAM6_IVYBRIDGE 0x3A -#define INTEL_FAM6_IVYBRIDGE_X 0x3E +#define INTEL_NEHALEM IFM(6, 0x1E) +#define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */ +#define INTEL_NEHALEM_EP IFM(6, 0x1A) +#define INTEL_NEHALEM_EX IFM(6, 0x2E) -#define INTEL_FAM6_HASWELL 0x3C -#define INTEL_FAM6_HASWELL_X 0x3F -#define INTEL_FAM6_HASWELL_L 0x45 -#define INTEL_FAM6_HASWELL_G 0x46 +#define INTEL_WESTMERE IFM(6, 0x25) +#define INTEL_WESTMERE_EP IFM(6, 0x2C) +#define INTEL_WESTMERE_EX IFM(6, 0x2F) -#define INTEL_FAM6_BROADWELL 0x3D -#define INTEL_FAM6_BROADWELL_G 0x47 -#define INTEL_FAM6_BROADWELL_X 0x4F -#define INTEL_FAM6_BROADWELL_D 0x56 +#define INTEL_SANDYBRIDGE IFM(6, 0x2A) +#define INTEL_SANDYBRIDGE_X IFM(6, 0x2D) +#define INTEL_IVYBRIDGE IFM(6, 0x3A) +#define INTEL_IVYBRIDGE_X IFM(6, 0x3E) -#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */ -#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */ -#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */ +#define INTEL_HASWELL IFM(6, 0x3C) +#define INTEL_HASWELL_X IFM(6, 0x3F) +#define INTEL_HASWELL_L IFM(6, 0x45) +#define INTEL_HASWELL_G IFM(6, 0x46) + +#define INTEL_BROADWELL IFM(6, 0x3D) +#define INTEL_BROADWELL_G IFM(6, 0x47) +#define INTEL_BROADWELL_X IFM(6, 0x4F) +#define INTEL_BROADWELL_D IFM(6, 0x56) + +#define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */ +#define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */ +#define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */ /* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */ /* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */ -#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */ +#define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */ /* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */ /* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */ /* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */ -#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */ +#define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */ /* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */ -#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */ -#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */ +#define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */ +#define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */ -#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */ +#define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */ -#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ +#define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */ +#define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */ +#define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */ +#define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */ +#define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */ -#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ +#define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */ -#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ +#define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */ +#define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */ -#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ -#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */ +#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ -#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ +#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) -#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF +#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) +#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) -#define INTEL_FAM6_GRANITERAPIDS_X 0xAD -#define INTEL_FAM6_GRANITERAPIDS_D 0xAE +/* "Hybrid" Processors (P-Core/E-Core) */ -#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ -#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ -#define INTEL_FAM6_ALDERLAKE_N 0xBE +#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */ -#define INTEL_FAM6_RAPTORLAKE 0xB7 -#define INTEL_FAM6_RAPTORLAKE_P 0xBA -#define INTEL_FAM6_RAPTORLAKE_S 0xBF +#define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */ +#define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */ -#define INTEL_FAM6_METEORLAKE 0xAC -#define INTEL_FAM6_METEORLAKE_L 0xAA +#define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */ +#define INTEL_RAPTORLAKE_P IFM(6, 0xBA) +#define INTEL_RAPTORLAKE_S IFM(6, 0xBF) -#define INTEL_FAM6_LUNARLAKE_M 0xBD +#define INTEL_METEORLAKE IFM(6, 0xAC) +#define INTEL_METEORLAKE_L IFM(6, 0xAA) -#define INTEL_FAM6_ARROWLAKE 0xC6 +#define INTEL_ARROWLAKE_H IFM(6, 0xC5) +#define INTEL_ARROWLAKE IFM(6, 0xC6) +#define INTEL_ARROWLAKE_U IFM(6, 0xB5) + +#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* "Small Core" Processors (Atom/E-Core) */ -#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ -#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ +#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */ +#define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */ -#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ -#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ -#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ +#define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */ +#define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */ +#define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */ -#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ -#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */ -#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ +#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ +#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ +#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ -#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ -#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */ +#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ +#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */ +#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ -#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */ +#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ +#define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */ /* Note: the micro-architecture is "Goldmont Plus" */ -#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ +#define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */ + +#define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */ +#define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */ +#define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */ -#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */ -#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ -#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ +#define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */ -#define INTEL_FAM6_SIERRAFOREST_X 0xAF +#define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */ +#define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */ -#define INTEL_FAM6_GRANDRIDGE 0xB6 +#define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */ /* Xeon Phi */ -#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ -#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ +#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */ +#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */ /* Family 5 */ #define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ +#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 2f9eeb5c3069..5dbeac48a5b9 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -9,6 +9,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS_FMT4 8 #define MAX_PEBS_EVENTS 32 +#define MAX_PEBS_EVENTS_MASK GENMASK_ULL(MAX_PEBS_EVENTS - 1, 0) #define MAX_FIXED_PEBS_EVENTS 16 /* diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h deleted file mode 100644 index 994638ef171b..000000000000 --- a/arch/x86/include/asm/intel_pconfig.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef _ASM_X86_INTEL_PCONFIG_H -#define _ASM_X86_INTEL_PCONFIG_H - -#include <asm/asm.h> -#include <asm/processor.h> - -enum pconfig_target { - INVALID_TARGET = 0, - MKTME_TARGET = 1, - PCONFIG_TARGET_NR -}; - -int pconfig_target_supported(enum pconfig_target target); - -enum pconfig_leaf { - MKTME_KEY_PROGRAM = 0, - PCONFIG_LEAF_INVALID, -}; - -#define PCONFIG ".byte 0x0f, 0x01, 0xc5" - -/* Defines and structure for MKTME_KEY_PROGRAM of PCONFIG instruction */ - -/* mktme_key_program::keyid_ctrl COMMAND, bits [7:0] */ -#define MKTME_KEYID_SET_KEY_DIRECT 0 -#define MKTME_KEYID_SET_KEY_RANDOM 1 -#define MKTME_KEYID_CLEAR_KEY 2 -#define MKTME_KEYID_NO_ENCRYPT 3 - -/* mktme_key_program::keyid_ctrl ENC_ALG, bits [23:8] */ -#define MKTME_AES_XTS_128 (1 << 8) - -/* Return codes from the PCONFIG MKTME_KEY_PROGRAM */ -#define MKTME_PROG_SUCCESS 0 -#define MKTME_INVALID_PROG_CMD 1 -#define MKTME_ENTROPY_ERROR 2 -#define MKTME_INVALID_KEYID 3 -#define MKTME_INVALID_ENC_ALG 4 -#define MKTME_DEVICE_BUSY 5 - -/* Hardware requires the structure to be 256 byte aligned. Otherwise #GP(0). */ -struct mktme_key_program { - u16 keyid; - u32 keyid_ctrl; - u8 __rsvd[58]; - u8 key_field_1[64]; - u8 key_field_2[64]; -} __packed __aligned(256); - -static inline int mktme_key_program(struct mktme_key_program *key_program) -{ - unsigned long rax = MKTME_KEY_PROGRAM; - - if (!pconfig_target_supported(MKTME_TARGET)) - return -ENXIO; - - asm volatile(PCONFIG - : "=a" (rax), "=b" (key_program) - : "0" (rax), "1" (key_program) - : "memory", "cc"); - - return rax; -} - -#endif /* _ASM_X86_INTEL_PCONFIG_H */ diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h deleted file mode 100644 index 8537f597d20a..000000000000 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_INTEL_SCU_IPC_H_ -#define _ASM_X86_INTEL_SCU_IPC_H_ - -#include <linux/ioport.h> - -struct device; -struct intel_scu_ipc_dev; - -/** - * struct intel_scu_ipc_data - Data used to configure SCU IPC - * @mem: Base address of SCU IPC MMIO registers - * @irq: The IRQ number used for SCU (optional) - */ -struct intel_scu_ipc_data { - struct resource mem; - int irq; -}; - -struct intel_scu_ipc_dev * -__intel_scu_ipc_register(struct device *parent, - const struct intel_scu_ipc_data *scu_data, - struct module *owner); - -#define intel_scu_ipc_register(parent, scu_data) \ - __intel_scu_ipc_register(parent, scu_data, THIS_MODULE) - -void intel_scu_ipc_unregister(struct intel_scu_ipc_dev *scu); - -struct intel_scu_ipc_dev * -__devm_intel_scu_ipc_register(struct device *parent, - const struct intel_scu_ipc_data *scu_data, - struct module *owner); - -#define devm_intel_scu_ipc_register(parent, scu_data) \ - __devm_intel_scu_ipc_register(parent, scu_data, THIS_MODULE) - -struct intel_scu_ipc_dev *intel_scu_ipc_dev_get(void); -void intel_scu_ipc_dev_put(struct intel_scu_ipc_dev *scu); -struct intel_scu_ipc_dev *devm_intel_scu_ipc_dev_get(struct device *dev); - -int intel_scu_ipc_dev_ioread8(struct intel_scu_ipc_dev *scu, u16 addr, - u8 *data); -int intel_scu_ipc_dev_iowrite8(struct intel_scu_ipc_dev *scu, u16 addr, - u8 data); -int intel_scu_ipc_dev_readv(struct intel_scu_ipc_dev *scu, u16 *addr, - u8 *data, size_t len); -int intel_scu_ipc_dev_writev(struct intel_scu_ipc_dev *scu, u16 *addr, - u8 *data, size_t len); - -int intel_scu_ipc_dev_update(struct intel_scu_ipc_dev *scu, u16 addr, - u8 data, u8 mask); - -int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd, - int sub); -int intel_scu_ipc_dev_command_with_size(struct intel_scu_ipc_dev *scu, int cmd, - int sub, const void *in, size_t inlen, - size_t size, void *out, size_t outlen); - -static inline int intel_scu_ipc_dev_command(struct intel_scu_ipc_dev *scu, int cmd, - int sub, const void *in, size_t inlen, - void *out, size_t outlen) -{ - return intel_scu_ipc_dev_command_with_size(scu, cmd, sub, in, inlen, - inlen, out, outlen); -} - -#endif diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index 8046e70dfd7c..43b7657febca 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -10,7 +10,7 @@ #define TELEM_MAX_EVENTS_SRAM 28 #define TELEM_MAX_OS_ALLOCATED_EVENTS 20 -#include <asm/intel_scu_ipc.h> +#include <linux/platform_data/x86/intel_scu_ipc.h> enum telemetry_unit { TELEM_PSS = 0, diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e9025640f634..1d60427379c9 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -35,9 +35,6 @@ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ -#define ARCH_HAS_IOREMAP_WC -#define ARCH_HAS_IOREMAP_WT - #include <linux/string.h> #include <linux/compiler.h> #include <linux/cc_platform.h> @@ -45,6 +42,7 @@ #include <asm/early_ioremap.h> #include <asm/pgtable_types.h> #include <asm/shared/io.h> +#include <asm/special_insns.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -212,7 +210,22 @@ void memset_io(volatile void __iomem *, int, size_t); #define memcpy_toio memcpy_toio #define memset_io memset_io -#include <asm-generic/iomap.h> +#ifdef CONFIG_X86_64 +/* + * Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for + * x86_64") says that circa 2006 rep movsl is noticeably faster than a copy + * loop. + */ +static inline void __iowrite32_copy(void __iomem *to, const void *from, + size_t count) +{ + asm volatile("rep ; movsl" + : "=&c"(count), "=&D"(to), "=&S"(from) + : "0"(count), "1"(to), "2"(from) + : "memory"); +} +#define __iowrite32_copy __iowrite32_copy +#endif /* * ISA space is 'always mapped' on a typical x86 system, no need to @@ -247,7 +260,7 @@ static inline void slow_down_io(void) #endif -#define BUILDIO(bwl, bw, type) \ +#define BUILDIO(bwl, type) \ static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ @@ -293,9 +306,9 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ } \ } -BUILDIO(b, b, u8) -BUILDIO(w, w, u16) -BUILDIO(l, , u32) +BUILDIO(b, u8) +BUILDIO(w, u16) +BUILDIO(l, u32) #undef BUILDIO #define inb_p inb_p @@ -384,7 +397,7 @@ static inline void iosubmit_cmds512(void __iomem *dst, const void *src, const u8 *end = from + count * 64; while (from < end) { - movdir64b(dst, from); + movdir64b_io(dst, from); from += 64; } } diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 437aa8d00e53..0d806513c4b3 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -109,8 +109,8 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* 1 if "noapic" boot option passed */ -extern int skip_ioapic_setup; +/* True if "noapic" boot option passed */ +extern bool ioapic_is_disabled; /* 1 if "noapic" boot option passed */ extern int noioapicquirk; @@ -129,7 +129,7 @@ extern unsigned long io_apic_irqs; * assignment of PCI IRQ's. */ #define io_apic_assign_pci_irqs \ - (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) + (mp_irq_entries && !ioapic_is_disabled && io_apic_irqs) struct irq_cfg; extern void ioapic_insert_resources(void); @@ -140,7 +140,6 @@ extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); -extern void setup_ioapic_ids_from_mpc_nocheck(void); extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); @@ -179,6 +178,7 @@ extern void print_IO_APICs(void); #define IO_APIC_IRQ(x) 0 #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop +#define nr_ioapics (0) static inline void ioapic_insert_resources(void) { } static inline int arch_early_ioapic_init(void) { return 0; } static inline void print_IO_APICs(void) {} diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 2fd52b65deac..3be2451e7bc8 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -10,6 +10,7 @@ extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_merge; extern int panic_on_overflow; +extern bool amd_iommu_snp_en; #ifdef CONFIG_SWIOTLB extern bool x86_swiotlb_enable; diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index a1911fea8739..af7541c11821 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -111,7 +111,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); * This function will block all kernel access to the PMIC I2C bus, so that the * P-Unit can safely access the PMIC over the shared I2C bus. * - * Note on these systems the i2c-bus driver will request a sempahore from the + * Note on these systems the i2c-bus driver will request a semaphore from the * P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing * it, but this does not appear to be sufficient, we still need to avoid making * certain P-Unit requests during the access window to avoid problems. diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 29e083b92813..194dfff84cb1 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -29,7 +29,7 @@ struct irq_desc; extern void fixup_irqs(void); -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); #endif @@ -42,7 +42,7 @@ extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC void arch_trigger_cpumask_backtrace(const struct cpumask *mask, - bool exclude_self); + int exclude_cpu); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 7a2ed154a5e1..5036f13ab69f 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -50,6 +50,13 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void) return x86_vector_domain; } +extern bool enable_posted_msi; + +static inline bool posted_msi_supported(void) +{ + return enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP); +} + #else /* CONFIG_IRQ_REMAP */ static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 798183867d78..b71ad173f877 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -100,7 +100,7 @@ } #define ASM_CALL_ARG0 \ - "call %P[__func] \n" \ + "call %c[__func] \n" \ ASM_REACHABLE #define ASM_CALL_ARG1 \ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 43dcb9284208..47051871b436 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -18,8 +18,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... LOCAL_TIMER_VECTOR-1 - * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts + * Vectors 129 ... FIRST_SYSTEM_VECTOR-1 : device interrupts + * Vectors FIRST_SYSTEM_VECTOR ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * @@ -35,13 +35,6 @@ */ #define FIRST_EXTERNAL_VECTOR 0x20 -/* - * Reserve the lowest usable vector (and hence lowest priority) 0x20 for - * triggering cleanup after irq migration. 0x21-0x2f will still be used - * for device interrupts. - */ -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR - #define IA32_SYSCALL_VECTOR 0x80 /* @@ -91,11 +84,9 @@ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 /* Vector for KVM to deliver posted interrupt IPI */ -#ifdef CONFIG_HAVE_KVM #define POSTED_INTR_VECTOR 0xf2 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 #define POSTED_INTR_NESTED_VECTOR 0xf0 -#endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef @@ -106,10 +97,16 @@ #define LOCAL_TIMER_VECTOR 0xec +/* + * Posted interrupt notification vector for all device MSIs delivered to + * the host kernel. + */ +#define POSTED_MSI_NOTIFICATION_VECTOR 0xeb + #define NR_VECTORS 256 #ifdef CONFIG_X86_LOCAL_APIC -#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR +#define FIRST_SYSTEM_VECTOR POSTED_MSI_NOTIFICATION_VECTOR #else #define FIRST_SYSTEM_VECTOR NR_VECTORS #endif diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index 800ffce0db29..6b4d36c95165 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -9,7 +9,6 @@ static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } -extern void arch_irq_work_raise(void); #else static inline bool arch_irq_work_has_interrupt(void) { diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 8c5ae649d2df..cf7fc2b8e3ce 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -54,6 +54,26 @@ static __always_inline void native_halt(void) asm volatile("hlt": : :"memory"); } +static __always_inline int native_irqs_disabled_flags(unsigned long flags) +{ + return !(flags & X86_EFLAGS_IF); +} + +static __always_inline unsigned long native_local_irq_save(void) +{ + unsigned long flags = native_save_fl(); + + native_irq_disable(); + + return flags; +} + +static __always_inline void native_local_irq_restore(unsigned long flags) +{ + if (!native_irqs_disabled_flags(flags)) + native_irq_enable(); +} + #endif #ifdef CONFIG_PARAVIRT_XXL diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 071572e23d3a..cbbef32517f0 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -24,7 +24,7 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes] # objtool NOPs this \n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (2 | branch) : : l_yes); @@ -38,7 +38,7 @@ l_yes: static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" ".byte " __stringify(BYTES_NOP5) "\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); @@ -52,7 +52,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes]\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5b77bbc28f96..ae5482a2f0ca 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -25,7 +25,6 @@ #include <asm/page.h> #include <asm/ptrace.h> -#include <asm/bootparam.h> struct kimage; @@ -205,10 +204,19 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image); #endif #endif -typedef void crash_vmclear_fn(void); -extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; extern void kdump_nmi_shootdown_cpus(void); +#ifdef CONFIG_CRASH_HOTPLUG +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event + +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags); +#define arch_crash_hotplug_support arch_crash_hotplug_support + +unsigned int arch_crash_get_elfcorehdr_size(void); +#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h index 8fa6ac0e2d76..d91b37f5b4bb 100644 --- a/arch/x86/include/asm/kmsan.h +++ b/arch/x86/include/asm/kmsan.h @@ -64,6 +64,7 @@ static inline bool kmsan_virt_addr_valid(void *addr) { unsigned long x = (unsigned long)addr; unsigned long y = x - __START_KERNEL_map; + bool ret; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { @@ -79,7 +80,21 @@ static inline bool kmsan_virt_addr_valid(void *addr) return false; } - return pfn_valid(x >> PAGE_SHIFT); + /* + * pfn_valid() relies on RCU, and may call into the scheduler on exiting + * the critical section. However, this would result in recursion with + * KMSAN. Therefore, disable preemption here, and re-enable preemption + * below while suppressing reschedules to avoid recursion. + * + * Note, this sacrifices occasionally breaking scheduling guarantees. + * Although, a kernel compiled with KMSAN has already given up on any + * performance guarantees due to being heavily instrumented. + */ + preempt_disable(); + ret = pfn_valid(x >> PAGE_SHIFT); + preempt_enable_no_resched(); + + return ret; } #endif /* !MODULE */ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index a2e9317aad49..5939694dfb28 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -113,8 +113,6 @@ struct kprobe_ctlblk { }; extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); -extern int kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data); extern int kprobe_int3_handler(struct pt_regs *regs); #else diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index e3054e3e46d5..861d080ed4c6 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -9,14 +9,13 @@ BUILD_BUG_ON(1) * "static_call_update()" calls. * * KVM_X86_OP_OPTIONAL() can be used for those functions that can have - * a NULL definition, for example if "static_call_cond()" will be used - * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise + * a NULL definition. KVM_X86_OP_OPTIONAL_RET0() can be used likewise * to make a definition optional, but in this case the default will * be __static_call_return0. */ KVM_X86_OP(check_processor_compatibility) -KVM_X86_OP(hardware_enable) -KVM_X86_OP(hardware_disable) +KVM_X86_OP(enable_virtualization_cpu) +KVM_X86_OP(disable_virtualization_cpu) KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) @@ -55,8 +54,10 @@ KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) KVM_X86_OP(flush_tlb_all) KVM_X86_OP(flush_tlb_current) +#if IS_ENABLED(CONFIG_HYPERV) KVM_X86_OP_OPTIONAL(flush_remote_tlbs) KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range) +#endif KVM_X86_OP(flush_tlb_gva) KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_pre_run) @@ -83,7 +84,6 @@ KVM_X86_OP_OPTIONAL(update_cr8_intercept) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP_OPTIONAL(hwapic_irr_update) KVM_X86_OP_OPTIONAL(hwapic_isr_update) -KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt) KVM_X86_OP_OPTIONAL(load_eoi_exitmap) KVM_X86_OP_OPTIONAL(set_virtual_apic_mode) KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) @@ -101,13 +101,12 @@ KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) -KVM_X86_OP(request_immediate_exit) -KVM_X86_OP(sched_in) KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) KVM_X86_OP_OPTIONAL(vcpu_blocking) KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) @@ -119,14 +118,15 @@ KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) #endif +KVM_X86_OP_OPTIONAL(dev_get_attr) KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) -KVM_X86_OP(get_msr_feature) -KVM_X86_OP(can_emulate_instruction) +KVM_X86_OP(get_feature_msr) +KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) @@ -134,6 +134,11 @@ KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); +KVM_X86_OP_OPTIONAL(get_untagged_addr) +KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) +KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) +KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level) +KVM_X86_OP_OPTIONAL(gmem_invalidate) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index 6c98f4bb4228..9159bf1a4730 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -9,20 +9,17 @@ BUILD_BUG_ON(1) * "static_call_update()" calls. * * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have - * a NULL definition, for example if "static_call_cond()" will be used - * at the call sites. + * a NULL definition. */ -KVM_X86_PMU_OP(hw_event_available) -KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) KVM_X86_PMU_OP(msr_idx_to_pmc) -KVM_X86_PMU_OP(is_valid_rdpmc_ecx) +KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early) KVM_X86_PMU_OP(is_valid_msr) KVM_X86_PMU_OP(get_msr) KVM_X86_PMU_OP(set_msr) KVM_X86_PMU_OP(refresh) KVM_X86_PMU_OP(init) -KVM_X86_PMU_OP(reset) +KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3bc146dfd38d..6d9f763a7bb9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -36,10 +36,19 @@ #include <asm/kvm_page_track.h> #include <asm/kvm_vcpu_regs.h> #include <asm/hyperv-tlfs.h> +#include <asm/reboot.h> #define __KVM_HAVE_ARCH_VCPU_DEBUGFS +/* + * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if + * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS). + */ +#ifdef CONFIG_KVM_MAX_NR_VCPUS +#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS +#else #define KVM_MAX_VCPUS 1024 +#endif /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs @@ -113,6 +122,7 @@ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -125,7 +135,8 @@ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ - | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP)) + | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ + | X86_CR4_LAM_SUP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -150,7 +161,6 @@ #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 256 -#define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 #define ASYNC_PF_PER_VCPU 64 @@ -202,6 +212,7 @@ enum exit_fastpath_completion { EXIT_FASTPATH_NONE, EXIT_FASTPATH_REENTER_GUEST, EXIT_FASTPATH_EXIT_HANDLED, + EXIT_FASTPATH_EXIT_USERSPACE, }; typedef enum exit_fastpath_completion fastpath_t; @@ -245,32 +256,31 @@ enum x86_intercept_stage; KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) +#define PFERR_PRESENT_MASK BIT(0) +#define PFERR_WRITE_MASK BIT(1) +#define PFERR_USER_MASK BIT(2) +#define PFERR_RSVD_MASK BIT(3) +#define PFERR_FETCH_MASK BIT(4) +#define PFERR_PK_MASK BIT(5) +#define PFERR_SGX_MASK BIT(15) +#define PFERR_GUEST_RMP_MASK BIT_ULL(31) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(32) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_GUEST_ENC_MASK BIT_ULL(34) +#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) +#define PFERR_GUEST_VMPL_MASK BIT_ULL(36) -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 -#define PFERR_PK_BIT 5 -#define PFERR_SGX_BIT 15 -#define PFERR_GUEST_FINAL_BIT 32 -#define PFERR_GUEST_PAGE_BIT 33 -#define PFERR_IMPLICIT_ACCESS_BIT 48 - -#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) -#define PFERR_USER_MASK BIT(PFERR_USER_BIT) -#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) -#define PFERR_PK_MASK BIT(PFERR_PK_BIT) -#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) - -#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ - PFERR_WRITE_MASK | \ - PFERR_PRESENT_MASK) +/* + * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks + * when emulating instructions that triggers implicit access. + */ +#define PFERR_IMPLICIT_ACCESS BIT_ULL(48) +/* + * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred + * when the guest was accessing private memory. + */ +#define PFERR_PRIVATE_ACCESS BIT_ULL(49) +#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -288,13 +298,13 @@ struct kvm_kernel_irq_routing_entry; * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to - * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating - * 2 bytes per gfn instead of 4 bytes per gfn. + * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows + * allocating 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via - * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create - * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise - * gfn_track will overflow and explosions will ensure. + * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must + * not create more than 2^16-1 upper-level shadow pages at a single gfn, + * otherwise gfn_write_track will overflow and explosions will ensue. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which @@ -492,8 +502,23 @@ struct kvm_pmc { u8 idx; bool is_paused; bool intr; + /* + * Base value of the PMC counter, relative to the *consumed* count in + * the associated perf_event. This value includes counter updates from + * the perf_event and emulated_count since the last time the counter + * was reprogrammed, but it is *not* the current value as seen by the + * guest or userspace. + * + * The count is relative to the associated perf_event so that KVM + * doesn't need to reprogram the perf_event every time the guest writes + * to the counter. + */ u64 counter; - u64 prev_counter; + /* + * PMC events triggered by KVM emulation that haven't been fully + * processed, i.e. haven't undergone overflow detection. + */ + u64 emulated_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; @@ -506,29 +531,33 @@ struct kvm_pmc { }; /* More counters may conflict with other existing Architectural MSRs */ -#define KVM_INTEL_PMC_MAX_GENERIC 8 -#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) -#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) -#define KVM_PMC_MAX_FIXED 3 -#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) -#define KVM_AMD_PMC_MAX_GENERIC 6 +#define KVM_MAX(a, b) ((a) >= (b) ? (a) : (b)) +#define KVM_MAX_NR_INTEL_GP_COUNTERS 8 +#define KVM_MAX_NR_AMD_GP_COUNTERS 6 +#define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \ + KVM_MAX_NR_AMD_GP_COUNTERS) + +#define KVM_MAX_NR_INTEL_FIXED_COUTNERS 3 +#define KVM_MAX_NR_AMD_FIXED_COUTNERS 0 +#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \ + KVM_MAX_NR_AMD_FIXED_COUTNERS) + struct kvm_pmu { u8 version; unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; - u64 fixed_ctr_ctrl_mask; + u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; - u64 global_ctrl_mask; - u64 global_status_mask; + u64 global_ctrl_rsvd; + u64 global_status_rsvd; u64 reserved_bits; u64 raw_event_mask; - struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; - struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; - struct irq_work irq_work; + struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS]; + struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS]; /* * Overlay the bitmap with a 64-bit atomic so that all bits can be @@ -544,9 +573,9 @@ struct kvm_pmu { u64 ds_area; u64 pebs_enable; - u64 pebs_enable_mask; + u64 pebs_enable_rsvd; u64 pebs_data_cfg; - u64 pebs_data_cfg_mask; + u64 pebs_data_cfg_rsvd; /* * If a guest counter is cross-mapped to host counter with different @@ -577,18 +606,12 @@ enum { KVM_DEBUGREG_WONT_EXIT = 2, }; -struct kvm_mtrr_range { - u64 base; - u64 mask; - struct list_head node; -}; - struct kvm_mtrr { - struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR]; - mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; + u64 var[KVM_NR_VAR_MTRR * 2]; + u64 fixed_64k; + u64 fixed_16k[2]; + u64 fixed_4k[8]; u64 deftype; - - struct list_head head; }; /* Hyper-V SynIC timer */ @@ -680,6 +703,7 @@ struct kvm_hypervisor_cpuid { u32 limit; }; +#ifdef CONFIG_KVM_XEN /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -702,6 +726,7 @@ struct kvm_vcpu_xen { struct timer_list poll_timer; struct kvm_hypervisor_cpuid cpuid; }; +#endif struct kvm_queued_exception { bool pending; @@ -746,7 +771,6 @@ struct kvm_vcpu_arch { u64 smi_count; bool at_instruction_boundary; bool tpr_access_reporting; - bool xsaves_enabled; bool xfd_no_write_intercept; u64 ia32_xss; u64 microcode_version; @@ -830,6 +854,26 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; + bool is_amd_compatible; + + /* + * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly + * when "struct kvm_vcpu_arch" is no longer defined in an + * arch/x86/include/asm header. The max is mostly arbitrary, i.e. + * can be increased as necessary. + */ +#define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG + + /* + * Track whether or not the guest is allowed to use features that are + * governed by KVM, where "governed" means KVM needs to manage state + * and/or explicitly enable the feature in hardware. Typically, but + * not always, governed features can be used by the guest if and only + * if both KVM and userspace want to expose the feature to the guest. + */ + struct { + DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES); + } governed_features; u64 reserved_gpa_bits; int maxphyaddr; @@ -910,10 +954,13 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; +#ifdef CONFIG_KVM_HYPERV bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; +#endif +#ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; - +#endif cpumask_var_t wbinvd_dirty_mask; unsigned long last_retry_eip; @@ -946,9 +993,6 @@ struct kvm_vcpu_arch { u64 msr_kvm_poll_control; - /* set at EPT violation at this point */ - unsigned long exit_qualification; - /* pv related host specific info */ struct { bool pv_unhalted; @@ -1005,7 +1049,7 @@ struct kvm_lpage_info { struct kvm_arch_memory_slot { struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; - unsigned short *gfn_track[KVM_PAGE_TRACK_MAX]; + unsigned short *gfn_write_track; }; /* @@ -1067,6 +1111,7 @@ enum hv_tsc_page_status { HV_TSC_PAGE_BROKEN, }; +#ifdef CONFIG_KVM_HYPERV /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -1097,9 +1142,11 @@ struct kvm_hv { */ unsigned int synic_auto_eoi_used; - struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; + + bool xsaves_xsavec_checked; }; +#endif struct msr_bitmap_range { u32 flags; @@ -1108,6 +1155,7 @@ struct msr_bitmap_range { unsigned long *bitmap; }; +#ifdef CONFIG_KVM_XEN /* Xen emulation context */ struct kvm_xen { struct mutex xen_lock; @@ -1119,6 +1167,7 @@ struct kvm_xen { struct idr evtchn_ports; unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; }; +#endif enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, @@ -1154,7 +1203,7 @@ enum kvm_apicv_inhibit { * APIC acceleration is disabled by a module parameter * and/or not supported in hardware. */ - APICV_INHIBIT_REASON_DISABLE, + APICV_INHIBIT_REASON_DISABLED, /* * APIC acceleration is inhibited because AutoEOI feature is @@ -1224,14 +1273,37 @@ enum kvm_apicv_inhibit { * mapping between logical ID and vCPU. */ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, + + NR_APICV_INHIBIT_REASONS, }; +#define __APICV_INHIBIT_REASON(reason) \ + { BIT(APICV_INHIBIT_REASON_##reason), #reason } + +#define APICV_INHIBIT_REASONS \ + __APICV_INHIBIT_REASON(DISABLED), \ + __APICV_INHIBIT_REASON(HYPERV), \ + __APICV_INHIBIT_REASON(ABSENT), \ + __APICV_INHIBIT_REASON(BLOCKIRQ), \ + __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \ + __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \ + __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \ + __APICV_INHIBIT_REASON(NESTED), \ + __APICV_INHIBIT_REASON(IRQWIN), \ + __APICV_INHIBIT_REASON(PIT_REINJ), \ + __APICV_INHIBIT_REASON(SEV), \ + __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED) + struct kvm_arch { unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; u8 mmu_valid_gen; + u8 vm_type; + bool has_private_mem; + bool has_protected_state; + bool pre_fault_allowed; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; @@ -1247,8 +1319,9 @@ struct kvm_arch { * create an NX huge page (without hanging the guest). */ struct list_head possible_nx_huge_pages; - struct kvm_page_track_notifier_node mmu_sp_tracker; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; +#endif /* * Protects marking pages unsync during page faults, as TDP MMU page * faults only take mmu_lock for read. For simplicity, the unsync @@ -1257,7 +1330,8 @@ struct kvm_arch { */ spinlock_t mmu_unsync_pages_lock; - struct list_head assigned_dev_head; + u64 shadow_mmio_value; + struct iommu_domain *iommu_domain; bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA @@ -1305,6 +1379,8 @@ struct kvm_arch { int nr_vcpus_matched_tsc; u32 default_tsc_khz; + bool user_set_tsc; + u64 apic_bus_cycle_ns; seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; @@ -1318,8 +1394,13 @@ struct kvm_arch { /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; +#ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; +#endif + +#ifdef CONFIG_KVM_XEN struct kvm_xen xen; +#endif bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; @@ -1377,9 +1458,8 @@ struct kvm_arch { * the MMU lock in read mode + RCU or * the MMU lock in write mode * - * For writes, this list is protected by: - * the MMU lock in read mode + the tdp_mmu_pages_lock or - * the MMU lock in write mode + * For writes, this list is protected by tdp_mmu_pages_lock; see + * below for the details. * * Roots will remain in the list until their tdp_mmu_root_count * drops to zero, at which point the thread that decremented the @@ -1396,11 +1476,12 @@ struct kvm_arch { * - possible_nx_huge_pages; * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU - * It is acceptable, but not necessary, to acquire this lock when - * the thread holds the MMU lock in write mode. + * Because the lock is only taken within the MMU lock, strictly + * speaking it is redundant to acquire this lock when the thread + * holds the MMU lock in write mode. However it often simplifies + * the code to do so. */ spinlock_t tdp_mmu_pages_lock; - struct workqueue_struct *tdp_mmu_zap_wq; #endif /* CONFIG_X86_64 */ /* @@ -1410,9 +1491,19 @@ struct kvm_arch { */ bool shadow_root_allocated; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING + /* + * If set, the VM has (or had) an external write tracking user, and + * thus all write tracking metadata has been allocated, even if KVM + * itself isn't using write tracking. + */ + bool external_write_tracking_enabled; +#endif + #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; + struct hv_partition_assist_pg *hv_pa_pg; #endif /* * VM-scope maximum vCPU ID. Used to determine the size of structures @@ -1536,8 +1627,10 @@ struct kvm_x86_ops { int (*check_processor_compatibility)(void); - int (*hardware_enable)(void); - void (*hardware_disable)(void); + int (*enable_virtualization_cpu)(void); + void (*disable_virtualization_cpu)(void); + cpu_emergency_virt_cb *emergency_disable_virtualization_cpu; + void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); @@ -1585,9 +1678,11 @@ struct kvm_x86_ops { void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); +#if IS_ENABLED(CONFIG_HYPERV) int (*flush_remote_tlbs)(struct kvm *kvm); int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); +#endif /* * Flush any TLB entries associated with the given GVA. @@ -1604,7 +1699,8 @@ struct kvm_x86_ops { void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); - enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, + bool force_immediate_exit); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1624,20 +1720,20 @@ struct kvm_x86_ops { /* Whether or not a virtual NMI is pending in hardware. */ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu); /* - * Attempt to pend a virtual NMI in harware. Returns %true on success + * Attempt to pend a virtual NMI in hardware. Returns %true on success * to allow using static_call_ret0 as the fallback. */ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); + + const bool x2apic_icr_is_split; const unsigned long required_apicv_inhibits; bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(int isr); - bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); @@ -1655,8 +1751,8 @@ struct kvm_x86_ops { u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu); /* * Retrieve somewhat arbitrary exit information. Intended to @@ -1672,10 +1768,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - void (*request_immediate_exit)(struct kvm_vcpu *vcpu); - - void (*sched_in)(struct kvm_vcpu *kvm, int cpu); - /* * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero * value indicates CPU dirty logging is unsupported or disabled. @@ -1691,6 +1783,7 @@ struct kvm_x86_ops { int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*pi_start_assignment)(struct kvm *kvm); + void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); @@ -1707,6 +1800,7 @@ struct kvm_x86_ops { void (*enable_smi_window)(struct kvm_vcpu *vcpu); #endif + int (*dev_get_attr)(u32 group, u64 attr, u64 *val); int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); @@ -1714,10 +1808,10 @@ struct kvm_x86_ops { int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); - int (*get_msr_feature)(struct kvm_msr_entry *entry); + int (*get_feature_msr)(u32 msr, u64 *data); - bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); + int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); @@ -1732,6 +1826,12 @@ struct kvm_x86_ops { * Returns vCPU specific APICv inhibit reasons */ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); + + gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); + void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); + int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); + void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); + int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); }; struct kvm_x86_nested_ops { @@ -1739,7 +1839,7 @@ struct kvm_x86_nested_ops { bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector, u32 error_code); int (*check_events)(struct kvm_vcpu *vcpu); - bool (*has_events)(struct kvm_vcpu *vcpu); + bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, @@ -1769,14 +1869,17 @@ struct kvm_arch_async_pf { gfn_t gfn; unsigned long cr3; bool direct_map; + u64 error_code; }; extern u32 __read_mostly kvm_nr_uret_msrs; -extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; extern struct kvm_x86_ops kvm_x86_ops; +#define kvm_x86_call(func) static_call(kvm_x86_##func) +#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func) + #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP @@ -1795,18 +1898,38 @@ static inline struct kvm *kvm_arch_alloc_vm(void) #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); -#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB -static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +#if IS_ENABLED(CONFIG_HYPERV) +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS +static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { if (kvm_x86_ops.flush_remote_tlbs && - !static_call(kvm_x86_flush_remote_tlbs)(kvm)) + !kvm_x86_call(flush_remote_tlbs)(kvm)) return 0; else return -ENOTSUPP; } +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE +static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, + u64 nr_pages) +{ + if (!kvm_x86_ops.flush_remote_tlbs_range) + return -EOPNOTSUPP; + + return kvm_x86_call(flush_remote_tlbs_range)(kvm, gfn, nr_pages); +} +#endif /* CONFIG_HYPERV */ + +enum kvm_intr_type { + /* Values are arbitrary, but must be non-zero. */ + KVM_HANDLING_IRQ = 1, + KVM_HANDLING_NMI, +}; + +/* Enable perf NMI and timer modes to work, and minimise false positives. */ #define kvm_arch_pmi_in_guest(vcpu) \ - ((vcpu) && (vcpu)->arch.handling_intr_from_guest) + ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \ + (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI))) void __init kvm_mmu_x86_module_init(void); int kvm_mmu_vendor_module_init(void); @@ -1814,9 +1937,12 @@ void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_init_vm(struct kvm *kvm); +void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot); + void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1833,9 +1959,9 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); -void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -1936,6 +2062,8 @@ void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); +int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data); int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); @@ -1969,7 +2097,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); -void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); @@ -2012,7 +2140,15 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry); + +static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa) +{ + return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); +} + void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); @@ -2043,10 +2179,15 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, kvm_set_or_clear_apicv_inhibit(kvm, reason, false); } +unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, + unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, + int op_64_bit, int cpl); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, unsigned long roots); @@ -2056,6 +2197,15 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); + +#ifdef CONFIG_KVM_PRIVATE_MEM +#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) +#else +#define kvm_arch_has_private_mem(kvm) false +#endif + +#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) + static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -2103,20 +2253,20 @@ enum { #define HF_SMM_MASK (1 << 1) #define HF_SMM_INSIDE_NMI_MASK (1 << 2) -# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE -# define KVM_ADDRESS_SPACE_NUM 2 +# define KVM_MAX_NR_ADDRESS_SPACES 2 +/* SMM is currently unsupported for guests with private memory. */ +# define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2) # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) #else # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif -#define KVM_ARCH_WANT_MMU_NOTIFIER - int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); +int kvm_cpu_get_extint(struct kvm_vcpu *v); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); @@ -2157,7 +2307,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); -void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); @@ -2179,12 +2328,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - static_call_cond(kvm_x86_vcpu_blocking)(vcpu); + kvm_x86_call(vcpu_blocking)(vcpu); } static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - static_call_cond(kvm_x86_vcpu_unblocking)(vcpu); + kvm_x86_call(vcpu_unblocking)(vcpu); } static inline int kvm_cpu_get_apicid(int mps_cpu) @@ -2209,7 +2358,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ - KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ + KVM_X86_QUIRK_SLOT_ZAP_ALL) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index eb186bc57f6a..3d040741044b 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -2,11 +2,9 @@ #ifndef _ASM_X86_KVM_PAGE_TRACK_H #define _ASM_X86_KVM_PAGE_TRACK_H -enum kvm_page_track_mode { - KVM_PAGE_TRACK_WRITE, - KVM_PAGE_TRACK_MAX, -}; +#include <linux/kvm_types.h> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING /* * The notifier represented by @kvm_page_track_notifier_node is linked into * the head which will be notified when guest is triggering the track event. @@ -26,54 +24,39 @@ struct kvm_page_track_notifier_node { * It is called when guest is writing the write-tracked page * and write emulation is finished at that time. * - * @vcpu: the vcpu where the write access happened. * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. * @node: this node */ - void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes, struct kvm_page_track_notifier_node *node); + void (*track_write)(gpa_t gpa, const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node); + /* - * It is called when memory slot is being moved or removed - * users can drop write-protection for the pages in that memory slot + * Invoked when a memory region is removed from the guest. Or in KVM + * terms, when a memslot is deleted. * - * @kvm: the kvm where memory slot being moved or removed - * @slot: the memory slot being moved or removed - * @node: this node + * @gfn: base gfn of the region being removed + * @nr_pages: number of pages in the to-be-removed region + * @node: this node */ - void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node); + void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages, + struct kvm_page_track_notifier_node *node); }; -int kvm_page_track_init(struct kvm *kvm); -void kvm_page_track_cleanup(struct kvm *kvm); +int kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); +void kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); -bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); -int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); - -void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); -int kvm_page_track_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages); - -void kvm_slot_page_track_add_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -void kvm_slot_page_track_remove_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -bool kvm_slot_page_track_is_active(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, enum kvm_page_track_mode mode); +int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn); +int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn); +#else +/* + * Allow defining a node in a structure even if page tracking is disabled, e.g. + * to play nice with testing headers via direct inclusion from the command line. + */ +struct kvm_page_track_notifier_node {}; +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ -void -kvm_page_track_register_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void -kvm_page_track_unregister_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); -void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h index 511b35069187..f163176d6f7f 100644 --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -4,8 +4,6 @@ #include <linux/percpu.h> -extern struct clocksource kvm_clock; - DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 97a3de7892d3..dc31b13b87a0 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -8,6 +8,14 @@ #undef notrace #define notrace __attribute__((no_instrument_function)) +#ifdef CONFIG_64BIT +/* + * The generic version tends to create spurious ENDBR instructions under + * certain conditions. + */ +#define _THIS_IP_ ({ unsigned long __here; asm ("lea 0(%%rip), %0" : "=r" (__here)); __here; }) +#endif + #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #endif /* CONFIG_X86_32 */ @@ -32,27 +40,27 @@ #ifdef __ASSEMBLY__ -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk -#else /* CONFIG_RETPOLINE */ -#ifdef CONFIG_SLS +#else /* CONFIG_MITIGATION_RETPOLINE */ +#ifdef CONFIG_MITIGATION_SLS #define RET ret; int3 #else #define RET ret #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ #else /* __ASSEMBLY__ */ -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define ASM_RET "jmp __x86_return_thunk\n\t" -#else /* CONFIG_RETPOLINE */ -#ifdef CONFIG_SLS +#else /* CONFIG_MITIGATION_RETPOLINE */ +#ifdef CONFIG_MITIGATION_SLS #define ASM_RET "ret; int3\n\t" #else #define ASM_RET "ret\n\t" #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ #endif /* __ASSEMBLY__ */ @@ -97,6 +105,13 @@ CFI_POST_PADDING \ SYM_FUNC_END(__cfi_##name) +/* UML needs to be able to override memcpy() and friends for KASAN. */ +#ifdef CONFIG_UML +# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS_WEAK +#else +# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS +#endif + /* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ #define SYM_TYPED_FUNC_START(name) \ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 56d4ef604b91..59aa966dc212 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -127,36 +127,47 @@ static inline long local_cmpxchg(local_t *l, long old, long new) static inline bool local_try_cmpxchg(local_t *l, long *old, long new) { - typeof(l->a.counter) *__old = (typeof(l->a.counter) *) old; - return try_cmpxchg_local(&l->a.counter, __old, new); + return try_cmpxchg_local(&l->a.counter, + (typeof(l->a.counter) *) old, new); } -/* Always has a lock prefix */ -#define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) +/* + * Implement local_xchg using CMPXCHG instruction without the LOCK prefix. + * XCHG is expensive due to the implied LOCK prefix. The processor + * cannot prefetch cachelines if XCHG is used. + */ +static __always_inline long +local_xchg(local_t *l, long n) +{ + long c = local_read(l); + + do { } while (!local_try_cmpxchg(l, &c, n)); + + return c; +} /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read((l)); \ - for (;;) { \ - if (unlikely(c == (u))) \ - break; \ - old = local_cmpxchg((l), c, c + (a)); \ - if (likely(old == c)) \ - break; \ - c = old; \ - } \ - c != (u); \ -}) +static __always_inline bool +local_add_unless(local_t *l, long a, long u) +{ + long c = local_read(l); + + do { + if (unlikely(c == u)) + return false; + } while (!local_try_cmpxchg(l, &c, c + a)); + + return true; +} + #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 180b1cbfcc4e..3b9970117a0f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -13,6 +13,7 @@ #define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ #define MCG_EXT_P BIT_ULL(9) /* Extended registers available */ #define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ +#define MCG_SEAM_NR BIT_ULL(12) /* MCG_STATUS_SEAM_NR supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) @@ -25,6 +26,7 @@ #define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */ #define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */ +#define MCG_STATUS_SEAM_NR BIT_ULL(12) /* Machine check inside SEAM non-root mode */ /* MCG_EXT_CTL register defines */ #define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */ @@ -219,7 +221,7 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_setup(struct mce *m); +void mce_prep_record(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct device *, mce_device); @@ -245,7 +247,7 @@ static inline void cmci_recheck(void) {} int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); -int mce_usable_address(struct mce *m); +bool mce_usable_address(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); @@ -259,7 +261,8 @@ enum mcp_flags { MCP_DONTLOG = BIT(2), /* only clear, don't log */ MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */ }; -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b); + +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); @@ -311,6 +314,7 @@ enum smca_bank_types { SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ SMCA_UMC_V2, + SMCA_MA_LLC, /* Memory Attached Last Level Cache */ SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ SMCA_PSP_V2, @@ -326,6 +330,8 @@ enum smca_bank_types { SMCA_SHUB, /* System HUB Unit */ SMCA_SATA, /* SATA Unit */ SMCA_USB, /* USB Unit */ + SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */ + SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */ SMCA_GMI_PCS, /* GMI PCS Unit */ SMCA_XGMI_PHY, /* xGMI PHY Unit */ SMCA_WAFL_PHY, /* WAFL PHY Unit */ @@ -333,7 +339,6 @@ enum smca_bank_types { N_SMCA_BANK_TYPES }; -extern const char *smca_get_long_name(enum smca_bank_types t); extern bool amd_mce_is_memory_error(struct mce *m); extern int mce_threshold_create_device(unsigned int cpu); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 7f97a8a97e24..f922b682b9b4 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -15,12 +15,15 @@ #include <linux/init.h> #include <linux/cc_platform.h> -#include <asm/bootparam.h> +#include <asm/asm.h> +struct boot_params; #ifdef CONFIG_X86_MEM_ENCRYPT void __init mem_encrypt_init(void); +void __init mem_encrypt_setup_arch(void); #else static inline void mem_encrypt_init(void) { } +static inline void __init mem_encrypt_setup_arch(void) { } #endif #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -43,20 +46,24 @@ void __init sme_map_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sev_setup_arch(void); -void __init sme_encrypt_kernel(struct boot_params *bp); -void __init sme_enable(struct boot_params *bp); +void sme_encrypt_kernel(struct boot_params *bp); +void sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); -void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, - bool enc); +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, + unsigned long size, bool enc); void __init mem_encrypt_free_decrypted_mem(void); void __init sev_es_init_vc_handling(void); +static inline u64 sme_get_me_mask(void) +{ + return RIP_REL_REF(sme_me_mask); +} + #define __bss_decrypted __section(".bss..decrypted") #else /* !CONFIG_AMD_MEM_ENCRYPT */ @@ -73,10 +80,9 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sev_setup_arch(void) { } -static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } -static inline void __init sme_enable(struct boot_params *bp) { } +static inline void sme_encrypt_kernel(struct boot_params *bp) { } +static inline void sme_enable(struct boot_params *bp) { } static inline void sev_es_init_vc_handling(void) { } @@ -85,10 +91,12 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } static inline void __init -early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {} +early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) {} static inline void mem_encrypt_free_decrypted_mem(void) { } +static inline u64 sme_get_me_mask(void) { return 0; } + #define __bss_decrypted #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -106,11 +114,6 @@ void add_encrypt_protection_map(void); extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; -static inline u64 sme_get_me_mask(void) -{ - return sme_me_mask; -} - #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 66dbba181bd9..695e569159c1 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -2,138 +2,88 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H -#include <asm/cpu.h> -#include <linux/earlycpio.h> -#include <linux/initrd.h> -#include <asm/microcode_amd.h> - -struct ucode_patch { - struct list_head plist; - void *data; /* Intel uses only this one */ - unsigned int size; - u32 patch_id; - u16 equiv_cpu; -}; - -extern struct list_head microcode_cache; - struct cpu_signature { unsigned int sig; unsigned int pf; unsigned int rev; }; -struct device; - -enum ucode_state { - UCODE_OK = 0, - UCODE_NEW, - UCODE_UPDATED, - UCODE_NFOUND, - UCODE_ERROR, +struct ucode_cpu_info { + struct cpu_signature cpu_sig; + void *mc; }; -struct microcode_ops { - enum ucode_state (*request_microcode_fw) (int cpu, struct device *); - - void (*microcode_fini_cpu) (int cpu); +#ifdef CONFIG_MICROCODE +void load_ucode_bsp(void); +void load_ucode_ap(void); +void microcode_bsp_resume(void); +#else +static inline void load_ucode_bsp(void) { } +static inline void load_ucode_ap(void) { } +static inline void microcode_bsp_resume(void) { } +#endif - /* - * The generic 'microcode_core' part guarantees that - * the callbacks below run on a target cpu when they - * are being called. - * See also the "Synchronization" section in microcode_core.c. - */ - enum ucode_state (*apply_microcode) (int cpu); - int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); +extern unsigned long initrd_start_early; + +#ifdef CONFIG_CPU_SUP_INTEL +/* Intel specific microcode defines. Public for IFS */ +struct microcode_header_intel { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int metasize; + unsigned int min_req_ver; + unsigned int reserved; }; -struct ucode_cpu_info { - struct cpu_signature cpu_sig; - void *mc; +struct microcode_intel { + struct microcode_header_intel hdr; + unsigned int bits[]; }; -extern struct ucode_cpu_info ucode_cpu_info[]; -struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); -#ifdef CONFIG_MICROCODE_INTEL -extern struct microcode_ops * __init init_intel_microcode(void); -#else -static inline struct microcode_ops * __init init_intel_microcode(void) -{ - return NULL; -} -#endif /* CONFIG_MICROCODE_INTEL */ +#define DEFAULT_UCODE_DATASIZE (2000) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) +#define MC_HEADER_TYPE_MICROCODE 1 +#define MC_HEADER_TYPE_IFS 2 -#ifdef CONFIG_MICROCODE_AMD -extern struct microcode_ops * __init init_amd_microcode(void); -extern void __exit exit_amd_microcode(void); -#else -static inline struct microcode_ops * __init init_amd_microcode(void) +static inline int intel_microcode_get_datasize(struct microcode_header_intel *hdr) { - return NULL; + return hdr->datasize ? : DEFAULT_UCODE_DATASIZE; } -static inline void __exit exit_amd_microcode(void) {} -#endif -#define MAX_UCODE_COUNT 128 - -#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) -#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') -#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') -#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') -#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') -#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') -#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') - -#define CPUID_IS(a, b, c, ebx, ecx, edx) \ - (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) - -/* - * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. - * x86_cpuid_vendor() gets vendor id for BSP. - * - * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify - * coding, we still use x86_cpuid_vendor() to get vendor id for AP. - * - * x86_cpuid_vendor() gets vendor information directly from CPUID. - */ -static inline int x86_cpuid_vendor(void) +static inline u32 intel_get_microcode_revision(void) { - u32 eax = 0x00000000; - u32 ebx, ecx = 0, edx; + u32 rev, dummy; - native_cpuid(&eax, &ebx, &ecx, &edx); + native_wrmsrl(MSR_IA32_UCODE_REV, 0); - if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) - return X86_VENDOR_INTEL; + /* As documented in the SDM: Do a CPUID 1 here */ + native_cpuid_eax(1); - if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) - return X86_VENDOR_AMD; + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); - return X86_VENDOR_UNKNOWN; + return rev; } +#endif /* !CONFIG_CPU_SUP_INTEL */ -static inline unsigned int x86_cpuid_family(void) -{ - u32 eax = 0x00000001; - u32 ebx, ecx = 0, edx; - - native_cpuid(&eax, &ebx, &ecx, &edx); +bool microcode_nmi_handler(void); +void microcode_offline_nmi_handler(void); - return x86_family(eax); +#ifdef CONFIG_MICROCODE_LATE_LOADING +DECLARE_STATIC_KEY_FALSE(microcode_nmi_handler_enable); +static __always_inline bool microcode_nmi_handler_enabled(void) +{ + return static_branch_unlikely(µcode_nmi_handler_enable); } - -#ifdef CONFIG_MICROCODE -extern void __init load_ucode_bsp(void); -extern void load_ucode_ap(void); -void reload_early_microcode(unsigned int cpu); -extern bool initrd_gone; -void microcode_bsp_resume(void); #else -static inline void __init load_ucode_bsp(void) { } -static inline void load_ucode_ap(void) { } -static inline void reload_early_microcode(unsigned int cpu) { } -static inline void microcode_bsp_resume(void) { } +static __always_inline bool microcode_nmi_handler_enabled(void) { return false; } #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h deleted file mode 100644 index 9675c621c1ca..000000000000 --- a/arch/x86/include/asm/microcode_amd.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MICROCODE_AMD_H -#define _ASM_X86_MICROCODE_AMD_H - -#include <asm/microcode.h> - -#define UCODE_MAGIC 0x00414d44 -#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 -#define UCODE_UCODE_TYPE 0x00000001 - -#define SECTION_HDR_SIZE 8 -#define CONTAINER_HDR_SZ 12 - -struct equiv_cpu_entry { - u32 installed_cpu; - u32 fixed_errata_mask; - u32 fixed_errata_compare; - u16 equiv_cpu; - u16 res; -} __attribute__((packed)); - -struct microcode_header_amd { - u32 data_code; - u32 patch_id; - u16 mc_patch_data_id; - u8 mc_patch_data_len; - u8 init_flag; - u32 mc_patch_data_checksum; - u32 nb_dev_id; - u32 sb_dev_id; - u16 processor_rev_id; - u8 nb_rev_id; - u8 sb_rev_id; - u8 bios_api_rev; - u8 reserved1[3]; - u32 match_reg[8]; -} __attribute__((packed)); - -struct microcode_amd { - struct microcode_header_amd hdr; - unsigned int mpb[]; -}; - -#define PATCH_MAX_SIZE (3 * PAGE_SIZE) - -#ifdef CONFIG_MICROCODE_AMD -extern void __init load_ucode_amd_bsp(unsigned int family); -extern void load_ucode_amd_ap(unsigned int family); -extern int __init save_microcode_in_initrd_amd(unsigned int family); -void reload_ucode_amd(unsigned int cpu); -extern void amd_check_microcode(void); -#else -static inline void __init load_ucode_amd_bsp(unsigned int family) {} -static inline void load_ucode_amd_ap(unsigned int family) {} -static inline int __init -save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } -static inline void reload_ucode_amd(unsigned int cpu) {} -static inline void amd_check_microcode(void) {} -#endif -#endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h deleted file mode 100644 index f1fa979e05bf..000000000000 --- a/arch/x86/include/asm/microcode_intel.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MICROCODE_INTEL_H -#define _ASM_X86_MICROCODE_INTEL_H - -#include <asm/microcode.h> - -struct microcode_header_intel { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int metasize; - unsigned int reserved[2]; -}; - -struct microcode_intel { - struct microcode_header_intel hdr; - unsigned int bits[]; -}; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[]; -}; - -#define DEFAULT_UCODE_DATASIZE (2000) -#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) -#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) -#define MC_HEADER_TYPE_MICROCODE 1 -#define MC_HEADER_TYPE_IFS 2 - -#define get_totalsize(mc) \ - (((struct microcode_intel *)mc)->hdr.datasize ? \ - ((struct microcode_intel *)mc)->hdr.totalsize : \ - DEFAULT_UCODE_TOTALSIZE) - -#define get_datasize(mc) \ - (((struct microcode_intel *)mc)->hdr.datasize ? \ - ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) - -#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) - -static inline u32 intel_get_microcode_revision(void) -{ - u32 rev, dummy; - - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - native_cpuid_eax(1); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); - - return rev; -} - -#ifdef CONFIG_MICROCODE_INTEL -extern void __init load_ucode_intel_bsp(void); -extern void load_ucode_intel_ap(void); -extern void show_ucode_info_early(void); -extern int __init save_microcode_in_initrd_intel(void); -void reload_ucode_intel(void); -#else -static inline __init void load_ucode_intel_bsp(void) {} -static inline void load_ucode_intel_ap(void) {} -static inline void show_ucode_info_early(void) {} -static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; } -static inline void reload_ucode_intel(void) {} -#endif - -#endif /* _ASM_X86_MICROCODE_INTEL_H */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h new file mode 100644 index 000000000000..12b820259b9f --- /dev/null +++ b/arch/x86/include/asm/mman.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#define arch_calc_vm_prot_bits(prot, key) ( \ + ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ + ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ + ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ + ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) +#endif + +#include <uapi/asm/mman.h> + +#endif /* __ASM_MMAN_H__ */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 0da5c227f490..ce4677b8b735 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -75,7 +75,7 @@ typedef struct { .lock = __MUTEX_INITIALIZER(mm.context.lock), \ } -void leave_mm(int cpu); +void leave_mm(void); #define leave_mm leave_mm #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 1d29dc791f5a..2886cb668d7f 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -88,7 +88,13 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) #ifdef CONFIG_ADDRESS_MASKING static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) { - return mm->context.lam_cr3_mask; + /* + * When switch_mm_irqs_off() is called for a kthread, it may race with + * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two + * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it + * reads a single value for both. + */ + return READ_ONCE(mm->context.lam_cr3_mask); } static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) @@ -186,6 +192,7 @@ do { \ #else #define deactivate_mm(tsk, mm) \ do { \ + shstk_free(tsk); \ load_gs_index(0); \ loadsegment(fs, 0); \ } while (0) @@ -231,11 +238,6 @@ static inline bool is_64bit_mm(struct mm_struct *mm) } #endif -static inline void arch_unmap(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ -} - /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h deleted file mode 100644 index c41b41edd691..000000000000 --- a/arch/x86/include/asm/mmzone.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_X86_32 -# include <asm/mmzone_32.h> -#else -# include <asm/mmzone_64.h> -#endif diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h deleted file mode 100644 index 2d4515e8b7df..000000000000 --- a/arch/x86/include/asm/mmzone_32.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002 - * - */ - -#ifndef _ASM_X86_MMZONE_32_H -#define _ASM_X86_MMZONE_32_H - -#include <asm/smp.h> - -#ifdef CONFIG_NUMA -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) -#endif /* CONFIG_NUMA */ - -#endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h deleted file mode 100644 index 0c585046f744..000000000000 --- a/arch/x86/include/asm/mmzone_64.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* K8 NUMA support */ -/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */ -/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */ -#ifndef _ASM_X86_MMZONE_64_H -#define _ASM_X86_MMZONE_64_H - -#ifdef CONFIG_NUMA - -#include <linux/mmdebug.h> -#include <asm/smp.h> - -extern struct pglist_data *node_data[]; - -#define NODE_DATA(nid) (node_data[nid]) - -#endif -#endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index e90ac7e9ae2c..d593e52e6635 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_MPSPEC_H #define _ASM_X86_MPSPEC_H +#include <linux/types.h> #include <asm/mpspec_def.h> #include <asm/x86_init.h> @@ -15,16 +16,14 @@ extern int pic_mode; * Summit or generic (i.e. installer) kernels need lots of bus entries. * Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */ -#if CONFIG_BASE_SMALL == 0 -# define MAX_MP_BUSSES 260 -#else +#ifdef CONFIG_BASE_SMALL # define MAX_MP_BUSSES 32 +#else +# define MAX_MP_BUSSES 260 #endif #define MAX_IRQ_SOURCES 256 -extern unsigned int def_to_bigsmp; - #else /* CONFIG_X86_64: */ #define MAX_MP_BUSSES 256 @@ -39,9 +38,8 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -extern unsigned int boot_cpu_physical_apicid; +extern u32 boot_cpu_physical_apicid; extern u8 boot_cpu_apic_version; -extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC extern int smp_found_config; @@ -49,96 +47,31 @@ extern int smp_found_config; # define smp_found_config 0 #endif -static inline void get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(0); -} - -static inline void early_get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(1); -} - -static inline void find_smp_config(void) -{ - x86_init.mpparse.find_smp_config(); -} - #ifdef CONFIG_X86_MPPARSE extern void e820__memblock_alloc_reserved_mpc_new(void); extern int enable_update_mptable; -extern void default_find_smp_config(void); -extern void default_get_smp_config(unsigned int early); +extern void mpparse_find_mptable(void); +extern void mpparse_parse_early_smp_config(void); +extern void mpparse_parse_smp_config(void); #else static inline void e820__memblock_alloc_reserved_mpc_new(void) { } -#define enable_update_mptable 0 -#define default_find_smp_config x86_init_noop -#define default_get_smp_config x86_init_uint_noop +#define enable_update_mptable 0 +#define mpparse_find_mptable x86_init_noop +#define mpparse_parse_early_smp_config x86_init_noop +#define mpparse_parse_smp_config x86_init_noop #endif -int generic_processor_info(int apicid, int version); - -#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) - -struct physid_mask { - unsigned long mask[PHYSID_ARRAY_SIZE]; -}; +extern DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC); -typedef struct physid_mask physid_mask_t; - -#define physid_set(physid, map) set_bit(physid, (map).mask) -#define physid_clear(physid, map) clear_bit(physid, (map).mask) -#define physid_isset(physid, map) test_bit(physid, (map).mask) -#define physid_test_and_set(physid, map) \ - test_and_set_bit(physid, (map).mask) - -#define physids_and(dst, src1, src2) \ - bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) - -#define physids_or(dst, src1, src2) \ - bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) - -#define physids_clear(map) \ - bitmap_zero((map).mask, MAX_LOCAL_APIC) - -#define physids_complement(dst, src) \ - bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC) - -#define physids_empty(map) \ - bitmap_empty((map).mask, MAX_LOCAL_APIC) - -#define physids_equal(map1, map2) \ - bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC) - -#define physids_weight(map) \ - bitmap_weight((map).mask, MAX_LOCAL_APIC) - -#define physids_shift_right(d, s, n) \ - bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC) - -#define physids_shift_left(d, s, n) \ - bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC) - -static inline unsigned long physids_coerce(physid_mask_t *map) -{ - return map->mask[0]; -} - -static inline void physids_promote(unsigned long physids, physid_mask_t *map) +static inline void reset_phys_cpu_present_map(u32 apicid) { - physids_clear(*map); - map->mask[0] = physids; + bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC); + set_bit(apicid, phys_cpu_present_map); } -static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) +static inline void copy_phys_cpu_present_map(unsigned long *dst) { - physids_clear(*map); - physid_set(physid, *map); + bitmap_copy(dst, phys_cpu_present_map, MAX_LOCAL_APIC); } -#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } -#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } - -extern physid_mask_t phys_cpu_present_map; - #endif /* _ASM_X86_MPSPEC_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index fa83d88e4c99..5f0bc6a6d025 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -26,6 +26,7 @@ union hv_ghcb; DECLARE_STATIC_KEY_FALSE(isolation_type_snp); +DECLARE_STATIC_KEY_FALSE(isolation_type_tdx); typedef int (*hyperv_fill_flush_list_func)( struct hv_guest_mapping_flush_list *flush, @@ -39,7 +40,7 @@ static inline unsigned char hv_get_nmi_reason(void) } #if IS_ENABLED(CONFIG_HYPERV) -extern int hyperv_init_cpuhp; +extern bool hyperv_paravisor_present; extern void *hv_hypercall_pg; @@ -47,10 +48,25 @@ extern u64 hv_current_partition_id; extern union hv_ghcb * __percpu *hv_ghcb_pg; +bool hv_isolation_type_snp(void); +bool hv_isolation_type_tdx(void); +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); + +/* + * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA + * to start AP in enlightened SEV guest. + */ +#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define HV_AP_SEGMENT_LIMIT 0xffffffff + int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); +/* + * If the hypercall involves no input or output parameters, the hypervisor + * ignores the corresponding GPA pointer. + */ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) { u64 input_address = input ? virt_to_phys(input) : 0; @@ -58,6 +74,19 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) u64 hv_status; #ifdef CONFIG_X86_64 + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input_address, output_address); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__("mov %4, %%r8\n" + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input_address) + : "r" (output_address) + : "cc", "memory", "r8", "r9", "r10", "r11"); + return hv_status; + } + if (!hv_hypercall_pg) return U64_MAX; @@ -101,7 +130,16 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1) u64 hv_status; #ifdef CONFIG_X86_64 - { + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input1, 0); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__( + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + :: "cc", "r8", "r9", "r10", "r11"); + } else { __asm__ __volatile__(CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) @@ -146,7 +184,17 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2) u64 hv_status; #ifdef CONFIG_X86_64 - { + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input1, input2); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__("mov %4, %%r8\n" + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + : "r" (input2) + : "cc", "r8", "r9", "r10", "r11"); + } else { __asm__ __volatile__("mov %4, %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, @@ -225,39 +273,43 @@ int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); #ifdef CONFIG_AMD_MEM_ENCRYPT -void hv_ghcb_msr_write(u64 msr, u64 value); -void hv_ghcb_msr_read(u64 msr, u64 *value); bool hv_ghcb_negotiate_protocol(void); void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason); -void hv_vtom_init(void); +int hv_snp_boot_ap(u32 cpu, unsigned long start_ip); #else -static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} -static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} static inline bool hv_ghcb_negotiate_protocol(void) { return false; } static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} -static inline void hv_vtom_init(void) {} +static inline int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { return 0; } #endif -extern bool hv_isolation_type_snp(void); +#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) +void hv_vtom_init(void); +void hv_ivm_msr_write(u64 msr, u64 value); +void hv_ivm_msr_read(u64 msr, u64 *value); +#else +static inline void hv_vtom_init(void) {} +static inline void hv_ivm_msr_write(u64 msr, u64 value) {} +static inline void hv_ivm_msr_read(u64 msr, u64 *value) {} +#endif -static inline bool hv_is_synic_reg(unsigned int reg) +static inline bool hv_is_synic_msr(unsigned int reg) { - return (reg >= HV_REGISTER_SCONTROL) && - (reg <= HV_REGISTER_SINT15); + return (reg >= HV_X64_MSR_SCONTROL) && + (reg <= HV_X64_MSR_SINT15); } -static inline bool hv_is_sint_reg(unsigned int reg) +static inline bool hv_is_sint_msr(unsigned int reg) { - return (reg >= HV_REGISTER_SINT0) && - (reg <= HV_REGISTER_SINT15); + return (reg >= HV_X64_MSR_SINT0) && + (reg <= HV_X64_MSR_SINT15); } -u64 hv_get_register(unsigned int reg); -void hv_set_register(unsigned int reg, u64 value); -u64 hv_get_non_nested_register(unsigned int reg); -void hv_set_non_nested_register(unsigned int reg, u64 value); +u64 hv_get_msr(unsigned int reg); +void hv_set_msr(unsigned int reg, u64 value); +u64 hv_get_non_nested_msr(unsigned int reg); +void hv_set_non_nested_msr(unsigned int reg, u64 value); -static __always_inline u64 hv_raw_get_register(unsigned int reg) +static __always_inline u64 hv_raw_get_msr(unsigned int reg) { return __rdmsr(reg); } @@ -278,17 +330,19 @@ static inline int hyperv_flush_guest_mapping_range(u64 as, { return -1; } -static inline void hv_set_register(unsigned int reg, u64 value) { } -static inline u64 hv_get_register(unsigned int reg) { return 0; } -static inline void hv_set_non_nested_register(unsigned int reg, u64 value) { } -static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; } +static inline void hv_set_msr(unsigned int reg, u64 value) { } +static inline u64 hv_get_msr(unsigned int reg) { return 0; } +static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) { } +static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; } #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_VTL_MODE void __init hv_vtl_init_platform(void); +int __init hv_vtl_early_init(void); #else static inline void __init hv_vtl_init_platform(void) {} +static inline int __init hv_vtl_early_init(void) { return 0; } #endif #include <asm-generic/mshyperv.h> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1d111350197f..3ae84c3b8e6d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -36,8 +36,33 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) -/* Intel MSRs. Some also available on other CPUs */ +/* + * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc. + * Most MSRs support/allow only a subset of memory types, but the values + * themselves are common across all relevant MSRs. + */ +#define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */ +#define X86_MEMTYPE_WC 1ull /* Write Combining */ +/* RESERVED 2 */ +/* RESERVED 3 */ +#define X86_MEMTYPE_WT 4ull /* Write Through */ +#define X86_MEMTYPE_WP 5ull /* Write Protected */ +#define X86_MEMTYPE_WB 6ull /* Write Back */ +#define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */ + +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ +#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ +#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */ +#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */ +#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */ +#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */ +#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */ +#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */ +#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */ +/* Intel MSRs. Some also available on other CPUs */ #define MSR_TEST_CTRL 0x00000033 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) @@ -50,10 +75,13 @@ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ #define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) +#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */ +#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT) /* A mask for bits which the kernel toggles when controlling mitigations */ #define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \ - | SPEC_CTRL_RRSBA_DIS_S) + | SPEC_CTRL_RRSBA_DIS_S \ + | SPEC_CTRL_BHI_DIS_S) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ @@ -152,6 +180,14 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_BHI_NO BIT(20) /* + * CPU is not affected by Branch + * History Injection. + */ +#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* + * IA32_XAPIC_DISABLE_STATUS MSR + * supported + */ #define ARCH_CAP_PBRSB_NO BIT(24) /* * Not susceptible to Post-Barrier * Return Stack Buffer Predictions. @@ -165,10 +201,13 @@ * CPU is not vulnerable to Gather * Data Sampling (GDS). */ - -#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* - * IA32_XAPIC_DISABLE_STATUS MSR - * supported +#define ARCH_CAP_RFDS_NO BIT(27) /* + * Not susceptible to Register + * File Data Sampling. + */ +#define ARCH_CAP_RFDS_CLEAR BIT(28) /* + * VERW clears CPU Register + * File. */ #define MSR_IA32_FLUSH_CMD 0x0000010b @@ -222,6 +261,9 @@ #define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT) #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) +#define MSR_INTEGRITY_CAPS_SBAF_BIT 8 +#define MSR_INTEGRITY_CAPS_SBAF BIT(MSR_INTEGRITY_CAPS_SBAF_BIT) +#define MSR_INTEGRITY_CAPS_SAF_GEN_MASK GENMASK_ULL(10, 9) #define MSR_LBR_NHM_FROM 0x00000680 #define MSR_LBR_NHM_TO 0x000006c0 @@ -236,6 +278,11 @@ #define LBR_INFO_CYCLES 0xffff #define LBR_INFO_BR_TYPE_OFFSET 56 #define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) +#define LBR_INFO_BR_CNTR_OFFSET 32 +#define LBR_INFO_BR_CNTR_NUM 4 +#define LBR_INFO_BR_CNTR_BITS 2 +#define LBR_INFO_BR_CNTR_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_BITS - 1, 0) +#define LBR_INFO_BR_CNTR_FULL_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS - 1, 0) #define MSR_ARCH_LBR_CTL 0x000014ce #define ARCH_LBR_CTL_LBREN BIT(0) @@ -332,6 +379,12 @@ #define MSR_IA32_CR_PAT 0x00000277 +#define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \ + ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ + (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ + (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ + (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) + #define MSR_IA32_DEBUGCTLMSR 0x000001d9 #define MSR_IA32_LASTBRANCHFROMIP 0x000001db #define MSR_IA32_LASTBRANCHTOIP 0x000001dc @@ -535,6 +588,15 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* V6 PMON MSR range */ +#define MSR_IA32_PMC_V6_GP0_CTR 0x1900 +#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_STEP 4 + +/* KeyID partitioning between MKTME and TDX */ +#define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087 + /* * AMD64 MSRs. Not complete. See the architecture manual for a more * complete list. @@ -553,6 +615,7 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_TW_CFG 0xc0011023 #define MSR_AMD64_DE_CFG 0xc0011029 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 @@ -584,34 +647,49 @@ #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 -#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 -#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) - -/* SNP feature bits enabled by the hypervisor */ -#define MSR_AMD64_SNP_VTOM BIT_ULL(3) -#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4) -#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5) -#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6) -#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7) -#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8) -#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9) -#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10) -#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11) -#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12) -#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14) -#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16) -#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17) - -/* SNP feature bits reserved for future use. */ -#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) -#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) -#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18) +#define MSR_AMD64_SNP_VTOM_BIT 3 +#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT) +#define MSR_AMD64_SNP_REFLECT_VC_BIT 4 +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT) +#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5 +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT) +#define MSR_AMD64_SNP_ALT_INJ_BIT 6 +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT) +#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7 +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8 +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT) +#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9 +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT) +#define MSR_AMD64_SNP_VMPL_SSS_BIT 10 +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT) +#define MSR_AMD64_SNP_SECURE_TSC_BIT 11 +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT) +#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12 +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_IBS_VIRT_BIT 14 +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16 +#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) +#define MSR_AMD64_SNP_SMT_PROT_BIT 17 +#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) +#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +#define MSR_AMD64_RMP_BASE 0xc0010132 +#define MSR_AMD64_RMP_END 0xc0010133 + +#define MSR_SVSM_CAA 0xc001f000 + /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 #define MSR_AMD_CPPC_ENABLE 0xc00102b1 @@ -637,12 +715,21 @@ /* AMD Last Branch Record MSRs */ #define MSR_AMD64_LBR_SELECT 0xc000010e +/* Zen4 */ +#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 + +/* Fam 19h MSRs */ +#define MSR_F19H_UMC_PERF_CTL 0xc0010800 +#define MSR_F19H_UMC_PERF_CTR 0xc0010801 + +/* Zen 2 */ +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 -#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 -#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) - /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 @@ -689,8 +776,15 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_AMD64_SYSCFG 0xc0010010 -#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 #define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24 +#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT) +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25 +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT) +#define MSR_AMD64_SYSCFG_MFDM_BIT 19 +#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT) + #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 @@ -717,6 +811,8 @@ #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 +#define MSR_K7_HWCR_CPB_DIS_BIT 25 +#define MSR_K7_HWCR_CPB_DIS BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT) /* K6 MSRs */ #define MSR_K6_WHCR 0xc0000082 @@ -1083,15 +1179,6 @@ #define MSR_IA32_VMX_VMFUNC 0x00000491 #define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 -/* VMX_BASIC bits and bitmasks */ -#define VMX_BASIC_VMCS_SIZE_SHIFT 32 -#define VMX_BASIC_TRUE_CTLS (1ULL << 55) -#define VMX_BASIC_64 0x0001000000000000LLU -#define VMX_BASIC_MEM_TYPE_SHIFT 50 -#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU -#define VMX_BASIC_MEM_TYPE_WB 6LLU -#define VMX_BASIC_INOUT 0x0040000000000000LLU - /* Resctrl MSRs: */ /* - Intel: */ #define MSR_IA32_L3_QOS_CFG 0xc81 @@ -1100,6 +1187,7 @@ #define MSR_IA32_QM_CTR 0xc8e #define MSR_IA32_PQR_ASSOC 0xc8f #define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_RMID_SNC_CONFIG 0xca0 #define MSR_IA32_L2_CBM_BASE 0xd10 #define MSR_IA32_MBA_THRTL_BASE 0xd50 @@ -1108,16 +1196,15 @@ #define MSR_IA32_SMBA_BW_BASE 0xc0000280 #define MSR_IA32_EVT_CFG_BASE 0xc0000400 -/* MSR_IA32_VMX_MISC bits */ -#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) -#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) -#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ - #define MSR_VM_CR 0xc0010114 #define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + /* Hardware Feedback Interface */ #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 65ec1965cd28..001853541f1e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -12,11 +12,13 @@ #include <uapi/asm/msr.h> #include <asm/shared/msr.h> +#include <linux/percpu.h> + struct msr_info { - u32 msr_no; - struct msr reg; - struct msr *msrs; - int err; + u32 msr_no; + struct msr reg; + struct msr __percpu *msrs; + int err; }; struct msr_regs_info { @@ -297,6 +299,21 @@ do { \ #endif /* !CONFIG_PARAVIRT_XXL */ +/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */ +#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) + +/* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */ +static __always_inline void wrmsrns(u32 msr, u64 val) +{ + /* + * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant + * DS prefix to avoid a trailing NOP. + */ + asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS) + "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) + : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))); +} + /* * 64-bit version of wrmsr_safe(): */ @@ -305,8 +322,8 @@ static inline int wrmsrl_safe(u32 msr, u64 val) return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); } -struct msr *msrs_alloc(void); -void msrs_free(struct msr *msrs); +struct msr __percpu *msrs_alloc(void); +void msrs_free(struct msr __percpu *msrs); int msr_set_bit(u32 msr, u8 bit); int msr_clear_bit(u32 msr, u8 bit); @@ -315,8 +332,8 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q); -void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); -void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); @@ -345,14 +362,14 @@ static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) return 0; } static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, - struct msr *msrs) + struct msr __percpu *msrs) { - rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); + rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h)); } static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, - struct msr *msrs) + struct msr __percpu *msrs) { - wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); + wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h)); } static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 090d658a85a6..4218248083d9 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -69,7 +69,6 @@ extern int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment); extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); -extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); void mtrr_disable(void); @@ -117,7 +116,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) return 0; } #define mtrr_bp_init() do {} while (0) -#define mtrr_bp_restore() do {} while (0) #define mtrr_disable() do {} while (0) #define mtrr_enable() do {} while (0) #define mtrr_generic_set_state() do {} while (0) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 778df05f8539..920426d691ce 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -87,6 +87,15 @@ static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx, :: "a" (eax), "b" (ebx), "c" (ecx)); } +/* + * Re-enable interrupts right upon calling mwait in such a way that + * no interrupt can fire _before_ the execution of mwait, ie: no + * instruction must be placed between "sti" and "mwait". + * + * This is necessary because if an interrupt queues a timer before + * executing mwait, it would otherwise go unnoticed and the next tick + * would not be reprogrammed accordingly before mwait ever wakes up. + */ static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx) { mds_idle_clear_cpu_buffers(); @@ -115,8 +124,15 @@ static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned lo } __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) - __mwait(eax, ecx); + + if (!need_resched()) { + if (ecx & 1) { + __mwait(eax, ecx); + } else { + __sti_mwait(eax, ecx); + raw_local_irq_disable(); + } + } } current_clr_polling(); } diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5c5f1e56c404..41a0ebb699ec 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -14,9 +14,6 @@ extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -struct ctl_table; -extern int proc_nmi_enabled(struct ctl_table *, int , - void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; #endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c55cc243592e..ff5f1ecc7d1e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -49,7 +49,7 @@ * but there is still a cushion vs. the RSB depth. The algorithm does not * claim to be perfect and it can be speculated around by the CPU, but it * is considered that it obfuscates the problem enough to make exploitation - * extremly difficult. + * extremely difficult. */ #define RET_DEPTH_SHIFT 5 #define RSB_RET_STUFF_LOOPS 16 @@ -59,13 +59,13 @@ #ifdef CONFIG_CALL_THUNKS_DEBUG # define CALL_THUNKS_DEBUG_INC_CALLS \ - incq %gs:__x86_call_count; + incq PER_CPU_VAR(__x86_call_count); # define CALL_THUNKS_DEBUG_INC_RETS \ - incq %gs:__x86_ret_count; + incq PER_CPU_VAR(__x86_ret_count); # define CALL_THUNKS_DEBUG_INC_STUFFS \ - incq %gs:__x86_stuffs_count; + incq PER_CPU_VAR(__x86_stuffs_count); # define CALL_THUNKS_DEBUG_INC_CTXSW \ - incq %gs:__x86_ctxsw_count; + incq PER_CPU_VAR(__x86_ctxsw_count); #else # define CALL_THUNKS_DEBUG_INC_CALLS # define CALL_THUNKS_DEBUG_INC_RETS @@ -73,16 +73,13 @@ # define CALL_THUNKS_DEBUG_INC_CTXSW #endif -#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) +#if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) #include <asm/asm-offsets.h> #define CREDIT_CALL_DEPTH \ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); -#define ASM_CREDIT_CALL_DEPTH \ - movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); - #define RESET_CALL_DEPTH \ xor %eax, %eax; \ bts $63, %rax; \ @@ -95,20 +92,14 @@ CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ - sarq $5, %gs:pcpu_hot + X86_call_depth; \ - CALL_THUNKS_DEBUG_INC_CALLS - -#define ASM_INCREMENT_CALL_DEPTH \ sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #else #define CREDIT_CALL_DEPTH -#define ASM_CREDIT_CALL_DEPTH #define RESET_CALL_DEPTH -#define INCREMENT_CALL_DEPTH -#define ASM_INCREMENT_CALL_DEPTH #define RESET_CALL_DEPTH_FROM_CALL +#define INCREMENT_CALL_DEPTH #endif /* @@ -158,7 +149,7 @@ jnz 771b; \ /* barrier for jnz misprediction */ \ lfence; \ - ASM_CREDIT_CALL_DEPTH \ + CREDIT_CALL_DEPTH \ CALL_THUNKS_DEBUG_INC_CTXSW #else /* @@ -196,7 +187,7 @@ .macro ANNOTATE_RETPOLINE_SAFE .Lhere_\@: .pushsection .discard.retpoline_safe - .long .Lhere_\@ - . + .long .Lhere_\@ .popsection .endm @@ -208,11 +199,11 @@ /* * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should - * eventually turn into it's own annotation. + * eventually turn into its own annotation. */ .macro VALIDATE_UNRET_END #if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif @@ -241,7 +232,7 @@ * instruction irrespective of kCFI. */ .macro JMP_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg jmp __x86_indirect_thunk_\reg #else @@ -251,7 +242,7 @@ .endm .macro CALL_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg call __x86_indirect_thunk_\reg #else @@ -271,11 +262,20 @@ .Lskip_rsb_\@: .endm -#ifdef CONFIG_CPU_UNRET_ENTRY -#define CALL_UNTRAIN_RET "call entry_untrain_ret" -#else -#define CALL_UNTRAIN_RET "" +/* + * The CALL to srso_alias_untrain_ret() must be patched in directly at + * the spot where untraining must be done, ie., srso_alias_untrain_ret() + * must be the target of a CALL instruction instead of indirectly + * jumping to a wrapper which then calls it. Therefore, this macro is + * called outside of __UNTRAIN_RET below, for the time being, before the + * kernel can support nested alternatives with arbitrary nesting. + */ +.macro CALL_UNTRAIN_RET +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) + ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \ + "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS #endif +.endm /* * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the @@ -288,53 +288,63 @@ * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. */ -.macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) +.macro __UNTRAIN_RET ibpb_feature, call_depth_insns +#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ - __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH + CALL_UNTRAIN_RET + ALTERNATIVE_2 "", \ + "call entry_ibpb", \ibpb_feature, \ + __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm -.macro UNTRAIN_RET_VM -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) || defined(CONFIG_CPU_SRSO) - VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_IBPB_ON_VMEXIT, \ - __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH -#endif -.endm +#define UNTRAIN_RET \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH) -.macro UNTRAIN_RET_FROM_CALL -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \ - defined(CONFIG_CALL_DEPTH_TRACKING) - VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ - __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH -#endif -.endm +#define UNTRAIN_RET_VM \ + __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH) + +#define UNTRAIN_RET_FROM_CALL \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL) .macro CALL_DEPTH_ACCOUNT -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING ALTERNATIVE "", \ - __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH + __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm +/* + * Macro to execute VERW instruction that mitigate transient data sampling + * attacks such as MDS. On affected systems a microcode update overloaded VERW + * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. + * + * Note: Only the memory operand variant of VERW clears the CPU buffers. + */ +.macro CLEAR_CPU_BUFFERS + ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF +.endm + +#ifdef CONFIG_X86_64 +.macro CLEAR_BRANCH_HISTORY + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP +.endm + +.macro CLEAR_BRANCH_HISTORY_VMEXIT + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT +.endm +#else +#define CLEAR_BRANCH_HISTORY +#define CLEAR_BRANCH_HISTORY_VMEXIT +#endif + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ - ".long 999b - .\n\t" \ + ".long 999b\n\t" \ ".popsection\n\t" typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; @@ -342,32 +352,45 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK extern void __x86_return_thunk(void); #else static inline void __x86_return_thunk(void) {} #endif +#ifdef CONFIG_MITIGATION_UNRET_ENTRY extern void retbleed_return_thunk(void); +#else +static inline void retbleed_return_thunk(void) {} +#endif + +extern void srso_alias_untrain_ret(void); + +#ifdef CONFIG_MITIGATION_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); +#else +static inline void srso_return_thunk(void) {} +static inline void srso_alias_return_thunk(void) {} +#endif -extern void retbleed_untrain_ret(void); -extern void srso_untrain_ret(void); -extern void srso_alias_untrain_ret(void); +extern void retbleed_return_thunk(void); +extern void srso_return_thunk(void); +extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); extern void entry_ibpb(void); +#ifdef CONFIG_X86_64 +extern void clear_bhb_loop(void); +#endif + extern void (*x86_return_thunk)(void); -#ifdef CONFIG_CALL_DEPTH_TRACKING -extern void __x86_return_skl(void); +extern void __warn_thunk(void); -static inline void x86_set_skl_return_thunk(void) -{ - x86_return_thunk = &__x86_return_skl; -} +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING +extern void call_depth_return_thunk(void); #define CALL_DEPTH_ACCOUNT \ ALTERNATIVE("", \ @@ -380,14 +403,14 @@ DECLARE_PER_CPU(u64, __x86_ret_count); DECLARE_PER_CPU(u64, __x86_stuffs_count); DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif -#else -static inline void x86_set_skl_return_thunk(void) {} +#else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ +static inline void call_depth_return_thunk(void) {} #define CALL_DEPTH_ACCOUNT "" -#endif +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE #define GEN(reg) \ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; @@ -408,7 +431,7 @@ static inline void x86_set_skl_return_thunk(void) {} /* * Inline asm uses the %V modifier which is only in newer GCC - * which is ensured when CONFIG_RETPOLINE is defined. + * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ # define CALL_NOSPEC \ ALTERNATIVE_2( \ @@ -538,13 +561,14 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -DECLARE_STATIC_KEY_FALSE(mds_user_clear); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); +extern u16 mds_verw_sel; + #include <asm/segment.h> /** @@ -571,17 +595,6 @@ static __always_inline void mds_clear_cpu_buffers(void) } /** - * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability - * - * Clear CPU buffers if the corresponding static key is enabled - */ -static __always_inline void mds_user_clear_cpu_buffers(void) -{ - if (static_branch_likely(&mds_user_clear)) - mds_clear_cpu_buffers(); -} - -/** * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability * * Clear CPU buffers if the corresponding static key is enabled diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index e3bae2b60a0d..5469d7a7c40f 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -10,15 +10,6 @@ #ifdef CONFIG_NUMA -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) - -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - extern int numa_off; /* @@ -32,9 +23,6 @@ extern int numa_off; extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; -extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); -extern void __init numa_set_distance(int from, int to, int distance); - static inline void set_apicid_to_node(int apicid, s16 node) { __apicid_to_node[apicid] = node; @@ -61,31 +49,20 @@ static inline int numa_cpu_node(int cpu) extern void numa_set_node(int cpu, int node); extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); -extern void numa_add_cpu(int cpu); -extern void numa_remove_cpu(int cpu); +extern void numa_add_cpu(unsigned int cpu); +extern void numa_remove_cpu(unsigned int cpu); extern void init_gi_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } static inline void init_cpu_to_node(void) { } -static inline void numa_add_cpu(int cpu) { } -static inline void numa_remove_cpu(int cpu) { } +static inline void numa_add_cpu(unsigned int cpu) { } +static inline void numa_remove_cpu(unsigned int cpu) { } static inline void init_gi_nodes(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS -void debug_cpumask_set_cpu(int cpu, int node, bool enable); +void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable); #endif -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -int numa_emu_cmdline(char *str); -#else /* CONFIG_NUMA_EMU */ -static inline int numa_emu_cmdline(char *str) -{ - return -EINVAL; -} -#endif /* CONFIG_NUMA_EMU */ - #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index d18e5c332cb9..1b93ff80b43b 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -66,10 +66,14 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) +static __always_inline void *pfn_to_kaddr(unsigned long pfn) +{ + return __va(pfn << PAGE_SHIFT); +} + static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index cc6b8e087192..f3d257c45225 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -17,6 +17,7 @@ extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; +extern unsigned long physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { @@ -54,7 +55,7 @@ static inline void clear_page(void *page) clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, "=D" (page), - "0" (page) + "D" (page) : "cc", "memory", "rax", "rcx"); } diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 86bd4311daf8..52f1b4ff0cc1 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -7,7 +7,7 @@ #include <linux/mem_encrypt.h> /* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 +#define PAGE_SHIFT CONFIG_PAGE_SHIFT #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) @@ -31,10 +31,12 @@ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC -#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \ - CONFIG_PHYSICAL_ALIGN) +/* Physical address where kernel should be loaded. */ +#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + + (CONFIG_PHYSICAL_ALIGN - 1)) \ + & ~(CONFIG_PHYSICAL_ALIGN - 1)) -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) +#define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR) #ifdef CONFIG_X86_64 #include <asm/page_64_types.h> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index b49778664d2b..d4eb9e1d61b8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -6,6 +6,10 @@ #include <asm/paravirt_types.h> +#ifndef __ASSEMBLY__ +struct mm_struct; +#endif + #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> @@ -142,8 +146,7 @@ static inline void write_cr0(unsigned long x) static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, - "mov %%cr2, %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) @@ -154,13 +157,12 @@ static __always_inline void write_cr2(unsigned long x) static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, - "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { - PVOP_ALT_VCALL1(mmu.write_cr3, x, - "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) @@ -182,7 +184,7 @@ extern noinstr void pv_native_wbinvd(void); static __always_inline void wbinvd(void) { - PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN); } static inline u64 paravirt_read_msr(unsigned msr) @@ -390,27 +392,25 @@ static inline void paravirt_release_p4d(unsigned long pfn) static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -444,14 +444,13 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) @@ -464,7 +463,7 @@ static inline pud_t __pud(pudval_t val) pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } @@ -472,7 +471,7 @@ static inline pud_t __pud(pudval_t val) static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) @@ -492,8 +491,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } @@ -501,7 +499,7 @@ static inline p4d_t __p4d(p4dval_t val) static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) @@ -687,17 +685,17 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { - PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { - PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) @@ -726,51 +724,25 @@ static __always_inline unsigned long arch_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 -#define DEFINE_PARAVIRT_ASM(func, instr, sec) \ - asm (".pushsection " #sec ", \"ax\"\n" \ - ".global " #func "\n\t" \ - ".type " #func ", @function\n\t" \ - ASM_FUNC_ALIGN "\n" \ - #func ":\n\t" \ - ASM_ENDBR \ - instr "\n\t" \ - ASM_RET \ - ".size " #func ", . - " #func "\n\t" \ - ".popsection") - extern void default_banner(void); +void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ -#define _PVSITE(ptype, ops, word, algn) \ -771:; \ - ops; \ -772:; \ - .pushsection .parainstructions,"a"; \ - .align algn; \ - word 771b; \ - .byte ptype; \ - .byte 772b-771b; \ - _ASM_ALIGN; \ - .popsection - - #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL +#ifdef CONFIG_DEBUG_ENTRY -#define PARA_PATCH(off) ((off) / 8) -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) -#ifdef CONFIG_DEBUG_ENTRY .macro PARA_IRQ_save_fl - PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), - ANNOTATE_RETPOLINE_SAFE; - call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);) + ANNOTATE_RETPOLINE_SAFE; + call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm -#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \ - ALT_NOT(X86_FEATURE_XENPV) +#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ + "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ + "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ @@ -778,6 +750,12 @@ extern void default_banner(void); #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop + +#ifndef __ASSEMBLY__ +static inline void native_pv_lock_init(void) +{ +} +#endif #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 4acbcddddc29..8d4fbe1be489 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -2,25 +2,10 @@ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H -#ifndef __ASSEMBLY__ -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 type; /* type of this instruction */ - u8 len; /* length of original instruction */ -}; - -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; -#endif - #ifdef CONFIG_PARAVIRT #ifndef __ASSEMBLY__ +#include <linux/types.h> #include <asm/desc_defs.h> #include <asm/pgtable_types.h> @@ -257,43 +242,11 @@ struct paravirt_patch_template { extern struct pv_info pv_info; extern struct paravirt_patch_template pv_ops; -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "m" (pv_ops.op) -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - _ASM_ALIGN "\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]") - -/* Simple instruction patching code. */ -#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" - -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len); +#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op) int paravirt_disable_iospace(void); -/* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ +/* This generates an indirect call based on the operation type number. */ #define PARAVIRT_CALL \ ANNOTATE_RETPOLINE_SAFE \ "call *%[paravirt_opptr];" @@ -326,12 +279,6 @@ int paravirt_disable_iospace(void); * However, x86_64 also has to clobber all caller saved registers, which * unfortunately, are quite a bit (r8 - r11) * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * * Unfortunately there's no way to get gcc to generate the args setup * for the call, and then allow the call itself to be generated by an * inline asm. Because of this, we must do the complete arg setup and @@ -430,14 +377,27 @@ int paravirt_disable_iospace(void); __mask & __eax; \ }) - +/* + * Use alternative patching for paravirt calls: + * - For replacing an indirect call with a direct one, use the "normal" + * ALTERNATIVE() macro with the indirect call as the initial code sequence, + * which will be replaced with the related direct call by using the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + * - In case the replacement is either a direct call or a short code sequence + * depending on a feature bit, the ALTERNATIVE_2() macro is being used. + * The indirect call is the initial code sequence again, while the special + * code sequence is selected with the specified feature bit. In case the + * feature is not active, the direct call is used as above via the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + */ #define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ + asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \ + ALT_CALL_ALWAYS) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ @@ -448,10 +408,11 @@ int paravirt_disable_iospace(void); ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \ - alt, cond) \ + asm volatile(ALTERNATIVE_2(PARAVIRT_CALL, \ + ALT_CALL_INSTR, ALT_CALL_ALWAYS, \ + alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ @@ -549,16 +510,6 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_start_context_switch(struct task_struct *prev); -void paravirt_end_context_switch(struct task_struct *next); - -void paravirt_enter_lazy_mmu(void); -void paravirt_leave_lazy_mmu(void); -void paravirt_flush_lazy_mmu(void); - -void _paravirt_nop(void); -void paravirt_BUG(void); unsigned long paravirt_ret0(void); #ifdef CONFIG_PARAVIRT_XXL u64 _paravirt_ident_64(u64); @@ -568,11 +519,11 @@ void pv_native_irq_enable(void); unsigned long pv_native_read_cr2(void); #endif -#define paravirt_nop ((void *)_paravirt_nop) - -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; +#define paravirt_nop ((void *)nop_func) #endif /* __ASSEMBLY__ */ + +#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV) + #endif /* CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b40c462b4af3..b3ab80a03365 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -10,7 +10,6 @@ #include <linux/numa.h> #include <asm/io.h> #include <asm/memtype.h> -#include <asm/x86_init.h> struct pci_sysdata { int domain; /* PCI domain */ @@ -124,16 +123,4 @@ cpumask_of_pcibus(const struct pci_bus *bus) } #endif -struct pci_setup_rom { - struct setup_data data; - uint16_t vendor; - uint16_t devid; - uint64_t pcilen; - unsigned long segment; - unsigned long bus; - unsigned long device; - unsigned long function; - uint8_t romdata[]; -}; - #endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 34734d730463..c55a79d5feae 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -3,54 +3,102 @@ #define _ASM_X86_PERCPU_H #ifdef CONFIG_X86_64 -#define __percpu_seg gs +# define __percpu_seg gs +# define __percpu_rel (%rip) #else -#define __percpu_seg fs +# define __percpu_seg fs +# define __percpu_rel #endif #ifdef __ASSEMBLY__ #ifdef CONFIG_SMP -#define PER_CPU_VAR(var) %__percpu_seg:var -#else /* ! SMP */ -#define PER_CPU_VAR(var) var -#endif /* SMP */ +# define __percpu %__percpu_seg: +#else +# define __percpu +#endif + +#define PER_CPU_VAR(var) __percpu(var)__percpu_rel #ifdef CONFIG_X86_64_SMP -#define INIT_PER_CPU_VAR(var) init_per_cpu__##var +# define INIT_PER_CPU_VAR(var) init_per_cpu__##var #else -#define INIT_PER_CPU_VAR(var) var +# define INIT_PER_CPU_VAR(var) var #endif -#else /* ...!ASSEMBLY */ +#else /* !__ASSEMBLY__: */ -#include <linux/kernel.h> +#include <linux/build_bug.h> #include <linux/stringify.h> +#include <asm/asm.h> #ifdef CONFIG_SMP + +#ifdef CONFIG_CC_HAS_NAMED_AS + +#ifdef __CHECKER__ +# define __seg_gs __attribute__((address_space(__seg_gs))) +# define __seg_fs __attribute__((address_space(__seg_fs))) +#endif + +#ifdef CONFIG_X86_64 +# define __percpu_seg_override __seg_gs +#else +# define __percpu_seg_override __seg_fs +#endif + +#define __percpu_prefix "" + +#else /* !CONFIG_CC_HAS_NAMED_AS: */ + +#define __percpu_seg_override #define __percpu_prefix "%%"__stringify(__percpu_seg)":" + +#endif /* CONFIG_CC_HAS_NAMED_AS */ + +#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" #define __my_cpu_offset this_cpu_read(this_cpu_off) /* * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. + * + * arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit + * kernel, because games are played with CONFIG_X86_64 there and + * sizeof(this_cpu_off) becames 4. */ -#define arch_raw_cpu_ptr(ptr) \ -({ \ - unsigned long tcp_ptr__; \ - asm ("add " __percpu_arg(1) ", %0" \ - : "=r" (tcp_ptr__) \ - : "m" (this_cpu_off), "0" (ptr)); \ - (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ +#ifndef BUILD_VDSO32_64 +#define arch_raw_cpu_ptr(_ptr) \ +({ \ + unsigned long tcp_ptr__ = raw_cpu_read_long(this_cpu_off); \ + \ + tcp_ptr__ += (__force unsigned long)(_ptr); \ + (typeof(*(_ptr)) __kernel __force *)tcp_ptr__; \ }) #else -#define __percpu_prefix "" +#define arch_raw_cpu_ptr(_ptr) ({ BUILD_BUG(); (typeof(_ptr))0; }) #endif +#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel + +#else /* !CONFIG_SMP: */ + +#define __percpu_seg_override +#define __percpu_prefix "" +#define __force_percpu_prefix "" + +#define PER_CPU_VAR(var) (var)__percpu_rel + +#endif /* CONFIG_SMP */ + +#define __my_cpu_type(var) typeof(var) __percpu_seg_override +#define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr) +#define __my_cpu_var(var) (*__my_cpu_ptr(&(var))) #define __percpu_arg(x) __percpu_prefix "%" #x +#define __force_percpu_arg(x) __force_percpu_prefix "%" #x /* - * Initialized pointers to per-cpu variables needed for the boot + * Initialized pointers to per-CPU variables needed for the boot * processor need to use these macros to get the proper address * offset from __per_cpu_load on SMP. * @@ -60,65 +108,128 @@ extern typeof(var) init_per_cpu_var(var) #ifdef CONFIG_X86_64_SMP -#define init_per_cpu_var(var) init_per_cpu__##var +# define init_per_cpu_var(var) init_per_cpu__##var #else -#define init_per_cpu_var(var) var +# define init_per_cpu_var(var) var #endif -/* For arch-specific code, we can use direct single-insn ops (they - * don't give an lvalue though). */ +/* + * For arch-specific code, we can use direct single-insn ops (they + * don't give an lvalue though). + */ -#define __pcpu_type_1 u8 -#define __pcpu_type_2 u16 -#define __pcpu_type_4 u32 -#define __pcpu_type_8 u64 +#define __pcpu_type_1 u8 +#define __pcpu_type_2 u16 +#define __pcpu_type_4 u32 +#define __pcpu_type_8 u64 -#define __pcpu_cast_1(val) ((u8)(((unsigned long) val) & 0xff)) -#define __pcpu_cast_2(val) ((u16)(((unsigned long) val) & 0xffff)) -#define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff)) -#define __pcpu_cast_8(val) ((u64)(val)) +#define __pcpu_cast_1(val) ((u8)(((unsigned long) val) & 0xff)) +#define __pcpu_cast_2(val) ((u16)(((unsigned long) val) & 0xffff)) +#define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff)) +#define __pcpu_cast_8(val) ((u64)(val)) -#define __pcpu_op1_1(op, dst) op "b " dst -#define __pcpu_op1_2(op, dst) op "w " dst -#define __pcpu_op1_4(op, dst) op "l " dst -#define __pcpu_op1_8(op, dst) op "q " dst +#define __pcpu_op1_1(op, dst) op "b " dst +#define __pcpu_op1_2(op, dst) op "w " dst +#define __pcpu_op1_4(op, dst) op "l " dst +#define __pcpu_op1_8(op, dst) op "q " dst #define __pcpu_op2_1(op, src, dst) op "b " src ", " dst #define __pcpu_op2_2(op, src, dst) op "w " src ", " dst #define __pcpu_op2_4(op, src, dst) op "l " src ", " dst #define __pcpu_op2_8(op, src, dst) op "q " src ", " dst -#define __pcpu_reg_1(mod, x) mod "q" (x) -#define __pcpu_reg_2(mod, x) mod "r" (x) -#define __pcpu_reg_4(mod, x) mod "r" (x) -#define __pcpu_reg_8(mod, x) mod "r" (x) +#define __pcpu_reg_1(mod, x) mod "q" (x) +#define __pcpu_reg_2(mod, x) mod "r" (x) +#define __pcpu_reg_4(mod, x) mod "r" (x) +#define __pcpu_reg_8(mod, x) mod "r" (x) + +#define __pcpu_reg_imm_1(x) "qi" (x) +#define __pcpu_reg_imm_2(x) "ri" (x) +#define __pcpu_reg_imm_4(x) "ri" (x) +#define __pcpu_reg_imm_8(x) "re" (x) + +#ifdef CONFIG_USE_X86_SEG_SUPPORT + +#define __raw_cpu_read(size, qual, pcp) \ +({ \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \ +}) + +#define __raw_cpu_write(size, qual, pcp, val) \ +do { \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \ +} while (0) + +#define __raw_cpu_read_const(pcp) __raw_cpu_read(, , pcp) -#define __pcpu_reg_imm_1(x) "qi" (x) -#define __pcpu_reg_imm_2(x) "ri" (x) -#define __pcpu_reg_imm_4(x) "ri" (x) -#define __pcpu_reg_imm_8(x) "re" (x) +#else /* !CONFIG_USE_X86_SEG_SUPPORT: */ + +#define __raw_cpu_read(size, qual, _var) \ +({ \ + __pcpu_type_##size pfo_val__; \ + \ + asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), "%[val]") \ + : [val] __pcpu_reg_##size("=", pfo_val__) \ + : [var] "m" (__my_cpu_var(_var))); \ + \ + (typeof(_var))(unsigned long) pfo_val__; \ +}) -#define percpu_to_op(size, qual, op, _var, _val) \ +#define __raw_cpu_write(size, qual, _var, _val) \ do { \ __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \ + \ if (0) { \ typeof(_var) pto_tmp__; \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ - asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ - : [var] "+m" (_var) \ + asm qual(__pcpu_op2_##size("mov", "%[val]", __percpu_arg([var])) \ + : [var] "=m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) +/* + * The generic per-CPU infrastrucutre is not suitable for + * reading const-qualified variables. + */ +#define __raw_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; }) + +#endif /* CONFIG_USE_X86_SEG_SUPPORT */ + +#define __raw_cpu_read_stable(size, _var) \ +({ \ + __pcpu_type_##size pfo_val__; \ + \ + asm(__pcpu_op2_##size("mov", __force_percpu_arg(a[var]), "%[val]") \ + : [val] __pcpu_reg_##size("=", pfo_val__) \ + : [var] "i" (&(_var))); \ + \ + (typeof(_var))(unsigned long) pfo_val__; \ +}) + #define percpu_unary_op(size, qual, op, _var) \ ({ \ asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \ - : [var] "+m" (_var)); \ + : [var] "+m" (__my_cpu_var(_var))); \ }) +#define percpu_binary_op(size, qual, op, _var, _val) \ +do { \ + __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \ + \ + if (0) { \ + typeof(_var) pto_tmp__; \ + pto_tmp__ = (_val); \ + (void)pto_tmp__; \ + } \ + asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ + : [var] "+m" (__my_cpu_var(_var)) \ + : [val] __pcpu_reg_imm_##size(pto_val__)); \ +} while (0) + /* - * Generate a percpu add to memory instruction and optimize code + * Generate a per-CPU add to memory instruction and optimize code * if one is added or subtracted. */ #define percpu_add_op(size, qual, var, val) \ @@ -126,6 +237,7 @@ do { \ const int pao_ID__ = (__builtin_constant_p(val) && \ ((val) == 1 || (val) == -1)) ? \ (int)(val) : 0; \ + \ if (0) { \ typeof(var) pao_tmp__; \ pao_tmp__ = (val); \ @@ -136,81 +248,93 @@ do { \ else if (pao_ID__ == -1) \ percpu_unary_op(size, qual, "dec", var); \ else \ - percpu_to_op(size, qual, "add", var, val); \ + percpu_binary_op(size, qual, "add", var, val); \ } while (0) -#define percpu_from_op(size, qual, op, _var) \ -({ \ - __pcpu_type_##size pfo_val__; \ - asm qual (__pcpu_op2_##size(op, __percpu_arg([var]), "%[val]") \ - : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "m" (_var)); \ - (typeof(_var))(unsigned long) pfo_val__; \ -}) - -#define percpu_stable_op(size, op, _var) \ -({ \ - __pcpu_type_##size pfo_val__; \ - asm(__pcpu_op2_##size(op, __percpu_arg(P[var]), "%[val]") \ - : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "p" (&(_var))); \ - (typeof(_var))(unsigned long) pfo_val__; \ -}) - /* * Add return operation */ #define percpu_add_return_op(size, qual, _var, _val) \ ({ \ __pcpu_type_##size paro_tmp__ = __pcpu_cast_##size(_val); \ + \ asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \ __percpu_arg([var])) \ : [tmp] __pcpu_reg_##size("+", paro_tmp__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : : "memory"); \ (typeof(_var))(unsigned long) (paro_tmp__ + _val); \ }) /* - * xchg is implemented using cmpxchg without a lock prefix. xchg is - * expensive due to the implied lock prefix. The processor cannot prefetch - * cachelines if xchg is used. + * raw_cpu_xchg() can use a load-store since + * it is not required to be IRQ-safe. */ -#define percpu_xchg_op(size, qual, _var, _nval) \ +#define raw_percpu_xchg_op(_var, _nval) \ ({ \ - __pcpu_type_##size pxo_old__; \ - __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \ - asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \ - "%[oval]") \ - "\n1:\t" \ - __pcpu_op2_##size("cmpxchg", "%[nval]", \ - __percpu_arg([var])) \ - "\n\tjnz 1b" \ - : [oval] "=&a" (pxo_old__), \ - [var] "+m" (_var) \ - : [nval] __pcpu_reg_##size(, pxo_new__) \ - : "memory"); \ - (typeof(_var))(unsigned long) pxo_old__; \ + typeof(_var) pxo_old__ = raw_cpu_read(_var); \ + \ + raw_cpu_write(_var, _nval); \ + \ + pxo_old__; \ }) /* - * cmpxchg has no such implied lock semantics as a result it is much - * more efficient for cpu local operations. + * this_cpu_xchg() is implemented using CMPXCHG without a LOCK prefix. + * XCHG is expensive due to the implied LOCK prefix. The processor + * cannot prefetch cachelines if XCHG is used. + */ +#define this_percpu_xchg_op(_var, _nval) \ +({ \ + typeof(_var) pxo_old__ = this_cpu_read(_var); \ + \ + do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \ + \ + pxo_old__; \ +}) + +/* + * CMPXCHG has no such implied lock semantics as a result it is much + * more efficient for CPU-local operations. */ #define percpu_cmpxchg_op(size, qual, _var, _oval, _nval) \ ({ \ __pcpu_type_##size pco_old__ = __pcpu_cast_##size(_oval); \ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ + \ asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ __percpu_arg([var])) \ : [oval] "+a" (pco_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ : "memory"); \ + \ (typeof(_var))(unsigned long) pco_old__; \ }) +#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \ + __pcpu_type_##size pco_old__ = *pco_oval__; \ + __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ + \ + asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ + __percpu_arg([var])) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [oval] "+a" (pco_old__), \ + [var] "+m" (__my_cpu_var(_var)) \ + : [nval] __pcpu_reg_##size(, pco_new__) \ + : "memory"); \ + if (unlikely(!success)) \ + *pco_oval__ = pco_old__; \ + \ + likely(success); \ +}) + #if defined(CONFIG_X86_32) && !defined(CONFIG_UML) + #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \ ({ \ union { \ @@ -223,25 +347,64 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : [var] "+m" (_var), \ + : [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ - "c" (new__.high) \ - : "memory", "esi"); \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) -#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval) -#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval) -#endif +#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval) +#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval) + +#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + u64 *_oval = (u64 *)(_ovalp); \ + union { \ + u64 var; \ + struct { \ + u32 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = *_oval; \ + new__.var = _nval; \ + \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high) \ + : "b" (new__.low), \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ + if (unlikely(!success)) \ + *_oval = old__.var; \ + \ + likely(success); \ +}) + +#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval) +#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval) + +#endif /* defined(CONFIG_X86_32) && !defined(CONFIG_UML) */ #ifdef CONFIG_X86_64 -#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval); -#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval); +#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval); +#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval); + +#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval); +#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval); #define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \ ({ \ @@ -255,156 +418,199 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : [var] "+m" (_var), \ + : [var] "+m" (__my_cpu_var(_var)), \ "+a" (old__.low), \ "+d" (old__.high) \ : "b" (new__.low), \ - "c" (new__.high) \ - : "memory", "rsi"); \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) -#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval) -#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval) -#endif +#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval) +#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval) -/* - * this_cpu_read() makes gcc load the percpu variable every time it is - * accessed while this_cpu_read_stable() allows the value to be cached. - * this_cpu_read_stable() is more efficient and can be used if its value - * is guaranteed to be valid across cpus. The current users include - * get_current() and get_thread_info() both of which are actually - * per-thread variables implemented as per-cpu variables and thus - * stable for the duration of the respective task. - */ -#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp) -#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp) -#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp) -#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp) -#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp) - -#define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp) -#define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp) -#define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp) - -#define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val) -#define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val) -#define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val) -#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val) -#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val) -#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val) -#define raw_cpu_and_1(pcp, val) percpu_to_op(1, , "and", (pcp), val) -#define raw_cpu_and_2(pcp, val) percpu_to_op(2, , "and", (pcp), val) -#define raw_cpu_and_4(pcp, val) percpu_to_op(4, , "and", (pcp), val) -#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val) -#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val) -#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val) - -/* - * raw_cpu_xchg() can use a load-store since it is not required to be - * IRQ-safe. - */ -#define raw_percpu_xchg_op(var, nval) \ +#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \ ({ \ - typeof(var) pxo_ret__ = raw_cpu_read(var); \ - raw_cpu_write(var, (nval)); \ - pxo_ret__; \ + bool success; \ + u128 *_oval = (u128 *)(_ovalp); \ + union { \ + u128 var; \ + struct { \ + u64 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = *_oval; \ + new__.var = _nval; \ + \ + asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), \ + "+d" (old__.high) \ + : "b" (new__.low), \ + "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ + if (unlikely(!success)) \ + *_oval = old__.var; \ + likely(success); \ }) -#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) - -#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp) -#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp) -#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp) -#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val) -#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val) -#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val) -#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val) -#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val) -#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val) -#define this_cpu_and_1(pcp, val) percpu_to_op(1, volatile, "and", (pcp), val) -#define this_cpu_and_2(pcp, val) percpu_to_op(2, volatile, "and", (pcp), val) -#define this_cpu_and_4(pcp, val) percpu_to_op(4, volatile, "and", (pcp), val) -#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val) -#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val) -#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val) -#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval) -#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval) -#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval) - -#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val) -#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val) -#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(4, , pcp, val) -#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval) -#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval) -#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval) - -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(4, volatile, pcp, val) -#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval) -#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval) -#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval) +#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval) +#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval) + +#endif /* CONFIG_X86_64 */ + +#define raw_cpu_read_1(pcp) __raw_cpu_read(1, , pcp) +#define raw_cpu_read_2(pcp) __raw_cpu_read(2, , pcp) +#define raw_cpu_read_4(pcp) __raw_cpu_read(4, , pcp) +#define raw_cpu_write_1(pcp, val) __raw_cpu_write(1, , pcp, val) +#define raw_cpu_write_2(pcp, val) __raw_cpu_write(2, , pcp, val) +#define raw_cpu_write_4(pcp, val) __raw_cpu_write(4, , pcp, val) + +#define this_cpu_read_1(pcp) __raw_cpu_read(1, volatile, pcp) +#define this_cpu_read_2(pcp) __raw_cpu_read(2, volatile, pcp) +#define this_cpu_read_4(pcp) __raw_cpu_read(4, volatile, pcp) +#define this_cpu_write_1(pcp, val) __raw_cpu_write(1, volatile, pcp, val) +#define this_cpu_write_2(pcp, val) __raw_cpu_write(2, volatile, pcp, val) +#define this_cpu_write_4(pcp, val) __raw_cpu_write(4, volatile, pcp, val) + +#define this_cpu_read_stable_1(pcp) __raw_cpu_read_stable(1, pcp) +#define this_cpu_read_stable_2(pcp) __raw_cpu_read_stable(2, pcp) +#define this_cpu_read_stable_4(pcp) __raw_cpu_read_stable(4, pcp) + +#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_binary_op(1, , "and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_binary_op(2, , "and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_binary_op(4, , "and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_binary_op(1, , "or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_binary_op(2, , "or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_binary_op(4, , "or", (pcp), val) +#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) + +#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val) +#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val) +#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val) +#define this_cpu_and_1(pcp, val) percpu_binary_op(1, volatile, "and", (pcp), val) +#define this_cpu_and_2(pcp, val) percpu_binary_op(2, volatile, "and", (pcp), val) +#define this_cpu_and_4(pcp, val) percpu_binary_op(4, volatile, "and", (pcp), val) +#define this_cpu_or_1(pcp, val) percpu_binary_op(1, volatile, "or", (pcp), val) +#define this_cpu_or_2(pcp, val) percpu_binary_op(2, volatile, "or", (pcp), val) +#define this_cpu_or_4(pcp, val) percpu_binary_op(4, volatile, "or", (pcp), val) +#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval) + +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(4, , pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval) +#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval) +#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval) +#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval) + +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(4, volatile, pcp, val) +#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval) +#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval) +#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval) +#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval) /* - * Per cpu atomic 64 bit operations are only available under 64 bit. - * 32 bit must fall back to generic operations. + * Per-CPU atomic 64-bit operations are only available under 64-bit kernels. + * 32-bit kernels must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp) -#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val) -#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val) -#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val) -#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val) -#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val) -#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) -#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) -#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval) -#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval) -#endif -static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, - const unsigned long __percpu *addr) -{ - unsigned long __percpu *a = - (unsigned long __percpu *)addr + nr / BITS_PER_LONG; +#define raw_cpu_read_8(pcp) __raw_cpu_read(8, , pcp) +#define raw_cpu_write_8(pcp, val) __raw_cpu_write(8, , pcp, val) -#ifdef CONFIG_X86_64 - return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0; -#else - return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0; -#endif -} +#define this_cpu_read_8(pcp) __raw_cpu_read(8, volatile, pcp) +#define this_cpu_write_8(pcp, val) __raw_cpu_write(8, volatile, pcp, val) + +#define this_cpu_read_stable_8(pcp) __raw_cpu_read_stable(8, pcp) + +#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_binary_op(8, , "and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_binary_op(8, , "or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val) +#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) +#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval) + +#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val) +#define this_cpu_and_8(pcp, val) percpu_binary_op(8, volatile, "and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_binary_op(8, volatile, "or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val) +#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval) +#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval) + +#define raw_cpu_read_long(pcp) raw_cpu_read_8(pcp) + +#else /* !CONFIG_X86_64: */ + +/* There is no generic 64-bit read stable operation for 32-bit targets. */ +#define this_cpu_read_stable_8(pcp) ({ BUILD_BUG(); (typeof(pcp))0; }) + +#define raw_cpu_read_long(pcp) raw_cpu_read_4(pcp) + +#endif /* CONFIG_X86_64 */ + +#define this_cpu_read_const(pcp) __raw_cpu_read_const(pcp) -static inline bool x86_this_cpu_variable_test_bit(int nr, - const unsigned long __percpu *addr) -{ - bool oldbit; +/* + * this_cpu_read() makes the compiler load the per-CPU variable every time + * it is accessed while this_cpu_read_stable() allows the value to be cached. + * this_cpu_read_stable() is more efficient and can be used if its value + * is guaranteed to be valid across CPUs. The current users include + * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are + * actually per-thread variables implemented as per-CPU variables and + * thus stable for the duration of the respective task. + */ +#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp) - asm volatile("btl "__percpu_arg(2)",%1" - CC_SET(c) - : CC_OUT(c) (oldbit) - : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); +#define x86_this_cpu_constant_test_bit(_nr, _var) \ +({ \ + unsigned long __percpu *addr__ = \ + (unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \ + \ + !!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__)); \ +}) - return oldbit; -} +#define x86_this_cpu_variable_test_bit(_nr, _var) \ +({ \ + bool oldbit; \ + \ + asm volatile("btl %[nr], " __percpu_arg([var]) \ + CC_SET(c) \ + : CC_OUT(c) (oldbit) \ + : [var] "m" (__my_cpu_var(_var)), \ + [nr] "rI" (_nr)); \ + oldbit; \ +}) -#define x86_this_cpu_test_bit(nr, addr) \ - (__builtin_constant_p((nr)) \ - ? x86_this_cpu_constant_test_bit((nr), (addr)) \ - : x86_this_cpu_variable_test_bit((nr), (addr))) +#define x86_this_cpu_test_bit(_nr, _var) \ + (__builtin_constant_p(_nr) \ + ? x86_this_cpu_constant_test_bit(_nr, _var) \ + : x86_this_cpu_variable_test_bit(_nr, _var)) #include <asm-generic/percpu.h> @@ -434,46 +640,47 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); { [0 ... NR_CPUS-1] = _initvalue }; \ __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map -#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ EXPORT_PER_CPU_SYMBOL(_name) -#define DECLARE_EARLY_PER_CPU(_type, _name) \ - DECLARE_PER_CPU(_type, _name); \ - extern __typeof__(_type) *_name##_early_ptr; \ +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name); \ + extern __typeof__(_type) *_name##_early_ptr; \ extern __typeof__(_type) _name##_early_map[] -#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ - DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \ - extern __typeof__(_type) *_name##_early_ptr; \ +#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ + DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \ + extern __typeof__(_type) *_name##_early_ptr; \ extern __typeof__(_type) _name##_early_map[] -#define early_per_cpu_ptr(_name) (_name##_early_ptr) -#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) -#define early_per_cpu(_name, _cpu) \ - *(early_per_cpu_ptr(_name) ? \ - &early_per_cpu_ptr(_name)[_cpu] : \ +#define early_per_cpu_ptr(_name) (_name##_early_ptr) +#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) + +#define early_per_cpu(_name, _cpu) \ + *(early_per_cpu_ptr(_name) ? \ + &early_per_cpu_ptr(_name)[_cpu] : \ &per_cpu(_name, _cpu)) -#else /* !CONFIG_SMP */ -#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ +#else /* !CONFIG_SMP: */ +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ DEFINE_PER_CPU(_type, _name) = _initvalue #define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \ DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue -#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ EXPORT_PER_CPU_SYMBOL(_name) -#define DECLARE_EARLY_PER_CPU(_type, _name) \ +#define DECLARE_EARLY_PER_CPU(_type, _name) \ DECLARE_PER_CPU(_type, _name) -#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ +#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ DECLARE_PER_CPU_READ_MOSTLY(_type, _name) -#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) -#define early_per_cpu_ptr(_name) NULL +#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) +#define early_per_cpu_ptr(_name) NULL /* no early_per_cpu_map() */ -#endif /* !CONFIG_SMP */ +#endif /* !CONFIG_SMP */ #endif /* _ASM_X86_PERCPU_H */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 85a9fd5a3ec3..91b73571412f 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -31,6 +31,9 @@ #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL +#define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35) +#define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) +#define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) #define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 @@ -112,6 +115,13 @@ (AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \ AMD64_PERFMON_V2_EVENTSEL_UMASK_NB) +#define AMD64_PERFMON_V2_ENABLE_UMC BIT_ULL(31) +#define AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC GENMASK_ULL(7, 0) +#define AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC GENMASK_ULL(9, 8) +#define AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC \ + (AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC | \ + AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC) + #define AMD64_NUM_COUNTERS 4 #define AMD64_NUM_COUNTERS_CORE 6 #define AMD64_NUM_COUNTERS_NB 4 @@ -177,6 +187,8 @@ union cpuid10_edx { * detection/enumeration details: */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 +#define ARCH_PERFMON_EXT_UMASK2 0x1 +#define ARCH_PERFMON_EXT_EQ 0x2 #define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 @@ -216,6 +228,9 @@ union cpuid28_ecx { unsigned int lbr_timed_lbr:1; /* Branch Type Field Supported */ unsigned int lbr_br_type:1; + unsigned int reserved:13; + /* Branch counters (Event Logging) Supported */ + unsigned int lbr_counters:4; } split; unsigned int full; }; @@ -232,6 +247,8 @@ union cpuid_0x80000022_ebx { unsigned int lbr_v2_stack_sz:6; /* Number of Data Fabric Counters */ unsigned int num_df_pmc:6; + /* Number of Unified Memory Controller Counters */ + unsigned int num_umc_pmc:6; } split; unsigned int full; }; @@ -294,6 +311,10 @@ struct x86_pmu_capability { #define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3) #define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS) +/* TOPDOWN_BAD_SPECULATION.ALL: fixed counter 4 (Atom only) */ +/* TOPDOWN_FE_BOUND.ALL: fixed counter 5 (Atom only) */ +/* TOPDOWN_RETIRING.ALL: fixed counter 6 (Atom only) */ + static inline bool use_fixed_pseudo_encoding(u64 code) { return !(code & 0xff); @@ -542,6 +563,7 @@ struct x86_pmu_lbr { unsigned int from; unsigned int to; unsigned int info; + bool has_callstack; }; extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 94de1a05aeba..d65e338b6a5f 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -181,7 +181,7 @@ static inline u64 p4_clear_ht_bit(u64 config) static inline int p4_ht_active(void) { #ifdef CONFIG_SMP - return smp_num_siblings > 1; + return __max_threads_per_core > 1; #endif return 0; } @@ -189,7 +189,7 @@ static inline int p4_ht_active(void) static inline int p4_ht_thread(int cpu) { #ifdef CONFIG_SMP - if (smp_num_siblings == 2) + if (__max_threads_per_core == 2) return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c7ec5bb88334..dcd836b59beb 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -34,7 +34,7 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} */ extern gfp_t __userpte_alloc_gfp; -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is * both 8k in size and 8k-aligned. That lets us just flip bit 12 diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 9e7c0b719c3c..dabafba957ea 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -52,7 +52,7 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) static inline void native_set_pud(pud_t *pudp, pud_t pud) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); #endif pxx_xchg64(pud, pudp, native_pud_val(pud)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5700bb337987..4c2d080d26b4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -31,7 +31,8 @@ struct seq_file; void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); -void ptdump_walk_pgd_level_checkwx(void); +bool ptdump_walk_pgd_level_checkwx(void); +#define ptdump_check_wx ptdump_walk_pgd_level_checkwx void ptdump_walk_user_pgd_level_checkwx(void); /* @@ -41,10 +42,8 @@ void ptdump_walk_user_pgd_level_checkwx(void); #define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot))) #ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_walk_pgd_level_checkwx() #define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else -#define debug_checkwx() do { } while (0) #define debug_checkwx_user() do { } while (0) #endif @@ -121,13 +120,47 @@ extern pmdval_t early_pmd_flags; #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v & ~clear); +} + +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v & ~clear); +} + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -static inline int pte_dirty(pte_t pte) +static inline bool pte_dirty(pte_t pte) { - return pte_flags(pte) & _PAGE_DIRTY; + return pte_flags(pte) & _PAGE_DIRTY_BITS; +} + +static inline bool pte_shstk(pte_t pte) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY; } static inline int pte_young(pte_t pte) @@ -135,9 +168,22 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } -static inline int pmd_dirty(pmd_t pmd) +static inline bool pte_decrypted(pte_t pte) { - return pmd_flags(pmd) & _PAGE_DIRTY; + return cc_mkdec(pte_val(pte)) == pte_val(pte); +} + +#define pmd_dirty pmd_dirty +static inline bool pmd_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_DIRTY_BITS; +} + +static inline bool pmd_shstk(pmd_t pmd) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); } #define pmd_young pmd_young @@ -146,9 +192,9 @@ static inline int pmd_young(pmd_t pmd) return pmd_flags(pmd) & _PAGE_ACCESSED; } -static inline int pud_dirty(pud_t pud) +static inline bool pud_dirty(pud_t pud) { - return pud_flags(pud) & _PAGE_DIRTY; + return pud_flags(pud) & _PAGE_DIRTY_BITS; } static inline int pud_young(pud_t pud) @@ -156,9 +202,36 @@ static inline int pud_young(pud_t pud) return pud_flags(pud) & _PAGE_ACCESSED; } +static inline bool pud_shstk(pud_t pud) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); +} + static inline int pte_write(pte_t pte) { - return pte_flags(pte) & _PAGE_RW; + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte); +} + +#define pmd_write pmd_write +static inline int pmd_write(pmd_t pmd) +{ + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd); +} + +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return pud_flags(pud) & _PAGE_RW; } static inline int pte_huge(pte_t pte) @@ -185,6 +258,8 @@ static inline int pte_special(pte_t pte) static inline u64 protnone_mask(u64 val); +#define PFN_PTE_SHIFT PAGE_SHIFT + static inline unsigned long pte_pfn(pte_t pte) { phys_addr_t pfn = pte_val(pte); @@ -199,6 +274,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd) return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } +#define pud_pfn pud_pfn static inline unsigned long pud_pfn(pud_t pud) { phys_addr_t pfn = pud_val(pud); @@ -216,8 +292,8 @@ static inline unsigned long pgd_pfn(pgd_t pgd) return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; } -#define p4d_leaf p4d_large -static inline int p4d_large(p4d_t p4d) +#define p4d_leaf p4d_leaf +static inline bool p4d_leaf(p4d_t p4d) { /* No 512 GiB pages yet */ return 0; @@ -225,14 +301,14 @@ static inline int p4d_large(p4d_t p4d) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) -#define pmd_leaf pmd_large -static inline int pmd_large(pmd_t pte) +#define pmd_leaf pmd_leaf +static inline bool pmd_leaf(pmd_t pte) { return pmd_flags(pte) & _PAGE_PSE; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */ +/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_leaf */ static inline int pmd_trans_huge(pmd_t pmd) { return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; @@ -269,6 +345,30 @@ static inline int pud_devmap(pud_t pud) } #endif +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SPECIAL; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return pud_flags(pud) & _PAGE_SPECIAL; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + static inline int pgd_devmap(pgd_t pgd) { return 0; @@ -290,31 +390,69 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } -static inline pte_t pte_wrprotect(pte_t pte) +/* + * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the + * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So + * when creating dirty, write-protected memory, a software bit is used: + * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the + * Dirty bit to SavedDirty, and vice-vesra. + * + * This shifting is only done if needed. In the case of shifting + * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of + * shifting SavedDirty->Dirty, the condition is Write=1. + */ +static inline pgprotval_t mksaveddirty_shift(pgprotval_t v) { - return pte_clear_flags(pte, _PAGE_RW); + pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY; + v &= ~(cond << _PAGE_BIT_DIRTY); + + return v; } -#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP -static inline int pte_uffd_wp(pte_t pte) +static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v) { - bool wp = pte_flags(pte) & _PAGE_UFFD_WP; + pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY; + v &= ~(cond << _PAGE_BIT_SAVED_DIRTY); + + return v; +} + +static inline pte_t pte_mksaveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = mksaveddirty_shift(v); + return native_make_pte(v); +} + +static inline pte_t pte_clear_saveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = clear_saveddirty_shift(v); + return native_make_pte(v); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_RW); -#ifdef CONFIG_DEBUG_VM /* - * Having write bit for wr-protect-marked present ptes is fatal, - * because it means the uffd-wp bit will be ignored and write will - * just go through. - * - * Use any chance of pgtable walking to verify this (e.g., when - * page swapped out or being migrated for all purposes). It means - * something is already wrong. Tell the admin even before the - * process crashes. We also nail it with wrong pgtable setup. + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PTE (Write=0,Dirty=1). Move the hardware + * dirty value to the software bit, if present. */ - WARN_ON_ONCE(wp && pte_write(pte)); -#endif + return pte_mksaveddirty(pte); +} - return wp; +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline int pte_uffd_wp(pte_t pte) +{ + return pte_flags(pte) & _PAGE_UFFD_WP; } static inline pte_t pte_mkuffd_wp(pte_t pte) @@ -330,7 +468,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte) static inline pte_t pte_mkclean(pte_t pte) { - return pte_clear_flags(pte, _PAGE_DIRTY); + return pte_clear_flags(pte, _PAGE_DIRTY_BITS); } static inline pte_t pte_mkold(pte_t pte) @@ -345,7 +483,16 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pte_mksaveddirty(pte); +} + +static inline pte_t pte_mkwrite_shstk(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_RW); + + return pte_set_flags(pte, _PAGE_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -353,11 +500,15 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte_set_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkwrite_novma(pte_t pte) { return pte_set_flags(pte, _PAGE_RW); } +struct vm_area_struct; +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma); +#define pte_mkwrite pte_mkwrite + static inline pte_t pte_mkhuge(pte_t pte) { return pte_set_flags(pte, _PAGE_PSE); @@ -388,23 +539,34 @@ static inline pte_t pte_mkdevmap(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); } -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_mksaveddirty(pmd_t pmd) { pmdval_t v = native_pmd_val(pmd); - return native_make_pmd(v | set); + v = mksaveddirty_shift(v); + return native_make_pmd(v); } -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_clear_saveddirty(pmd_t pmd) { pmdval_t v = native_pmd_val(pmd); - return native_make_pmd(v & ~clear); + v = clear_saveddirty_shift(v); + return native_make_pmd(v); } static inline pmd_t pmd_wrprotect(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_RW); + pmd = pmd_clear_flags(pmd, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PMD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pmd_mksaveddirty(pmd); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP @@ -431,12 +593,21 @@ static inline pmd_t pmd_mkold(pmd_t pmd) static inline pmd_t pmd_mkclean(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_DIRTY); + return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS); } static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pmd_mksaveddirty(pmd); +} + +static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd) +{ + pmd = pmd_clear_flags(pmd, _PAGE_RW); + + return pmd_set_flags(pmd, _PAGE_DIRTY); } static inline pmd_t pmd_mkdevmap(pmd_t pmd) @@ -454,23 +625,30 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_ACCESSED); } -static inline pmd_t pmd_mkwrite(pmd_t pmd) +static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_RW); } -static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); +#define pmd_mkwrite pmd_mkwrite + +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_mksaveddirty(pud_t pud) { pudval_t v = native_pud_val(pud); - return native_make_pud(v | set); + v = mksaveddirty_shift(v); + return native_make_pud(v); } -static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_clear_saveddirty(pud_t pud) { pudval_t v = native_pud_val(pud); - return native_make_pud(v & ~clear); + v = clear_saveddirty_shift(v); + return native_make_pud(v); } static inline pud_t pud_mkold(pud_t pud) @@ -480,17 +658,26 @@ static inline pud_t pud_mkold(pud_t pud) static inline pud_t pud_mkclean(pud_t pud) { - return pud_clear_flags(pud, _PAGE_DIRTY); + return pud_clear_flags(pud, _PAGE_DIRTY_BITS); } static inline pud_t pud_wrprotect(pud_t pud) { - return pud_clear_flags(pud, _PAGE_RW); + pud = pud_clear_flags(pud, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PUD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pud_mksaveddirty(pud); } static inline pud_t pud_mkdirty(pud_t pud) { - return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pud_mksaveddirty(pud); } static inline pud_t pud_mkdevmap(pud_t pud) @@ -510,7 +697,9 @@ static inline pud_t pud_mkyoung(pud_t pud) static inline pud_t pud_mkwrite(pud_t pud) { - return pud_set_flags(pud, _PAGE_RW); + pud = pud_set_flags(pud, _PAGE_RW); + + return pud_clear_saveddirty(pud); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -622,11 +811,18 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd) __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } +static inline pud_t pud_mkinvalid(pud_t pud) +{ + return pfn_pud(pud_pfn(pud), + __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pteval_t val = pte_val(pte), oldval = val; + pte_t pte_result; /* * Chop off the NX bit (if present), and add the NX portion of @@ -635,17 +831,71 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); - return __pte(val); + + pte_result = __pte(val); + + /* + * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid: + * 1. Marking Write=0 PTEs Dirty=1 + * 2. Marking Dirty=1 PTEs Write=0 + * + * The first case cannot happen because the _PAGE_CHG_MASK will filter + * out any Dirty bit passed in newprot. Handle the second case by + * going through the mksaveddirty exercise. Only do this if the old + * value was Write=1 to avoid doing this on Shadow Stack PTEs. + */ + if (oldval & _PAGE_RW) + pte_result = pte_mksaveddirty(pte_result); + else + pte_result = pte_clear_saveddirty(pte_result); + + return pte_result; } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { pmdval_t val = pmd_val(pmd), oldval = val; + pmd_t pmd_result; - val &= _HPAGE_CHG_MASK; + val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY); val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); - return __pmd(val); + + pmd_result = __pmd(val); + + /* + * Avoid creating shadow stack PMD by accident. See comment in + * pte_modify(). + */ + if (oldval & _PAGE_RW) + pmd_result = pmd_mksaveddirty(pmd_result); + else + pmd_result = pmd_clear_saveddirty(pmd_result); + + return pmd_result; +} + +static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + pudval_t val = pud_val(pud), oldval = val; + pud_t pud_result; + + val &= _HPAGE_CHG_MASK; + val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; + val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK); + + pud_result = __pud(val); + + /* + * Avoid creating shadow stack PUD by accident. See comment in + * pte_modify(). + */ + if (oldval & _PAGE_RW) + pud_result = pud_mksaveddirty(pud_result); + else + pud_result = pud_clear_saveddirty(pud_result); + + return pud_result; } /* @@ -702,7 +952,7 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); /* @@ -716,12 +966,12 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) return pgd; return __pti_set_user_pgtbl(pgdp, pgd); } -#else /* CONFIG_PAGE_TABLE_ISOLATION */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ #endif /* __ASSEMBLY__ */ @@ -749,6 +999,14 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) +{ + if (__pte_needs_invert(pte_val(pte))) + return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); +} +#define pte_advance_pfn pte_advance_pfn + static inline int pte_present(pte_t a) { return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); @@ -829,7 +1087,14 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * (Currently stuck as a macro because of indirect forward reference * to linux/mm.h:page_to_nid()) */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) +#define mk_pte(page, pgprot) \ +({ \ + pgprot_t __pgprot = pgprot; \ + \ + WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \ + _PAGE_DIRTY); \ + pfn_pte(page_to_pfn(page), __pgprot); \ +}) static inline int pmd_bad(pmd_t pmd) { @@ -864,23 +1129,16 @@ static inline pmd_t *pud_pgtable(pud_t pud) */ #define pud_page(pud) pfn_to_page(pud_pfn(pud)) -#define pud_leaf pud_large -static inline int pud_large(pud_t pud) +#define pud_leaf pud_leaf +static inline bool pud_leaf(pud_t pud) { - return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return pud_val(pud) & _PAGE_PSE; } static inline int pud_bad(pud_t pud) { return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; } -#else -#define pud_leaf pud_large -static inline int pud_large(pud_t pud) -{ - return 0; -} #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -909,7 +1167,7 @@ static inline int p4d_bad(p4d_t p4d) { unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (p4d_flags(p4d) & ~ignore_flags) != 0; @@ -955,7 +1213,7 @@ static inline int pgd_bad(pgd_t pgd) if (!pgtable_l5_enabled()) return 0; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; @@ -985,7 +1243,6 @@ static inline int pgd_none(pgd_t pgd) extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); -extern void memblock_find_dma_reserve(void); void __init poking_init(void); unsigned long init_memory_mapping(unsigned long start, unsigned long end, pgprot_t prot); @@ -1020,24 +1277,17 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) return res; } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - page_table_check_pte_set(mm, addr, ptep, pte); - set_pte(ptep, pte); -} - static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); set_pmd(pmdp, pmd); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); native_set_pud(pudp, pud); } @@ -1068,7 +1318,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); return pte; } @@ -1084,7 +1334,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } @@ -1095,7 +1345,17 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pte_t old_pte, new_pte; + + old_pte = READ_ONCE(*ptep); + do { + new_pte = pte_wrprotect(old_pte); + } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte)); } #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) @@ -1121,19 +1381,13 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define pmd_write pmd_write -static inline int pmd_write(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_RW; -} - #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, addr, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } @@ -1144,7 +1398,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, { pud_t pud = native_pudp_get_and_clear(pudp); - page_table_check_pud_clear(mm, addr, pud); + page_table_check_pud_clear(mm, pud); return pud; } @@ -1153,13 +1407,17 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); -} + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pmd_t old_pmd, new_pmd; -#define pud_write pud_write -static inline int pud_write(pud_t pud) -{ - return pud_flags(pud) & _PAGE_RW; + old_pmd = READ_ONCE(*pmdp); + do { + new_pmd = pmd_wrprotect(old_pmd); + } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd)); } #ifndef pmdp_establish @@ -1167,7 +1425,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { @@ -1178,10 +1436,28 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline pud_t pudp_establish(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, pud_t pud) +{ + page_table_check_pud_set(vma->vm_mm, pudp, pud); + if (IS_ENABLED(CONFIG_SMP)) { + return xchg(pudp, pud); + } else { + pud_t old = *pudp; + WRITE_ONCE(*pudp, pud); + return old; + } +} +#endif + #define __HAVE_ARCH_PMDP_INVALIDATE_AD extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); + /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. @@ -1196,12 +1472,12 @@ static inline bool pgdp_maps_userspace(void *__ptr) return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); } -#define pgd_leaf pgd_large -static inline int pgd_large(pgd_t pgd) { return 0; } +#define pgd_leaf pgd_leaf +static inline bool pgd_leaf(pgd_t pgd) { return false; } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and * the user one is in the last 4k. To switch between them, you * just need to flip the 12th bit in their addresses. @@ -1246,7 +1522,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) { return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); @@ -1261,7 +1537,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!static_cpu_has(X86_FEATURE_PTI)) return; /* Clone the user space pgd as well */ @@ -1292,6 +1568,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ +} static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { @@ -1412,6 +1693,11 @@ static inline bool __pte_access_permitted(unsigned long pteval, bool write) { unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + /* + * Write=0,Dirty=1 PTEs are shadow stack, which the kernel + * shouldn't generally allow access to, but since they + * are already Write=0, the below logic covers both cases. + */ if (write) need_pte_bits |= _PAGE_RW; @@ -1447,11 +1733,14 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } -#define arch_has_hw_pte_young arch_has_hw_pte_young -static inline bool arch_has_hw_pte_young(void) -{ - return true; -} +#define arch_check_zapped_pte arch_check_zapped_pte +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); + +#define arch_check_zapped_pmd arch_check_zapped_pmd +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd); + +#define arch_check_zapped_pud arch_check_zapped_pud +void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud); #ifdef CONFIG_XEN_PV #define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young @@ -1478,6 +1767,14 @@ static inline bool pud_user_accessible_page(pud_t pud) } #endif +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index a629b1b9f65a..d1426b64c1b9 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -143,7 +143,8 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; - if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { + if (pgtable_l5_enabled() || + !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) { WRITE_ONCE(*p4dp, p4d); return; } @@ -203,7 +204,7 @@ static inline void native_pgd_clear(pgd_t *pgd) * F (2) in swp entry is used to record when a pagetable is * writeprotected by userfaultfd WP support. * - * E (3) in swp entry is used to rememeber PG_anon_exclusive. + * E (3) in swp entry is used to remember PG_anon_exclusive. * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. @@ -269,5 +270,26 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end) #include <asm/pgtable-invert.h> -#endif /* !__ASSEMBLY__ */ +#else /* __ASSEMBLY__ */ + +#define l4_index(x) (((x) >> 39) & 511) +#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) + +L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) +L4_START_KERNEL = l4_index(__START_KERNEL_map) + +L3_START_KERNEL = pud_index(__START_KERNEL_map) + +#define SYM_DATA_START_PAGE_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) + +/* Automate the creation of 1 to 1 mapping pmd entries */ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ + i = i + 1 ; \ + .endr + +#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 38b54b992f32..a98e53491a4e 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -21,9 +21,9 @@ typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; typedef struct { pmdval_t pmd; } pmd_t; -#ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled; +#ifdef CONFIG_X86_5LEVEL #ifdef USE_EARLY_PGTABLE_L5 /* * cpu_feature_enabled() is not available in early boot code. @@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d; # define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +#ifdef CONFIG_RANDOMIZE_MEMORY +# define PHYSMEM_END physmem_end +#endif + /* * End of the region for which vmalloc page tables are pre-allocated. * For non-KMSAN builds, this is the same as VMALLOC_END. diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ba3e2554799a..6f82e75b6149 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -21,7 +21,8 @@ #define _PAGE_BIT_SOFTW2 10 /* " */ #define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ -#define _PAGE_BIT_SOFTW4 58 /* available for programmer */ +#define _PAGE_BIT_SOFTW4 57 /* available for programmer */ +#define _PAGE_BIT_SOFTW5 58 /* available for programmer */ #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ @@ -34,6 +35,13 @@ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit */ +#else +/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */ +#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit */ +#endif + /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -117,6 +125,18 @@ #define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif +/* + * The hardware requires shadow stack to be Write=0,Dirty=1. However, + * there are valid cases where the kernel might create read-only PTEs that + * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In + * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit, + * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have + * (Write=0,SavedDirty=1,Dirty=0) set. + */ +#define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY) + +#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY) + #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) /* @@ -125,11 +145,12 @@ * instance, and is *not* included in this mask since * pte_modify() does modify it. */ -#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ - _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \ - _PAGE_UFFD_WP) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) +#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ + _PAGE_SPECIAL | _PAGE_ACCESSED | \ + _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \ + _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP) +#define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT) +#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE) /* * The cache modes defined here are used to translate between pure SW usage @@ -152,6 +173,7 @@ enum page_cache_mode { }; #endif +#define _PAGE_CC (_AT(pteval_t, cc_mask)) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) @@ -188,14 +210,22 @@ enum page_cache_mode { #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) + +/* + * Page tables needs to have Write=1 in order for any lower PTEs to be + * writable. This includes shadow stack memory (Write=0, Dirty=1) + */ #define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0) #define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC) #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) -#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G) + +#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G) +#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G) +#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) -#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G) #define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP) @@ -487,8 +517,6 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern pteval_t __default_kernel_pte_mask; -extern void set_nx(void); -extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); @@ -519,6 +547,7 @@ enum pg_level { PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, + PG_LEVEL_256T, PG_LEVEL_NUM }; @@ -537,6 +566,8 @@ static inline void update_page_count(int level, unsigned long pages) { } extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); +pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, + unsigned int *level, bool *nx, bool *rw); extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h new file mode 100644 index 000000000000..de788b400fba --- /dev/null +++ b/arch/x86/include/asm/posted_intr.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_POSTED_INTR_H +#define _X86_POSTED_INTR_H +#include <asm/irq_vectors.h> + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +#define PID_TABLE_ENTRY_VALID 1 + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + union { + u32 pir[8]; /* Posted interrupt requested */ + u64 pir64[4]; + }; + union { + struct { + u16 notifications; /* Suppress and outstanding bits */ + u8 nv; + u8 rsvd_2; + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +/* Non-atomic helpers */ +static inline void __pi_set_sn(struct pi_desc *pi_desc) +{ + pi_desc->notifications |= BIT(POSTED_INTR_SN); +} + +static inline void __pi_clear_sn(struct pi_desc *pi_desc) +{ + pi_desc->notifications &= ~BIT(POSTED_INTR_SN); +} + +#ifdef CONFIG_X86_POSTED_MSI +/* + * Not all external vectors are subject to interrupt remapping, e.g. IOMMU's + * own interrupts. Here we do not distinguish them since those vector bits in + * PIR will always be zero. + */ +static inline bool pi_pending_this_cpu(unsigned int vector) +{ + struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc); + + if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) + return false; + + return test_bit(vector, (unsigned long *)pid->pir); +} + +extern void intel_posted_msi_init(void); +#else +static inline bool pi_pending_this_cpu(unsigned int vector) { return false; } + +static inline void intel_posted_msi_init(void) {}; +#endif /* X86_POSTED_MSI */ + +#endif /* _X86_POSTED_INTR_H */ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 2d13f25b1bd8..919909d8cb77 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -6,7 +6,6 @@ #include <asm/percpu.h> #include <asm/current.h> -#include <linux/thread_info.h> #include <linux/static_call_types.h> /* We use the MSB mostly because its available */ @@ -31,11 +30,11 @@ static __always_inline void preempt_count_set(int pc) { int old, new; + old = raw_cpu_read_4(pcpu_hot.preempt_count); do { - old = raw_cpu_read_4(pcpu_hot.preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old); + } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new)); } /* @@ -92,7 +91,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e, + return GEN_UNARY_RMWcc("decl", __my_cpu_var(pcpu_hot.preempt_count), e, __percpu_arg([var])); } diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index d8cccadc83a6..e5f204b9b33d 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -51,7 +51,7 @@ #define CR3_NOFLUSH 0 #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define X86_CR3_PTI_PCID_USER_BIT 11 #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fd750247ca89..4a686f0e5dbf 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -20,7 +20,6 @@ struct vm86; #include <asm/page.h> #include <asm/pgtable_types.h> #include <asm/percpu.h> -#include <asm/msr.h> #include <asm/desc_defs.h> #include <asm/nops.h> #include <asm/special_insns.h> @@ -28,6 +27,7 @@ struct vm86; #include <asm/unwind_hints.h> #include <asm/vmxfeatures.h> #include <asm/vdso/processor.h> +#include <asm/shstk.h> #include <linux/personality.h> #include <linux/cache.h> @@ -74,15 +74,57 @@ extern u16 __read_mostly tlb_lld_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_1g[NR_INFO]; /* - * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head_32.S, so think twice - * before touching them. [mj] + * CPU type and hardware bug flags. Kept separately for each CPU. */ +struct cpuinfo_topology { + // Real APIC ID read from the local APIC + u32 apicid; + // The initial APIC ID provided by CPUID + u32 initial_apicid; + + // Physical package ID + u32 pkg_id; + + // Physical die ID on AMD, Relative on Intel + u32 die_id; + + // Compute unit ID - AMD specific + u32 cu_id; + + // Core ID relative to the package + u32 core_id; + + // Logical ID mappings + u32 logical_pkg_id; + u32 logical_die_id; + + // AMD Node ID and Nodes per Package info + u32 amd_node_id; + + // Cache level topology IDs + u32 llc_id; + u32 l2c_id; +}; + struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; + union { + /* + * The particular ordering (low-to-high) of (vendor, + * family, model) is done in case range of models, like + * it is usually done on AMD, need to be compared. + */ + struct { + __u8 x86_model; + /* CPU family */ + __u8 x86; + /* CPU vendor */ + __u8 x86_vendor; + __u8 x86_reserved; + }; + /* combined vendor, family, model */ + __u32 x86_vfm; + }; __u8 x86_stepping; #ifdef CONFIG_X86_64 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ @@ -93,9 +135,6 @@ struct cpuinfo_x86 { #endif __u8 x86_virt_bits; __u8 x86_phys_bits; - /* CPUID returned core id bits: */ - __u8 x86_coreid_bits; - __u8 cu_id; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -111,6 +150,7 @@ struct cpuinfo_x86 { }; char x86_vendor_id[16]; char x86_model_id[64]; + struct cpuinfo_topology topo; /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ @@ -122,21 +162,9 @@ struct cpuinfo_x86 { unsigned long loops_per_jiffy; /* protected processor identification number */ u64 ppin; - /* cpuid returned max cores value: */ - u16 x86_max_cores; - u16 apicid; - u16 initial_apicid; u16 x86_clflush_size; /* number of cores as seen by the OS: */ u16 booted_cores; - /* Physical processor id: */ - u16 phys_proc_id; - /* Logical processor id: */ - u16 logical_proc_id; - /* Core id: */ - u16 cpu_core_id; - u16 cpu_die_id; - u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; /* Is SMT active on this core? */ @@ -170,13 +198,8 @@ extern struct cpuinfo_x86 new_cpu_data; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; -#ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) -#else -#define cpu_info boot_cpu_data -#define cpu_data(cpu) boot_cpu_data -#endif extern const struct seq_operations cpuinfo_op; @@ -190,7 +213,6 @@ static inline unsigned long long l1tf_pfn_limit(void) } extern void early_cpu_init(void); -extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); @@ -399,7 +421,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -extern asmlinkage void ignore_sysret(void); +extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); @@ -464,7 +486,6 @@ struct thread_struct { unsigned long iopl_emul; unsigned int iopl_warn:1; - unsigned int sig_on_uaccess_err:1; /* * Protection Keys Register for Userspace. Loaded immediately on @@ -475,6 +496,13 @@ struct thread_struct { */ u32 pkru; +#ifdef CONFIG_X86_USER_SHADOW_STACK + unsigned long features; + unsigned long features_locked; + + struct thread_shstk shstk; +#endif + /* Floating point and extended processor state */ struct fpu fpu; /* @@ -511,6 +539,9 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return this_cpu_read_const(const_pcpu_hot.top_of_stack); + return this_cpu_read_stable(pcpu_hot.top_of_stack); } @@ -533,7 +564,7 @@ static inline void load_sp0(unsigned long sp0) unsigned long __get_wchan(struct task_struct *p); -extern void select_idle_routine(const struct cpuinfo_x86 *c); +extern void select_idle_routine(void); extern void amd_e400_c1e_apic_setup(void); extern unsigned long boot_option_idle_override; @@ -551,31 +582,10 @@ extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void cpu_init(void); -extern void cpu_init_exception_handling(void); +extern void cpu_init_exception_handling(bool boot_cpu); +extern void cpu_init_replace_early_idt(void); extern void cr4_init(void); -static inline unsigned long get_debugctlmsr(void) -{ - unsigned long debugctlmsr = 0; - -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return 0; -#endif - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); - - return debugctlmsr; -} - -static inline void update_debugctlmsr(unsigned long debugctlmsr) -{ -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return; -#endif - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); -} - extern void set_task_blockstep(struct task_struct *task, bool on); /* Boot loader type from the setup header: */ @@ -591,7 +601,7 @@ extern char ignore_fpu_irq; # define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else -# define BASE_PREFETCH "prefetcht0 %P1" +# define BASE_PREFETCH "prefetcht0 %1" #endif /* @@ -602,7 +612,7 @@ extern char ignore_fpu_irq; */ static inline void prefetch(const void *x) { - alternative_input(BASE_PREFETCH, "prefetchnta %P1", + alternative_input(BASE_PREFETCH, "prefetchnta %1", X86_FEATURE_XMM, "m" (*(const char *)x)); } @@ -614,7 +624,7 @@ static inline void prefetch(const void *x) */ static __always_inline void prefetchw(const void *x) { - alternative_input(BASE_PREFETCH, "prefetchw %P1", + alternative_input(BASE_PREFETCH, "prefetchw %1", X86_FEATURE_3DNOWPREFETCH, "m" (*(const char *)x)); } @@ -640,10 +650,10 @@ static __always_inline void prefetchw(const void *x) #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else -extern unsigned long __end_init_task[]; +extern unsigned long __top_init_kernel_stack[]; -#define INIT_THREAD { \ - .sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \ +#define INIT_THREAD { \ + .sp = (unsigned long)&__top_init_kernel_stack, \ } extern unsigned long KSTK_ESP(struct task_struct *task); @@ -671,18 +681,31 @@ extern int set_tsc_mode(unsigned int val); DECLARE_PER_CPU(u64, msr_misc_features_shadow); -extern u16 get_llc_id(unsigned int cpu); +static inline u32 per_cpu_llc_id(unsigned int cpu) +{ + return per_cpu(cpu_info.topo.llc_id, cpu); +} + +static inline u32 per_cpu_l2c_id(unsigned int cpu) +{ + return per_cpu(cpu_info.topo.l2c_id, cpu); +} #ifdef CONFIG_CPU_SUP_AMD -extern u32 amd_get_nodes_per_socket(void); -extern u32 amd_get_highest_perf(void); -extern bool cpu_has_ibpb_brtype_microcode(void); -extern void amd_clear_divider(void); +/* + * Issue a DIV 0/1 insn to clear any division data from previous DIV + * operations. + */ +static __always_inline void amd_clear_divider(void) +{ + asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0) + :: "a" (0), "d" (0), "r" (1)); +} + +extern void amd_check_microcode(void); #else -static inline u32 amd_get_nodes_per_socket(void) { return 0; } -static inline u32 amd_get_highest_perf(void) { return 0; } -static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; } static inline void amd_clear_divider(void) { } +static inline void amd_check_microcode(void) { } #endif extern unsigned long arch_align_stack(unsigned long sp); @@ -717,14 +740,24 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; -#ifdef CONFIG_X86_SGX -int arch_memory_failure(unsigned long pfn, int flags); -#define arch_memory_failure arch_memory_failure - -bool arch_is_platform_page(u64 paddr); -#define arch_is_platform_page arch_is_platform_page -#endif - extern bool gds_ucode_mitigated(void); +/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE)); +} + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index b716d291d0d4..365798cb4408 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -23,11 +23,11 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); void x86_of_pci_init(void); -void x86_dtb_init(void); +void x86_flattree_get_config(void); #else static inline void add_dtb(u64 data) { } static inline void x86_of_pci_init(void) { } -static inline void x86_dtb_init(void) { } +static inline void x86_flattree_get_config(void) { } #define of_ioapic 0 #endif diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 12ef86b19910..484f4f0131a5 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -32,10 +32,9 @@ void entry_SYSCALL_compat(void); void entry_SYSCALL_compat_safe_stack(void); void entry_SYSRETL_compat_unsafe_stack(void); void entry_SYSRETL_compat_end(void); -void entry_INT80_compat(void); -#ifdef CONFIG_XEN_PV -void xen_entry_INT80_compat(void); -#endif +#else /* !CONFIG_IA32_EMULATION */ +#define entry_SYSCALL_compat NULL +#define entry_SYSENTER_compat NULL #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 07375b476c4f..ab167c96b9ab 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -3,7 +3,7 @@ #define _ASM_X86_PTI_H #ifndef __ASSEMBLY__ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); extern void pti_finalize(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index f4db78b09c8f..5a83fbd9bc0b 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -56,18 +56,64 @@ struct pt_regs { #else /* __i386__ */ +struct fred_cs { + /* CS selector */ + u64 cs : 16, + /* Stack level at event time */ + sl : 2, + /* IBT in WAIT_FOR_ENDBRANCH state */ + wfe : 1, + : 45; +}; + +struct fred_ss { + /* SS selector */ + u64 ss : 16, + /* STI state */ + sti : 1, + /* Set if syscall, sysenter or INT n */ + swevent : 1, + /* Event is NMI type */ + nmi : 1, + : 13, + /* Event vector */ + vector : 8, + : 8, + /* Event type */ + type : 4, + : 4, + /* Event was incident to enclave execution */ + enclave : 1, + /* CPU was in long mode */ + lm : 1, + /* + * Nested exception during FRED delivery, not set + * for #DF. + */ + nested : 1, + : 1, + /* + * The length of the instruction causing the event. + * Only set for INTO, INT1, INT3, INT n, SYSCALL + * and SYSENTER. 0 otherwise. + */ + insnlen : 4; +}; + struct pt_regs { -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ + /* + * C ABI says these regs are callee-preserved. They aren't saved on + * kernel entry unless syscall needs a complete, fully filled + * "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* These regs are callee-clobbered. Always saved on kernel entry. */ + + /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -77,18 +123,50 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ + + /* + * orig_ax is used on entry for: + * - the syscall number (syscall, sysenter, int80) + * - error_code stored by the CPU on traps and exceptions + * - the interrupt number for device interrupts + * + * A FRED stack frame starts here: + * 1) It _always_ includes an error code; + * + * 2) The return frame for ERET[US] starts here, but + * the content of orig_ax is ignored. + */ unsigned long orig_ax; -/* Return frame for iretq */ + + /* The IRETQ return frame starts here */ unsigned long ip; - unsigned long cs; + + union { + /* CS selector */ + u16 cs; + /* The extended 64-bit data slot containing CS */ + u64 csx; + /* The FRED CS extension */ + struct fred_cs fred_cs; + }; + unsigned long flags; unsigned long sp; - unsigned long ss; -/* top of stack page */ + + union { + /* SS selector */ + u16 ss; + /* The extended 64-bit data slot containing SS */ + u64 ssx; + /* The FRED SS extension */ + struct fred_ss fred_ss; + }; + + /* + * Top of stack on IDT systems, while FRED systems have extra fields + * defined above for storing exception related information, e.g. CR2 or + * DR6. + */ }; #endif /* !__i386__ */ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 0c92db84469d..6e4f8fae3ce9 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -5,6 +5,7 @@ #include <asm/clocksource.h> #include <asm/pvclock-abi.h> +struct timespec64; /* some helper functions for xen and kvm pv clock sources */ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src); diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index d87451df480b..68da67df304d 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -66,15 +66,15 @@ static inline bool vcpu_is_preempted(long cpu) #ifdef CONFIG_PARAVIRT /* - * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack. + * virt_spin_lock_key - disables by default the virt_spin_lock() hijack. * - * Native (and PV wanting native due to vCPU pinning) should disable this key. - * It is done in this backwards fashion to only have a single direction change, - * which removes ordering between native_pv_spin_init() and HV setup. + * Native (and PV wanting native due to vCPU pinning) should keep this key + * disabled. Native does not touch the key. + * + * When in a guest then native_pv_lock_init() enables the key first and + * KVM/XEN might conditionally disable it later in the boot process again. */ -DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); - -void native_pv_lock_init(void) __init; +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); /* * Shortcut for the queued_spin_lock_slowpath() function that allows @@ -87,6 +87,8 @@ void native_pv_lock_init(void) __init; #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { + int val; + if (!static_branch_likely(&virt_spin_lock_key)) return false; @@ -96,17 +98,17 @@ static inline bool virt_spin_lock(struct qspinlock *lock) * horrible lock 'holder' preemption issues. */ - do { - while (atomic_read(&lock->val) != 0) - cpu_relax(); - } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0); + __retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) { + cpu_relax(); + goto __retry; + } return true; } -#else -static inline void native_pv_lock_init(void) -{ -} + #endif /* CONFIG_PARAVIRT */ #include <asm-generic/qspinlock.h> diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 42b17cf10b10..0a985784be9b 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -4,6 +4,8 @@ #include <asm/ibt.h> +void __lockfunc __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked); + /* * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit * registers. For i386, however, only 1 32-bit register needs to be saved @@ -23,9 +25,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); * * void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock) * { - * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0); + * u8 lockval = _Q_LOCKED_VAL; * - * if (likely(lockval == _Q_LOCKED_VAL)) + * if (try_cmpxchg(&lock->locked, &lockval, 0)) * return; * pv_queued_spin_unlock_slowpath(lock, lockval); * } @@ -38,10 +40,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define PV_UNLOCK_ASM \ FRAME_BEGIN \ "push %rdx\n\t" \ - "mov $0x1,%eax\n\t" \ + "mov $" __stringify(_Q_LOCKED_VAL) ",%eax\n\t" \ "xor %edx,%edx\n\t" \ LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ - "cmp $0x1,%al\n\t" \ "jne .slowpath\n\t" \ "pop %rdx\n\t" \ FRAME_END \ @@ -54,8 +55,8 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); "pop %rdx\n\t" \ FRAME_END -DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock, - PV_UNLOCK_ASM, .spinlock.text); +DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock, + PV_UNLOCK_ASM, .spinlock.text); #else /* CONFIG_64BIT */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 9177b4354c3f..d0ef2a678d66 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,7 +25,14 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +typedef void (cpu_emergency_virt_cb)(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); +#else +static inline void cpu_emergency_disable_virtualization(void) {} +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 7ba1726b71c7..e9187ddd3d1f 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -99,6 +99,7 @@ #define REQUIRED_MASK18 0 #define REQUIRED_MASK19 0 #define REQUIRED_MASK20 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define REQUIRED_MASK21 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 255a78d9d906..8b1b6ce1e51b 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -7,6 +7,13 @@ #include <linux/sched.h> #include <linux/jump_label.h> +/* + * This value can never be a valid CLOSID, and is used when mapping a + * (closid, rmid) pair to an index and back. On x86 only the RMID is + * needed. The index is a software defined value. + */ +#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0) + /** * struct resctrl_pqr_state - State cache for the PQR MSR * @cur_rmid: The cached Resource Monitoring ID @@ -31,10 +38,47 @@ struct resctrl_pqr_state { DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; + DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); +static inline bool resctrl_arch_alloc_capable(void) +{ + return rdt_alloc_capable; +} + +static inline void resctrl_arch_enable_alloc(void) +{ + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_alloc(void) +{ + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + +static inline bool resctrl_arch_mon_capable(void) +{ + return rdt_mon_capable; +} + +static inline void resctrl_arch_enable_mon(void) +{ + static_branch_enable_cpuslocked(&rdt_mon_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_mon(void) +{ + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * @@ -88,12 +132,52 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) return val * scale; } +static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk, + u32 closid, u32 rmid) +{ + WRITE_ONCE(tsk->closid, closid); + WRITE_ONCE(tsk->rmid, rmid); +} + +static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + return READ_ONCE(tsk->closid) == closid; +} + +static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored, + u32 rmid) +{ + return READ_ONCE(tsk->rmid) == rmid; +} + static inline void resctrl_sched_in(struct task_struct *tsk) { if (static_branch_likely(&rdt_enable_key)) __resctrl_sched_in(tsk); } +static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *rmid = idx; + *closid = X86_RESCTRL_EMPTY_CLOSID; +} + +static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) +{ + return rmid; +} + +/* x86 can always read an rmid, nothing needs allocating */ +struct rdt_resource; +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) +{ + might_sleep(); + return NULL; +}; + +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, + void *ctx) { }; + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 7fa611216417..363266cbcada 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -2,12 +2,7 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc -/* This counts to 12. Any more, it will return 13th argument. */ -#define __RMWcc_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n -#define RMWcc_ARGS(X...) __RMWcc_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) - -#define __RMWcc_CONCAT(a, b) a ## b -#define RMWcc_CONCAT(a, b) __RMWcc_CONCAT(a, b) +#include <linux/args.h> #define __CLOBBERS_MEM(clb...) "memory", ## clb @@ -18,7 +13,7 @@ #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ ({ \ bool c = false; \ - asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ + asm goto (fullop "; j" #cc " %l[cc_label]" \ : : [var] "m" (_var), ## __VA_ARGS__ \ : clobbers : cc_label); \ if (0) { \ @@ -48,7 +43,7 @@ cc_label: c = true; \ #define GEN_UNARY_RMWcc_3(op, var, cc) \ GEN_UNARY_RMWcc_4(op, var, cc, "%[var]") -#define GEN_UNARY_RMWcc(X...) RMWcc_CONCAT(GEN_UNARY_RMWcc_, RMWcc_ARGS(X))(X) +#define GEN_UNARY_RMWcc(X...) CONCATENATE(GEN_UNARY_RMWcc_, COUNT_ARGS(X))(X) #define GEN_BINARY_RMWcc_6(op, var, cc, vcon, _val, arg0) \ __GEN_RMWcc(op " %[val], " arg0, var, cc, \ @@ -57,7 +52,7 @@ cc_label: c = true; \ #define GEN_BINARY_RMWcc_5(op, var, cc, vcon, val) \ GEN_BINARY_RMWcc_6(op, var, cc, vcon, val, "%[var]") -#define GEN_BINARY_RMWcc(X...) RMWcc_CONCAT(GEN_BINARY_RMWcc_, RMWcc_ARGS(X))(X) +#define GEN_BINARY_RMWcc(X...) CONCATENATE(GEN_BINARY_RMWcc_, COUNT_ARGS(X))(X) #define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, cc, clobbers...) \ __GEN_RMWcc(op " %[var]\n\t" suffix, var, cc, \ diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h new file mode 100644 index 000000000000..24e3a53ca255 --- /dev/null +++ b/arch/x86/include/asm/runtime-const.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RUNTIME_CONST_H +#define _ASM_RUNTIME_CONST_H + +#define runtime_const_ptr(sym) ({ \ + typeof(sym) __ret; \ + asm_inline("mov %1,%0\n1:\n" \ + ".pushsection runtime_ptr_" #sym ",\"a\"\n\t" \ + ".long 1b - %c2 - .\n\t" \ + ".popsection" \ + :"=r" (__ret) \ + :"i" ((unsigned long)0x0123456789abcdefull), \ + "i" (sizeof(long))); \ + __ret; }) + +// The 'typeof' will create at _least_ a 32-bit type, but +// will happily also take a bigger type and the 'shrl' will +// clear the upper bits +#define runtime_const_shift_right_32(val, sym) ({ \ + typeof(0u+(val)) __ret = (val); \ + asm_inline("shrl $12,%k0\n1:\n" \ + ".pushsection runtime_shift_" #sym ",\"a\"\n\t" \ + ".long 1b - 1 - .\n\t" \ + ".popsection" \ + :"+r" (__ret)); \ + __ret; }) + +#define runtime_const_init(type, sym) do { \ + extern s32 __start_runtime_##type##_##sym[]; \ + extern s32 __stop_runtime_##type##_##sym[]; \ + runtime_const_fixup(__runtime_fixup_##type, \ + (unsigned long)(sym), \ + __start_runtime_##type##_##sym, \ + __stop_runtime_##type##_##sym); \ +} while (0) + +/* + * The text patching is trivial - you can only do this at init time, + * when the text section hasn't been marked RO, and before the text + * has ever been executed. + */ +static inline void __runtime_fixup_ptr(void *where, unsigned long val) +{ + *(unsigned long *)where = val; +} + +static inline void __runtime_fixup_shift(void *where, unsigned long val) +{ + *(unsigned char *)where = val; +} + +static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), + unsigned long val, s32 *start, s32 *end) +{ + while (start < end) { + fn(*start + (void *)start, val); + start++; + } +} + +#endif diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index fef16e398161..42bcd42d70d1 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -9,7 +9,7 @@ #endif #ifdef CONFIG_COMPAT -#include <asm/ia32_unistd.h> +#include <asm/unistd_32_ia32.h> #define __NR_seccomp_read_32 __NR_ia32_read #define __NR_seccomp_write_32 __NR_ia32_write #define __NR_seccomp_exit_32 __NR_ia32_exit diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index a6e8373a5170..3fa87e5e11ab 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_SECTIONS_H #define _ASM_X86_SECTIONS_H -#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed - #include <asm-generic/sections.h> #include <asm/extable.h> @@ -18,20 +16,4 @@ extern char __end_of_kernel_reserve[]; extern unsigned long _brk_start, _brk_end; -static inline bool arch_is_kernel_initmem_freed(unsigned long addr) -{ - /* - * If _brk_start has not been cleared, brk allocation is incomplete, - * and we can not make assumptions about its use. - */ - if (_brk_start) - return 0; - - /* - * After brk allocation is complete, space between _brk_end and _end - * is available for allocation. - */ - return addr >= _brk_end && addr < (unsigned long)&_end; -} - #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index a5e89641bd2d..4b2abce2e3e7 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -47,9 +47,13 @@ int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); +int set_memory_p(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); + +bool set_memory_enc_stop_conversion(void); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); + int set_memory_np_noalias(unsigned long addr, int numpages); int set_memory_nonglobal(unsigned long addr, int numpages); int set_memory_global(unsigned long addr, int numpages); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f3495623ac99..0667b2a88614 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -28,11 +28,11 @@ #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ #ifndef __ASSEMBLY__ +#include <linux/cache.h> + #include <asm/bootparam.h> #include <asm/x86_init.h> -extern u64 relocated_ramdisk; - /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 void vsmp_init(void); @@ -50,7 +50,7 @@ extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); -extern void startup_64_setup_env(unsigned long physbase); +extern void startup_64_setup_gdt_idt(void); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); @@ -126,6 +126,7 @@ void clear_bss(void); #ifdef __i386__ asmlinkage void __init __noreturn i386_start_kernel(void); +void __init mk_early_pgtbl_32(void); #else asmlinkage void __init __noreturn x86_64_start_kernel(char *real_mode); @@ -134,6 +135,12 @@ asmlinkage void __init __noreturn x86_64_start_reservations(char *real_mode_data #endif /* __i386__ */ #endif /* _SETUP */ +#ifdef CONFIG_CMDLINE_BOOL +extern bool builtin_cmdline_added __ro_after_init; +#else +#define builtin_cmdline_added 0 +#endif + #else /* __ASSEMBLY */ .macro __RESERVE_BRK name, size diff --git a/arch/x86/include/asm/setup_data.h b/arch/x86/include/asm/setup_data.h new file mode 100644 index 000000000000..77c51111a893 --- /dev/null +++ b/arch/x86/include/asm/setup_data.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SETUP_DATA_H +#define _ASM_X86_SETUP_DATA_H + +#include <uapi/asm/setup_data.h> + +#ifndef __ASSEMBLY__ + +struct pci_setup_rom { + struct setup_data data; + uint16_t vendor; + uint16_t devid; + uint64_t pcilen; + unsigned long segment; + unsigned long bus; + unsigned long device; + unsigned long function; + uint8_t romdata[]; +}; + +/* kexec external ABI */ +struct efi_setup_data { + u64 fw_vendor; + u64 __unused; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index b463fcbd4b90..98726c2b04f8 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -54,8 +54,18 @@ (((unsigned long)fn) << 32)) /* AP Reset Hold */ -#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 -#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 +#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12 +#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0) + +/* Preferred GHCB GPA Request */ +#define GHCB_MSR_PREF_GPA_REQ 0x010 +#define GHCB_MSR_GPA_VALUE_POS 12 +#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0) + +#define GHCB_MSR_PREF_GPA_RESP 0x011 +#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff /* GHCB GPA Register */ #define GHCB_MSR_REG_GPA_REQ 0x012 @@ -91,28 +101,61 @@ enum psc_op { /* GHCBData[11:0] */ \ GHCB_MSR_PSC_REQ) +#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12) +#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52) + #define GHCB_MSR_PSC_RESP 0x015 #define GHCB_MSR_PSC_RESP_VAL(val) \ /* GHCBData[63:32] */ \ (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) +/* Set highest bit as a generic error response */ +#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP) + +/* GHCB Run at VMPL Request/Response */ +#define GHCB_MSR_VMPL_REQ 0x016 +#define GHCB_MSR_VMPL_REQ_LEVEL(v) \ + /* GHCBData[39:32] */ \ + (((u64)(v) & GENMASK_ULL(7, 0) << 32) | \ + /* GHCBDdata[11:0] */ \ + GHCB_MSR_VMPL_REQ) + +#define GHCB_MSR_VMPL_RESP 0x017 +#define GHCB_MSR_VMPL_RESP_VAL(v) \ + /* GHCBData[63:32] */ \ + (((u64)(v) & GENMASK_ULL(63, 32)) >> 32) + /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 +#define GHCB_MSR_HV_FT_POS 12 +#define GHCB_MSR_HV_FT_MASK GENMASK_ULL(51, 0) #define GHCB_MSR_HV_FT_RESP_VAL(v) \ /* GHCBData[63:12] */ \ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) #define GHCB_HV_FT_SNP BIT_ULL(0) #define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1) +#define GHCB_HV_FT_SNP_MULTI_VMPL BIT_ULL(5) /* * SNP Page State Change NAE event * The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which * is a local stack variable in set_pages_state(). Do not increase this value * without evaluating the impact to stack usage. + * + * Use VMGEXIT_PSC_MAX_COUNT in cases where the actual GHCB-defined max value + * is needed, such as when processing GHCB requests on the hypervisor side. */ #define VMGEXIT_PSC_MAX_ENTRY 64 +#define VMGEXIT_PSC_MAX_COUNT 253 + +#define VMGEXIT_PSC_ERROR_GENERIC (0x100UL << 32) +#define VMGEXIT_PSC_ERROR_INVALID_HDR ((1UL << 32) | 1) +#define VMGEXIT_PSC_ERROR_INVALID_ENTRY ((1UL << 32) | 2) + +#define VMGEXIT_PSC_OP_PRIVATE 1 +#define VMGEXIT_PSC_OP_SHARED 2 struct psc_hdr { u16 cur_entry; @@ -159,6 +202,10 @@ struct snp_psc_desc { #define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */ #define GHCB_TERM_CPUID 4 /* CPUID-validation failure */ #define GHCB_TERM_CPUID_HV 5 /* CPUID failure during hypervisor fallback */ +#define GHCB_TERM_SECRETS_PAGE 6 /* Secrets page failure */ +#define GHCB_TERM_NO_SVSM 7 /* SVSM is not advertised in the secrets page */ +#define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */ +#define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 66c806784c52..ee34ab00a8d6 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -13,7 +13,6 @@ #include <asm/insn.h> #include <asm/sev-common.h> -#include <asm/bootparam.h> #include <asm/coco.h> #define GHCB_PROTOCOL_MIN 1ULL @@ -22,6 +21,8 @@ #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } +struct boot_params; + enum es_result { ES_OK, /* All good */ ES_UNSUPPORTED, /* Requested operation not supported */ @@ -87,9 +88,26 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); /* Software defined (when rFlags.CF = 1) */ #define PVALIDATE_FAIL_NOUPDATE 255 +/* RMUPDATE detected 4K page and 2MB page overlap. */ +#define RMPUPDATE_FAIL_OVERLAP 4 + +/* PSMASH failed due to concurrent access by another CPU */ +#define PSMASH_FAIL_INUSE 3 + /* RMP page size */ #define RMP_PG_SIZE_4K 0 #define RMP_PG_SIZE_2M 1 +#define RMP_TO_PG_LEVEL(level) (((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M) +#define PG_LEVEL_TO_RMP(level) (((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M) + +struct rmp_state { + u64 gpa; + u8 assigned; + u8 pagesize; + u8 immutable; + u8 rsvd; + u32 asid; +} __packed; #define RMPADJUST_VMSA_PAGE_BIT BIT(16) @@ -101,6 +119,54 @@ struct snp_req_data { unsigned int data_npages; }; +#define MAX_AUTHTAG_LEN 32 + +/* See SNP spec SNP_GUEST_REQUEST section for the structure */ +enum msg_type { + SNP_MSG_TYPE_INVALID = 0, + SNP_MSG_CPUID_REQ, + SNP_MSG_CPUID_RSP, + SNP_MSG_KEY_REQ, + SNP_MSG_KEY_RSP, + SNP_MSG_REPORT_REQ, + SNP_MSG_REPORT_RSP, + SNP_MSG_EXPORT_REQ, + SNP_MSG_EXPORT_RSP, + SNP_MSG_IMPORT_REQ, + SNP_MSG_IMPORT_RSP, + SNP_MSG_ABSORB_REQ, + SNP_MSG_ABSORB_RSP, + SNP_MSG_VMRK_REQ, + SNP_MSG_VMRK_RSP, + + SNP_MSG_TYPE_MAX +}; + +enum aead_algo { + SNP_AEAD_INVALID, + SNP_AEAD_AES_256_GCM, +}; + +struct snp_guest_msg_hdr { + u8 authtag[MAX_AUTHTAG_LEN]; + u64 msg_seqno; + u8 rsvd1[8]; + u8 algo; + u8 hdr_version; + u16 hdr_sz; + u8 msg_type; + u8 msg_version; + u16 msg_sz; + u32 rsvd2; + u8 msg_vmpck; + u8 rsvd3[35]; +} __packed; + +struct snp_guest_msg { + struct snp_guest_msg_hdr hdr; + u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)]; +} __packed; + struct sev_guest_platform_data { u64 secrets_gpa; }; @@ -125,7 +191,7 @@ struct secrets_os_area { #define VMPCK_KEY_LEN 32 /* See the SNP spec version 0.9 for secrets page format */ -struct snp_secrets_page_layout { +struct snp_secrets_page { u32 version; u32 imien : 1, rsvd1 : 31; @@ -137,10 +203,119 @@ struct snp_secrets_page_layout { u8 vmpck2[VMPCK_KEY_LEN]; u8 vmpck3[VMPCK_KEY_LEN]; struct secrets_os_area os_area; - u8 rsvd3[3840]; + + u8 vmsa_tweak_bitmap[64]; + + /* SVSM fields */ + u64 svsm_base; + u64 svsm_size; + u64 svsm_caa; + u32 svsm_max_version; + u8 svsm_guest_vmpl; + u8 rsvd3[3]; + + /* Remainder of page */ + u8 rsvd4[3744]; } __packed; +/* + * The SVSM Calling Area (CA) related structures. + */ +struct svsm_ca { + u8 call_pending; + u8 mem_available; + u8 rsvd1[6]; + + u8 svsm_buffer[PAGE_SIZE - 8]; +}; + +#define SVSM_SUCCESS 0 +#define SVSM_ERR_INCOMPLETE 0x80000000 +#define SVSM_ERR_UNSUPPORTED_PROTOCOL 0x80000001 +#define SVSM_ERR_UNSUPPORTED_CALL 0x80000002 +#define SVSM_ERR_INVALID_ADDRESS 0x80000003 +#define SVSM_ERR_INVALID_FORMAT 0x80000004 +#define SVSM_ERR_INVALID_PARAMETER 0x80000005 +#define SVSM_ERR_INVALID_REQUEST 0x80000006 +#define SVSM_ERR_BUSY 0x80000007 +#define SVSM_PVALIDATE_FAIL_SIZEMISMATCH 0x80001006 + +/* + * The SVSM PVALIDATE related structures + */ +struct svsm_pvalidate_entry { + u64 page_size : 2, + action : 1, + ignore_cf : 1, + rsvd : 8, + pfn : 52; +}; + +struct svsm_pvalidate_call { + u16 num_entries; + u16 cur_index; + + u8 rsvd1[4]; + + struct svsm_pvalidate_entry entry[]; +}; + +#define SVSM_PVALIDATE_MAX_COUNT ((sizeof_field(struct svsm_ca, svsm_buffer) - \ + offsetof(struct svsm_pvalidate_call, entry)) / \ + sizeof(struct svsm_pvalidate_entry)) + +/* + * The SVSM Attestation related structures + */ +struct svsm_loc_entry { + u64 pa; + u32 len; + u8 rsvd[4]; +}; + +struct svsm_attest_call { + struct svsm_loc_entry report_buf; + struct svsm_loc_entry nonce; + struct svsm_loc_entry manifest_buf; + struct svsm_loc_entry certificates_buf; + + /* For attesting a single service */ + u8 service_guid[16]; + u32 service_manifest_ver; + u8 rsvd[4]; +}; + +/* + * SVSM protocol structure + */ +struct svsm_call { + struct svsm_ca *caa; + u64 rax; + u64 rcx; + u64 rdx; + u64 r8; + u64 r9; + u64 rax_out; + u64 rcx_out; + u64 rdx_out; + u64 r8_out; + u64 r9_out; +}; + +#define SVSM_CORE_CALL(x) ((0ULL << 32) | (x)) +#define SVSM_CORE_REMAP_CA 0 +#define SVSM_CORE_PVALIDATE 1 +#define SVSM_CORE_CREATE_VCPU 2 +#define SVSM_CORE_DELETE_VCPU 3 + +#define SVSM_ATTEST_CALL(x) ((1ULL << 32) | (x)) +#define SVSM_ATTEST_SERVICES 0 +#define SVSM_ATTEST_SINGLE_SERVICE 1 + #ifdef CONFIG_AMD_MEM_ENCRYPT + +extern u8 snp_vmpl; + extern void __sev_es_ist_enter(struct pt_regs *regs); extern void __sev_es_ist_exit(void); static __always_inline void sev_es_ist_enter(struct pt_regs *regs) @@ -164,7 +339,16 @@ static __always_inline void sev_es_nmi_complete(void) __sev_es_nmi_complete(); } extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +extern void sev_enable(struct boot_params *bp); +/* + * RMPADJUST modifies the RMP permissions of a page of a lesser- + * privileged (numerically higher) VMPL. + * + * If the guest is running at a higher-privilege than the privilege + * level the instruction is targeting, the instruction will succeed, + * otherwise, it will fail. + */ static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { int rc; @@ -198,24 +382,33 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) struct snp_guest_request_ioctl; void setup_ghcb(void); -void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned long npages); -void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned long npages); -void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); +void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned long npages); +void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned long npages); void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void __init __noreturn snp_abort(void); +void __noreturn snp_abort(void); +void snp_dmi_setup(void); int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); +int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input); void snp_accept_memory(phys_addr_t start, phys_addr_t end); -#else +u64 snp_get_unsupported_features(u64 status); +u64 sev_get_status(void); +void sev_show_status(void); +void snp_update_svsm_ca(void); + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +#define snp_vmpl 0 static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +static inline void sev_enable(struct boot_params *bp) { } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } static inline void setup_ghcb(void) { } @@ -223,18 +416,52 @@ static inline void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } static inline void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } -static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } +static inline void snp_dmi_setup(void) { } static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) { return -ENOTTY; } - +static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input) +{ + return -ENOTTY; +} static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } +static inline u64 snp_get_unsupported_features(u64 status) { return 0; } +static inline u64 sev_get_status(void) { return 0; } +static inline void sev_show_status(void) { } +static inline void snp_update_svsm_ca(void) { } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +#ifdef CONFIG_KVM_AMD_SEV +bool snp_probe_rmptable_info(void); +int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level); +void snp_dump_hva_rmpentry(unsigned long address); +int psmash(u64 pfn); +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); +int rmp_make_shared(u64 pfn, enum pg_level level); +void snp_leak_pages(u64 pfn, unsigned int npages); +void kdump_sev_callback(void); +void snp_fixup_e820_tables(void); +#else +static inline bool snp_probe_rmptable_info(void) { return false; } +static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } +static inline void snp_dump_hva_rmpentry(unsigned long address) {} +static inline int psmash(u64 pfn) { return -ENODEV; } +static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, + bool immutable) +{ + return -ENODEV; +} +static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } +static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} +static inline void kdump_sev_callback(void) { } +static inline void snp_fixup_e820_tables(void) {} #endif #endif diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 7513b3bb69b7..fdfd41511b02 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -11,45 +11,91 @@ #define TDX_IDENT "IntelTDX " /* TDX module Call Leaf IDs */ -#define TDX_GET_INFO 1 -#define TDX_GET_VEINFO 3 -#define TDX_GET_REPORT 4 -#define TDX_ACCEPT_PAGE 6 -#define TDX_WR 8 +#define TDG_VP_VMCALL 0 +#define TDG_VP_INFO 1 +#define TDG_VP_VEINFO_GET 3 +#define TDG_MR_REPORT 4 +#define TDG_MEM_PAGE_ACCEPT 6 +#define TDG_VM_WR 8 /* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ #define TDCS_NOTIFY_ENABLES 0x9100000000000010 /* TDX hypercall Leaf IDs */ #define TDVMCALL_MAP_GPA 0x10001 +#define TDVMCALL_GET_QUOTE 0x10002 #define TDVMCALL_REPORT_FATAL_ERROR 0x10003 +#define TDVMCALL_STATUS_RETRY 1 + +/* + * Bitmasks of exposed registers (with VMM). + */ +#define TDX_RDX BIT(2) +#define TDX_RBX BIT(3) +#define TDX_RSI BIT(6) +#define TDX_RDI BIT(7) +#define TDX_R8 BIT(8) +#define TDX_R9 BIT(9) +#define TDX_R10 BIT(10) +#define TDX_R11 BIT(11) +#define TDX_R12 BIT(12) +#define TDX_R13 BIT(13) +#define TDX_R14 BIT(14) +#define TDX_R15 BIT(15) + +/* + * These registers are clobbered to hold arguments for each + * TDVMCALL. They are safe to expose to the VMM. + * Each bit in this mask represents a register ID. Bit field + * details can be found in TDX GHCI specification, section + * titled "TDCALL [TDG.VP.VMCALL] leaf". + */ +#define TDVMCALL_EXPOSE_REGS_MASK \ + (TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \ + TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15) + +/* TDX supported page sizes from the TDX module ABI. */ +#define TDX_PS_4K 0 +#define TDX_PS_2M 1 +#define TDX_PS_1G 2 +#define TDX_PS_NR (TDX_PS_1G + 1) + #ifndef __ASSEMBLY__ +#include <linux/compiler_attributes.h> + /* - * Used in __tdx_hypercall() to pass down and get back registers' values of - * the TDCALL instruction when requesting services from the VMM. - * - * This is a software only structure and not part of the TDX module/VMM ABI. + * Used in __tdcall*() to gather the input/output registers' values of the + * TDCALL instruction when requesting services from the TDX module. This is a + * software only structure and not part of the TDX module/VMM ABI */ -struct tdx_hypercall_args { +struct tdx_module_args { + /* callee-clobbered */ + u64 rcx; + u64 rdx; u64 r8; u64 r9; + /* extra callee-clobbered */ u64 r10; u64 r11; + /* callee-saved + rdi/rsi */ u64 r12; u64 r13; u64 r14; u64 r15; + u64 rbx; u64 rdi; u64 rsi; - u64 rbx; - u64 rdx; }; +/* Used to communicate with the TDX module */ +u64 __tdcall(u64 fn, struct tdx_module_args *args); +u64 __tdcall_ret(u64 fn, struct tdx_module_args *args); +u64 __tdcall_saved_ret(u64 fn, struct tdx_module_args *args); + /* Used to request services from the VMM */ -u64 __tdx_hypercall(struct tdx_hypercall_args *args); -u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); +u64 __tdx_hypercall(struct tdx_module_args *args); /* * Wrapper for standard use of __tdx_hypercall with no output aside from @@ -57,7 +103,7 @@ u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); */ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, .r11 = fn, .r12 = r12, @@ -71,25 +117,7 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) /* Called from __tdx_hypercall() for unrecoverable failure */ -void __tdx_hypercall_failed(void); - -/* - * Used in __tdx_module_call() to gather the output registers' values of the - * TDCALL instruction when requesting services from the TDX module. This is a - * software only structure and not part of the TDX module/VMM ABI - */ -struct tdx_module_output { - u64 rcx; - u64 rdx; - u64 r8; - u64 r9; - u64 r10; - u64 r11; -}; - -/* Used to communicate with the TDX module */ -u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, - struct tdx_module_output *out); +void __noreturn __tdx_hypercall_failed(void); bool tdx_accept_memory(phys_addr_t start, phys_addr_t end); diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h new file mode 100644 index 000000000000..4cb77e004615 --- /dev/null +++ b/arch/x86/include/asm/shstk.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHSTK_H +#define _ASM_X86_SHSTK_H + +#ifndef __ASSEMBLY__ +#include <linux/types.h> + +struct task_struct; +struct ksignal; + +#ifdef CONFIG_X86_USER_SHADOW_STACK +struct thread_shstk { + u64 base; + u64 size; +}; + +long shstk_prctl(struct task_struct *task, int option, unsigned long arg2); +void reset_thread_features(void); +unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags, + unsigned long stack_size); +void shstk_free(struct task_struct *p); +int setup_signal_shadow_stack(struct ksignal *ksig); +int restore_signal_shadow_stack(void); +int shstk_update_last_frame(unsigned long val); +bool shstk_is_enabled(void); +#else +static inline long shstk_prctl(struct task_struct *task, int option, + unsigned long arg2) { return -EINVAL; } +static inline void reset_thread_features(void) {} +static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p, + unsigned long clone_flags, + unsigned long stack_size) { return 0; } +static inline void shstk_free(struct task_struct *p) {} +static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } +static inline int restore_signal_shadow_stack(void) { return 0; } +static inline int shstk_update_last_frame(unsigned long val) { return 0; } +static inline bool shstk_is_enabled(void) { return false; } +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SHSTK_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 600cf25dbfc6..ca073f40698f 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -8,24 +8,15 @@ #include <asm/current.h> #include <asm/thread_info.h> -extern int smp_num_siblings; -extern unsigned int num_processors; - DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); -DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); -DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) -DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid); -#endif struct task_struct; @@ -44,6 +35,7 @@ struct smp_ops { int (*cpu_disable)(void); void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); + void (*stop_this_cpu)(void); void (*send_call_func_ipi)(const struct cpumask *mask); void (*send_call_func_single_ipi)(int cpu); @@ -65,11 +57,6 @@ static inline void stop_other_cpus(void) smp_ops.stop_other_cpus(1); } -static inline void smp_prepare_boot_cpu(void) -{ - smp_ops.smp_prepare_boot_cpu(); -} - static inline void smp_prepare_cpus(unsigned int max_cpus) { smp_ops.smp_prepare_cpus(max_cpus); @@ -116,7 +103,6 @@ void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void smp_prepare_cpus_common(void); void native_smp_prepare_cpus(unsigned int max_cpus); -void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_kick_ap(unsigned int cpu, struct task_struct *tidle); @@ -132,11 +118,7 @@ void smp_kick_mwait_play_dead(void); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); -bool smp_park_other_cpus_in_init(void); - -void smp_store_boot_cpu_info(void); void smp_store_cpu_info(int id); asmlinkage __visible void smp_reboot_interrupt(void); @@ -184,15 +166,6 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) } #endif /* CONFIG_SMP */ -extern unsigned disabled_cpus; - -#ifdef CONFIG_X86_LOCAL_APIC -extern int hard_smp_processor_id(void); - -#else /* CONFIG_X86_LOCAL_APIC */ -#define hard_smp_processor_id() 0 -#endif /* CONFIG_X86_LOCAL_APIC */ - #ifdef CONFIG_DEBUG_NMI_SELFTEST extern void nmi_selftest(void); #else diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 64df897c0ee3..3918c7a434f5 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -31,13 +31,4 @@ #endif /* CONFIG_SPARSEMEM */ -#ifndef __ASSEMBLY__ -#ifdef CONFIG_NUMA_KEEP_MEMINFO -extern int phys_to_target_node(phys_addr_t start); -#define phys_to_target_node phys_to_target_node -extern int memory_add_physaddr_to_nid(u64 start); -#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid -#endif -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_SPARSEMEM_H */ diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index cb0386fc4dc3..658b690b2ccb 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -4,6 +4,7 @@ #include <linux/thread_info.h> #include <asm/nospec-branch.h> +#include <asm/msr.h> /* * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR @@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; } +/* + * This can be used in noinstr functions & should only be called in bare + * metal context. + */ +static __always_inline void __update_spec_ctrl(u64 val) +{ + __this_cpu_write(x86_spec_ctrl_current, val); + native_wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + #ifdef CONFIG_SMP extern void speculative_store_bypass_ht_init(void); #else @@ -85,4 +96,6 @@ static inline void speculative_store_bypass_ht_init(void) { } extern void speculation_ctrl_update(unsigned long tif); extern void speculation_ctrl_update_current(void); +extern bool itlb_multihit_kvm_mitigation; + #endif diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index de48d1389936..aec6e2d3aa1d 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -2,11 +2,11 @@ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H - #ifdef __KERNEL__ - #include <asm/nops.h> #include <asm/processor-flags.h> + +#include <linux/errno.h> #include <linux/irqflags.h> #include <linux/jump_label.h> @@ -182,8 +182,8 @@ static __always_inline void clflush(volatile void *__p) static inline void clflushopt(volatile void *__p) { - alternative_io(".byte 0x3e; clflush %P0", - ".byte 0x66; clflush %P0", + alternative_io(".byte 0x3e; clflush %0", + ".byte 0x66; clflush %0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); } @@ -202,6 +202,19 @@ static inline void clwb(volatile void *__p) : [pax] "a" (p)); } +#ifdef CONFIG_X86_USER_SHADOW_STACK +static inline int write_user_shstk_64(u64 __user *addr, u64 val) +{ + asm goto("1: wrussq %[val], %[addr]\n" + _ASM_EXTABLE(1b, %l[fail]) + :: [addr] "m" (*addr), [val] "r" (val) + :: fail); + return 0; +fail: + return -EFAULT; +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + #define nop() asm volatile ("nop") static inline void serialize(void) @@ -211,10 +224,10 @@ static inline void serialize(void) } /* The dst parameter must be 64-bytes aligned */ -static inline void movdir64b(void __iomem *dst, const void *src) +static inline void movdir64b(void *dst, const void *src) { const struct { char _[64]; } *__src = src; - struct { char _[64]; } __iomem *__dst = dst; + struct { char _[64]; } *__dst = dst; /* * MOVDIR64B %(rdx), rax. @@ -232,6 +245,11 @@ static inline void movdir64b(void __iomem *dst, const void *src) : "m" (*__src), "a" (__dst), "d" (__src)); } +static inline void movdir64b_io(void __iomem *dst, const void *src) +{ + movdir64b((void __force *)dst, src); +} + /** * enqcmds - Enqueue a command in supervisor (CPL0) mode * @dst: destination, in MMIO space (must be 512-bit aligned) diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 343b722ccaf2..125c407e2abe 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -46,7 +46,7 @@ #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)") -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk") #else diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 857d364b9888..9d0b324eab21 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -30,37 +30,40 @@ void *__memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMSET16 static inline void *memset16(uint16_t *s, uint16_t v, size_t n) { - long d0, d1; - asm volatile("rep\n\t" - "stosw" - : "=&c" (d0), "=&D" (d1) - : "a" (v), "1" (s), "0" (n) - : "memory"); - return s; + const __auto_type s0 = s; + asm volatile ( + "rep stosw" + : "+D" (s), "+c" (n) + : "a" (v) + : "memory" + ); + return s0; } #define __HAVE_ARCH_MEMSET32 static inline void *memset32(uint32_t *s, uint32_t v, size_t n) { - long d0, d1; - asm volatile("rep\n\t" - "stosl" - : "=&c" (d0), "=&D" (d1) - : "a" (v), "1" (s), "0" (n) - : "memory"); - return s; + const __auto_type s0 = s; + asm volatile ( + "rep stosl" + : "+D" (s), "+c" (n) + : "a" (v) + : "memory" + ); + return s0; } #define __HAVE_ARCH_MEMSET64 static inline void *memset64(uint64_t *s, uint64_t v, size_t n) { - long d0, d1; - asm volatile("rep\n\t" - "stosq" - : "=&c" (d0), "=&D" (d1) - : "a" (v), "1" (s), "0" (n) - : "memory"); - return s; + const __auto_type s0 = s; + asm volatile ( + "rep stosq" + : "+D" (s), "+c" (n) + : "a" (v) + : "memory" + ); + return s0; } #endif diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index a800abb1a992..d8416b3bf832 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,11 +12,6 @@ /* image of the saved processor state */ struct saved_context { - /* - * On x86_32, all segment registers except gs are saved at kernel - * entry in pt_regs. - */ - u16 gs; unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; struct saved_msrs saved_msrs; @@ -27,6 +22,11 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + /* + * On x86_32, all segment registers except gs are saved at kernel + * entry in pt_regs. + */ + u16 gs; bool misc_enable_saved; } __attribute__((packed)); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index e7c7379d6ac7..2b59b9951c90 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -229,10 +229,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) -#define SVM_VM_CR_VALID_MASK 0x001fULL -#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL -#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL - #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) @@ -268,6 +264,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, AVIC_IPI_FAILURE_INVALID_TARGET, AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, + AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; #define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) @@ -288,6 +285,14 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) +#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) +#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) +#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) +#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) + +#define SVM_SEV_FEAT_INT_INJ_MODES \ + (SVM_SEV_FEAT_RESTRICTED_INJECTION | \ + SVM_SEV_FEAT_ALTERNATE_INJECTION) struct vmcb_seg { u16 selector; @@ -345,7 +350,7 @@ struct vmcb_save_area { u64 last_excp_from; u64 last_excp_to; u8 reserved_0x298[72]; - u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ + u64 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; /* Save area definition for SEV-ES and SEV-SNP guests */ @@ -360,10 +365,10 @@ struct sev_es_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u64 vmpl0_ssp; - u64 vmpl1_ssp; - u64 vmpl2_ssp; - u64 vmpl3_ssp; + u64 pl0_ssp; + u64 pl1_ssp; + u64 pl2_ssp; + u64 pl3_ssp; u64 u_cet; u8 reserved_0xc8[2]; u8 vmpl; @@ -511,8 +516,22 @@ struct ghcb { u32 ghcb_usage; } __packed; +struct vmcb { + struct vmcb_control_area control; + union { + struct vmcb_save_area save; + + /* + * For SEV-ES VMs, the save area in the VMCB is used only to + * save/load host state. Guest state resides in a separate + * page, the aptly named VM Save Area (VMSA), that is encrypted + * with the guest's private key. + */ + struct sev_es_save_area host_sev_es_save; + }; +} __packed; -#define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_VMCB_SAVE_AREA_SIZE 744 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 #define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 @@ -527,6 +546,7 @@ static inline void __unused_size_checks(void) BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); + BUILD_BUG_ON(offsetof(struct vmcb, save) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); /* Check offsets of reserved fields */ @@ -563,15 +583,8 @@ static inline void __unused_size_checks(void) BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); } -struct vmcb { - struct vmcb_control_area control; - struct vmcb_save_area save; -} __packed; - #define SVM_CPUID_FUNC 0x8000000a -#define SVM_VM_CR_SVM_DISABLE 4 - #define SVM_SELECTOR_S_SHIFT 4 #define SVM_SELECTOR_DPL_SHIFT 5 #define SVM_SELECTOR_P_SHIFT 7 diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index f42dbf17f52b..75248546403d 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -70,8 +70,8 @@ static inline void update_task_stack(struct task_struct *task) #ifdef CONFIG_X86_32 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* Xen PV enters the kernel on the thread stack. */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) + if (!cpu_feature_enabled(X86_FEATURE_FRED) && cpu_feature_enabled(X86_FEATURE_XENPV)) + /* Xen PV enters the kernel on the thread stack. */ load_sp0(task_top_of_stack(task)); #endif } diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 4fb36fba4b5a..7c488ff0c764 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,19 +16,17 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> +/* This is used purely for kernel/trace/trace_syscalls.c */ typedef long (*sys_call_ptr_t)(const struct pt_regs *); extern const sys_call_ptr_t sys_call_table[]; -#if defined(CONFIG_X86_32) -#define ia32_sys_call_table sys_call_table -#else /* * These may not exist, but still put the prototypes in so we * can use IS_ENABLED(). */ -extern const sys_call_ptr_t ia32_sys_call_table[]; -extern const sys_call_ptr_t x32_sys_call_table[]; -#endif +extern long ia32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x64_sys_call(const struct pt_regs *, unsigned int nr); /* * Only the low 32 bits of orig_ax are meaningful, so we return int. @@ -84,7 +82,12 @@ static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) { - memcpy(args, ®s->bx, 6 * sizeof(args[0])); + args[0] = regs->bx; + args[1] = regs->cx; + args[2] = regs->dx; + args[3] = regs->si; + args[4] = regs->di; + args[5] = regs->bp; } static inline int syscall_get_arch(struct task_struct *task) @@ -126,12 +129,13 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(struct pt_regs *regs, int nr); +bool do_syscall_64(struct pt_regs *regs, int nr); +void do_int80_emulation(struct pt_regs *regs); #endif /* CONFIG_X86_32 */ void do_int80_syscall_32(struct pt_regs *regs); -long do_fast_syscall_32(struct pt_regs *regs); -long do_SYSENTER_32(struct pt_regs *regs); +bool do_fast_syscall_32(struct pt_regs *regs); +bool do_SYSENTER_32(struct pt_regs *regs); #endif /* _ASM_X86_SYSCALL_H */ diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index fd2669b1cb2d..7e88705e907f 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -58,12 +58,29 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); ,,regs->di,,regs->si,,regs->dx \ ,,regs->r10,,regs->r8,,regs->r9) \ + +/* SYSCALL_PT_ARGS is Adapted from s390x */ +#define SYSCALL_PT_ARG6(m, t1, t2, t3, t4, t5, t6) \ + SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5), m(t6, (regs->bp)) +#define SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5) \ + SYSCALL_PT_ARG4(m, t1, t2, t3, t4), m(t5, (regs->di)) +#define SYSCALL_PT_ARG4(m, t1, t2, t3, t4) \ + SYSCALL_PT_ARG3(m, t1, t2, t3), m(t4, (regs->si)) +#define SYSCALL_PT_ARG3(m, t1, t2, t3) \ + SYSCALL_PT_ARG2(m, t1, t2), m(t3, (regs->dx)) +#define SYSCALL_PT_ARG2(m, t1, t2) \ + SYSCALL_PT_ARG1(m, t1), m(t2, (regs->cx)) +#define SYSCALL_PT_ARG1(m, t1) m(t1, (regs->bx)) +#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__) + +#define __SC_COMPAT_CAST(t, a) \ + (__typeof(__builtin_choose_expr(__TYPE_IS_L(t), 0, 0U))) \ + (unsigned int)a + /* Mapping of registers to parameters for syscalls on i386 */ #define SC_IA32_REGS_TO_ARGS(x, ...) \ - __MAP(x,__SC_ARGS \ - ,,(unsigned int)regs->bx,,(unsigned int)regs->cx \ - ,,(unsigned int)regs->dx,,(unsigned int)regs->si \ - ,,(unsigned int)regs->di,,(unsigned int)regs->bp) + SYSCALL_PT_ARGS(x, __SC_COMPAT_CAST, \ + __MAP(x, __SC_TYPE, __VA_ARGS__)) \ #define __SYS_STUB0(abi, name) \ long __##abi##_##name(const struct pt_regs *regs); \ @@ -86,9 +103,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); return sys_ni_syscall(); \ } -#define __SYS_NI(abi, name) \ - SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers); - #ifdef CONFIG_X86_64 #define __X64_SYS_STUB0(name) \ __SYS_STUB0(x64, sys_##name) @@ -100,13 +114,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X64_COND_SYSCALL(name) \ __COND_SYSCALL(x64, sys_##name) -#define __X64_SYS_NI(name) \ - __SYS_NI(x64, sys_##name) #else /* CONFIG_X86_64 */ #define __X64_SYS_STUB0(name) #define __X64_SYS_STUBx(x, name, ...) #define __X64_COND_SYSCALL(name) -#define __X64_SYS_NI(name) #endif /* CONFIG_X86_64 */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) @@ -120,13 +131,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, sys_##name) -#define __IA32_SYS_NI(name) \ - __SYS_NI(ia32, sys_##name) #else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #define __IA32_SYS_STUB0(name) #define __IA32_SYS_STUBx(x, name, ...) #define __IA32_COND_SYSCALL(name) -#define __IA32_SYS_NI(name) #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #ifdef CONFIG_IA32_EMULATION @@ -135,8 +143,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the * ia32 regs in the proper order for shared or "common" syscalls. As some * syscalls may not be implemented, we need to expand COND_SYSCALL in - * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this - * case as well. + * kernel/sys_ni.c to cover this case as well. */ #define __IA32_COMPAT_SYS_STUB0(name) \ __SYS_STUB0(ia32, compat_sys_##name) @@ -148,14 +155,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, compat_sys_##name) -#define __IA32_COMPAT_SYS_NI(name) \ - __SYS_NI(ia32, compat_sys_##name) - #else /* CONFIG_IA32_EMULATION */ #define __IA32_COMPAT_SYS_STUB0(name) #define __IA32_COMPAT_SYS_STUBx(x, name, ...) #define __IA32_COMPAT_COND_SYSCALL(name) -#define __IA32_COMPAT_SYS_NI(name) #endif /* CONFIG_IA32_EMULATION */ @@ -175,13 +178,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(x64, compat_sys_##name) -#define __X32_COMPAT_SYS_NI(name) \ - __SYS_NI(x64, compat_sys_##name) #else /* CONFIG_X86_X32_ABI */ #define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) #define __X32_COMPAT_COND_SYSCALL(name) -#define __X32_COMPAT_SYS_NI(name) #endif /* CONFIG_X86_X32_ABI */ @@ -212,17 +212,12 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); /* * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in - * kernel/time/posix-stubs.c to cover this case as well. + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ __IA32_COMPAT_COND_SYSCALL(name) \ __X32_COMPAT_COND_SYSCALL(name) -#define COMPAT_SYS_NI(name) \ - __IA32_COMPAT_SYS_NI(name) \ - __X32_COMPAT_SYS_NI(name) - #endif /* CONFIG_COMPAT */ #define __SYSCALL_DEFINEx(x, name, ...) \ @@ -243,8 +238,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not * hurt, we only need to re-define it here to keep the naming congruent to - * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI() - * macros to work correctly. + * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro + * to work correctly. */ #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ @@ -257,10 +252,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); __X64_COND_SYSCALL(name) \ __IA32_COND_SYSCALL(name) -#define SYS_NI(name) \ - __X64_SYS_NI(name) \ - __IA32_SYS_NI(name) - /* * For VSYSCALLS, we need to declare these three syscalls with the new diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 603e6d1e9d4a..eba178996d84 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -8,6 +8,7 @@ #include <asm/errno.h> #include <asm/ptrace.h> +#include <asm/trapnr.h> #include <asm/shared/tdx.h> /* @@ -20,8 +21,19 @@ #define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) #define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) +#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP) +#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD) + +/* + * TDX module SEAMCALL leaf function error codes + */ +#define TDX_SUCCESS 0ULL +#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL + #ifndef __ASSEMBLY__ +#include <uapi/asm/mce.h> + /* * Used by the #VE exception handler to gather the #VE exception * info from the TDX module. This is a software only structure @@ -52,6 +64,8 @@ bool tdx_early_handle_ve(struct pt_regs *regs); int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport); +u64 tdx_hcall_get_quote(u8 *buf, size_t size); + #else static inline void tdx_early_init(void) { }; @@ -72,5 +86,42 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, return -ENODEV; } #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */ + +#ifdef CONFIG_INTEL_TDX_HOST +u64 __seamcall(u64 fn, struct tdx_module_args *args); +u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); +u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); +void tdx_init(void); + +#include <asm/archrandom.h> + +typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); + +static inline u64 sc_retry(sc_func_t func, u64 fn, + struct tdx_module_args *args) +{ + int retry = RDRAND_RETRY_LOOPS; + u64 ret; + + do { + ret = func(fn, args); + } while (ret == TDX_RND_NO_ENTROPY && --retry); + + return ret; +} + +#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args)) +#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args)) +#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) +int tdx_cpu_enable(void); +int tdx_enable(void); +const char *tdx_dump_mce_info(struct mce *m); +#else +static inline void tdx_init(void) { } +static inline int tdx_cpu_enable(void) { return -ENODEV; } +static inline int tdx_enable(void) { return -ENODEV; } +static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } +#endif /* CONFIG_INTEL_TDX_HOST */ + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 29832c338cdc..6259f1937fe7 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -6,18 +6,6 @@ #include <linux/stddef.h> #include <asm/ptrace.h> -struct paravirt_patch_site; -#ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end); -#else -static inline void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{} -#define __parainstructions NULL -#define __parainstructions_end NULL -#endif - /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. @@ -27,6 +15,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, extern void text_poke_early(void *addr, const void *opcode, size_t len); +extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); + /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d63b02940747..12da7dfd5ef1 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,9 @@ * In vm86 mode, the hardware frame is much longer still, so add 16 * bytes to make room for the real-mode segments. * - * x86_64 has a fixed-length stack frame. + * x86-64 has a fixed-length stack frame, but it depends on whether + * or not FRED is enabled. Future versions of FRED might make this + * dynamic, but for now it is always 2 words longer. */ #ifdef CONFIG_X86_32 # ifdef CONFIG_VM86 @@ -39,8 +41,12 @@ # else # define TOP_OF_KERNEL_STACK_PADDING 8 # endif -#else -# define TOP_OF_KERNEL_STACK_PADDING 0 +#else /* x86-64 */ +# ifdef CONFIG_X86_FRED +# define TOP_OF_KERNEL_STACK_PADDING (2 * 8) +# else +# define TOP_OF_KERNEL_STACK_PADDING 0 +# endif #endif /* diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 80450e1d5385..69e79fff41b8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -3,6 +3,7 @@ #define _ASM_X86_TLBFLUSH_H #include <linux/mm_types.h> +#include <linux/mmu_notifier.h> #include <linux/sched.h> #include <asm/processor.h> @@ -253,6 +254,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ + bool should_defer = false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* @@ -264,11 +277,18 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(&mm->context.tlb_gen); } -static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); +} + +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + flush_tlb_mm(mm); } extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); @@ -286,7 +306,8 @@ static inline bool pte_flags_need_flush(unsigned long oldflags, const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED; const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | - _PAGE_SOFTW3 | _PAGE_SOFTW4; + _PAGE_SOFTW3 | _PAGE_SOFTW4 | + _PAGE_SAVED_DIRTY; const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | @@ -378,11 +399,10 @@ static inline u64 tlbstate_lam_cr3_mask(void) return lam << X86_CR3_LAM_U57_BIT; } -static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { - this_cpu_write(cpu_tlbstate.lam, - mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT); - this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask); + this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT); + this_cpu_write(tlbstate_untag_mask, untag_mask); } #else @@ -392,7 +412,7 @@ static inline u64 tlbstate_lam_cr3_mask(void) return 0; } -static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { } #endif diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index caf41c4869a0..aef70336d624 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -102,31 +102,94 @@ static inline void setup_node_to_cpumask_map(void) { } #include <asm-generic/topology.h> +/* Topology information */ +enum x86_topology_domains { + TOPO_SMT_DOMAIN, + TOPO_CORE_DOMAIN, + TOPO_MODULE_DOMAIN, + TOPO_TILE_DOMAIN, + TOPO_DIE_DOMAIN, + TOPO_DIEGRP_DOMAIN, + TOPO_PKG_DOMAIN, + TOPO_MAX_DOMAIN, +}; + +struct x86_topology_system { + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_size[TOPO_MAX_DOMAIN]; +}; + +extern struct x86_topology_system x86_topo_system; + +static inline unsigned int topology_get_domain_size(enum x86_topology_domains dom) +{ + return x86_topo_system.dom_size[dom]; +} + +static inline unsigned int topology_get_domain_shift(enum x86_topology_domains dom) +{ + return dom == TOPO_SMT_DOMAIN ? 0 : x86_topo_system.dom_shifts[dom - 1]; +} + extern const struct cpumask *cpu_coregroup_mask(int cpu); extern const struct cpumask *cpu_clustergroup_mask(int cpu); -#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) -#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) -#define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id) -#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) -#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) +#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) +#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) +#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) +#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) +#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) -extern unsigned int __max_die_per_package; +#define topology_amd_node_id(cpu) (cpu_data(cpu).topo.amd_node_id) + +extern unsigned int __max_dies_per_package; +extern unsigned int __max_logical_packages; +extern unsigned int __max_threads_per_core; +extern unsigned int __num_threads_per_package; +extern unsigned int __num_cores_per_package; + +static inline unsigned int topology_max_packages(void) +{ + return __max_logical_packages; +} + +static inline unsigned int topology_max_dies_per_package(void) +{ + return __max_dies_per_package; +} + +static inline unsigned int topology_num_cores_per_package(void) +{ + return __num_cores_per_package; +} + +static inline unsigned int topology_num_threads_per_package(void) +{ + return __num_threads_per_package; +} + +#ifdef CONFIG_X86_LOCAL_APIC +int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level); +#else +static inline int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) +{ + return 0; +} +#endif #ifdef CONFIG_SMP -#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu)) +#define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id) #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) -extern unsigned int __max_logical_packages; -#define topology_max_packages() (__max_logical_packages) -static inline int topology_max_die_per_package(void) +static inline int topology_phys_to_logical_pkg(unsigned int pkg) { - return __max_die_per_package; + return topology_get_logical_id(pkg << x86_topo_system.dom_shifts[TOPO_PKG_DOMAIN], + TOPO_PKG_DOMAIN); } extern int __max_smt_threads; @@ -136,10 +199,14 @@ static inline int topology_max_smt_threads(void) return __max_smt_threads; } -int topology_update_package_map(unsigned int apicid, unsigned int cpu); -int topology_update_die_map(unsigned int dieid, unsigned int cpu); -int topology_phys_to_logical_pkg(unsigned int pkg); -bool topology_smt_supported(void); +#include <linux/cpu_smt.h> + +extern unsigned int __amd_nodes_per_pkg; + +static inline unsigned int topology_amd_nodes_per_pkg(void) +{ + return __amd_nodes_per_pkg; +} extern struct cpumask __cpu_primary_thread_mask; #define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) @@ -152,17 +219,12 @@ static inline bool topology_is_primary_thread(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_primary_thread_mask); } + #else /* CONFIG_SMP */ -#define topology_max_packages() (1) -static inline int -topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } -static inline int -topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } -static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } -static inline bool topology_smt_supported(void) { return false; } +static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } #endif /* !CONFIG_SMP */ static inline void arch_fix_phys_package_id(int num, u32 slot) @@ -220,9 +282,22 @@ static inline long arch_scale_freq_capacity(int cpu) } #define arch_scale_freq_capacity arch_scale_freq_capacity +bool arch_enable_hybrid_capacity_scale(void); +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq); + +unsigned long arch_scale_cpu_capacity(int cpu); +#define arch_scale_cpu_capacity arch_scale_cpu_capacity + extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else +static inline bool arch_enable_hybrid_capacity_scale(void) { return false; } +static inline void arch_set_cpu_capacity(int cpu, unsigned long cap, + unsigned long max_cap, + unsigned long cap_freq, + unsigned long base_freq) { } + static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index 10b1de500ab1..a23a7b707b64 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_TRAP_PF_H #define _ASM_X86_TRAP_PF_H +#include <linux/bits.h> + /* * Page fault error code bits: * @@ -11,16 +13,20 @@ * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch * bit 5 == 1: protection keys block access + * bit 6 == 1: shadow stack access fault * bit 15 == 1: SGX MMU page-fault + * bit 31 == 1: fault was due to RMP violation */ enum x86_pf_error_code { - X86_PF_PROT = 1 << 0, - X86_PF_WRITE = 1 << 1, - X86_PF_USER = 1 << 2, - X86_PF_RSVD = 1 << 3, - X86_PF_INSTR = 1 << 4, - X86_PF_PK = 1 << 5, - X86_PF_SGX = 1 << 15, + X86_PF_PROT = BIT(0), + X86_PF_WRITE = BIT(1), + X86_PF_USER = BIT(2), + X86_PF_RSVD = BIT(3), + X86_PF_INSTR = BIT(4), + X86_PF_PK = BIT(5), + X86_PF_SHSTK = BIT(6), + X86_PF_SGX = BIT(15), + X86_PF_RMP = BIT(31), }; #endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index f5d2325aa0b7..8d1154cdf787 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -2,6 +2,18 @@ #ifndef _ASM_X86_TRAPNR_H #define _ASM_X86_TRAPNR_H +/* + * Event type codes used by FRED, Intel VT-x and AMD SVM + */ +#define EVENT_TYPE_EXTINT 0 // External interrupt +#define EVENT_TYPE_RESERVED 1 +#define EVENT_TYPE_NMI 2 // NMI +#define EVENT_TYPE_HWEXC 3 // Hardware originated traps, exceptions +#define EVENT_TYPE_SWINT 4 // INT n +#define EVENT_TYPE_PRIV_SWEXC 5 // INT1 +#define EVENT_TYPE_SWEXC 6 // INTO, INT3 +#define EVENT_TYPE_OTHER 7 // FRED SYSCALL/SYSENTER, VT-x MTF + /* Interrupts/Exceptions */ #define X86_TRAP_DE 0 /* Divide-by-zero */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 47ecfff2c83d..1f1deaecd364 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -14,11 +14,11 @@ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs); -void __init trap_init(void); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif -extern bool ibt_selftest(void); +extern int ibt_selftest(void); +extern int ibt_selftest_noendbr(void); #ifdef CONFIG_X86_F00F_BUG /* For handling the FOOF bug */ @@ -47,4 +47,16 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs, struct stack_info *info); #endif +static inline void cond_local_irq_enable(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void cond_local_irq_disable(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); +} + #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 594fce0ca744..94408a784c8e 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -5,8 +5,9 @@ #ifndef _ASM_X86_TSC_H #define _ASM_X86_TSC_H -#include <asm/processor.h> #include <asm/cpufeature.h> +#include <asm/processor.h> +#include <asm/msr.h> /* * Standard way to access the cycle counter. @@ -27,9 +28,6 @@ static inline cycles_t get_cycles(void) } #define get_cycles get_cycles -extern struct system_counterval_t convert_art_to_tsc(u64 art); -extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); - extern void tsc_early_init(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8bae40a66282..3a7755c1a441 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -78,10 +78,10 @@ extern int __get_user_bad(void); int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ - asm volatile("call __" #fn "_%P4" \ + asm volatile("call __" #fn "_%c[size]" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ - : "0" (ptr), "i" (sizeof(*(ptr)))); \ + : "0" (ptr), [size] "i" (sizeof(*(ptr)))); \ instrument_get_user(__val_gu); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ @@ -133,7 +133,7 @@ extern int __get_user_bad(void); #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ @@ -177,7 +177,7 @@ extern void __put_user_nocheck_8(void); __chk_user_ptr(__ptr); \ __ptr_pu = __ptr; \ __val_pu = __x; \ - asm volatile("call __" #fn "_%P[size]" \ + asm volatile("call __" #fn "_%c[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ : "0" (__ptr_pu), \ @@ -295,7 +295,7 @@ do { \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ @@ -375,7 +375,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -394,7 +394,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -477,7 +477,7 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ @@ -496,7 +496,7 @@ copy_mc_to_kernel(void *to, const void *from, unsigned len); #define copy_mc_to_kernel copy_mc_to_kernel unsigned long __must_check -copy_mc_to_user(void *to, const void *from, unsigned len); +copy_mc_to_user(void __user *to, const void *from, unsigned len); #endif /* diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 81b826d3b753..afce8ee5d7b7 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -11,6 +11,7 @@ #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/page.h> +#include <asm/percpu.h> #ifdef CONFIG_ADDRESS_MASKING /* @@ -18,14 +19,10 @@ */ static inline unsigned long __untagged_addr(unsigned long addr) { - /* - * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation - * in alternative instructions. The relocation gets wrong when gets - * copied to the target place. - */ asm (ALTERNATIVE("", - "and %%gs:tlbstate_untag_mask, %[addr]\n\t", X86_FEATURE_LAM) - : [addr] "+r" (addr) : "m" (tlbstate_untag_mask)); + "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM) + : [addr] "+r" (addr) + : [mask] "m" (__my_cpu_var(tlbstate_untag_mask))); return addr; } @@ -54,7 +51,18 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, * half and a user half. When cast to a signed type, user pointers * are positive and kernel pointers are negative. */ -#define valid_user_address(x) ((long)(x) >= 0) +#define valid_user_address(x) ((__force long)(x) >= 0) + +/* + * Masking the user address is an alternative to a conditional + * user_access_begin that can avoid the fencing. This only works + * for dense accesses starting at the address. + */ +#define mask_user_address(x) ((typeof(x))((long)(x)|((long)(x)>>63))) +#define masked_user_access_begin(x) ({ \ + __auto_type __masked_ptr = (x); \ + __masked_ptr = mask_user_address(__masked_ptr); \ + __uaccess_begin(); __masked_ptr; }) /* * User pointers can have tag bits on x86-64. This scheme tolerates @@ -87,8 +95,9 @@ static inline bool __access_ok(const void __user *ptr, unsigned long size) if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) { return valid_user_address(ptr); } else { - unsigned long sum = size + (unsigned long)ptr; - return valid_user_address(sum) && sum >= (unsigned long)ptr; + unsigned long sum = size + (__force unsigned long)ptr; + + return valid_user_address(sum) && sum >= (__force unsigned long)ptr; } } #define __access_ok __access_ok @@ -116,7 +125,7 @@ copy_user_generic(void *to, const void *from, unsigned long len) "2:\n" _ASM_EXTABLE_UA(1b, 2b) :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT - : : "memory", "rax", "r8", "r9", "r10", "r11"); + : : "memory", "rax"); clac(); return len; } diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 761173ccc33c..6c9e5bdd3916 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -56,6 +56,5 @@ # define __ARCH_WANT_SYS_FORK # define __ARCH_WANT_SYS_VFORK # define __ARCH_WANT_SYS_CLONE -# define __ARCH_WANT_SYS_CLONE3 #endif /* _ASM_X86_UNISTD_H */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 1b6455f881f9..6989b824fd32 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -10,6 +10,7 @@ * Copyright (c) Russ Anderson <rja@sgi.com> */ +#include <linux/efi.h> #include <linux/rtc.h> /* @@ -115,7 +116,8 @@ struct uv_arch_type_entry { struct uv_systab { char signature[4]; /* must be UV_SYSTAB_SIG */ u32 revision; /* distinguish different firmware revs */ - u64 function; /* BIOS runtime callback function ptr */ + u64 (__efiapi *function)(enum uv_bios_cmd, ...); + /* BIOS runtime callback function ptr */ u32 size; /* systab size (starting with _VERSION_UV4) */ struct { u32 type:8; /* type of entry */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 5fa76c2ced51..ea877fd83114 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -653,7 +653,7 @@ static inline int uv_blade_to_node(int blade) return uv_socket_to_node(blade); } -/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */ +/* Blade number of current cpu. Numbered 0 .. <#blades -1> */ static inline int uv_numa_blade_id(void) { return uv_hub_info->numa_blade_id; diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h index d6b17c760622..1876b5edd142 100644 --- a/arch/x86/include/asm/uv/uv_irq.h +++ b/arch/x86/include/asm/uv/uv_irq.h @@ -31,7 +31,6 @@ enum { UV_AFFINITY_CPU }; -extern int uv_irq_2_mmr_info(int, unsigned long *, int *); extern int uv_setup_irq(char *, int, int, unsigned long, int); extern void uv_teardown_irq(unsigned int); diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h new file mode 100644 index 000000000000..ff5334ad32a0 --- /dev/null +++ b/arch/x86/include/asm/vdso/getrandom.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ +#ifndef __ASM_VDSO_GETRANDOM_H +#define __ASM_VDSO_GETRANDOM_H + +#ifndef __ASSEMBLY__ + +#include <asm/unistd.h> +#include <asm/vvar.h> + +/** + * getrandom_syscall - Invoke the getrandom() syscall. + * @buffer: Destination buffer to fill with random bytes. + * @len: Size of @buffer in bytes. + * @flags: Zero or more GRND_* flags. + * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. + */ +static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags) +{ + long ret; + + asm ("syscall" : "=a" (ret) : + "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) : + "rcx", "r11", "memory"); + + return ret; +} + +#define __vdso_rng_data (VVAR(_vdso_rng_data)) + +static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) +{ + if (IS_ENABLED(CONFIG_TIME_NS) && __vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) + return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data); + return &__vdso_rng_data; +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index c81858d903dc..b2d2df026f6e 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -300,7 +300,7 @@ static inline bool arch_vdso_cycles_ok(u64 cycles) #define vdso_cycles_ok arch_vdso_cycles_ok /* - * x86 specific delta calculation. + * x86 specific calculation of nanoseconds for the current cycle count * * The regular implementation assumes that clocksource reads are globally * monotonic. The TSC can be slightly off across sockets which can cause @@ -308,8 +308,8 @@ static inline bool arch_vdso_cycles_ok(u64 cycles) * jump. * * Therefore it needs to be verified that @cycles are greater than - * @last. If not then use @last, which is the base time of the current - * conversion period. + * @vd->cycles_last. If not then use @vd->cycles_last, which is the base + * time of the current conversion period. * * This variant also uses a custom mask because while the clocksource mask of * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses @@ -317,27 +317,36 @@ static inline bool arch_vdso_cycles_ok(u64 cycles) * declares everything with the MSB/Sign-bit set as invalid. Therefore the * effective mask is S64_MAX. */ -static __always_inline -u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) +static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base) { - /* - * Due to the MSB/Sign-bit being used as invald marker (see - * arch_vdso_cycles_valid() above), the effective mask is S64_MAX. - */ - u64 delta = (cycles - last) & S64_MAX; + u64 delta = cycles - vd->cycle_last; /* - * Due to the above mentioned TSC wobbles, filter out negative motion. - * Per the above masking, the effective sign bit is now bit 62. + * Negative motion and deltas which can cause multiplication + * overflow require special treatment. This check covers both as + * negative motion is guaranteed to be greater than @vd::max_cycles + * due to unsigned comparison. + * + * Due to the MSB/Sign-bit being used as invalid marker (see + * arch_vdso_cycles_ok() above), the effective mask is S64_MAX, but that + * case is also unlikely and will also take the unlikely path here. */ - if (unlikely(delta & (1ULL << 62))) - return 0; + if (unlikely(delta > vd->max_cycles)) { + /* + * Due to the above mentioned TSC wobbles, filter out + * negative motion. Per the above masking, the effective + * sign bit is now bit 62. + */ + if (delta & (1ULL << 62)) + return base >> vd->shift; + + /* Handle multiplication overflow gracefully */ + return mul_u64_u32_add_u64_shr(delta & S64_MAX, vd->mult, base, vd->shift); + } - return delta * mult; + return ((delta * vd->mult) + base) >> vd->shift; } -#define vdso_calc_delta vdso_calc_delta - -int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts); +#define vdso_calc_ns vdso_calc_ns #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h index be199a9b2676..67fedf1698b5 100644 --- a/arch/x86/include/asm/vdso/vsyscall.h +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -4,13 +4,11 @@ #ifndef __ASSEMBLY__ -#include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <asm/vgtod.h> #include <asm/vvar.h> -DEFINE_VVAR(struct vdso_data, _vdso_data); /* * Update the vDSO data page to keep in sync with kernel timekeeping. */ @@ -21,6 +19,13 @@ struct vdso_data *__x86_get_k_vdso_data(void) } #define __arch_get_k_vdso_data __x86_get_k_vdso_data +static __always_inline +struct vdso_rng_data *__x86_get_k_vdso_rng_data(void) +{ + return &_vdso_rng_data; +} +#define __arch_get_k_vdso_rng_data __x86_get_k_vdso_rng_data + /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 7aa38b2ad8a9..a0ce291abcae 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -14,11 +14,6 @@ #include <uapi/linux/time.h> -#ifdef BUILD_VDSO32_64 -typedef u64 gtod_long_t; -#else -typedef unsigned long gtod_long_t; -#endif #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/video.h b/arch/x86/include/asm/video.h new file mode 100644 index 000000000000..0950c9535fae --- /dev/null +++ b/arch/x86/include/asm/video.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_VIDEO_H +#define _ASM_X86_VIDEO_H + +#include <linux/types.h> + +#include <asm/page.h> + +struct device; + +pgprot_t pgprot_framebuffer(pgprot_t prot, + unsigned long vm_start, unsigned long vm_end, + unsigned long offset); +#define pgprot_framebuffer pgprot_framebuffer + +bool video_is_primary_device(struct device *dev); +#define video_is_primary_device video_is_primary_device + +#include <asm-generic/video.h> + +#endif /* _ASM_X86_VIDEO_H */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h deleted file mode 100644 index 3b12e6b99412..000000000000 --- a/arch/x86/include/asm/virtext.h +++ /dev/null @@ -1,154 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* CPU virtualization extensions handling - * - * This should carry the code for handling CPU virtualization extensions - * that needs to live in the kernel core. - * - * Author: Eduardo Habkost <ehabkost@redhat.com> - * - * Copyright (C) 2008, Red Hat Inc. - * - * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. - */ -#ifndef _ASM_X86_VIRTEX_H -#define _ASM_X86_VIRTEX_H - -#include <asm/processor.h> - -#include <asm/vmx.h> -#include <asm/svm.h> -#include <asm/tlbflush.h> - -/* - * VMX functions: - */ - -static inline int cpu_has_vmx(void) -{ - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ -} - - -/** - * cpu_vmxoff() - Disable VMX on the current CPU - * - * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) - * - * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to - * atomically track post-VMXON state, e.g. this may be called in NMI context. - * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. - * faults are guaranteed to be due to the !post-VMXON check unless the CPU is - * magically in RM, VM86, compat mode, or at CPL>0. - */ -static inline int cpu_vmxoff(void) -{ - asm_volatile_goto("1: vmxoff\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "cc", "memory" : fault); - - cr4_clear_bits(X86_CR4_VMXE); - return 0; - -fault: - cr4_clear_bits(X86_CR4_VMXE); - return -EIO; -} - -static inline int cpu_vmx_enabled(void) -{ - return __read_cr4() & X86_CR4_VMXE; -} - -/** Disable VMX if it is enabled on the current CPU - * - * You shouldn't call this if cpu_has_vmx() returns 0. - */ -static inline void __cpu_emergency_vmxoff(void) -{ - if (cpu_vmx_enabled()) - cpu_vmxoff(); -} - -/** Disable VMX if it is supported and enabled on the current CPU - */ -static inline void cpu_emergency_vmxoff(void) -{ - if (cpu_has_vmx()) - __cpu_emergency_vmxoff(); -} - - - - -/* - * SVM functions: - */ - -/** Check if the CPU has SVM support - * - * You can use the 'msg' arg to get a message describing the problem, - * if the function returns zero. Simply pass NULL if you are not interested - * on the messages; gcc should take care of not generating code for - * the messages on this case. - */ -static inline int cpu_has_svm(const char **msg) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { - if (msg) - *msg = "not amd or hygon"; - return 0; - } - - if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { - if (msg) - *msg = "can't execute cpuid_8000000a"; - return 0; - } - - if (!boot_cpu_has(X86_FEATURE_SVM)) { - if (msg) - *msg = "svm not available"; - return 0; - } - return 1; -} - - -/** Disable SVM on the current CPU - * - * You should call this only if cpu_has_svm() returned true. - */ -static inline void cpu_svm_disable(void) -{ - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - if (efer & EFER_SVME) { - /* - * Force GIF=1 prior to disabling SVM to ensure INIT and NMI - * aren't blocked, e.g. if a fatal error occurred between CLGI - * and STGI. Note, STGI may #UD if SVM is disabled from NMI - * context between reading EFER and executing STGI. In that - * case, GIF must already be set, otherwise the NMI would have - * been blocked, so just eat the fault. - */ - asm_volatile_goto("1: stgi\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "memory" : fault); -fault: - wrmsrl(MSR_EFER, efer & ~EFER_SVME); - } -} - -/** Makes sure SVM is disabled, if it is supported on the CPU - */ -static inline void cpu_emergency_svm_disable(void) -{ - if (cpu_has_svm(NULL)) - cpu_svm_disable(); -} - -#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 9e8ac5073ecb..62ee19909903 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -84,7 +84,7 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c) static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { } -#define free_vm86(t) do { } while(0) +#define free_vm86(task) do { (void)(task); } while(0) #endif /* CONFIG_VM86 */ diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index ac9fc51e2b18..c9cf43d5ef23 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -7,51 +7,321 @@ #include <linux/stringify.h> /* - * The hypercall definitions differ in the low word of the %edx argument - * in the following way: the old port base interface uses the port - * number to distinguish between high- and low bandwidth versions. + * VMware hypercall ABI. + * + * - Low bandwidth (LB) hypercalls (I/O port based, vmcall and vmmcall) + * have up to 6 input and 6 output arguments passed and returned using + * registers: %eax (arg0), %ebx (arg1), %ecx (arg2), %edx (arg3), + * %esi (arg4), %edi (arg5). + * The following input arguments must be initialized by the caller: + * arg0 - VMWARE_HYPERVISOR_MAGIC + * arg2 - Hypercall command + * arg3 bits [15:0] - Port number, LB and direction flags + * + * - Low bandwidth TDX hypercalls (x86_64 only) are similar to LB + * hypercalls. They also have up to 6 input and 6 output on registers + * arguments, with different argument to register mapping: + * %r12 (arg0), %rbx (arg1), %r13 (arg2), %rdx (arg3), + * %rsi (arg4), %rdi (arg5). + * + * - High bandwidth (HB) hypercalls are I/O port based only. They have + * up to 7 input and 7 output arguments passed and returned using + * registers: %eax (arg0), %ebx (arg1), %ecx (arg2), %edx (arg3), + * %esi (arg4), %edi (arg5), %ebp (arg6). + * The following input arguments must be initialized by the caller: + * arg0 - VMWARE_HYPERVISOR_MAGIC + * arg1 - Hypercall command + * arg3 bits [15:0] - Port number, HB and direction flags + * + * For compatibility purposes, x86_64 systems use only lower 32 bits + * for input and output arguments. + * + * The hypercall definitions differ in the low word of the %edx (arg3) + * in the following way: the old I/O port based interface uses the port + * number to distinguish between high- and low bandwidth versions, and + * uses IN/OUT instructions to define transfer direction. * * The new vmcall interface instead uses a set of flags to select * bandwidth mode and transfer direction. The flags should be loaded - * into %dx by any user and are automatically replaced by the port - * number if the VMWARE_HYPERVISOR_PORT method is used. - * - * In short, new driver code should strictly use the new definition of - * %dx content. + * into arg3 by any user and are automatically replaced by the port + * number if the I/O port method is used. */ -/* Old port-based version */ -#define VMWARE_HYPERVISOR_PORT 0x5658 -#define VMWARE_HYPERVISOR_PORT_HB 0x5659 +#define VMWARE_HYPERVISOR_HB BIT(0) +#define VMWARE_HYPERVISOR_OUT BIT(1) -/* Current vmcall / vmmcall version */ -#define VMWARE_HYPERVISOR_HB BIT(0) -#define VMWARE_HYPERVISOR_OUT BIT(1) +#define VMWARE_HYPERVISOR_PORT 0x5658 +#define VMWARE_HYPERVISOR_PORT_HB (VMWARE_HYPERVISOR_PORT | \ + VMWARE_HYPERVISOR_HB) -/* The low bandwidth call. The low word of edx is presumed clear. */ -#define VMWARE_HYPERCALL \ - ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT) ", %%dx; " \ - "inl (%%dx), %%eax", \ - "vmcall", X86_FEATURE_VMCALL, \ - "vmmcall", X86_FEATURE_VMW_VMMCALL) +#define VMWARE_HYPERVISOR_MAGIC 0x564d5868U +#define VMWARE_CMD_GETVERSION 10 +#define VMWARE_CMD_GETHZ 45 +#define VMWARE_CMD_GETVCPU_INFO 68 +#define VMWARE_CMD_STEALCLOCK 91 /* - * The high bandwidth out call. The low word of edx is presumed to have the - * HB and OUT bits set. + * Hypercall command mask: + * bits [6:0] command, range [0, 127] + * bits [19:16] sub-command, range [0, 15] */ -#define VMWARE_HYPERCALL_HB_OUT \ - ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ - "rep outsb", \ - "vmcall", X86_FEATURE_VMCALL, \ - "vmmcall", X86_FEATURE_VMW_VMMCALL) +#define VMWARE_CMD_MASK 0xf007fU + +#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0) +#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1) + +extern unsigned long vmware_hypercall_slow(unsigned long cmd, + unsigned long in1, unsigned long in3, + unsigned long in4, unsigned long in5, + u32 *out1, u32 *out2, u32 *out3, + u32 *out4, u32 *out5); + +#define VMWARE_TDX_VENDOR_LEAF 0x1af7e4909ULL +#define VMWARE_TDX_HCALL_FUNC 1 + +extern unsigned long vmware_tdx_hypercall(unsigned long cmd, + unsigned long in1, unsigned long in3, + unsigned long in4, unsigned long in5, + u32 *out1, u32 *out2, u32 *out3, + u32 *out4, u32 *out5); /* - * The high bandwidth in call. The low word of edx is presumed to have the - * HB bit set. + * The low bandwidth call. The low word of %edx is presumed to have OUT bit + * set. The high word of %edx may contain input data from the caller. */ -#define VMWARE_HYPERCALL_HB_IN \ - ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ - "rep insb", \ - "vmcall", X86_FEATURE_VMCALL, \ +#define VMWARE_HYPERCALL \ + ALTERNATIVE_2("movw %[port], %%dx\n\t" \ + "inl (%%dx), %%eax", \ + "vmcall", X86_FEATURE_VMCALL, \ "vmmcall", X86_FEATURE_VMW_VMMCALL) + +static inline +unsigned long vmware_hypercall1(unsigned long cmd, unsigned long in1) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, 0, 0, 0, + NULL, NULL, NULL, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, 0, 0, 0, + NULL, NULL, NULL, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (0) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall3(unsigned long cmd, unsigned long in1, + u32 *out1, u32 *out2) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, 0, 0, 0, + out1, out2, NULL, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, 0, 0, 0, + out1, out2, NULL, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=b" (*out1), "=c" (*out2) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (0) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall4(unsigned long cmd, unsigned long in1, + u32 *out1, u32 *out2, u32 *out3) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, 0, 0, 0, + out1, out2, out3, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, 0, 0, 0, + out1, out2, out3, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=b" (*out1), "=c" (*out2), "=d" (*out3) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (0) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall5(unsigned long cmd, unsigned long in1, + unsigned long in3, unsigned long in4, + unsigned long in5, u32 *out2) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, in3, in4, in5, + NULL, out2, NULL, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, in3, in4, in5, + NULL, out2, NULL, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=c" (*out2) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall6(unsigned long cmd, unsigned long in1, + unsigned long in3, u32 *out2, + u32 *out3, u32 *out4, u32 *out5) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, in3, 0, 0, + NULL, out2, out3, out4, out5); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, in3, 0, 0, + NULL, out2, out3, out4, out5); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=c" (*out2), "=d" (*out3), "=S" (*out4), + "=D" (*out5) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall7(unsigned long cmd, unsigned long in1, + unsigned long in3, unsigned long in4, + unsigned long in5, u32 *out1, + u32 *out2, u32 *out3) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, in3, in4, in5, + out1, out2, out3, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, in3, in4, in5, + out1, out2, out3, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=b" (*out1), "=c" (*out2), "=d" (*out3) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + return out0; +} + +#ifdef CONFIG_X86_64 +#define VMW_BP_CONSTRAINT "r" +#else +#define VMW_BP_CONSTRAINT "m" +#endif + +/* + * High bandwidth calls are not supported on encrypted memory guests. + * The caller should check cc_platform_has(CC_ATTR_MEM_ENCRYPT) and use + * low bandwidth hypercall if memory encryption is set. + * This assumption simplifies HB hypercall implementation to just I/O port + * based approach without alternative patching. + */ +static inline +unsigned long vmware_hypercall_hb_out(unsigned long cmd, unsigned long in2, + unsigned long in3, unsigned long in4, + unsigned long in5, unsigned long in6, + u32 *out1) +{ + unsigned long out0; + + asm_inline volatile ( + UNWIND_HINT_SAVE + "push %%" _ASM_BP "\n\t" + UNWIND_HINT_UNDEFINED + "mov %[in6], %%" _ASM_BP "\n\t" + "rep outsb\n\t" + "pop %%" _ASM_BP "\n\t" + UNWIND_HINT_RESTORE + : "=a" (out0), "=b" (*out1) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (cmd), + "c" (in2), + "d" (in3 | VMWARE_HYPERVISOR_PORT_HB), + "S" (in4), + "D" (in5), + [in6] VMW_BP_CONSTRAINT (in6) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall_hb_in(unsigned long cmd, unsigned long in2, + unsigned long in3, unsigned long in4, + unsigned long in5, unsigned long in6, + u32 *out1) +{ + unsigned long out0; + + asm_inline volatile ( + UNWIND_HINT_SAVE + "push %%" _ASM_BP "\n\t" + UNWIND_HINT_UNDEFINED + "mov %[in6], %%" _ASM_BP "\n\t" + "rep insb\n\t" + "pop %%" _ASM_BP "\n\t" + UNWIND_HINT_RESTORE + : "=a" (out0), "=b" (*out1) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (cmd), + "c" (in2), + "d" (in3 | VMWARE_HYPERVISOR_PORT_HB), + "S" (in4), + "D" (in5), + [in6] VMW_BP_CONSTRAINT (in6) + : "cc", "memory"); + return out0; +} +#undef VMW_BP_CONSTRAINT +#undef VMWARE_HYPERCALL + #endif diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0d02c4aafa6f..f7fd4369b821 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -17,6 +17,7 @@ #include <linux/types.h> #include <uapi/asm/vmx.h> +#include <asm/trapnr.h> #include <asm/vmxfeatures.h> #define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f) @@ -70,8 +71,9 @@ #define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING) #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) +#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE) #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) -#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES) +#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES) #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) @@ -120,19 +122,17 @@ #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff -#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f -#define VMX_MISC_SAVE_EFER_LMA 0x00000020 -#define VMX_MISC_ACTIVITY_HLT 0x00000040 -#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100 -#define VMX_MISC_ZERO_LEN_INS 0x40000000 -#define VMX_MISC_MSR_LIST_MULTIPLIER 512 - /* VMFUNC functions */ #define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28) #define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING) #define VMFUNC_EPTP_ENTRIES 512 +#define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48) +#define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) +#define VMX_BASIC_INOUT BIT_ULL(54) +#define VMX_BASIC_TRUE_CTLS BIT_ULL(55) + static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { return vmx_basic & GENMASK_ULL(30, 0); @@ -143,9 +143,30 @@ static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; } +static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(53, 50)) >> 50; +} + +static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) +{ + return revision | ((u64)size << 32) | ((u64)memtype << 50); +} + +#define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5) +#define VMX_MISC_ACTIVITY_HLT BIT_ULL(6) +#define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7) +#define VMX_MISC_ACTIVITY_WAIT_SIPI BIT_ULL(8) +#define VMX_MISC_INTEL_PT BIT_ULL(14) +#define VMX_MISC_RDMSR_IN_SMM BIT_ULL(15) +#define VMX_MISC_VMXOFF_BLOCK_SMI BIT_ULL(28) +#define VMX_MISC_VMWRITE_SHADOW_RO_FIELDS BIT_ULL(29) +#define VMX_MISC_ZERO_LEN_INS BIT_ULL(30) +#define VMX_MISC_MSR_LIST_MULTIPLIER 512 + static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { - return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + return vmx_misc & GENMASK_ULL(4, 0); } static inline int vmx_misc_cr3_count(u64 vmx_misc) @@ -225,6 +246,8 @@ enum vmcs_field { VMREAD_BITMAP_HIGH = 0x00002027, VMWRITE_BITMAP = 0x00002028, VMWRITE_BITMAP_HIGH = 0x00002029, + VE_INFORMATION_ADDRESS = 0x0000202A, + VE_INFORMATION_ADDRESS_HIGH = 0x0000202B, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, ENCLS_EXITING_BITMAP = 0x0000202E, @@ -374,14 +397,14 @@ enum vmcs_field { #define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK -#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ -#define INTR_TYPE_RESERVED (1 << 8) /* reserved */ -#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ -#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ -#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ -#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ -#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ -#define INTR_TYPE_OTHER_EVENT (7 << 8) /* other event */ +#define INTR_TYPE_EXT_INTR (EVENT_TYPE_EXTINT << 8) /* external interrupt */ +#define INTR_TYPE_RESERVED (EVENT_TYPE_RESERVED << 8) /* reserved */ +#define INTR_TYPE_NMI_INTR (EVENT_TYPE_NMI << 8) /* NMI */ +#define INTR_TYPE_HARD_EXCEPTION (EVENT_TYPE_HWEXC << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (EVENT_TYPE_SWINT << 8) /* software interrupt */ +#define INTR_TYPE_PRIV_SW_EXCEPTION (EVENT_TYPE_PRIV_SWEXC << 8) /* ICE breakpoint */ +#define INTR_TYPE_SOFT_EXCEPTION (EVENT_TYPE_SWEXC << 8) /* software exception */ +#define INTR_TYPE_OTHER_EVENT (EVENT_TYPE_OTHER << 8) /* other event */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define GUEST_INTR_STATE_STI 0x00000001 @@ -504,15 +527,17 @@ enum vmcs_field { #define VMX_EPTP_PWL_4 0x18ull #define VMX_EPTP_PWL_5 0x20ull #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) +/* The EPTP memtype is encoded in bits 2:0, i.e. doesn't need to be shifted. */ #define VMX_EPTP_MT_MASK 0x7ull -#define VMX_EPTP_MT_WB 0x6ull -#define VMX_EPTP_MT_UC 0x0ull +#define VMX_EPTP_MT_WB X86_MEMTYPE_WB +#define VMX_EPTP_MT_UC X86_MEMTYPE_UC #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_ACCESS_BIT (1ull << 8) #define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63) #define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ VMX_EPT_WRITABLE_MASK | \ VMX_EPT_EXECUTABLE_MASK) @@ -629,4 +654,13 @@ enum vmx_l1d_flush_state { extern enum vmx_l1d_flush_state l1tf_vmx_mitigation; +struct vmx_ve_information { + u32 exit_reason; + u32 delivery; + u64 exit_qualification; + u64 guest_linear_address; + u64 guest_physical_address; + u16 eptp_index; +}; + #endif diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index c6a7eed03914..09b1d7e607c1 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -9,84 +9,85 @@ /* * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. + * in /proc/cpuinfo instead of the macro name. Otherwise, this feature bit + * is not displayed in /proc/cpuinfo at all. */ /* Pin-Based VM-Execution Controls, EPT/VPID, APIC and VM-Functions, word 0 */ -#define VMX_FEATURE_INTR_EXITING ( 0*32+ 0) /* "" VM-Exit on vectored interrupts */ -#define VMX_FEATURE_NMI_EXITING ( 0*32+ 3) /* "" VM-Exit on NMIs */ +#define VMX_FEATURE_INTR_EXITING ( 0*32+ 0) /* VM-Exit on vectored interrupts */ +#define VMX_FEATURE_NMI_EXITING ( 0*32+ 3) /* VM-Exit on NMIs */ #define VMX_FEATURE_VIRTUAL_NMIS ( 0*32+ 5) /* "vnmi" NMI virtualization */ -#define VMX_FEATURE_PREEMPTION_TIMER ( 0*32+ 6) /* VMX Preemption Timer */ -#define VMX_FEATURE_POSTED_INTR ( 0*32+ 7) /* Posted Interrupts */ +#define VMX_FEATURE_PREEMPTION_TIMER ( 0*32+ 6) /* "preemption_timer" VMX Preemption Timer */ +#define VMX_FEATURE_POSTED_INTR ( 0*32+ 7) /* "posted_intr" Posted Interrupts */ /* EPT/VPID features, scattered to bits 16-23 */ -#define VMX_FEATURE_INVVPID ( 0*32+ 16) /* INVVPID is supported */ +#define VMX_FEATURE_INVVPID ( 0*32+ 16) /* "invvpid" INVVPID is supported */ #define VMX_FEATURE_EPT_EXECUTE_ONLY ( 0*32+ 17) /* "ept_x_only" EPT entries can be execute only */ -#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* EPT Accessed/Dirty bits */ -#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* 1GB EPT pages */ +#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* "ept_ad" EPT Accessed/Dirty bits */ +#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* "ept_1gb" 1GB EPT pages */ +#define VMX_FEATURE_EPT_5LEVEL ( 0*32+ 20) /* "ept_5level" 5-level EPT paging */ /* Aggregated APIC features 24-27 */ -#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* TPR shadow + virt APIC */ -#define VMX_FEATURE_APICV ( 0*32+ 25) /* TPR shadow + APIC reg virt + virt intr delivery + posted interrupts */ +#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* "flexpriority" TPR shadow + virt APIC */ +#define VMX_FEATURE_APICV ( 0*32+ 25) /* "apicv" TPR shadow + APIC reg virt + virt intr delivery + posted interrupts */ /* VM-Functions, shifted to bits 28-31 */ -#define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* EPTP switching (in guest) */ +#define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* "eptp_switching" EPTP switching (in guest) */ /* Primary Processor-Based VM-Execution Controls, word 1 */ -#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* "" VM-Exit if INTRs are unblocked in guest */ +#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* VM-Exit if INTRs are unblocked in guest */ #define VMX_FEATURE_USE_TSC_OFFSETTING ( 1*32+ 3) /* "tsc_offset" Offset hardware TSC when read in guest */ -#define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* "" VM-Exit on HLT */ -#define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* "" VM-Exit on INVLPG */ -#define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* "" VM-Exit on MWAIT */ -#define VMX_FEATURE_RDPMC_EXITING ( 1*32+ 11) /* "" VM-Exit on RDPMC */ -#define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */ -#define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */ -#define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */ -#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* "" Enable Tertiary VM-Execution Controls */ -#define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */ -#define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */ +#define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* VM-Exit on HLT */ +#define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* VM-Exit on INVLPG */ +#define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* VM-Exit on MWAIT */ +#define VMX_FEATURE_RDPMC_EXITING ( 1*32+ 11) /* VM-Exit on RDPMC */ +#define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* VM-Exit on RDTSC */ +#define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* VM-Exit on writes to CR3 */ +#define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* VM-Exit on reads from CR3 */ +#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* Enable Tertiary VM-Execution Controls */ +#define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* VM-Exit on writes to CR8 */ +#define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* VM-Exit on reads from CR8 */ #define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */ -#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* "" VM-Exit if NMIs are unblocked in guest */ -#define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* "" VM-Exit on accesses to debug registers */ -#define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* "" VM-Exit on *all* IN{S} and OUT{S}*/ -#define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* "" VM-Exit based on I/O port */ +#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* VM-Exit if NMIs are unblocked in guest */ +#define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* VM-Exit on accesses to debug registers */ +#define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* VM-Exit on *all* IN{S} and OUT{S}*/ +#define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* VM-Exit based on I/O port */ #define VMX_FEATURE_MONITOR_TRAP_FLAG ( 1*32+ 27) /* "mtf" VMX single-step VM-Exits */ -#define VMX_FEATURE_USE_MSR_BITMAPS ( 1*32+ 28) /* "" VM-Exit based on MSR index */ -#define VMX_FEATURE_MONITOR_EXITING ( 1*32+ 29) /* "" VM-Exit on MONITOR (MWAIT's accomplice) */ -#define VMX_FEATURE_PAUSE_EXITING ( 1*32+ 30) /* "" VM-Exit on PAUSE (unconditionally) */ -#define VMX_FEATURE_SEC_CONTROLS ( 1*32+ 31) /* "" Enable Secondary VM-Execution Controls */ +#define VMX_FEATURE_USE_MSR_BITMAPS ( 1*32+ 28) /* VM-Exit based on MSR index */ +#define VMX_FEATURE_MONITOR_EXITING ( 1*32+ 29) /* VM-Exit on MONITOR (MWAIT's accomplice) */ +#define VMX_FEATURE_PAUSE_EXITING ( 1*32+ 30) /* VM-Exit on PAUSE (unconditionally) */ +#define VMX_FEATURE_SEC_CONTROLS ( 1*32+ 31) /* Enable Secondary VM-Execution Controls */ /* Secondary Processor-Based VM-Execution Controls, word 2 */ #define VMX_FEATURE_VIRT_APIC_ACCESSES ( 2*32+ 0) /* "vapic" Virtualize memory mapped APIC accesses */ -#define VMX_FEATURE_EPT ( 2*32+ 1) /* Extended Page Tables, a.k.a. Two-Dimensional Paging */ -#define VMX_FEATURE_DESC_EXITING ( 2*32+ 2) /* "" VM-Exit on {S,L}*DT instructions */ -#define VMX_FEATURE_RDTSCP ( 2*32+ 3) /* "" Enable RDTSCP in guest */ -#define VMX_FEATURE_VIRTUAL_X2APIC ( 2*32+ 4) /* "" Virtualize X2APIC for the guest */ -#define VMX_FEATURE_VPID ( 2*32+ 5) /* Virtual Processor ID (TLB ASID modifier) */ -#define VMX_FEATURE_WBINVD_EXITING ( 2*32+ 6) /* "" VM-Exit on WBINVD */ -#define VMX_FEATURE_UNRESTRICTED_GUEST ( 2*32+ 7) /* Allow Big Real Mode and other "invalid" states */ +#define VMX_FEATURE_EPT ( 2*32+ 1) /* "ept" Extended Page Tables, a.k.a. Two-Dimensional Paging */ +#define VMX_FEATURE_DESC_EXITING ( 2*32+ 2) /* VM-Exit on {S,L}*DT instructions */ +#define VMX_FEATURE_RDTSCP ( 2*32+ 3) /* Enable RDTSCP in guest */ +#define VMX_FEATURE_VIRTUAL_X2APIC ( 2*32+ 4) /* Virtualize X2APIC for the guest */ +#define VMX_FEATURE_VPID ( 2*32+ 5) /* "vpid" Virtual Processor ID (TLB ASID modifier) */ +#define VMX_FEATURE_WBINVD_EXITING ( 2*32+ 6) /* VM-Exit on WBINVD */ +#define VMX_FEATURE_UNRESTRICTED_GUEST ( 2*32+ 7) /* "unrestricted_guest" Allow Big Real Mode and other "invalid" states */ #define VMX_FEATURE_APIC_REGISTER_VIRT ( 2*32+ 8) /* "vapic_reg" Hardware emulation of reads to the virtual-APIC */ #define VMX_FEATURE_VIRT_INTR_DELIVERY ( 2*32+ 9) /* "vid" Evaluation and delivery of pending virtual interrupts */ #define VMX_FEATURE_PAUSE_LOOP_EXITING ( 2*32+ 10) /* "ple" Conditionally VM-Exit on PAUSE at CPL0 */ -#define VMX_FEATURE_RDRAND_EXITING ( 2*32+ 11) /* "" VM-Exit on RDRAND*/ -#define VMX_FEATURE_INVPCID ( 2*32+ 12) /* "" Enable INVPCID in guest */ -#define VMX_FEATURE_VMFUNC ( 2*32+ 13) /* "" Enable VM-Functions (leaf dependent) */ -#define VMX_FEATURE_SHADOW_VMCS ( 2*32+ 14) /* VMREAD/VMWRITE in guest can access shadow VMCS */ -#define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* "" VM-Exit on ENCLS (leaf dependent) */ -#define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* "" VM-Exit on RDSEED */ +#define VMX_FEATURE_RDRAND_EXITING ( 2*32+ 11) /* VM-Exit on RDRAND*/ +#define VMX_FEATURE_INVPCID ( 2*32+ 12) /* Enable INVPCID in guest */ +#define VMX_FEATURE_VMFUNC ( 2*32+ 13) /* Enable VM-Functions (leaf dependent) */ +#define VMX_FEATURE_SHADOW_VMCS ( 2*32+ 14) /* "shadow_vmcs" VMREAD/VMWRITE in guest can access shadow VMCS */ +#define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* VM-Exit on ENCLS (leaf dependent) */ +#define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* VM-Exit on RDSEED */ #define VMX_FEATURE_PAGE_MOD_LOGGING ( 2*32+ 17) /* "pml" Log dirty pages into buffer */ -#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "" Conditionally reflect EPT violations as #VE exceptions */ -#define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* "" Suppress VMX indicators in Processor Trace */ -#define VMX_FEATURE_XSAVES ( 2*32+ 20) /* "" Enable XSAVES and XRSTORS in guest */ +#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "ept_violation_ve" Conditionally reflect EPT violations as #VE exceptions */ +#define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* Suppress VMX indicators in Processor Trace */ +#define VMX_FEATURE_XSAVES ( 2*32+ 20) /* Enable XSAVES and XRSTORS in guest */ #define VMX_FEATURE_MODE_BASED_EPT_EXEC ( 2*32+ 22) /* "ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */ -#define VMX_FEATURE_PT_USE_GPA ( 2*32+ 24) /* "" Processor Trace logs GPAs */ -#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */ -#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ -#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ -#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ -#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* VM-Exit when no event windows after notify window */ +#define VMX_FEATURE_PT_USE_GPA ( 2*32+ 24) /* Processor Trace logs GPAs */ +#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* "tsc_scaling" Scale hardware TSC when read in guest */ +#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* "usr_wait_pause" Enable TPAUSE, UMONITOR, UMWAIT in guest */ +#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* VM-Exit on ENCLV (leaf dependent) */ +#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* VM-Exit when bus lock caused */ +#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* "notify_vm_exiting" VM-Exit when no event windows after notify window */ /* Tertiary Processor-Based VM-Execution Controls, word 3 */ -#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ +#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* "ipi_virt" Enable IPI virtualization */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index ab60a71a8dcb..472f0263dbc6 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,6 +4,7 @@ #include <linux/seqlock.h> #include <uapi/asm/vsyscall.h> +#include <asm/page_types.h> #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); @@ -24,4 +25,13 @@ static inline bool emulate_vsyscall(unsigned long error_code, } #endif +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static inline bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); +} + #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 183e98e49ab9..9d9af37f7cab 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -26,6 +26,8 @@ */ #define DECLARE_VVAR(offset, type, name) \ EMIT_VVAR(name, offset) +#define DECLARE_VVAR_SINGLE(offset, type, name) \ + EMIT_VVAR(name, offset) #else @@ -37,6 +39,10 @@ extern char __vvar_page; extern type timens_ ## name[CS_BASES] \ __attribute__((visibility("hidden"))); \ +#define DECLARE_VVAR_SINGLE(offset, type, name) \ + extern type vvar_ ## name \ + __attribute__((visibility("hidden"))); \ + #define VVAR(name) (vvar_ ## name) #define TIMENS(name) (timens_ ## name) @@ -44,12 +50,22 @@ extern char __vvar_page; type name[CS_BASES] \ __attribute__((section(".vvar_" #name), aligned(16))) __visible +#define DEFINE_VVAR_SINGLE(type, name) \ + type name \ + __attribute__((section(".vvar_" #name), aligned(16))) __visible + #endif /* DECLARE_VVAR(offset, type, name) */ DECLARE_VVAR(128, struct vdso_data, _vdso_data) +#if !defined(_SINGLE_DATA) +#define _SINGLE_DATA +DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data) +#endif + #undef DECLARE_VVAR +#undef DECLARE_VVAR_SINGLE #endif diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index 46b4f1f7f354..422a47746657 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -2,47 +2,15 @@ #ifndef _ASM_WORD_AT_A_TIME_H #define _ASM_WORD_AT_A_TIME_H -#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/wordpart.h> -/* - * This is largely generic for little-endian machines, but the - * optimal byte mask counting is probably going to be something - * that is architecture-specific. If you have a reliably fast - * bit count instruction, that might be better than the multiply - * and shift, for example. - */ struct word_at_a_time { const unsigned long one_bits, high_bits; }; #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } -#ifdef CONFIG_64BIT - -/* - * Jan Achrenius on G+: microoptimized version of - * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" - * that works for the bytemasks without having to - * mask them first. - */ -static inline long count_masked_bytes(unsigned long mask) -{ - return mask*0x0001020304050608ul >> 56; -} - -#else /* 32-bit case */ - -/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ -static inline long count_masked_bytes(long mask) -{ - /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ - long a = (0x0ff0001+mask) >> 23; - /* Fix the 1 for 00 case */ - return a & mask; -} - -#endif - /* Return nonzero if it has a zero */ static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) { @@ -56,6 +24,22 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, return bits; } +#ifdef CONFIG_64BIT + +/* Keep the initial has_zero() value for both bitmask and size calc */ +#define create_zero_mask(bits) (bits) + +static inline unsigned long zero_bytemask(unsigned long bits) +{ + bits = (bits - 1) & ~bits; + return bits >> 7; +} + +#define find_zero(bits) (__ffs(bits) >> 3) + +#else + +/* Create the final mask for both bytemask and size */ static inline unsigned long create_zero_mask(unsigned long bits) { bits = (bits - 1) & ~bits; @@ -65,11 +49,17 @@ static inline unsigned long create_zero_mask(unsigned long bits) /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ static inline unsigned long find_zero(unsigned long mask) { - return count_masked_bytes(mask); + /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ + long a = (0x0ff0001+mask) >> 23; + /* Fix the 1 for 00 case */ + return a & mask; } +#endif + /* * Load an unaligned word from kernel space. * diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 5240d88db52a..213cf5379a5a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H -#include <asm/bootparam.h> - struct ghcb; struct mpc_bus; struct mpc_cpu; @@ -15,13 +13,15 @@ struct irq_domain; /** * struct x86_init_mpparse - platform specific mpparse ops * @setup_ioapic_ids: platform specific ioapic id override - * @find_smp_config: find the smp configuration - * @get_smp_config: get the smp configuration + * @find_mptable: Find MPTABLE early to reserve the memory region + * @early_parse_smp_cfg: Parse the SMP configuration data early before initmem_init() + * @parse_smp_cfg: Parse the SMP configuration data */ struct x86_init_mpparse { void (*setup_ioapic_ids)(void); - void (*find_smp_config)(void); - void (*get_smp_config)(unsigned int early); + void (*find_mptable)(void); + void (*early_parse_smp_cfg)(void); + void (*parse_smp_cfg)(void); }; /** @@ -30,12 +30,13 @@ struct x86_init_mpparse { * @reserve_resources: reserve the standard resources for the * platform * @memory_setup: platform specific memory setup - * + * @dmi_setup: platform specific DMI setup */ struct x86_init_resources { void (*probe_roms)(void); void (*reserve_resources)(void); char *(*memory_setup)(void); + void (*dmi_setup)(void); }; /** @@ -148,12 +149,22 @@ struct x86_init_acpi { * @enc_status_change_finish Notify HV after the encryption status of a range is changed * @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status * @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status + * @enc_kexec_begin Begin the two-step process of converting shared memory back + * to private. It stops the new conversions from being started + * and waits in-flight conversions to finish, if possible. + * @enc_kexec_finish Finish the two-step process of converting shared memory to + * private. All memory is private after the call when + * the function returns. + * It is called on only one CPU while the others are shut down + * and with interrupts disabled. */ struct x86_guest { - bool (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc); - bool (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc); + int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc); + int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc); bool (*enc_tlb_flush_required)(bool enc); bool (*enc_cache_flush_required)(void); + void (*enc_kexec_begin)(void); + void (*enc_kexec_finish)(void); }; /** @@ -177,7 +188,7 @@ struct x86_init_ops { * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device * @early_percpu_clock_init: early init of the per cpu clock event device - * @fixup_cpu_id: fixup function for cpuinfo_x86::phys_proc_id + * @fixup_cpu_id: fixup function for cpuinfo_x86::topo.pkg_id * @parallel_bringup: Parallel bringup control */ struct x86_cpuinit_ops { diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 5fc35f889cd1..a9088250770f 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -36,6 +36,7 @@ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; +#include <asm/bug.h> #include <asm/processor.h> #define XEN_SIGNATURE "XenVMMXenVMM" @@ -63,4 +64,49 @@ void __init xen_pvh_init(struct boot_params *boot_params); void __init mem_map_via_hcall(struct boot_params *boot_params_p); #endif +/* Lazy mode for batching updates / context switch */ +enum xen_lazy_mode { + XEN_LAZY_NONE, + XEN_LAZY_MMU, + XEN_LAZY_CPU, +}; + +DECLARE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode); +DECLARE_PER_CPU(unsigned int, xen_lazy_nesting); + +static inline void enter_lazy(enum xen_lazy_mode mode) +{ + enum xen_lazy_mode old_mode = this_cpu_read(xen_lazy_mode); + + if (mode == old_mode) { + this_cpu_inc(xen_lazy_nesting); + return; + } + + BUG_ON(old_mode != XEN_LAZY_NONE); + + this_cpu_write(xen_lazy_mode, mode); +} + +static inline void leave_lazy(enum xen_lazy_mode mode) +{ + BUG_ON(this_cpu_read(xen_lazy_mode) != mode); + + if (this_cpu_read(xen_lazy_nesting) == 0) + this_cpu_write(xen_lazy_mode, XEN_LAZY_NONE); + else + this_cpu_dec(xen_lazy_nesting); +} + +enum xen_lazy_mode xen_get_lazy_mode(void); + +#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI) +void xen_sanitize_proc_cap_bits(uint32_t *buf); +#else +static inline void xen_sanitize_proc_cap_bits(uint32_t *buf) +{ + BUG(); +} +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h index c599ec269a25..c10f279aae93 100644 --- a/arch/x86/include/asm/xen/interface_64.h +++ b/arch/x86/include/asm/xen/interface_64.h @@ -61,7 +61,7 @@ * RING1 -> RING3 kernel mode. * RING2 -> RING3 kernel mode. * RING3 -> RING3 user mode. - * However RING0 indicates that the guest kernel should return to iteself + * However RING0 indicates that the guest kernel should return to itself * directly with * orb $3,1*8(%rsp) * iretq diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index fa9ec20783fa..85e63d58c074 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -295,7 +295,10 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) -#define virt_to_pfn(v) (PFN_DOWN(__pa(v))) +static inline unsigned long virt_to_pfn(const void *v) +{ + return PFN_DOWN(__pa(v)); +} #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 77a2d19cc990..abde0f44df57 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -2,12 +2,6 @@ #ifndef _ASM_X86_SWIOTLB_XEN_H #define _ASM_X86_SWIOTLB_XEN_H -#ifdef CONFIG_SWIOTLB_XEN -extern int pci_xen_swiotlb_init_late(void); -#else -static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } -#endif - int xen_swiotlb_fixup(void *buf, unsigned long nslabs); int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h index 769b939444ae..e5d182c7373c 100644 --- a/arch/x86/include/uapi/asm/amd_hsmp.h +++ b/arch/x86/include/uapi/asm/amd_hsmp.h @@ -47,6 +47,9 @@ enum hsmp_message_ids { HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */ HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */ HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */ + HSMP_GET_METRIC_TABLE_VER, /* 23h Get metrics table version */ + HSMP_GET_METRIC_TABLE, /* 24h Get metrics table */ + HSMP_GET_METRIC_TABLE_DRAM_ADDR,/* 25h Get metrics table dram address */ HSMP_MSG_ID_MAX, }; @@ -64,6 +67,14 @@ enum hsmp_msg_type { HSMP_GET = 1, }; +enum hsmp_proto_versions { + HSMP_PROTO_VER2 = 2, + HSMP_PROTO_VER3, + HSMP_PROTO_VER4, + HSMP_PROTO_VER5, + HSMP_PROTO_VER6 +}; + struct hsmp_msg_desc { int num_args; int response_sz; @@ -227,7 +238,7 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { /* * HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1 * input: args[0] = DIMM address[7:0] - * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] + + * output: args[0] = temperature in degree celsius[31:21] + update rate in ms[16:8] + * DIMM address[7:0] */ {1, 1, HSMP_GET}, @@ -295,6 +306,104 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { * input: args[0] = min df pstate[15:8] + max df pstate[7:0] */ {1, 0, HSMP_SET}, + + /* + * HSMP_GET_METRIC_TABLE_VER, num_args = 0, response_sz = 1 + * output: args[0] = metrics table version + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_METRIC_TABLE, num_args = 0, response_sz = 0 + */ + {0, 0, HSMP_GET}, + + /* + * HSMP_GET_METRIC_TABLE_DRAM_ADDR, num_args = 0, response_sz = 2 + * output: args[0] = lower 32 bits of the address + * output: args[1] = upper 32 bits of the address + */ + {0, 2, HSMP_GET}, +}; + +/* Metrics table (supported only with proto version 6) */ +struct hsmp_metric_table { + __u32 accumulation_counter; + + /* TEMPERATURE */ + __u32 max_socket_temperature; + __u32 max_vr_temperature; + __u32 max_hbm_temperature; + __u64 max_socket_temperature_acc; + __u64 max_vr_temperature_acc; + __u64 max_hbm_temperature_acc; + + /* POWER */ + __u32 socket_power_limit; + __u32 max_socket_power_limit; + __u32 socket_power; + + /* ENERGY */ + __u64 timestamp; + __u64 socket_energy_acc; + __u64 ccd_energy_acc; + __u64 xcd_energy_acc; + __u64 aid_energy_acc; + __u64 hbm_energy_acc; + + /* FREQUENCY */ + __u32 cclk_frequency_limit; + __u32 gfxclk_frequency_limit; + __u32 fclk_frequency; + __u32 uclk_frequency; + __u32 socclk_frequency[4]; + __u32 vclk_frequency[4]; + __u32 dclk_frequency[4]; + __u32 lclk_frequency[4]; + __u64 gfxclk_frequency_acc[8]; + __u64 cclk_frequency_acc[96]; + + /* FREQUENCY RANGE */ + __u32 max_cclk_frequency; + __u32 min_cclk_frequency; + __u32 max_gfxclk_frequency; + __u32 min_gfxclk_frequency; + __u32 fclk_frequency_table[4]; + __u32 uclk_frequency_table[4]; + __u32 socclk_frequency_table[4]; + __u32 vclk_frequency_table[4]; + __u32 dclk_frequency_table[4]; + __u32 lclk_frequency_table[4]; + __u32 max_lclk_dpm_range; + __u32 min_lclk_dpm_range; + + /* XGMI */ + __u32 xgmi_width; + __u32 xgmi_bitrate; + __u64 xgmi_read_bandwidth_acc[8]; + __u64 xgmi_write_bandwidth_acc[8]; + + /* ACTIVITY */ + __u32 socket_c0_residency; + __u32 socket_gfx_busy; + __u32 dram_bandwidth_utilization; + __u64 socket_c0_residency_acc; + __u64 socket_gfx_busy_acc; + __u64 dram_bandwidth_acc; + __u32 max_dram_bandwidth; + __u64 dram_bandwidth_utilization_acc; + __u64 pcie_bandwidth_acc[4]; + + /* THROTTLERS */ + __u32 prochot_residency_acc; + __u32 ppt_residency_acc; + __u32 socket_thm_residency_acc; + __u32 vr_thm_residency_acc; + __u32 hbm_thm_residency_acc; + __u32 spare; + + /* New items at the end to maintain driver compatibility */ + __u32 gfxclk_frequency[8]; }; /* Reset to default packing */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 01d19fc22346..9b82eebd7add 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -2,21 +2,7 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H -/* setup_data/setup_indirect types */ -#define SETUP_NONE 0 -#define SETUP_E820_EXT 1 -#define SETUP_DTB 2 -#define SETUP_PCI 3 -#define SETUP_EFI 4 -#define SETUP_APPLE_PROPERTIES 5 -#define SETUP_JAILHOUSE 6 -#define SETUP_CC_BLOB 7 -#define SETUP_IMA 8 -#define SETUP_RNG_SEED 9 -#define SETUP_ENUM_MAX SETUP_RNG_SEED - -#define SETUP_INDIRECT (1<<31) -#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) +#include <asm/setup_data.h> /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -38,6 +24,7 @@ #define XLF_EFI_KEXEC (1<<4) #define XLF_5LEVEL (1<<5) #define XLF_5LEVEL_ENABLED (1<<6) +#define XLF_MEM_ENCRYPTION (1<<7) #ifndef __ASSEMBLY__ @@ -48,22 +35,6 @@ #include <asm/ist.h> #include <video/edid.h> -/* extensible setup data list node */ -struct setup_data { - __u64 next; - __u32 type; - __u32 len; - __u8 data[]; -}; - -/* extensible setup indirect data node */ -struct setup_indirect { - __u32 type; - __u32 reserved; /* Reserved, must be set to zero. */ - __u64 len; - __u64 addr; -}; - struct setup_header { __u8 setup_sects; __u16 root_flags; @@ -137,50 +108,10 @@ struct efi_info { #define E820_MAX_ENTRIES_ZEROPAGE 128 /* - * The E820 memory region entry of the boot protocol ABI: - */ -struct boot_e820_entry { - __u64 addr; - __u64 size; - __u32 type; -} __attribute__((packed)); - -/* * Smallest compatible version of jailhouse_setup_data required by this kernel. */ #define JAILHOUSE_SETUP_REQUIRED_VERSION 1 -/* - * The boot loader is passing platform information via this Jailhouse-specific - * setup data structure. - */ -struct jailhouse_setup_data { - struct { - __u16 version; - __u16 compatible_version; - } __attribute__((packed)) hdr; - struct { - __u16 pm_timer_address; - __u16 num_cpus; - __u64 pci_mmconfig_base; - __u32 tsc_khz; - __u32 apic_khz; - __u8 standard_ioapic; - __u8 cpu_ids[255]; - } __attribute__((packed)) v1; - struct { - __u32 flags; - } __attribute__((packed)) v2; -} __attribute__((packed)); - -/* - * IMA buffer setup data information from the previous kernel during kexec - */ -struct ima_setup_data { - __u64 addr; - __u64 size; -} __attribute__((packed)); - /* The so-called "zeropage" */ struct boot_params { struct screen_info screen_info; /* 0x000 */ diff --git a/arch/x86/include/uapi/asm/elf.h b/arch/x86/include/uapi/asm/elf.h new file mode 100644 index 000000000000..468e135fa285 --- /dev/null +++ b/arch/x86/include/uapi/asm/elf.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_X86_ELF_H +#define _UAPI_ASM_X86_ELF_H + +#include <linux/types.h> + +struct x86_xfeat_component { + __u32 type; + __u32 size; + __u32 offset; + __u32 flags; +} __packed; + +_Static_assert(sizeof(struct x86_xfeat_component) % 4 == 0, "x86_xfeat_component is not aligned"); + +#endif /* _UAPI_ASM_X86_ELF_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 1a6a1f987949..a8debbf2f702 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -7,6 +7,8 @@ * */ +#include <linux/const.h> +#include <linux/bits.h> #include <linux/types.h> #include <linux/ioctl.h> #include <linux/stddef.h> @@ -40,7 +42,6 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 @@ -49,7 +50,6 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS -#define __KVM_HAVE_READONLY_MEM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -106,6 +106,7 @@ struct kvm_ioapic_state { #define KVM_RUN_X86_SMM (1 << 0) #define KVM_RUN_X86_BUS_LOCK (1 << 1) +#define KVM_RUN_X86_GUEST_MODE (1 << 2) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { @@ -438,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) +#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 @@ -457,8 +459,13 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 -/* attributes for system fd (group 0) */ -#define KVM_X86_XCOMP_GUEST_SUPP 0 +/* vendor-independent attributes for system fd (group 0) */ +#define KVM_X86_GRP_SYSTEM 0 +# define KVM_X86_XCOMP_GUEST_SUPP 0 + +/* vendor-specific groups and attributes for system fd */ +#define KVM_X86_GRP_SEV 1 +# define KVM_X86_SEV_VMSA_FEATURES 0 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -526,9 +533,359 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 -#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0) #define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) +/* for KVM_CAP_MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +/* for KVM_CAP_XEN_HVM */ +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) +#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) +#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + __u8 runstate_update_flag; + union { + __u64 gfn; +#define KVM_XEN_INVALID_GFN ((__u64)-1) + __u64 hva; + } shared_info; + struct { + __u32 send_port; + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ + __u32 flags; +#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) +#define KVM_XEN_EVTCHN_UPDATE (1 << 1) +#define KVM_XEN_EVTCHN_RESET (1 << 2) + /* + * Events sent by the guest are either looped back to + * the guest itself (potentially on a different port#) + * or signalled via an eventfd. + */ + union { + struct { + __u32 port; + __u32 vcpu; + __u32 priority; + } port; + struct { + __u32 port; /* Zero for eventfd */ + __s32 fd; + } eventfd; + __u32 padding[4]; + } deliver; + } evtchn; + __u32 xen_version; + __u64 pad[8]; + } u; +}; + + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 +#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ +#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; +#define KVM_XEN_INVALID_GPA ((__u64)-1) + __u64 hva; + __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; + __u32 vcpu_id; + struct { + __u32 port; + __u32 priority; + __u64 expires_ns; + } timer; + __u8 vector; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 +#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 +#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 + +/* Secure Encrypted Virtualization command */ +enum sev_cmd_id { + /* Guest initialization commands */ + KVM_SEV_INIT = 0, + KVM_SEV_ES_INIT, + /* Guest launch commands */ + KVM_SEV_LAUNCH_START, + KVM_SEV_LAUNCH_UPDATE_DATA, + KVM_SEV_LAUNCH_UPDATE_VMSA, + KVM_SEV_LAUNCH_SECRET, + KVM_SEV_LAUNCH_MEASURE, + KVM_SEV_LAUNCH_FINISH, + /* Guest migration commands (outgoing) */ + KVM_SEV_SEND_START, + KVM_SEV_SEND_UPDATE_DATA, + KVM_SEV_SEND_UPDATE_VMSA, + KVM_SEV_SEND_FINISH, + /* Guest migration commands (incoming) */ + KVM_SEV_RECEIVE_START, + KVM_SEV_RECEIVE_UPDATE_DATA, + KVM_SEV_RECEIVE_UPDATE_VMSA, + KVM_SEV_RECEIVE_FINISH, + /* Guest status and debug commands */ + KVM_SEV_GUEST_STATUS, + KVM_SEV_DBG_DECRYPT, + KVM_SEV_DBG_ENCRYPT, + /* Guest certificates commands */ + KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, + + /* Second time is the charm; improved versions of the above ioctls. */ + KVM_SEV_INIT2, + + /* SNP-specific commands */ + KVM_SEV_SNP_LAUNCH_START = 100, + KVM_SEV_SNP_LAUNCH_UPDATE, + KVM_SEV_SNP_LAUNCH_FINISH, + + KVM_SEV_NR_MAX, +}; + +struct kvm_sev_cmd { + __u32 id; + __u32 pad0; + __u64 data; + __u32 error; + __u32 sev_fd; +}; + +struct kvm_sev_init { + __u64 vmsa_features; + __u32 flags; + __u16 ghcb_version; + __u16 pad1; + __u32 pad2[8]; +}; + +struct kvm_sev_launch_start { + __u32 handle; + __u32 policy; + __u64 dh_uaddr; + __u32 dh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_launch_update_data { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + + +struct kvm_sev_launch_secret { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_launch_measure { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_guest_status { + __u32 handle; + __u32 policy; + __u32 state; +}; + +struct kvm_sev_dbg { + __u64 src_uaddr; + __u64 dst_uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_send_start { + __u32 policy; + __u32 pad0; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u32 pad1; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u32 pad2; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u32 pad3; + __u64 session_uaddr; + __u32 session_len; + __u32 pad4; +}; + +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_snp_launch_start { + __u64 policy; + __u8 gosvw[16]; + __u16 flags; + __u8 pad0[6]; + __u64 pad1[4]; +}; + +/* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 +#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 +#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 +#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 +#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 + +struct kvm_sev_snp_launch_update { + __u64 gfn_start; + __u64 uaddr; + __u64 len; + __u8 type; + __u8 pad0; + __u16 flags; + __u32 pad1; + __u64 pad2[4]; +}; + +#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 +#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 +#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 + +struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; + __u8 pad0[3]; + __u16 flags; + __u64 pad1[4]; +}; + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; +}; + +#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff +#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + /* * Masked event layout. * Bits Description @@ -549,10 +906,10 @@ struct kvm_pmu_event_filter { ((__u64)(!!(exclude)) << 55)) #define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ - (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) -#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) + (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55)) #define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ @@ -560,6 +917,12 @@ struct kvm_pmu_event_filter { #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ /* x86-specific KVM_EXIT_HYPERCALL flags. */ -#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0) + +#define KVM_X86_DEFAULT_VM 0 +#define KVM_X86_SW_PROTECTED_VM 1 +#define KVM_X86_SEV_VM 2 +#define KVM_X86_SEV_ES_VM 3 +#define KVM_X86_SNP_VM 4 #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 6e64b27b2c1e..a1efa7907a0b 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -92,7 +92,7 @@ struct kvm_clock_pairing { #define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3) /* MSR_KVM_ASYNC_PF_INT */ -#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) +#define KVM_ASYNC_PF_VEC_MASK __GENMASK(7, 0) /* MSR_KVM_MIGRATION_CONTROL */ #define KVM_MIGRATION_READY (1 << 0) @@ -142,7 +142,6 @@ struct kvm_vcpu_pv_apf_data { __u32 token; __u8 pad[56]; - __u32 enabled; }; #define KVM_PV_EOI_BIT 0 diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 775dbd3aff73..46cdc941f958 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -3,14 +3,10 @@ #define _ASM_X86_MMAN_H #define MAP_32BIT 0x40 /* only give out 32bit addresses */ +#define MAP_ABOVE4G 0x80 /* only map above 4GB */ -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -#define arch_calc_vm_prot_bits(prot, key) ( \ - ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ - ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ - ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ - ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) -#endif +/* Flags for map_shadow_stack(2) */ +#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ #include <asm-generic/mman.h> diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index e8d7ebbca1a4..384e2cc6ac19 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -23,9 +23,21 @@ #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 +/* Don't use 0x3001-0x3004 because of old glibcs */ + #define ARCH_GET_UNTAG_MASK 0x4001 #define ARCH_ENABLE_TAGGED_ADDR 0x4002 #define ARCH_GET_MAX_TAG_BITS 0x4003 #define ARCH_FORCE_TAGGED_SVA 0x4004 +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +/* ARCH_SHSTK_ features bits */ +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index d898432947ff..f1a4adc78272 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -139,6 +139,13 @@ #define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */ #define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT) +#ifdef __x86_64__ +#define X86_CR4_FRED_BIT 32 /* enable FRED kernel entry */ +#define X86_CR4_FRED _BITUL(X86_CR4_FRED_BIT) +#else +#define X86_CR4_FRED (0) +#endif + /* * x86-64 Task Priority Register, CR8 */ diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h new file mode 100644 index 000000000000..b111b0c18544 --- /dev/null +++ b/arch/x86/include/uapi/asm/setup_data.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_X86_SETUP_DATA_H +#define _UAPI_ASM_X86_SETUP_DATA_H + +/* setup_data/setup_indirect types */ +#define SETUP_NONE 0 +#define SETUP_E820_EXT 1 +#define SETUP_DTB 2 +#define SETUP_PCI 3 +#define SETUP_EFI 4 +#define SETUP_APPLE_PROPERTIES 5 +#define SETUP_JAILHOUSE 6 +#define SETUP_CC_BLOB 7 +#define SETUP_IMA 8 +#define SETUP_RNG_SEED 9 +#define SETUP_ENUM_MAX SETUP_RNG_SEED + +#define SETUP_INDIRECT (1<<31) +#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) + +#ifndef __ASSEMBLY__ + +#include <linux/types.h> + +/* extensible setup data list node */ +struct setup_data { + __u64 next; + __u32 type; + __u32 len; + __u8 data[]; +}; + +/* extensible setup indirect data node */ +struct setup_indirect { + __u32 type; + __u32 reserved; /* Reserved, must be set to zero. */ + __u64 len; + __u64 addr; +}; + +/* + * The E820 memory region entry of the boot protocol ABI: + */ +struct boot_e820_entry { + __u64 addr; + __u64 size; + __u32 type; +} __attribute__((packed)); + +/* + * The boot loader is passing platform information via this Jailhouse-specific + * setup data structure. + */ +struct jailhouse_setup_data { + struct { + __u16 version; + __u16 compatible_version; + } __attribute__((packed)) hdr; + struct { + __u16 pm_timer_address; + __u16 num_cpus; + __u64 pci_mmconfig_base; + __u32 tsc_khz; + __u32 apic_khz; + __u8 standard_ioapic; + __u8 cpu_ids[255]; + } __attribute__((packed)) v1; + struct { + __u32 flags; + } __attribute__((packed)) v2; +} __attribute__((packed)); + +/* + * IMA buffer setup data information from the previous kernel during kexec + */ +struct ima_setup_data { + __u64 addr; + __u64 size; +} __attribute__((packed)); + +#endif /* __ASSEMBLY__ */ + +#endif /* _UAPI_ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h index 777c3a0f4e23..f777346450ec 100644 --- a/arch/x86/include/uapi/asm/signal.h +++ b/arch/x86/include/uapi/asm/signal.h @@ -4,7 +4,6 @@ #ifndef __ASSEMBLY__ #include <linux/types.h> -#include <linux/time.h> #include <linux/compiler.h> /* Avoid too many header ordering problems. */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 80e1df482337..1814b413fd57 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -115,6 +115,7 @@ #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ |