diff options
Diffstat (limited to 'arch/x86/kernel')
40 files changed, 948 insertions, 688 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4efecac49863..3a44a9dc3fb7 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1171,7 +1171,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) } count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, - acpi_parse_int_src_ovr, nr_irqs); + acpi_parse_int_src_ovr, + irq_get_nr_irqs()); if (count < 0) { pr_err("Error parsing interrupt source overrides entry\n"); /* TBD: Cleanup to allow fallback to MPS */ @@ -1191,7 +1192,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) mp_config_acpi_legacy_irqs(); count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, - acpi_parse_nmi_src, nr_irqs); + acpi_parse_nmi_src, + irq_get_nr_irqs()); if (count < 0) { pr_err("Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 956984054bf3..d745dd586303 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -110,7 +110,7 @@ static void amd_set_max_freq_ratio(void) static DEFINE_MUTEX(freq_invariance_lock); -void init_freq_invariance_cppc(void) +static inline void init_freq_invariance_cppc(void) { static bool init_done; @@ -127,6 +127,11 @@ void init_freq_invariance_cppc(void) mutex_unlock(&freq_invariance_lock); } +void acpi_processor_init_invariance_cppc(void) +{ + init_freq_invariance_cppc(); +} + /* * Get the highest performance register value. * @cpu: CPU from which to get highest performance. @@ -234,8 +239,10 @@ EXPORT_SYMBOL_GPL(amd_detect_prefcore); */ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) { + enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu)); bool prefcore; int ret; + u32 tmp; ret = amd_detect_prefcore(&prefcore); if (ret) @@ -261,6 +268,27 @@ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) break; } } + + /* detect if running on heterogeneous design */ + if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) { + switch (core_type) { + case TOPO_CPU_TYPE_UNKNOWN: + pr_warn("Undefined core type found for cpu %d\n", cpu); + break; + case TOPO_CPU_TYPE_PERFORMANCE: + /* use the max scale for performance cores */ + *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; + return 0; + case TOPO_CPU_TYPE_EFFICIENCY: + /* use the highest perf value for efficiency cores */ + ret = amd_get_highest_perf(cpu, &tmp); + if (ret) + return ret; + *numerator = tmp; + return 0; + } + } + *numerator = CPPC_HIGHEST_PERF_PREFCORE; return 0; diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 94ff83f3d3fe..b200a193beeb 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -87,6 +87,7 @@ SYM_FUNC_START(do_suspend_lowlevel) .align 4 .Lresume_point: + ANNOTATE_NOENDBR /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index dc5d3216af24..9fe9972d2071 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -44,6 +44,7 @@ #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 +#define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4 0x16fc #define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4 0x124c #define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4 0x12bc #define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 @@ -127,6 +128,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6513c53c9459..c5fb28e6451a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -440,7 +440,19 @@ static int lapic_timer_shutdown(struct clock_event_device *evt) v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0); + + /* + * Setting APIC_LVT_MASKED (above) should be enough to tell + * the hardware that this timer will never fire. But AMD + * erratum 411 and some Intel CPU behavior circa 2024 say + * otherwise. Time for belt and suspenders programming: mask + * the timer _and_ zero the counter registers: + */ + if (v & APIC_LVT_TIMER_TSCDEADLINE) + wrmsrl(MSR_IA32_TSC_DEADLINE, 0); + else + apic_write(APIC_TMICT, 0); + return 0; } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 557318145038..736f62812f5c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -712,8 +712,8 @@ int __init arch_probe_nr_irqs(void) { int nr; - if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) - nr_irqs = NR_VECTORS * nr_cpu_ids; + if (irq_get_nr_irqs() > NR_VECTORS * nr_cpu_ids) + irq_set_nr_irqs(NR_VECTORS * nr_cpu_ids); nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; #if defined(CONFIG_PCI_MSI) @@ -725,8 +725,8 @@ int __init arch_probe_nr_irqs(void) else nr += gsi_top * 16; #endif - if (nr < nr_irqs) - nr_irqs = nr; + if (nr < irq_get_nr_irqs()) + irq_set_nr_irqs(nr); /* * We don't know if PIC is present at this point so we need to do diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 5857a0f5d514..4efdf5c2efc8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -59,6 +59,8 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o +obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o + quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 015971adadfc..823f44f7bc94 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -924,6 +924,17 @@ static void init_amd_zen4(struct cpuinfo_x86 *c) { if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); + + /* + * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE + * in some BIOS versions but they can lead to random host reboots. + */ + switch (c->x86_model) { + case 0x18 ... 0x1f: + case 0x60 ... 0x7f: + clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD); + break; + } } static void init_amd_zen5(struct cpuinfo_x86 *c) @@ -1202,5 +1213,6 @@ void amd_check_microcode(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return; - on_each_cpu(zenbleed_check_cpu, NULL, 1); + if (cpu_feature_enabled(X86_FEATURE_ZEN2)) + on_each_cpu(zenbleed_check_cpu, NULL, 1); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d1915427b4ff..47a01d4028f6 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1115,8 +1115,25 @@ do_cmd_auto: case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like SRSO has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); mitigate_smt = true; + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; case RETBLEED_MITIGATION_STUFF: @@ -2627,6 +2644,14 @@ static void __init srso_select_mitigation(void) if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); srso_mitigation = SRSO_MITIGATION_IBPB; + + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); } } else { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); @@ -2638,6 +2663,13 @@ static void __init srso_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c new file mode 100644 index 000000000000..704e9241b964 --- /dev/null +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +#include <linux/semaphore.h> +#include <linux/workqueue.h> +#include <linux/delay.h> +#include <linux/cpuhotplug.h> +#include <asm/cpu_device_id.h> +#include <asm/cmdline.h> +#include <asm/traps.h> +#include <asm/cpu.h> + +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, + sld_ratelimit, +}; + +/* + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, +}; + +static struct ratelimit_state bld_ratelimit; + +static unsigned int sysctl_sld_mitigate = 1; +static DEFINE_SEMAPHORE(buslock_sem, 1); + +#ifdef CONFIG_PROC_SYSCTL +static struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } + + return len == arglen; +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrl(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init sld_state_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + sld_state = state; +} + +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + /* Restore the MSR to its cached value. */ + wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrl(MSR_TEST_CTRL, test_ctrl_val); +} + +void split_lock_init(void) +{ + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); +} + +static void __split_lock_reenable_unlock(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static void split_lock_warn(unsigned long ip) +{ + struct delayed_work *work; + int cpu; + + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + if (sysctl_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; + } else { + work = &sl_reenable; + } + + cpu = get_cpu(); + schedule_delayed_work_on(cpu, work, 2); + + /* Disable split lock detection on this CPU to make progress */ + sld_update_msr(false); + put_cpu(); +} + +bool handle_guest_split_lock(unsigned long ip) +{ + if (sld_state == sld_warn) { + split_lock_warn(ip); + return true; + } + + pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", + current->comm, current->pid, + sld_state == sld_fatal ? "fatal" : "bogus", ip); + + current->thread.error_code = 0; + current->thread.trap_nr = X86_TRAP_AC; + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + return false; +} +EXPORT_SYMBOL_GPL(handle_guest_split_lock); + +void bus_lock_init(void) +{ + u64 val; + + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + split_lock_warn(regs->ip); + return true; +} + +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + +/* + * CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_L, 0), + X86_MATCH_VFM(INTEL_ICELAKE_D, 0), + {} +}; + +static void __init split_lock_setup(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *m; + u64 ia32_core_caps; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + /* Check for CPUs that have support but do not enumerate it: */ + m = x86_match_cpu(split_lock_cpu_ids); + if (m) + goto supported; + + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; + + /* + * Not all bits in MSR_IA32_CORE_CAPS are architectural, but + * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set + * it have split lock detection. + */ + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + goto supported; + + /* CPU is not in the model list and does not have the MSR bit: */ + return; + +supported: + cpu_model_supports_sld = true; + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: warning on user-space bus_locks\n"); + } + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); +} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 07a34d723505..02637365d1a9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -69,6 +69,7 @@ #include <asm/sev.h> #include <asm/tdx.h> #include <asm/posted_intr.h> +#include <asm/runtime-const.h> #include "cpu.h" @@ -1443,6 +1444,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) boot_cpu_has(X86_FEATURE_HYPERVISOR))) setup_force_cpu_bug(X86_BUG_BHI); + if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) + setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1837,6 +1841,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_init) this_cpu->c_init(c); + bus_lock_init(); + /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); @@ -1902,9 +1908,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); -#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); -#endif } /* @@ -2085,8 +2089,10 @@ void syscall_init(void) #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +#ifndef CONFIG_SMP EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif +#endif #endif /* CONFIG_X86_64 */ @@ -2386,6 +2392,15 @@ void __init arch_cpu_finalize_init(void) alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { + unsigned long USER_PTR_MAX = TASK_SIZE_MAX-1; + + /* + * Enable this when LAM is gated on LASS support + if (cpu_feature_enabled(X86_FEATURE_LAM)) + USER_PTR_MAX = (1ul << 63) - PAGE_SIZE - 1; + */ + runtime_const_init(ptr, USER_PTR_MAX); + /* * Make sure the first 2MB area is not mapped by huge pages * There are typically fixed size MTRRs in there and overlapping diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 3baf3e435834..10719aba6276 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -22,6 +22,7 @@ static int cpu_debug_show(struct seq_file *m, void *p) seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); seq_printf(m, "core_id: %u\n", c->topo.core_id); + seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index e7656cbef68d..d1de300af173 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,13 +7,9 @@ #include <linux/smp.h> #include <linux/sched.h> #include <linux/sched/clock.h> -#include <linux/semaphore.h> #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> -#include <linux/workqueue.h> -#include <linux/delay.h> -#include <linux/cpuhotplug.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -24,8 +20,6 @@ #include <asm/hwcap2.h> #include <asm/elf.h> #include <asm/cpu_device_id.h> -#include <asm/cmdline.h> -#include <asm/traps.h> #include <asm/resctrl.h> #include <asm/numa.h> #include <asm/thermal.h> @@ -41,28 +35,6 @@ #include <asm/apic.h> #endif -enum split_lock_detect_state { - sld_off = 0, - sld_warn, - sld_fatal, - sld_ratelimit, -}; - -/* - * Default to sld_off because most systems do not support split lock detection. - * sld_state_setup() will switch this to sld_warn on systems that support - * split lock/bus lock detect, unless there is a command line override. - */ -static enum split_lock_detect_state sld_state __ro_after_init = sld_off; -static u64 msr_test_ctrl_cache __ro_after_init; - -/* - * With a name like MSR_TEST_CTL it should go without saying, but don't touch - * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it - * on CPUs that do not support SLD can cause fireworks, even when writing '0'. - */ -static bool cpu_model_supports_sld __ro_after_init; - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -549,9 +521,6 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); } -static void split_lock_init(void); -static void bus_lock_init(void); - static void init_intel(struct cpuinfo_x86 *c) { early_init_intel(c); @@ -643,7 +612,6 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_misc_features(c); split_lock_init(); - bus_lock_init(); intel_init_thermal(c); } @@ -909,381 +877,6 @@ static const struct cpu_dev intel_cpu_dev = { cpu_dev_register(intel_cpu_dev); -#undef pr_fmt -#define pr_fmt(fmt) "x86/split lock detection: " fmt - -static const struct { - const char *option; - enum split_lock_detect_state state; -} sld_options[] __initconst = { - { "off", sld_off }, - { "warn", sld_warn }, - { "fatal", sld_fatal }, - { "ratelimit:", sld_ratelimit }, -}; - -static struct ratelimit_state bld_ratelimit; - -static unsigned int sysctl_sld_mitigate = 1; -static DEFINE_SEMAPHORE(buslock_sem, 1); - -#ifdef CONFIG_PROC_SYSCTL -static struct ctl_table sld_sysctls[] = { - { - .procname = "split_lock_mitigate", - .data = &sysctl_sld_mitigate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_douintvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; - -static int __init sld_mitigate_sysctl_init(void) -{ - register_sysctl_init("kernel", sld_sysctls); - return 0; -} - -late_initcall(sld_mitigate_sysctl_init); -#endif - -static inline bool match_option(const char *arg, int arglen, const char *opt) -{ - int len = strlen(opt), ratelimit; - - if (strncmp(arg, opt, len)) - return false; - - /* - * Min ratelimit is 1 bus lock/sec. - * Max ratelimit is 1000 bus locks/sec. - */ - if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && - ratelimit > 0 && ratelimit <= 1000) { - ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); - ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); - return true; - } - - return len == arglen; -} - -static bool split_lock_verify_msr(bool on) -{ - u64 ctrl, tmp; - - if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) - return false; - if (on) - ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - else - ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) - return false; - rdmsrl(MSR_TEST_CTRL, tmp); - return ctrl == tmp; -} - -static void __init sld_state_setup(void) -{ - enum split_lock_detect_state state = sld_warn; - char arg[20]; - int i, ret; - - if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - ret = cmdline_find_option(boot_command_line, "split_lock_detect", - arg, sizeof(arg)); - if (ret >= 0) { - for (i = 0; i < ARRAY_SIZE(sld_options); i++) { - if (match_option(arg, ret, sld_options[i].option)) { - state = sld_options[i].state; - break; - } - } - } - sld_state = state; -} - -static void __init __split_lock_setup(void) -{ - if (!split_lock_verify_msr(false)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - if (!split_lock_verify_msr(true)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - /* Restore the MSR to its cached value. */ - wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); -} - -/* - * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking - * is not implemented as one thread could undo the setting of the other - * thread immediately after dropping the lock anyway. - */ -static void sld_update_msr(bool on) -{ - u64 test_ctrl_val = msr_test_ctrl_cache; - - if (on) - test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - - wrmsrl(MSR_TEST_CTRL, test_ctrl_val); -} - -static void split_lock_init(void) -{ - /* - * #DB for bus lock handles ratelimit and #AC for split lock is - * disabled. - */ - if (sld_state == sld_ratelimit) { - split_lock_verify_msr(false); - return; - } - - if (cpu_model_supports_sld) - split_lock_verify_msr(sld_state != sld_off); -} - -static void __split_lock_reenable_unlock(struct work_struct *work) -{ - sld_update_msr(true); - up(&buslock_sem); -} - -static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); - -static void __split_lock_reenable(struct work_struct *work) -{ - sld_update_msr(true); -} -static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); - -/* - * If a CPU goes offline with pending delayed work to re-enable split lock - * detection then the delayed work will be executed on some other CPU. That - * handles releasing the buslock_sem, but because it executes on a - * different CPU probably won't re-enable split lock detection. This is a - * problem on HT systems since the sibling CPU on the same core may then be - * left running with split lock detection disabled. - * - * Unconditionally re-enable detection here. - */ -static int splitlock_cpu_offline(unsigned int cpu) -{ - sld_update_msr(true); - - return 0; -} - -static void split_lock_warn(unsigned long ip) -{ - struct delayed_work *work; - int cpu; - - if (!current->reported_split_lock) - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); - current->reported_split_lock = 1; - - if (sysctl_sld_mitigate) { - /* - * misery factor #1: - * sleep 10ms before trying to execute split lock. - */ - if (msleep_interruptible(10) > 0) - return; - /* - * Misery factor #2: - * only allow one buslocked disabled core at a time. - */ - if (down_interruptible(&buslock_sem) == -EINTR) - return; - work = &sl_reenable_unlock; - } else { - work = &sl_reenable; - } - - cpu = get_cpu(); - schedule_delayed_work_on(cpu, work, 2); - - /* Disable split lock detection on this CPU to make progress */ - sld_update_msr(false); - put_cpu(); -} - -bool handle_guest_split_lock(unsigned long ip) -{ - if (sld_state == sld_warn) { - split_lock_warn(ip); - return true; - } - - pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", - current->comm, current->pid, - sld_state == sld_fatal ? "fatal" : "bogus", ip); - - current->thread.error_code = 0; - current->thread.trap_nr = X86_TRAP_AC; - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - return false; -} -EXPORT_SYMBOL_GPL(handle_guest_split_lock); - -static void bus_lock_init(void) -{ - u64 val; - - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, val); - - if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - (sld_state == sld_warn || sld_state == sld_fatal)) || - sld_state == sld_off) { - /* - * Warn and fatal are handled by #AC for split lock if #AC for - * split lock is supported. - */ - val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; - } else { - val |= DEBUGCTLMSR_BUS_LOCK_DETECT; - } - - wrmsrl(MSR_IA32_DEBUGCTLMSR, val); -} - -bool handle_user_split_lock(struct pt_regs *regs, long error_code) -{ - if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) - return false; - split_lock_warn(regs->ip); - return true; -} - -void handle_bus_lock(struct pt_regs *regs) -{ - switch (sld_state) { - case sld_off: - break; - case sld_ratelimit: - /* Enforce no more than bld_ratelimit bus locks/sec. */ - while (!__ratelimit(&bld_ratelimit)) - msleep(20); - /* Warn on the bus lock. */ - fallthrough; - case sld_warn: - pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", - current->comm, current->pid, regs->ip); - break; - case sld_fatal: - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - break; - } -} - -/* - * CPU models that are known to have the per-core split-lock detection - * feature even though they do not enumerate IA32_CORE_CAPABILITIES. - */ -static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - X86_MATCH_VFM(INTEL_ICELAKE_X, 0), - X86_MATCH_VFM(INTEL_ICELAKE_L, 0), - X86_MATCH_VFM(INTEL_ICELAKE_D, 0), - {} -}; - -static void __init split_lock_setup(struct cpuinfo_x86 *c) -{ - const struct x86_cpu_id *m; - u64 ia32_core_caps; - - if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return; - - /* Check for CPUs that have support but do not enumerate it: */ - m = x86_match_cpu(split_lock_cpu_ids); - if (m) - goto supported; - - if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) - return; - - /* - * Not all bits in MSR_IA32_CORE_CAPS are architectural, but - * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set - * it have split lock detection. - */ - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); - if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) - goto supported; - - /* CPU is not in the model list and does not have the MSR bit: */ - return; - -supported: - cpu_model_supports_sld = true; - __split_lock_setup(); -} - -static void sld_state_show(void) -{ - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) - return; - - switch (sld_state) { - case sld_off: - pr_info("disabled\n"); - break; - case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, - "x86/splitlock", NULL, splitlock_cpu_offline) < 0) - pr_warn("No splitlock CPU offline handler\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: warning on user-space bus_locks\n"); - } - break; - case sld_fatal: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", - boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? - " from non-WB" : ""); - } - break; - case sld_ratelimit: - if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); - break; - } -} - -void __init sld_setup(struct cpuinfo_x86 *c) -{ - split_lock_setup(c); - sld_state_setup(); - sld_state_show(); -} - #define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 /** @@ -1299,3 +892,18 @@ u8 get_this_hybrid_cpu_type(void) return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; } + +/** + * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU + * + * Returns the uarch native ID [23:0] of a CPU in a hybrid processor. + * If the processor is not hybrid, returns 0. + */ +u32 get_this_hybrid_cpu_native_id(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return 0; + + return cpuid_eax(0x0000001a) & + (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1); +} diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 14bf8c232e45..6ca80fff1fea 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -778,29 +778,33 @@ bool amd_mce_usable_address(struct mce *m) static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - struct mce m; + struct mce_hw_err err; + struct mce *m = &err.m; - mce_prep_record(&m); + mce_prep_record(&err); - m.status = status; - m.misc = misc; - m.bank = bank; - m.tsc = rdtsc(); + m->status = status; + m->misc = misc; + m->bank = bank; + m->tsc = rdtsc(); - if (m.status & MCI_STATUS_ADDRV) { - m.addr = addr; + if (m->status & MCI_STATUS_ADDRV) { + m->addr = addr; - smca_extract_err_addr(&m); + smca_extract_err_addr(m); } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - if (m.status & MCI_STATUS_SYNDV) - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + if (m->status & MCI_STATUS_SYNDV) { + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); + } } - mce_log(&m); + mce_log(&err); } DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 3885fe05f01e..0a89947e47bc 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -28,7 +28,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) @@ -44,31 +45,33 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_prep_record(&m); - m.bank = -1; + mce_prep_record(&err); + m = &err.m; + m->bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; + m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) - m.status |= MCI_STATUS_UC; + m->status |= MCI_STATUS_UC; if (severity >= GHES_SEV_PANIC) { - m.status |= MCI_STATUS_PCC; - m.tsc = rdtsc(); + m->status |= MCI_STATUS_PCC; + m->tsc = rdtsc(); } - m.addr = mem_err->physical_addr; - mce_log(&m); + m->addr = mem_err->physical_addr; + mce_log(&err); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + unsigned int cpu, num_regs; bool apicid_found = false; - unsigned int cpu; - struct mce m; + struct mce_hw_err err; + struct mce *m; if (!boot_cpu_has(X86_FEATURE_SMCA)) return -EINVAL; @@ -86,16 +89,12 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) return -EINVAL; /* - * The register array size must be large enough to include all the - * SMCA registers which need to be extracted. - * * The number of registers in the register array is determined by * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2. - * The register layout is fixed and currently the raw data in the - * register array includes 6 SMCA registers which the kernel can - * extract. + * Sanity-check registers array size. */ - if (ctx_info->reg_arr_size < 48) + num_regs = ctx_info->reg_arr_size >> 3; + if (!num_regs) return -EINVAL; for_each_possible_cpu(cpu) { @@ -108,18 +107,68 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) if (!apicid_found) return -EINVAL; - mce_prep_record_common(&m); - mce_prep_record_per_cpu(cpu, &m); + m = &err.m; + memset(&err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(cpu, m); + + m->bank = (ctx_info->msr_addr >> 4) & 0xFF; - m.bank = (ctx_info->msr_addr >> 4) & 0xFF; - m.status = *i_mce; - m.addr = *(i_mce + 1); - m.misc = *(i_mce + 2); - /* Skipping MCA_CONFIG */ - m.ipid = *(i_mce + 4); - m.synd = *(i_mce + 5); + /* + * The SMCA register layout is fixed and includes 16 registers. + * The end of the array may be variable, but the beginning is known. + * Cap the number of registers to expected max (15). + */ + if (num_regs > 15) + num_regs = 15; + + switch (num_regs) { + /* MCA_SYND2 */ + case 15: + err.vendor.amd.synd2 = *(i_mce + 14); + fallthrough; + /* MCA_SYND1 */ + case 14: + err.vendor.amd.synd1 = *(i_mce + 13); + fallthrough; + /* MCA_MISC4 */ + case 13: + /* MCA_MISC3 */ + case 12: + /* MCA_MISC2 */ + case 11: + /* MCA_MISC1 */ + case 10: + /* MCA_DEADDR */ + case 9: + /* MCA_DESTAT */ + case 8: + /* reserved */ + case 7: + /* MCA_SYND */ + case 6: + m->synd = *(i_mce + 5); + fallthrough; + /* MCA_IPID */ + case 5: + m->ipid = *(i_mce + 4); + fallthrough; + /* MCA_CONFIG */ + case 4: + /* MCA_MISC0 */ + case 3: + m->misc = *(i_mce + 2); + fallthrough; + /* MCA_ADDR */ + case 2: + m->addr = *(i_mce + 1); + fallthrough; + /* MCA_STATUS */ + case 1: + m->status = *i_mce; + } - mce_log(&m); + mce_log(&err); return 0; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2a938f429c4d..7fb5556a0b53 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -88,7 +88,7 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -static DEFINE_PER_CPU(struct mce, mces_seen); +static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); static unsigned long mce_need_notify; /* @@ -119,8 +119,6 @@ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); void mce_prep_record_common(struct mce *m) { - memset(m, 0, sizeof(struct mce)); - m->cpuid = cpuid_eax(1); m->cpuvendor = boot_cpu_data.x86_vendor; m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); @@ -138,9 +136,12 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) m->socketid = topology_physical_package_id(cpu); } -/* Do initial initialization of a struct mce */ -void mce_prep_record(struct mce *m) +/* Do initial initialization of struct mce_hw_err */ +void mce_prep_record(struct mce_hw_err *err) { + struct mce *m = &err->m; + + memset(err, 0, sizeof(struct mce_hw_err)); mce_prep_record_common(m); mce_prep_record_per_cpu(smp_processor_id(), m); } @@ -148,9 +149,9 @@ void mce_prep_record(struct mce *m) DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -void mce_log(struct mce *m) +void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(m)) + if (!mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -171,8 +172,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static void __print_mce(struct mce *m) +static void __print_mce(struct mce_hw_err *err) { + struct mce *m = &err->m; + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", m->extcpu, (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), @@ -199,6 +202,10 @@ static void __print_mce(struct mce *m) if (mce_flags.smca) { if (m->synd) pr_cont("SYND %llx ", m->synd); + if (err->vendor.amd.synd1) + pr_cont("SYND1 %llx ", err->vendor.amd.synd1); + if (err->vendor.amd.synd2) + pr_cont("SYND2 %llx ", err->vendor.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); } @@ -214,9 +221,11 @@ static void __print_mce(struct mce *m) m->microcode); } -static void print_mce(struct mce *m) +static void print_mce(struct mce_hw_err *err) { - __print_mce(m); + struct mce *m = &err->m; + + __print_mce(err); if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); @@ -251,7 +260,7 @@ static const char *mce_dump_aux_info(struct mce *m) return NULL; } -static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp) { struct llist_node *pending; struct mce_evt_llist *l; @@ -282,20 +291,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) { - print_mce(m); + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } } /* Now print uncorrected but with the final one last */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || mce_cmp(m, final)) { - print_mce(m); + if (!final || mce_cmp(m, &final->m)) { + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } @@ -303,12 +314,12 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (final) { print_mce(final); if (!apei_err) - apei_err = apei_write_mce(final); + apei_err = apei_write_mce(&final->m); } if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); - memmsg = mce_dump_aux_info(final); + memmsg = mce_dump_aux_info(&final->m); if (memmsg) pr_emerg(HW_ERR "Machine check: %s\n", memmsg); @@ -323,9 +334,9 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) * panic. */ if (kexec_crash_loaded()) { - if (final && (final->status & MCI_STATUS_ADDRV)) { + if (final && (final->m.status & MCI_STATUS_ADDRV)) { struct page *p; - p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); } @@ -445,16 +456,18 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs) { + struct mce *m; /* * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_prep_record(m); + mce_prep_record(err); instrumentation_end(); + m = &err->m; m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { /* @@ -574,13 +587,13 @@ EXPORT_SYMBOL_GPL(mce_is_correctable); static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; /* Emit the trace record: */ - trace_mce_record(m); + trace_mce_record(err); set_bit(0, &mce_need_notify); @@ -624,13 +637,13 @@ static struct notifier_block mce_uc_nb = { static int mce_default_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; - if (mca_cfg.print_all || !m->kflags) - __print_mce(m); + if (mca_cfg.print_all || !(err->m.kflags)) + __print_mce(err); return NOTIFY_DONE; } @@ -644,8 +657,10 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static noinstr void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce_hw_err *err, int i) { + struct mce *m = &err->m; + if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); @@ -667,8 +682,11 @@ static noinstr void mce_read_aux(struct mce *m, int i) if (mce_flags.smca) { m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); - if (m->status & MCI_STATUS_SYNDV) + if (m->status & MCI_STATUS_SYNDV) { m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + } } } @@ -692,26 +710,28 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - struct mce m; + struct mce_hw_err err; + struct mce *m; int i; this_cpu_inc(mce_poll_count); - mce_gather_info(&m, NULL); + mce_gather_info(&err, NULL); + m = &err.m; if (flags & MCP_TIMESTAMP) - m.tsc = rdtsc(); + m->tsc = rdtsc(); for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; - m.misc = 0; - m.addr = 0; - m.bank = i; + m->misc = 0; + m->addr = 0; + m->bank = i; barrier(); - m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); /* * Update storm tracking here, before checking for the @@ -721,17 +741,17 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * storm status. */ if (!mca_cfg.cmci_disabled) - mce_track_storm(&m); + mce_track_storm(m); /* If this entry is not valid, ignore it */ - if (!(m.status & MCI_STATUS_VAL)) + if (!(m->status & MCI_STATUS_VAL)) continue; /* * If we are logging everything (at CPU online) or this * is a corrected error, then we must log it. */ - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) goto log_it; /* @@ -741,20 +761,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * everything else. */ if (!mca_cfg.ser) { - if (m.status & MCI_STATUS_UC) + if (m->status & MCI_STATUS_UC) continue; goto log_it; } /* Log "not enabled" (speculative) errors */ - if (!(m.status & MCI_STATUS_EN)) + if (!(m->status & MCI_STATUS_EN)) goto log_it; /* * Log UCNA (SDM: 15.6.3 "UCR Error Classification") * UC == 1 && PCC == 0 && S == 0 */ - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) goto log_it; /* @@ -768,20 +788,20 @@ log_it: if (flags & MCP_DONTLOG) goto clear_it; - mce_read_aux(&m, i); - m.severity = mce_severity(&m, NULL, NULL, false); + mce_read_aux(&err, i); + m->severity = mce_severity(m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + if (mca_cfg.dont_log_ce && !mce_usable_address(m)) goto clear_it; if (flags & MCP_QUEUE_LOG) - mce_gen_pool_add(&m); + mce_gen_pool_add(&err); else - mce_log(&m); + mce_log(&err); clear_it: /* @@ -905,9 +925,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, +static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp, struct pt_regs *regs) { + struct mce *m = &err->m; char *tmp = *msg; int i; @@ -925,7 +946,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); + mce_read_aux(err, i); *msg = tmp; return 1; } @@ -1016,10 +1037,11 @@ out: */ static void mce_reign(void) { - int cpu; + struct mce_hw_err *err = NULL; struct mce *m = NULL; int global_worst = 0; char *msg = NULL; + int cpu; /* * This CPU is the Monarch and the other CPUs have run @@ -1027,11 +1049,13 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - struct mce *mtmp = &per_cpu(mces_seen, cpu); + struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu); + struct mce *mtmp = &etmp->m; if (mtmp->severity > global_worst) { global_worst = mtmp->severity; - m = &per_cpu(mces_seen, cpu); + err = &per_cpu(hw_errs_seen, cpu); + m = &err->m; } } @@ -1043,7 +1067,7 @@ static void mce_reign(void) if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */ mce_severity(m, NULL, &msg, true); - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal machine check", err, msg); } /* @@ -1060,11 +1084,11 @@ static void mce_reign(void) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* - * Now clear all the mces_seen so that they don't reappear on + * Now clear all the hw_errs_seen so that they don't reappear on * the next mce. */ for_each_possible_cpu(cpu) - memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); + memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err)); } static atomic_t global_nwo; @@ -1268,13 +1292,14 @@ static noinstr bool mce_check_crashing_cpu(void) } static __always_inline int -__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, int no_way_out, - int *worst) +__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, + struct mce_hw_err *final, unsigned long *toclear, + unsigned long *valid_banks, int no_way_out, int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i, taint = 0; + struct mce *m = &err->m; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { arch___clear_bit(i, toclear); @@ -1319,7 +1344,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, if (severity == MCE_NO_SEVERITY) continue; - mce_read_aux(m, i); + mce_read_aux(err, i); /* assuming valid severity level != 0 */ m->severity = severity; @@ -1329,17 +1354,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, * done in #MC context, where instrumentation is disabled. */ instrumentation_begin(); - mce_log(m); + mce_log(err); instrumentation_end(); if (severity > *worst) { - *final = *m; + *final = *err; *worst = severity; } } /* mce_clear_state will clear *final, save locally for use later */ - *m = *final; + *err = *final; return taint; } @@ -1399,9 +1424,10 @@ static void kill_me_never(struct callback_head *cb) set_mce_nospec(pfn); } -static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; + struct mce *m = &err->m; /* First call, save all the details */ if (count == 1) { @@ -1414,11 +1440,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba /* Ten is likely overkill. Don't expect more than two faults before task_work() */ if (count > 10) - mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + mce_panic("Too many consecutive machine checks while accessing user data", + err, msg); /* Second or later call, make sure page address matches the one from first call */ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) - mce_panic("Consecutive machine checks to different user pages", m, msg); + mce_panic("Consecutive machine checks to different user pages", err, msg); /* Do not call task_work_add() more than once */ if (count > 1) @@ -1467,8 +1494,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; - struct mce m, *final; + struct mce_hw_err *final; + struct mce_hw_err err; char *msg = NULL; + struct mce *m; if (unlikely(mce_flags.p5)) return pentium_machine_check(regs); @@ -1506,13 +1535,14 @@ noinstr void do_machine_check(struct pt_regs *regs) this_cpu_inc(mce_exception_count); - mce_gather_info(&m, regs); - m.tsc = rdtsc(); + mce_gather_info(&err, regs); + m = &err.m; + m->tsc = rdtsc(); - final = this_cpu_ptr(&mces_seen); - *final = m; + final = this_cpu_ptr(&hw_errs_seen); + *final = err; - no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs); barrier(); @@ -1521,15 +1551,15 @@ noinstr void do_machine_check(struct pt_regs *regs) * Assume the worst for now, but if we find the * severity is MCE_AR_SEVERITY we have other options. */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) + if (!(m->mcgstatus & MCG_STATUS_RIPV)) kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL || - m.cpuvendor == X86_VENDOR_ZHAOXIN) - lmce = m.mcgstatus & MCG_STATUS_LMCES; + if (m->cpuvendor == X86_VENDOR_INTEL || + m->cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m->mcgstatus & MCG_STATUS_LMCES; /* * Local machine check may already know that we have to panic. @@ -1540,12 +1570,12 @@ noinstr void do_machine_check(struct pt_regs *regs) */ if (lmce) { if (no_way_out) - mce_panic("Fatal local machine check", &m, msg); + mce_panic("Fatal local machine check", &err, msg); } else { order = mce_start(&no_way_out); } - taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1560,7 +1590,7 @@ noinstr void do_machine_check(struct pt_regs *regs) no_way_out = worst >= MCE_PANIC_SEVERITY; if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); + mce_panic("Fatal machine check on current CPU", &err, msg); } } else { /* @@ -1572,8 +1602,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY) { - mce_severity(&m, regs, &msg, true); - mce_panic("Local fatal machine check!", &m, msg); + mce_severity(m, regs, &msg, true); + mce_panic("Local fatal machine check!", &err, msg); } } @@ -1591,16 +1621,16 @@ noinstr void do_machine_check(struct pt_regs *regs) goto out; /* Fault was in user mode and we need to take some action */ - if ((m.cs & 3) == 3) { + if ((m->cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (!mce_usable_address(&m)) - queue_task_work(&m, msg, kill_me_now); + if (!mce_usable_address(m)) + queue_task_work(&err, msg, kill_me_now); else - queue_task_work(&m, msg, kill_me_maybe); + queue_task_work(&err, msg, kill_me_maybe); - } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) { + } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) { /* * Saved RIP on stack makes it look like the machine check * was taken in the kernel on the instruction following @@ -1612,8 +1642,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * not occur there. Mark the page as poisoned so it won't * be added to free list when the guest is terminated. */ - if (mce_usable_address(&m)) { - struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT); + if (mce_usable_address(m)) { + struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); @@ -1628,13 +1658,13 @@ noinstr void do_machine_check(struct pt_regs *regs) * corresponding exception handler which would do that is the * proper one. */ - if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (m->kflags & MCE_IN_KERNEL_RECOV) { if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + mce_panic("Failed kernel mode recovery", &err, msg); } - if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_me_never); + if (m->kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&err, msg, kill_me_never); } out: diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index af44fd5dbd7c..8d023239ce18 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -264,15 +264,8 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, return put_user(sizeof(struct mce), p); case MCE_GET_LOG_LEN: return put_user(mcelog->len, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog->flags; - } while (cmpxchg(&mcelog->flags, flags, 0) != flags); - - return put_user(flags, p); - } + case MCE_GETCLEAR_FLAGS: + return put_user(xchg(&mcelog->flags, 0), p); default: return -ENOTTY; } diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index 4284749ec803..d0be6dda0c14 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -31,15 +31,15 @@ static LLIST_HEAD(mce_event_llist); */ static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) { + struct mce_hw_err *err1, *err2; struct mce_evt_llist *node; - struct mce *m1, *m2; - m1 = &t->mce; + err1 = &t->err; llist_for_each_entry(node, &l->llnode, llnode) { - m2 = &node->mce; + err2 = &node->err; - if (!mce_cmp(m1, m2)) + if (!mce_cmp(&err1->m, &err2->m)) return true; } return false; @@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void) void mce_gen_pool_process(struct work_struct *__unused) { - struct llist_node *head; struct mce_evt_llist *node, *tmp; + struct llist_node *head; struct mce *mce; head = llist_del_all(&mce_event_llist); @@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { - mce = &node->mce; + mce = &node->err.m; blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } @@ -94,11 +94,11 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce *mce) +int mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; - if (filter_mce(mce)) + if (filter_mce(&err->m)) return -EINVAL; if (!mce_evt_pool) @@ -110,7 +110,7 @@ int mce_gen_pool_add(struct mce *mce) return -ENOMEM; } - memcpy(&node->mce, mce, sizeof(*mce)); + memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); return 0; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 49ed3428785d..313fe682db33 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -502,8 +502,9 @@ static void prepare_msrs(void *info) static void do_inject(void) { - u64 mcg_status = 0; unsigned int cpu = i_mce.extcpu; + struct mce_hw_err err; + u64 mcg_status = 0; u8 b = i_mce.bank; i_mce.tsc = rdtsc_ordered(); @@ -517,7 +518,8 @@ static void do_inject(void) i_mce.status |= MCI_STATUS_SYNDV; if (inj_type == SW_INJ) { - mce_log(&i_mce); + err.m = i_mce; + mce_log(&err); return; } diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f6103e6bf69a..b3cd2c61b11d 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -94,7 +94,7 @@ static int cmci_supported(int *banks) if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return 0; rdmsrl(MSR_IA32_MCG_CAP, cap); - *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK); return !!(cap & MCG_CMCI_P); } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 43c7f3b71df5..84f810598231 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -26,12 +26,12 @@ extern struct blocking_notifier_head x86_mce_decoder_chain; struct mce_evt_llist { struct llist_node llnode; - struct mce mce; + struct mce_hw_err err; }; void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_add(struct mce_hw_err *err); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index f63b051f25a0..31a73715d755 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -584,7 +584,7 @@ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_ native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); } -static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); +static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size); static int __init save_microcode_in_initrd(void) { @@ -605,7 +605,7 @@ static int __init save_microcode_in_initrd(void) if (!desc.mc) return -EINVAL; - ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); + ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); if (ret > UCODE_UPDATED) return -EINVAL; @@ -613,16 +613,19 @@ static int __init save_microcode_in_initrd(void) } early_initcall(save_microcode_in_initrd); -static inline bool patch_cpus_equivalent(struct ucode_patch *p, struct ucode_patch *n) +static inline bool patch_cpus_equivalent(struct ucode_patch *p, + struct ucode_patch *n, + bool ignore_stepping) { /* Zen and newer hardcode the f/m/s in the patch ID */ if (x86_family(bsp_cpuid_1_eax) >= 0x17) { union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id); union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id); - /* Zap stepping */ - p_cid.stepping = 0; - n_cid.stepping = 0; + if (ignore_stepping) { + p_cid.stepping = 0; + n_cid.stepping = 0; + } return p_cid.full == n_cid.full; } else { @@ -644,13 +647,13 @@ static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equi WARN_ON_ONCE(!n.patch_id); list_for_each_entry(p, µcode_cache, plist) - if (patch_cpus_equivalent(p, &n)) + if (patch_cpus_equivalent(p, &n, false)) return p; return NULL; } -static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n) +static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n) { /* Zen and newer hardcode the f/m/s in the patch ID */ if (x86_family(bsp_cpuid_1_eax) >= 0x17) { @@ -659,6 +662,9 @@ static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n) zp.ucode_rev = p->patch_id; zn.ucode_rev = n->patch_id; + if (zn.stepping != zp.stepping) + return -1; + return zn.rev > zp.rev; } else { return n->patch_id > p->patch_id; @@ -668,10 +674,14 @@ static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n) static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; + int ret; list_for_each_entry(p, µcode_cache, plist) { - if (patch_cpus_equivalent(p, new_patch)) { - if (!patch_newer(p, new_patch)) { + if (patch_cpus_equivalent(p, new_patch, true)) { + ret = patch_newer(p, new_patch); + if (ret < 0) + continue; + else if (!ret) { /* we already have the latest patch */ kfree(new_patch->data); kfree(new_patch); @@ -944,21 +954,30 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size) { - struct cpuinfo_x86 *c; - unsigned int nid, cpu; - struct ucode_patch *p; enum ucode_state ret; /* free old equiv table */ free_equiv_cpu_table(); ret = __load_microcode_amd(family, data, size); - if (ret != UCODE_OK) { + if (ret != UCODE_OK) cleanup(); + + return ret; +} + +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + struct cpuinfo_x86 *c; + unsigned int nid, cpu; + struct ucode_patch *p; + enum ucode_state ret; + + ret = _load_microcode_amd(family, data, size); + if (ret != UCODE_OK) return ret; - } for_each_node(nid) { cpu = cpumask_first(cpumask_of_node(nid)); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 815fa67356a2..f3d534807d91 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -319,12 +319,6 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, return UCODE_OK; } - /* - * Writeback and invalidate caches before updating microcode to avoid - * internal issues depending on what the microcode is updating. - */ - native_wbinvd(); - /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -574,14 +568,14 @@ static bool is_blacklisted(unsigned int cpu) /* * Late loading on model 79 with microcode revision less than 0x0b000021 * and LLC size per core bigger than 2.5MB may result in a system hang. - * This behavior is documented in item BDF90, #334165 (Intel Xeon + * This behavior is documented in item BDX90, #334165 (Intel Xeon * Processor E7-8800/4800 v4 Product Family). */ if (c->x86_vfm == INTEL_BROADWELL_X && c->x86_stepping == 0x01 && llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { - pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); return true; } diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index e65fae63660e..41ed01f46bd9 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -41,11 +41,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", - boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", - boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", - boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)), + str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)), + str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)), + str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), + str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), c->cpuid_level); } #else diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 8591d53c144b..b681c2e07dbf 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -207,7 +207,7 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r) return false; } -static bool __get_mem_config_intel(struct rdt_resource *r) +static __init bool __get_mem_config_intel(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_3_eax eax; @@ -241,7 +241,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r) return true; } -static bool __rdt_get_mem_config_amd(struct rdt_resource *r) +static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); u32 eax, ebx, ecx, edx, subleaf; diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 50fa1fe9a073..200d89a64027 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -29,10 +29,10 @@ * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. */ -static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) { - unsigned long bw; int ret; + u32 bw; /* * Only linear delay values is supported for current Intel SKUs. @@ -42,16 +42,21 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return false; } - ret = kstrtoul(buf, 10, &bw); + ret = kstrtou32(buf, 10, &bw); if (ret) { - rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); + rdt_last_cmd_printf("Invalid MB value %s\n", buf); return false; } - if ((bw < r->membw.min_bw || bw > r->default_ctrl) && - !is_mba_sc(r)) { - rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, - r->membw.min_bw, r->default_ctrl); + /* Nothing else to do if software controller is enabled. */ + if (is_mba_sc(r)) { + *data = bw; + return true; + } + + if (bw < r->membw.min_bw || bw > r->default_ctrl) { + rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", + bw, r->membw.min_bw, r->default_ctrl); return false; } @@ -65,7 +70,7 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct resctrl_staged_config *cfg; u32 closid = data->rdtgrp->closid; struct rdt_resource *r = s->res; - unsigned long bw_val; + u32 bw_val; cfg = &d->staged_config[s->conf_type]; if (cfg->have_new_ctrl) { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 851b561850e0..5fcb3d635d91 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -1158,11 +1158,12 @@ static __init int snc_get_config(void) ret = cpus_per_l3 / cpus_per_node; - /* sanity check: Only valid results are 1, 2, 3, 4 */ + /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ switch (ret) { case 1: break; case 2 ... 4: + case 6: pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; break; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index d7163b764c62..d906a1cd8491 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1596,7 +1596,7 @@ static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) { - struct mon_config_info mon_info = {0}; + struct mon_config_info mon_info; struct rdt_mon_domain *dom; bool sep = false; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c84c30188fdf..16f3ca30626a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -24,34 +24,36 @@ struct cpuid_bit { * levels are different and there is a separate entry for each. */ static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, - { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, - { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, - { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, - { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, - { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, - { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, - { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, - { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, - { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, - { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, - { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, - { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, - { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, - { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, - { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, - { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, - { X86_FEATURE_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, - { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, - { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, - { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, - { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, + { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, + { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, + { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 9ace84486499..eb5848d1851a 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -901,19 +901,15 @@ static struct miscdevice sgx_dev_provision = { int sgx_set_attribute(unsigned long *allowed_attributes, unsigned int attribute_fd) { - struct fd f = fdget(attribute_fd); + CLASS(fd, f)(attribute_fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EINVAL; - if (fd_file(f)->f_op != &sgx_provision_fops) { - fdput(f); + if (fd_file(f)->f_op != &sgx_provision_fops) return -EINVAL; - } *allowed_attributes |= SGX_ATTR_PROVISIONKEY; - - fdput(f); return 0; } EXPORT_SYMBOL_GPL(sgx_set_attribute); diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 7d476fa697ca..03b3c9c3a45e 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -182,6 +182,9 @@ static void parse_topology_amd(struct topo_scan *tscan) if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) has_topoext = cpu_parse_topology_ext(tscan); + if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); + if (!has_topoext && !parse_8000_0008(tscan)) return; diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index 9a6069e7133c..8277c64f88db 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -3,6 +3,7 @@ #include <xen/xen.h> +#include <asm/intel-family.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/smp.h> @@ -27,6 +28,36 @@ void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, } } +enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor == X86_VENDOR_INTEL) { + switch (c->topo.intel_type) { + case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY; + case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE; + } + } + if (c->x86_vendor == X86_VENDOR_AMD) { + switch (c->topo.amd_type) { + case 0: return TOPO_CPU_TYPE_PERFORMANCE; + case 1: return TOPO_CPU_TYPE_EFFICIENCY; + } + } + + return TOPO_CPU_TYPE_UNKNOWN; +} + +const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c) +{ + switch (get_topology_cpu_type(c)) { + case TOPO_CPU_TYPE_PERFORMANCE: + return "performance"; + case TOPO_CPU_TYPE_EFFICIENCY: + return "efficiency"; + default: + return "unknown"; + } +} + static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c) { struct { @@ -87,6 +118,7 @@ static void parse_topology(struct topo_scan *tscan, bool early) .cu_id = 0xff, .llc_id = BAD_APICID, .l2c_id = BAD_APICID, + .cpu_type = TOPO_CPU_TYPE_UNKNOWN, }; struct cpuinfo_x86 *c = tscan->c; struct { @@ -132,6 +164,8 @@ static void parse_topology(struct topo_scan *tscan, bool early) case X86_VENDOR_INTEL: if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan)) parse_legacy(tscan); + if (c->cpuid_level >= 0x1a) + c->topo.cpu_type = cpuid_eax(0x1a); break; case X86_VENDOR_HYGON: if (IS_ENABLED(CONFIG_CPU_SUP_HYGON)) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 16752b8dfa89..56163e2124cf 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -77,6 +77,7 @@ SYM_CODE_START_NOALIGN(startup_64) lretq .Lon_kernel_cs: + ANNOTATE_NOENDBR UNWIND_HINT_END_OF_STACK #ifdef CONFIG_AMD_MEM_ENCRYPT diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 263f8aed4e2c..21e9e4845354 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -37,6 +37,7 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/mtrr.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> #include <asm/ptrace.h> @@ -980,6 +981,9 @@ static void __init kvm_init_platform(void) } kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; + + /* Set WB as the default cache mode for SEV-SNP and TDX */ + mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); } #if defined(CONFIG_AMD_MEM_ENCRYPT) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0e0a4cf6b5eb..615922838c51 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -530,7 +530,7 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +#if IS_ENABLED(CONFIG_KVM_X86) /* RCU-protected callback to disable virtualization prior to reboot. */ static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; @@ -600,7 +600,7 @@ static void emergency_reboot_disable_virtualization(void) } #else static void emergency_reboot_disable_virtualization(void) { } -#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ +#endif /* CONFIG_KVM_X86 */ void __attribute__((weak)) mach_reboot_fixups(void) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 766f092dab80..b5a8f0891135 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -497,8 +497,9 @@ static int x86_cluster_flags(void) static int x86_die_flags(void) { - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return x86_sched_itmt_flags(); + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU) || + cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) + return x86_sched_itmt_flags(); return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d05392db5d0f..2dbadf347b5f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt_regs *regs) int ud_type; u32 imm; - /* - * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() - * is a rare case that uses @regs without passing them to - * irqentry_enter(). - */ - kmsan_unpoison_entry_regs(regs); ud_type = decode_bug(regs->ip, &imm); if (ud_type == BUG_NONE) return handled; @@ -276,6 +270,12 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ instrumentation_begin(); /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); + /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index dfe6847fd99e..67aeaba4ba9c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -174,10 +174,11 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts c2n = per_cpu_ptr(&cyc2ns, cpu); - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch_begin(&c2n->seq); c2n->data[0] = data; - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch(&c2n->seq); c2n->data[1] = data; + write_seqcount_latch_end(&c2n->seq); } static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index d00c28aaa5be..d4705a348a80 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -723,7 +723,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, state->sp = task->thread.sp + sizeof(*frame); state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); - state->signal = (void *)state->ip == ret_from_fork; + state->signal = (void *)state->ip == ret_from_fork_asm; } if (get_stack_info((unsigned long *)state->sp, state->task, diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 6726be89b7a6..68efd8cd8bf1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -193,29 +193,6 @@ SECTIONS ORC_UNWIND_TABLE - . = ALIGN(PAGE_SIZE); - __vvar_page = .; - - .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { - /* work around gold bug 13023 */ - __vvar_beginning_hack = .; - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = __vvar_beginning_hack + offset; \ - *(.vvar_ ## name) -#include <asm/vvar.h> -#undef EMIT_VVAR - - /* - * Pad the rest of the page with zeros. Otherwise the loader - * can leave garbage here. - */ - . = __vvar_beginning_hack + PAGE_SIZE; - } :data - - . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); - /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { @@ -358,6 +335,7 @@ SECTIONS #endif RUNTIME_CONST_VARIABLES + RUNTIME_CONST(ptr, USER_PTR_MAX) . = ALIGN(PAGE_SIZE); @@ -490,6 +468,9 @@ SECTIONS . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); +/* needed for Clang - see arch/x86/entry/entry.S */ +PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); + #ifdef CONFIG_X86_64 /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -527,3 +508,22 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #endif /* CONFIG_X86_64 */ + +/* + * The symbols below are referenced using relative relocations in the + * respective ELF notes. This produces build time constants that the + * linker will never mark as relocatable. (Using just ABSOLUTE() is not + * sufficient for that). + */ +#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV +xen_elfnote_entry_value = + ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen); +#endif +xen_elfnote_hypercall_page_value = + ABSOLUTE(xen_elfnote_hypercall_page) + ABSOLUTE(hypercall_page); +#endif +#ifdef CONFIG_PVH +xen_elfnote_phys32_entry_value = + ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); +#endif |