summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c119
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c115
-rw-r--r--arch/x86/kernel/cpu/bugs.c363
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c431
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c72
-rw-r--r--arch/x86/kernel/cpu/common.c540
-rw-r--r--arch/x86/kernel/cpu/cpu.h10
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c40
-rw-r--r--arch/x86/kernel/cpu/cyrix.c4
-rw-r--r--arch/x86/kernel/cpu/debugfs.c6
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c2
-rw-r--r--arch/x86/kernel/cpu/hygon.c16
-rw-r--r--arch/x86/kernel/cpu/intel.c978
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c12
-rw-r--r--arch/x86/kernel/cpu/intel_pconfig.c84
-rw-r--r--arch/x86/kernel/cpu/match.c69
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c167
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c119
-rw-r--r--arch/x86/kernel/cpu/mce/core.c518
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c14
-rw-r--r--arch/x86/kernel/cpu/mce/genpool.c75
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c16
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c32
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h8
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c38
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c523
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_shas.c444
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c17
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h2
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c132
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c23
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c10
-rw-r--r--arch/x86/kernel/cpu/proc.c17
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile5
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c487
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c295
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h306
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c462
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c152
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c718
-rw-r--r--arch/x86/kernel/cpu/resctrl/trace.h (renamed from arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h)24
-rw-r--r--arch/x86/kernel/cpu/scattered.c55
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c12
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c7
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c49
-rw-r--r--arch/x86/kernel/cpu/topology.c59
-rw-r--r--arch/x86/kernel/cpu/topology_amd.c40
-rw-r--r--arch/x86/kernel/cpu/topology_common.c35
-rw-r--r--arch/x86/kernel/cpu/topology_ext.c15
-rw-r--r--arch/x86/kernel/cpu/vmware.c229
55 files changed, 4884 insertions, 3104 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index eb4dbcdf41f1..4efdf5c2efc8 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -34,7 +34,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
ifdef CONFIG_CPU_SUP_INTEL
-obj-y += intel.o intel_pconfig.o tsx.o
+obj-y += intel.o tsx.o
obj-$(CONFIG_PM) += intel_epb.o
endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
@@ -59,8 +59,10 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o
obj-$(CONFIG_DEBUG_FS) += debugfs.o
+obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o
+
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
+ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^
cpufeature = $(src)/../../include/asm/cpufeatures.h
vmxfeature = $(src)/../../include/asm/vmxfeatures.h
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index cb9eece55904..2b36379ff675 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -13,6 +13,7 @@
#include <asm/apic.h>
#include <asm/cacheinfo.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
#include <asm/smp.h>
#include <asm/numa.h>
@@ -28,6 +29,8 @@
#include "cpu.h"
+u16 invlpgb_count_max __ro_after_init;
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -354,10 +357,15 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c)
/*
* RMP table entry format is not architectural and is defined by the
* per-processor PPR. Restrict SNP support on the known CPU models
- * for which the RMP table entry format is currently defined for.
+ * for which the RMP table entry format is currently defined or for
+ * processors which support the architecturally defined RMPREAD
+ * instruction.
*/
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
- c->x86 >= 0x19 && snp_probe_rmptable_info()) {
+ (cpu_feature_enabled(X86_FEATURE_ZEN3) ||
+ cpu_feature_enabled(X86_FEATURE_ZEN4) ||
+ cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
+ snp_probe_rmptable_info()) {
cc_platform_set(CC_ATTR_HOST_SEV_SNP);
} else {
setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
@@ -459,10 +467,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
case 0x1a:
switch (c->x86_model) {
- case 0x00 ... 0x0f:
- case 0x20 ... 0x2f:
+ case 0x00 ... 0x2f:
case 0x40 ... 0x4f:
- case 0x70 ... 0x7f:
+ case 0x60 ... 0x7f:
setup_force_cpu_cap(X86_FEATURE_ZEN5);
break;
default:
@@ -627,7 +634,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
* (model = 0x14) and later actually support it.
* (AMD Erratum #110, docId: 25759).
*/
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) {
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
if (!rdmsrl_amd_safe(0xc001100d, &value)) {
value &= ~BIT_64(32);
@@ -795,6 +802,12 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+static const struct x86_cpu_id erratum_1386_microcode[] = {
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
+ {}
+};
+
static void fix_erratum_1386(struct cpuinfo_x86 *c)
{
/*
@@ -804,7 +817,13 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
*
* Affected parts all have no supervisor XSAVE states, meaning that
* the XSAVEC instruction (which works fine) is equivalent.
+ *
+ * Clear the feature flag only on microcode revisions which
+ * don't have the fix.
*/
+ if (x86_match_min_microcode_rev(erratum_1386_microcode))
+ return;
+
clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
@@ -850,6 +869,16 @@ static void init_amd_zen1(struct cpuinfo_x86 *c)
pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
setup_force_cpu_bug(X86_BUG_DIV0);
+
+ /*
+ * Turn off the Instructions Retired free counter on machines that are
+ * susceptible to erratum #1054 "Instructions Retired Performance
+ * Counter May Be Inaccurate".
+ */
+ if (c->x86_model < 0x30) {
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+ clear_cpu_cap(c, X86_FEATURE_IRPERF);
+ }
}
static bool cpu_has_zenbleed_microcode(void)
@@ -913,6 +942,17 @@ static void init_amd_zen4(struct cpuinfo_x86 *c)
{
if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /*
+ * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE
+ * in some BIOS versions but they can lead to random host reboots.
+ */
+ switch (c->x86_model) {
+ case 0x18 ... 0x1f:
+ case 0x60 ... 0x7f:
+ clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD);
+ break;
+ }
}
static void init_amd_zen5(struct cpuinfo_x86 *c)
@@ -1022,13 +1062,8 @@ static void init_amd(struct cpuinfo_x86 *c)
if (!cpu_feature_enabled(X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
- /*
- * Turn on the Instructions Retired free counter on machines not
- * susceptible to erratum #1054 "Instructions Retired Performance
- * Counter May Be Inaccurate".
- */
- if (cpu_has(c, X86_FEATURE_IRPERF) &&
- (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f))
+ /* Enable the Instructions Retired free counter */
+ if (cpu_has(c, X86_FEATURE_IRPERF))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
@@ -1042,10 +1077,14 @@ static void init_amd(struct cpuinfo_x86 *c)
*/
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
cpu_has(c, X86_FEATURE_AUTOIBRS))
- WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
+ WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0);
/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+ /* Enable Translation Cache Extension */
+ if (cpu_has(c, X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}
#ifdef CONFIG_X86_32
@@ -1078,8 +1117,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
- tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
- tlb_lli_4k[ENTRIES] = ebx & mask;
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
/*
* K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
@@ -1092,26 +1131,30 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
- tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
- tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ tlb_lld_2m = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
- tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+ tlb_lld_4m = tlb_lld_2m >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
/* Erratum 658 */
if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
- tlb_lli_2m[ENTRIES] = 1024;
+ tlb_lli_2m = 1024;
} else {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
- tlb_lli_2m[ENTRIES] = eax & 0xff;
+ tlb_lli_2m = eax & 0xff;
}
} else
- tlb_lli_2m[ENTRIES] = eax & mask;
+ tlb_lli_2m = eax & mask;
+
+ tlb_lli_4m = tlb_lli_2m >> 1;
- tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
}
static const struct cpu_dev amd_cpu_dev = {
@@ -1179,22 +1222,6 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr)
}
EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
-u32 amd_get_highest_perf(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
- return 166;
-
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
- return 166;
-
- return 255;
-}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
-
static void zenbleed_check_cpu(void *unused)
{
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
@@ -1207,16 +1234,6 @@ void amd_check_microcode(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return;
- on_each_cpu(zenbleed_check_cpu, NULL, 1);
-}
-
-/*
- * Issue a DIV 0/1 insn to clear any division data from previous DIV
- * operations.
- */
-void noinstr amd_clear_divider(void)
-{
- asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0)
- :: "a" (0), "d" (0), "r" (1));
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2))
+ on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
-EXPORT_SYMBOL_GPL(amd_clear_divider);
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index fdbb5f07448f..6cf31a1649c4 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -124,25 +124,24 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
return true;
}
-#define X86_MATCH(model) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+#define X86_MATCH(vfm) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
- X86_MATCH(XEON_PHI_KNL),
- X86_MATCH(XEON_PHI_KNM),
+ X86_MATCH(INTEL_XEON_PHI_KNL),
+ X86_MATCH(INTEL_XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
- X86_MATCH(SKYLAKE_X),
+ X86_MATCH(INTEL_SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
- X86_MATCH(ATOM_GOLDMONT),
- X86_MATCH(ATOM_GOLDMONT_D),
- X86_MATCH(ATOM_GOLDMONT_PLUS),
+ X86_MATCH(INTEL_ATOM_GOLDMONT),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_D),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
{}
};
@@ -307,7 +306,7 @@ static void freq_invariance_enable(void)
WARN_ON_ONCE(1);
return;
}
- static_branch_enable(&arch_scale_freq_key);
+ static_branch_enable_cpuslocked(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
}
@@ -324,8 +323,10 @@ static void __init bp_init_freq_invariance(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
- if (intel_set_max_freq_ratio())
+ if (intel_set_max_freq_ratio()) {
+ guard(cpus_read_lock)();
freq_invariance_enable();
+ }
}
static void disable_freq_invariance_workfn(struct work_struct *work)
@@ -346,10 +347,91 @@ static DECLARE_WORK(disable_freq_invariance_work,
disable_freq_invariance_workfn);
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
+
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+ unsigned long capacity;
+ unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+ int cpu;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+ return true;
+ }
+
+ arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+ if (!arch_cpu_scale)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+ per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+ }
+
+ static_branch_enable(&arch_hybrid_cap_scale_key);
+
+ pr_info("Hybrid CPU capacity scaling enabled\n");
+
+ return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter. Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq)
+{
+ if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+ div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+ div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+ } else {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+ }
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+ return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
static void scale_freq_tick(u64 acnt, u64 mcnt)
{
- u64 freq_scale;
+ u64 freq_scale, freq_ratio;
if (!arch_scale_freq_invariant())
return;
@@ -357,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt)
if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
goto error;
- if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+ else
+ freq_ratio = arch_max_freq_ratio;
+
+ if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
goto error;
freq_scale = div64_u64(acnt, mcnt);
@@ -411,7 +498,7 @@ void arch_scale_freq_tick(void)
*/
#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
-unsigned int arch_freq_get_on_cpu(int cpu)
+int arch_freq_get_on_cpu(int cpu)
{
struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
unsigned int seq, freq;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ab18185894df..362602b705cc 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -26,7 +26,7 @@
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
#include <asm/tlbflush.h>
@@ -59,7 +59,6 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
-EXPORT_SYMBOL_GPL(x86_pred_cmd);
static u64 __ro_after_init x86_arch_cap_msr;
@@ -113,6 +112,10 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+/* Control IBPB on vCPU load */
+DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
+
/* Control MDS CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
@@ -233,7 +236,8 @@ static void x86_amd_ssb_disable(void)
#define pr_fmt(fmt) "MDS: " fmt
/* Default mitigation for MDS-affected CPUs */
-static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+static enum mds_mitigations mds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
static bool mds_nosmt __ro_after_init = false;
static const char * const mds_strings[] = {
@@ -242,6 +246,40 @@ static const char * const mds_strings[] = {
[MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
};
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_AUTO,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF;
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_AUTO,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF;
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_AUTO,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
+
static void __init mds_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
@@ -249,6 +287,9 @@ static void __init mds_select_mitigation(void)
return;
}
+ if (mds_mitigation == MDS_MITIGATION_AUTO)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
if (mds_mitigation == MDS_MITIGATION_FULL) {
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
@@ -285,15 +326,6 @@ early_param("mds", mds_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "TAA: " fmt
-enum taa_mitigations {
- TAA_MITIGATION_OFF,
- TAA_MITIGATION_UCODE_NEEDED,
- TAA_MITIGATION_VERW,
- TAA_MITIGATION_TSX_DISABLED,
-};
-
-/* Default mitigation for TAA-affected CPUs */
-static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
static bool taa_nosmt __ro_after_init;
static const char * const taa_strings[] = {
@@ -384,14 +416,6 @@ early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "MMIO Stale Data: " fmt
-enum mmio_mitigations {
- MMIO_MITIGATION_OFF,
- MMIO_MITIGATION_UCODE_NEEDED,
- MMIO_MITIGATION_VERW,
-};
-
-/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
-static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
static bool mmio_nosmt __ro_after_init = false;
static const char * const mmio_strings[] = {
@@ -480,16 +504,6 @@ early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "Register File Data Sampling: " fmt
-enum rfds_mitigations {
- RFDS_MITIGATION_OFF,
- RFDS_MITIGATION_VERW,
- RFDS_MITIGATION_UCODE_NEEDED,
-};
-
-/* Default mitigation for Register File Data Sampling */
-static enum rfds_mitigations rfds_mitigation __ro_after_init =
- IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF;
-
static const char * const rfds_strings[] = {
[RFDS_MITIGATION_OFF] = "Vulnerable",
[RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
@@ -505,6 +519,9 @@ static void __init rfds_select_mitigation(void)
if (rfds_mitigation == RFDS_MITIGATION_OFF)
return;
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
else
@@ -605,7 +622,8 @@ enum srbds_mitigations {
SRBDS_MITIGATION_HYPERVISOR,
};
-static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
+static enum srbds_mitigations srbds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF;
static const char * const srbds_strings[] = {
[SRBDS_MITIGATION_OFF] = "Vulnerable",
@@ -731,11 +749,8 @@ enum gds_mitigations {
GDS_MITIGATION_HYPERVISOR,
};
-#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE)
-static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE;
-#else
-static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL;
-#endif
+static enum gds_mitigations gds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF;
static const char * const gds_strings[] = {
[GDS_MITIGATION_OFF] = "Vulnerable",
@@ -871,7 +886,8 @@ enum spectre_v1_mitigation {
};
static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
- SPECTRE_V1_MITIGATION_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ?
+ SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE;
static const char * const spectre_v1_strings[] = {
[SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
@@ -986,7 +1002,7 @@ static const char * const retbleed_strings[] = {
static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
RETBLEED_MITIGATION_NONE;
static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
- RETBLEED_CMD_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF;
static int __ro_after_init retbleed_nosmt = false;
@@ -1115,6 +1131,22 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
mitigate_smt = true;
+
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like SRSO has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+ /*
+ * There is no need for RSB filling: write_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+
break;
case RETBLEED_MITIGATION_STUFF:
@@ -1275,9 +1307,13 @@ static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
static enum spectre_v2_user_cmd __init
spectre_v2_parse_user_cmdline(void)
{
+ enum spectre_v2_user_cmd mode;
char arg[20];
int ret, i;
+ mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ?
+ SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE;
+
switch (spectre_v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return SPECTRE_V2_USER_CMD_NONE;
@@ -1290,7 +1326,7 @@ spectre_v2_parse_user_cmdline(void)
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
arg, sizeof(arg));
if (ret < 0)
- return SPECTRE_V2_USER_CMD_AUTO;
+ return mode;
for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
if (match_option(arg, ret, v2_user_options[i].option)) {
@@ -1300,8 +1336,8 @@ spectre_v2_parse_user_cmdline(void)
}
}
- pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_USER_CMD_AUTO;
+ pr_err("Unknown user space protection option (%s). Switching to default\n", arg);
+ return mode;
}
static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
@@ -1313,16 +1349,11 @@ static void __init
spectre_v2_user_select_mitigation(void)
{
enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
- bool smt_possible = IS_ENABLED(CONFIG_SMP);
enum spectre_v2_user_cmd cmd;
if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
return;
- if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
- cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
- smt_possible = false;
-
cmd = spectre_v2_parse_user_cmdline();
switch (cmd) {
case SPECTRE_V2_USER_CMD_NONE:
@@ -1346,7 +1377,7 @@ spectre_v2_user_select_mitigation(void)
/* Initialize Indirect Branch Prediction Barrier */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
- setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+ static_branch_enable(&switch_vcpu_ibpb);
spectre_v2_user_ibpb = mode;
switch (cmd) {
@@ -1383,7 +1414,7 @@ spectre_v2_user_select_mitigation(void)
* so allow for STIBP to be selected in those cases.
*/
if (!boot_cpu_has(X86_FEATURE_STIBP) ||
- !smt_possible ||
+ !cpu_smt_possible() ||
(spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
!boot_cpu_has(X86_FEATURE_AUTOIBRS)))
return;
@@ -1447,17 +1478,18 @@ static void __init spec_v2_print_cond(const char *reason, bool secure)
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
- enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+ enum spectre_v2_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
cpu_mitigations_off())
return SPECTRE_V2_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
if (ret < 0)
- return SPECTRE_V2_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
if (!match_option(arg, ret, mitigation_options[i].option))
@@ -1467,8 +1499,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
@@ -1559,51 +1591,54 @@ static void __init spec_ctrl_disable_kernel_rrsba(void)
rrsba_disabled = true;
}
-static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
+static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode)
{
/*
- * Similar to context switches, there are two types of RSB attacks
- * after VM exit:
+ * WARNING! There are many subtleties to consider when changing *any*
+ * code related to RSB-related mitigations. Before doing so, carefully
+ * read the following document, and update if necessary:
*
- * 1) RSB underflow
+ * Documentation/admin-guide/hw-vuln/rsb.rst
*
- * 2) Poisoned RSB entry
+ * In an overly simplified nutshell:
*
- * When retpoline is enabled, both are mitigated by filling/clearing
- * the RSB.
+ * - User->user RSB attacks are conditionally mitigated during
+ * context switches by cond_mitigation -> write_ibpb().
*
- * When IBRS is enabled, while #1 would be mitigated by the IBRS branch
- * prediction isolation protections, RSB still needs to be cleared
- * because of #2. Note that SMEP provides no protection here, unlike
- * user-space-poisoned RSB entries.
+ * - User->kernel and guest->host attacks are mitigated by eIBRS or
+ * RSB filling.
*
- * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB
- * bug is present then a LITE version of RSB protection is required,
- * just a single call needs to retire before a RET is executed.
+ * Though, depending on config, note that other alternative
+ * mitigations may end up getting used instead, e.g., IBPB on
+ * entry/vmexit, call depth tracking, or return thunks.
*/
+
switch (mode) {
case SPECTRE_V2_NONE:
- return;
+ break;
- case SPECTRE_V2_EIBRS_LFENCE:
case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
- setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
}
- return;
+ break;
- case SPECTRE_V2_EIBRS_RETPOLINE:
case SPECTRE_V2_RETPOLINE:
case SPECTRE_V2_LFENCE:
case SPECTRE_V2_IBRS:
+ pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
- pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n");
- return;
- }
+ break;
- pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit");
- dump_stack();
+ default:
+ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n");
+ dump_stack();
+ break;
+ }
}
/*
@@ -1625,6 +1660,7 @@ static bool __init spec_ctrl_bhi_dis(void)
enum bhi_mitigations {
BHI_MITIGATION_OFF,
BHI_MITIGATION_ON,
+ BHI_MITIGATION_VMEXIT_ONLY,
};
static enum bhi_mitigations bhi_mitigation __ro_after_init =
@@ -1639,6 +1675,8 @@ static int __init spectre_bhi_parse_cmdline(char *str)
bhi_mitigation = BHI_MITIGATION_OFF;
else if (!strcmp(str, "on"))
bhi_mitigation = BHI_MITIGATION_ON;
+ else if (!strcmp(str, "vmexit"))
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
else
pr_err("Ignoring unknown spectre_bhi option (%s)", str);
@@ -1659,19 +1697,22 @@ static void __init bhi_select_mitigation(void)
return;
}
+ /* Mitigate in hardware if supported */
if (spec_ctrl_bhi_dis())
return;
if (!IS_ENABLED(CONFIG_X86_64))
return;
- /* Mitigate KVM by default */
- setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
- pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
+ if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
+ pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ return;
+ }
- /* Mitigate syscalls when the mitigation is forced =on */
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n");
setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
- pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
}
static void __init spectre_v2_select_mitigation(void)
@@ -1791,48 +1832,7 @@ static void __init spectre_v2_select_mitigation(void)
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
- /*
- * If Spectre v2 protection has been enabled, fill the RSB during a
- * context switch. In general there are two types of RSB attacks
- * across context switches, for which the CALLs/RETs may be unbalanced.
- *
- * 1) RSB underflow
- *
- * Some Intel parts have "bottomless RSB". When the RSB is empty,
- * speculated return targets may come from the branch predictor,
- * which could have a user-poisoned BTB or BHB entry.
- *
- * AMD has it even worse: *all* returns are speculated from the BTB,
- * regardless of the state of the RSB.
- *
- * When IBRS or eIBRS is enabled, the "user -> kernel" attack
- * scenario is mitigated by the IBRS branch prediction isolation
- * properties, so the RSB buffer filling wouldn't be necessary to
- * protect against this type of attack.
- *
- * The "user -> user" attack scenario is mitigated by RSB filling.
- *
- * 2) Poisoned RSB entry
- *
- * If the 'next' in-kernel return stack is shorter than 'prev',
- * 'next' could be tricked into speculating with a user-poisoned RSB
- * entry.
- *
- * The "user -> kernel" attack scenario is mitigated by SMEP and
- * eIBRS.
- *
- * The "user -> user" scenario, also known as SpectreBHB, requires
- * RSB clearing.
- *
- * So to mitigate all cases, unconditionally fill RSB on context
- * switches.
- *
- * FIXME: Is this pointless for retbleed-affected AMD?
- */
- setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
- pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
-
- spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
+ spectre_v2_select_rsb_mitigation(mode);
/*
* Retpoline protects the kernel, but doesn't protect firmware. IBRS
@@ -1948,6 +1948,7 @@ void cpu_bugs_smt_update(void)
switch (mds_mitigation) {
case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_AUTO:
case MDS_MITIGATION_VMWERV:
if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
pr_warn_once(MDS_MSG_SMT);
@@ -1959,6 +1960,7 @@ void cpu_bugs_smt_update(void)
switch (taa_mitigation) {
case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_AUTO:
case TAA_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(TAA_MSG_SMT);
@@ -1970,6 +1972,7 @@ void cpu_bugs_smt_update(void)
switch (mmio_mitigation) {
case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_AUTO:
case MMIO_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(MMIO_MSG_SMT);
@@ -2015,10 +2018,12 @@ static const struct {
static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
{
- enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+ enum ssb_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ?
+ SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
cpu_mitigations_off()) {
return SPEC_STORE_BYPASS_CMD_NONE;
@@ -2026,7 +2031,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
arg, sizeof(arg));
if (ret < 0)
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
if (!match_option(arg, ret, ssb_mitigation_options[i].option))
@@ -2037,8 +2042,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
}
@@ -2365,7 +2370,8 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
#define pr_fmt(fmt) "L1TF: " fmt
/* Default mitigation for L1TF-affected CPUs */
-enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
+enum l1tf_mitigations l1tf_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF;
#if IS_ENABLED(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(l1tf_mitigation);
#endif
@@ -2391,20 +2397,20 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
+ switch (c->x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_WESTMERE:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_IVYBRIDGE:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_G:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
if (c->x86_cache_bits < 44)
c->x86_cache_bits = 44;
break;
@@ -2494,6 +2500,7 @@ enum srso_mitigation {
SRSO_MITIGATION_SAFE_RET,
SRSO_MITIGATION_IBPB,
SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
};
enum srso_mitigation_cmd {
@@ -2511,7 +2518,8 @@ static const char * const srso_strings[] = {
[SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
[SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
[SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
- [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only"
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
+ [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
};
static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE;
@@ -2545,13 +2553,12 @@ static void __init srso_select_mitigation(void)
{
bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
- if (cpu_mitigations_off())
- return;
-
- if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+ if (!boot_cpu_has_bug(X86_BUG_SRSO) ||
+ cpu_mitigations_off() ||
+ srso_cmd == SRSO_CMD_OFF) {
if (boot_cpu_has(X86_FEATURE_SBPB))
x86_pred_cmd = PRED_CMD_SBPB;
- return;
+ goto out;
}
if (has_microcode) {
@@ -2563,7 +2570,7 @@ static void __init srso_select_mitigation(void)
*/
if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
- return;
+ goto out;
}
if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
@@ -2579,11 +2586,6 @@ static void __init srso_select_mitigation(void)
}
switch (srso_cmd) {
- case SRSO_CMD_OFF:
- if (boot_cpu_has(X86_FEATURE_SBPB))
- x86_pred_cmd = PRED_CMD_SBPB;
- return;
-
case SRSO_CMD_MICROCODE:
if (has_microcode) {
srso_mitigation = SRSO_MITIGATION_MICROCODE;
@@ -2592,6 +2594,9 @@ static void __init srso_select_mitigation(void)
break;
case SRSO_CMD_SAFE_RET:
+ if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))
+ goto ibpb_on_vmexit;
+
if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
/*
* Enable the return thunk for generated code
@@ -2620,27 +2625,67 @@ static void __init srso_select_mitigation(void)
if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
srso_mitigation = SRSO_MITIGATION_IBPB;
+
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like Retbleed has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+ /*
+ * There is no need for RSB filling: write_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
}
} else {
pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
}
break;
+ibpb_on_vmexit:
case SRSO_CMD_IBPB_ON_VMEXIT:
- if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
- if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
+
+ if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+
+ /*
+ * There is no need for RSB filling: write_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
}
} else {
- pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
- }
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ }
+ break;
+ default:
break;
}
out:
- pr_info("%s\n", srso_strings[srso_mitigation]);
+ /*
+ * Clear the feature flag if this mitigation is not selected as that
+ * feature flag controls the BpSpecReduce MSR bit toggling in KVM.
+ */
+ if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
+ setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
+
+ if (srso_mitigation != SRSO_MITIGATION_NONE)
+ pr_info("%s\n", srso_strings[srso_mitigation]);
}
#undef pr_fmt
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
new file mode 100644
index 000000000000..237faf7e700c
--- /dev/null
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/cpuhotplug.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
+#include <asm/traps.h>
+#include <asm/cpu.h>
+
+enum split_lock_detect_state {
+ sld_off = 0,
+ sld_warn,
+ sld_fatal,
+ sld_ratelimit,
+};
+
+/*
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
+ */
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
+
+/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
+
+static const struct {
+ const char *option;
+ enum split_lock_detect_state state;
+} sld_options[] __initconst = {
+ { "off", sld_off },
+ { "warn", sld_warn },
+ { "fatal", sld_fatal },
+ { "ratelimit:", sld_ratelimit },
+};
+
+static struct ratelimit_state bld_ratelimit;
+
+static unsigned int sysctl_sld_mitigate = 1;
+static DEFINE_SEMAPHORE(buslock_sem, 1);
+
+#ifdef CONFIG_PROC_SYSCTL
+static const struct ctl_table sld_sysctls[] = {
+ {
+ .procname = "split_lock_mitigate",
+ .data = &sysctl_sld_mitigate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init sld_mitigate_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sld_sysctls);
+ return 0;
+}
+
+late_initcall(sld_mitigate_sysctl_init);
+#endif
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt), ratelimit;
+
+ if (strncmp(arg, opt, len))
+ return false;
+
+ /*
+ * Min ratelimit is 1 bus lock/sec.
+ * Max ratelimit is 1000 bus locks/sec.
+ */
+ if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+ ratelimit > 0 && ratelimit <= 1000) {
+ ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+ ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+ return true;
+ }
+
+ return len == arglen;
+}
+
+static bool split_lock_verify_msr(bool on)
+{
+ u64 ctrl, tmp;
+
+ if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
+ return false;
+ if (on)
+ ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ else
+ ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
+ return false;
+ rdmsrl(MSR_TEST_CTRL, tmp);
+ return ctrl == tmp;
+}
+
+static void __init sld_state_setup(void)
+{
+ enum split_lock_detect_state state = sld_warn;
+ char arg[20];
+ int i, ret;
+
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+ arg, sizeof(arg));
+ if (ret >= 0) {
+ for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+ if (match_option(arg, ret, sld_options[i].option)) {
+ state = sld_options[i].state;
+ break;
+ }
+ }
+ }
+ sld_state = state;
+}
+
+static void __init __split_lock_setup(void)
+{
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ if (!split_lock_verify_msr(true)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ /* Restore the MSR to its cached value. */
+ wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+}
+
+/*
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
+ */
+static void sld_update_msr(bool on)
+{
+ u64 test_ctrl_val = msr_test_ctrl_cache;
+
+ if (on)
+ test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+ wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+void split_lock_init(void)
+{
+ /*
+ * #DB for bus lock handles ratelimit and #AC for split lock is
+ * disabled.
+ */
+ if (sld_state == sld_ratelimit) {
+ split_lock_verify_msr(false);
+ return;
+ }
+
+ if (cpu_model_supports_sld)
+ split_lock_verify_msr(sld_state != sld_off);
+}
+
+static void __split_lock_reenable_unlock(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
+
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+}
+/*
+ * In order for each CPU to schedule its delayed work independently of the
+ * others, delayed work struct must be per-CPU. This is not required when
+ * sysctl_sld_mitigate is enabled because of the semaphore that limits
+ * the number of simultaneously scheduled delayed works to 1.
+ */
+static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
+
+/*
+ * Per-CPU delayed_work can't be statically initialized properly because
+ * the struct address is unknown. Thus per-CPU delayed_work structures
+ * have to be initialized during kernel initialization and after calling
+ * setup_per_cpu_areas().
+ */
+static int __init setup_split_lock_delayed_work(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct delayed_work *work = per_cpu_ptr(&sl_reenable, cpu);
+
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
+ return 0;
+}
+pure_initcall(setup_split_lock_delayed_work);
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static void split_lock_warn(unsigned long ip)
+{
+ struct delayed_work *work;
+ int cpu;
+ unsigned int saved_sld_mitigate = READ_ONCE(sysctl_sld_mitigate);
+
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ if (saved_sld_mitigate) {
+ /*
+ * misery factor #1:
+ * sleep 10ms before trying to execute split lock.
+ */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /*
+ * Misery factor #2:
+ * only allow one buslocked disabled core at a time.
+ */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ }
+
+ cpu = get_cpu();
+ work = saved_sld_mitigate ? &sl_reenable_unlock : per_cpu_ptr(&sl_reenable, cpu);
+ schedule_delayed_work_on(cpu, work, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
+ sld_update_msr(false);
+ put_cpu();
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+ if (sld_state == sld_warn) {
+ split_lock_warn(ip);
+ return true;
+ }
+
+ pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid,
+ sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+ current->thread.error_code = 0;
+ current->thread.trap_nr = X86_TRAP_AC;
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ return false;
+}
+EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+
+void bus_lock_init(void)
+{
+ u64 val;
+
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+
+ if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ (sld_state == sld_warn || sld_state == sld_fatal)) ||
+ sld_state == sld_off) {
+ /*
+ * Warn and fatal are handled by #AC for split lock if #AC for
+ * split lock is supported.
+ */
+ val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ } else {
+ val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+ }
+
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+}
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+ if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+ return false;
+ split_lock_warn(regs->ip);
+ return true;
+}
+
+void handle_bus_lock(struct pt_regs *regs)
+{
+ switch (sld_state) {
+ case sld_off:
+ break;
+ case sld_ratelimit:
+ /* Enforce no more than bld_ratelimit bus locks/sec. */
+ while (!__ratelimit(&bld_ratelimit))
+ msleep(20);
+ /* Warn on the bus lock. */
+ fallthrough;
+ case sld_warn:
+ pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, regs->ip);
+ break;
+ case sld_fatal:
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ break;
+ }
+}
+
+/*
+ * CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ {}
+};
+
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
+{
+ const struct x86_cpu_id *m;
+ u64 ia32_core_caps;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ /* Check for CPUs that have support but do not enumerate it: */
+ m = x86_match_cpu(split_lock_cpu_ids);
+ if (m)
+ goto supported;
+
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+ return;
+
+ /*
+ * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
+ * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
+ * it have split lock detection.
+ */
+ rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
+ goto supported;
+
+ /* CPU is not in the model list and does not have the MSR bit: */
+ return;
+
+supported:
+ cpu_model_supports_sld = true;
+ __split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return;
+
+ switch (sld_state) {
+ case sld_off:
+ pr_info("disabled\n");
+ break;
+ case sld_warn:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: warning on user-space bus_locks\n");
+ }
+ break;
+ case sld_fatal:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+ boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+ " from non-WB" : "");
+ }
+ break;
+ case sld_ratelimit:
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+ break;
+ }
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+ split_lock_setup(c);
+ sld_state_setup();
+ sld_state_show();
+}
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 392d09c936d6..b3a520959b51 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -8,21 +8,19 @@
* Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
*/
-#include <linux/slab.h>
#include <linux/cacheinfo.h>
+#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cpuhotplug.h>
-#include <linux/sched.h>
-#include <linux/capability.h>
-#include <linux/sysfs.h>
#include <linux/pci.h>
#include <linux/stop_machine.h>
+#include <linux/sysfs.h>
-#include <asm/cpufeature.h>
-#include <asm/cacheinfo.h>
#include <asm/amd_nb.h>
-#include <asm/smp.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpufeature.h>
#include <asm/mtrr.h>
+#include <asm/smp.h>
#include <asm/tlbflush.h>
#include "cpu.h"
@@ -31,7 +29,6 @@
#define LVL_1_DATA 2
#define LVL_2 3
#define LVL_3 4
-#define LVL_TRACE 5
/* Shared last level cache maps */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
@@ -96,10 +93,6 @@ static const struct _cache_table cache_table[] =
{ 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
- { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
- { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
- { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
{ 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
{ 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
{ 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -178,8 +171,6 @@ struct _cpuid4_info_regs {
struct amd_northbridge *nb;
};
-static unsigned short num_cache_leaves;
-
/* AMD doesn't have CPUID4. Emulate it here to report the same
information to the user. This makes some assumptions about the machine:
L2 not shared, no SMT etc. that is currently true on AMD CPUs.
@@ -717,20 +708,23 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- num_cache_leaves = find_num_cache_leaves(c);
+ ci->num_leaves = find_num_cache_leaves(c);
} else if (c->extended_cpuid_level >= 0x80000006) {
if (cpuid_edx(0x80000006) & 0xf000)
- num_cache_leaves = 4;
+ ci->num_leaves = 4;
else
- num_cache_leaves = 3;
+ ci->num_leaves = 3;
}
}
void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
{
- num_cache_leaves = find_num_cache_leaves(c);
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+
+ ci->num_leaves = find_num_cache_leaves(c);
}
void init_intel_cacheinfo(struct cpuinfo_x86 *c)
@@ -740,21 +734,21 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
if (c->cpuid_level > 3) {
- static int is_initialized;
-
- if (is_initialized == 0) {
- /* Init num_cache_leaves from boot CPU */
- num_cache_leaves = find_num_cache_leaves(c);
- is_initialized++;
- }
+ /*
+ * There should be at least one leaf. A non-zero value means
+ * that the number of leaves has been initialized.
+ */
+ if (!ci->num_leaves)
+ ci->num_leaves = find_num_cache_leaves(c);
/*
* Whenever possible use cpuid(4), deterministic cache
* parameters cpuid leaf to find the cache details
*/
- for (i = 0; i < num_cache_leaves; i++) {
+ for (i = 0; i < ci->num_leaves; i++) {
struct _cpuid4_info_regs this_leaf = {};
int retval;
@@ -786,19 +780,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
}
}
}
- /*
- * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
- * trace cache
- */
- if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
+
+ /* Don't use CPUID(2) if CPUID(4) is supported. */
+ if (!ci->num_leaves && c->cpuid_level > 1) {
/* supports eax=2 call */
int j, n;
unsigned int regs[4];
unsigned char *dp = (unsigned char *)regs;
- int only_trace = 0;
-
- if (num_cache_leaves != 0 && c->x86 == 15)
- only_trace = 1;
/* Number of times to iterate */
n = cpuid_eax(2) & 0xFF;
@@ -807,7 +795,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
/* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
+ for (j = 0 ; j < 4 ; j++)
if (regs[j] & (1 << 31))
regs[j] = 0;
@@ -819,8 +807,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
/* look up this descriptor in the table */
while (cache_table[k].descriptor != 0) {
if (cache_table[k].descriptor == des) {
- if (only_trace && cache_table[k].cache_type != LVL_TRACE)
- break;
switch (cache_table[k].cache_type) {
case LVL_1_INST:
l1i += cache_table[k].size;
@@ -991,14 +977,12 @@ static void ci_leaf_init(struct cacheinfo *this_leaf,
int init_cache_level(unsigned int cpu)
{
- struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
- if (!num_cache_leaves)
+ /* There should be at least one leaf. */
+ if (!ci->num_leaves)
return -ENOENT;
- if (!this_cpu_ci)
- return -EINVAL;
- this_cpu_ci->num_levels = 3;
- this_cpu_ci->num_leaves = num_cache_leaves;
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 605c26c009c8..12126adbc3a9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -29,6 +29,7 @@
#include <asm/alternative.h>
#include <asm/cmdline.h>
+#include <asm/cpuid.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/doublefault.h>
@@ -68,6 +69,8 @@
#include <asm/traps.h>
#include <asm/sev.h>
#include <asm/tdx.h>
+#include <asm/posted_intr.h>
+#include <asm/runtime-const.h>
#include "cpu.h"
@@ -114,17 +117,17 @@ static const struct x86_cpu_id ppin_cpuids[] = {
X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
/* Legacy models without CPUID enumeration */
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
{}
};
@@ -167,7 +170,7 @@ static void ppin_init(struct cpuinfo_x86 *c)
}
clear_ppin:
- clear_cpu_cap(c, info->feature);
+ setup_clear_cpu_cap(info->feature);
}
static void default_init(struct cpuinfo_x86 *c)
@@ -274,21 +277,13 @@ static int __init x86_noinvpcid_setup(char *s)
}
early_param("noinvpcid", x86_noinvpcid_setup);
-#ifdef CONFIG_X86_32
-static int cachesize_override = -1;
-static int disable_x86_serial_nr = 1;
-
-static int __init cachesize_setup(char *str)
-{
- get_option(&str, &cachesize_override);
- return 1;
-}
-__setup("cachesize=", cachesize_setup);
-
/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
+static inline bool flag_is_changeable_p(unsigned long flag)
{
- u32 f1, f2;
+ unsigned long f1, f2;
+
+ if (!IS_ENABLED(CONFIG_X86_32))
+ return true;
/*
* Cyrix and IDT cpus allow disabling of CPUID
@@ -311,11 +306,22 @@ static inline int flag_is_changeable_p(u32 flag)
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
- return ((f1^f2) & flag) != 0;
+ return (f1 ^ f2) & flag;
}
+#ifdef CONFIG_X86_32
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
/* Probe for the CPUID instruction */
-int have_cpuid_p(void)
+bool have_cpuid_p(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
@@ -347,10 +353,6 @@ static int __init x86_serial_nr_setup(char *s)
}
__setup("serialnumber", x86_serial_nr_setup);
#else
-static inline int flag_is_changeable_p(u32 flag)
-{
- return 1;
-}
static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
}
@@ -635,9 +637,9 @@ struct cpuid_dependent_feature {
static const struct cpuid_dependent_feature
cpuid_dependent_features[] = {
- { X86_FEATURE_MWAIT, 0x00000005 },
- { X86_FEATURE_DCA, 0x00000009 },
- { X86_FEATURE_XSAVE, 0x0000000d },
+ { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
+ { X86_FEATURE_DCA, CPUID_LEAF_DCA },
+ { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
{ 0, 0 }
};
@@ -665,8 +667,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
if (!warn)
continue;
- pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
- x86_cap_flag(df->feature), df->level);
+ pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
}
}
@@ -844,13 +846,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_cache_size = l2size;
}
-u16 __read_mostly tlb_lli_4k[NR_INFO];
-u16 __read_mostly tlb_lli_2m[NR_INFO];
-u16 __read_mostly tlb_lli_4m[NR_INFO];
-u16 __read_mostly tlb_lld_4k[NR_INFO];
-u16 __read_mostly tlb_lld_2m[NR_INFO];
-u16 __read_mostly tlb_lld_4m[NR_INFO];
-u16 __read_mostly tlb_lld_1g[NR_INFO];
+u16 __read_mostly tlb_lli_4k;
+u16 __read_mostly tlb_lli_2m;
+u16 __read_mostly tlb_lli_4m;
+u16 __read_mostly tlb_lld_4k;
+u16 __read_mostly tlb_lld_2m;
+u16 __read_mostly tlb_lld_4m;
+u16 __read_mostly tlb_lld_1g;
static void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
@@ -858,15 +860,13 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
this_cpu->c_detect_tlb(c);
pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
- tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
- tlb_lli_4m[ENTRIES]);
+ tlb_lli_4k, tlb_lli_2m, tlb_lli_4m);
pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
- tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
- tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
+ tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g);
}
-static void get_cpu_vendor(struct cpuinfo_x86 *c)
+void get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
int i;
@@ -1053,18 +1053,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
- bool vp_bits_from_cpuid = true;
if (!cpu_has(c, X86_FEATURE_CPUID) ||
- (c->extended_cpuid_level < 0x80000008))
- vp_bits_from_cpuid = false;
-
- if (vp_bits_from_cpuid) {
- cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
-
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- } else {
+ (c->extended_cpuid_level < 0x80000008)) {
if (IS_ENABLED(CONFIG_X86_64)) {
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1078,14 +1069,23 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c)
cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
}
+ } else {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+
+ /* Provide a sane default if not enumerated: */
+ if (!c->x86_clflush_size)
+ c->x86_clflush_size = 32;
}
+
c->x86_cache_bits = c->x86_phys_bits;
c->x86_cache_alignment = c->x86_clflush_size;
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_32
int i;
/*
@@ -1106,7 +1106,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
break;
}
}
-#endif
}
#define NO_SPECULATION BIT(0)
@@ -1125,8 +1124,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define VULNWL(vendor, family, model, whitelist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
-#define VULNWL_INTEL(model, whitelist) \
- VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+#define VULNWL_INTEL(vfm, whitelist) \
+ X86_MATCH_VFM(vfm, whitelist)
#define VULNWL_AMD(family, whitelist) \
VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
@@ -1143,32 +1142,32 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(TIGERLAKE, NO_MMIO),
- VULNWL_INTEL(TIGERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(CORE_YONAH, NO_SSB),
+ VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1178,9 +1177,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
- VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* AMD Family 0xf - 0x12 */
VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
@@ -1201,10 +1200,11 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define VULNBL(vendor, family, model, blacklist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
-#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
- X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, steppings, \
- X86_FEATURE_ANY, issues)
+#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
+ X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
+
+#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \
+ X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues)
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@@ -1229,49 +1229,50 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
- VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS),
+ VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
VULNBL_AMD(0x19, SRSO),
+ VULNBL_AMD(0x1a, SRSO),
{}
};
@@ -1331,8 +1332,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) {
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ }
if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
!(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
@@ -1443,6 +1446,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
boot_cpu_has(X86_FEATURE_HYPERVISOR)))
setup_force_cpu_bug(X86_BUG_BHI);
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
+ setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -1476,66 +1482,38 @@ static void detect_nopl(void)
#endif
}
-/*
- * We parse cpu parameters early because fpu__init_system() is executed
- * before parse_early_param().
- */
-static void __init cpu_parse_early_param(void)
+static inline bool parse_set_clear_cpuid(char *arg, bool set)
{
- char arg[128];
- char *argptr = arg, *opt;
- int arglen, taint = 0;
+ char *opt;
+ int taint = 0;
-#ifdef CONFIG_X86_32
- if (cmdline_find_option_bool(boot_command_line, "no387"))
-#ifdef CONFIG_MATH_EMULATION
- setup_clear_cpu_cap(X86_FEATURE_FPU);
-#else
- pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
-#endif
-
- if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
-#endif
-
- if (cmdline_find_option_bool(boot_command_line, "noxsave"))
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-
- if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
- if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-
- if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
- setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
-
- arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
- if (arglen <= 0)
- return;
-
- pr_info("Clearing CPUID bits:");
-
- while (argptr) {
+ while (arg) {
bool found __maybe_unused = false;
unsigned int bit;
- opt = strsep(&argptr, ",");
+ opt = strsep(&arg, ",");
/*
* Handle naked numbers first for feature flags which don't
- * have names.
+ * have names. It doesn't make sense for a bug not to have a
+ * name so don't handle bug flags here.
*/
if (!kstrtouint(opt, 10, &bit)) {
if (bit < NCAPINTS * 32) {
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU feature flag:");
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU feature flag:");
+ setup_clear_cpu_cap(bit);
+ }
/* empty-string, i.e., ""-defined feature flags */
if (!x86_cap_flags[bit])
- pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit));
+ pr_cont(" %d:%d\n", bit >> 5, bit & 31);
else
- pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ pr_cont(" %s\n", x86_cap_flags[bit]);
- setup_clear_cpu_cap(bit);
taint++;
}
/*
@@ -1545,27 +1523,97 @@ static void __init cpu_parse_early_param(void)
continue;
}
- for (bit = 0; bit < 32 * NCAPINTS; bit++) {
- if (!x86_cap_flag(bit))
+ for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) {
+ const char *flag;
+ const char *kind;
+
+ if (bit < 32 * NCAPINTS) {
+ flag = x86_cap_flags[bit];
+ kind = "feature";
+ } else {
+ kind = "bug";
+ flag = x86_bug_flags[bit - (32 * NCAPINTS)];
+ }
+
+ if (!flag)
continue;
- if (strcmp(x86_cap_flag(bit), opt))
+ if (strcmp(flag, opt))
continue;
- pr_cont(" %s", opt);
- setup_clear_cpu_cap(bit);
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_clear_cpu_cap(bit);
+ }
taint++;
found = true;
break;
}
if (!found)
- pr_cont(" (unknown: %s)", opt);
+ pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt);
}
- pr_cont("\n");
- if (taint)
+ return taint;
+}
+
+
+/*
+ * We parse cpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init cpu_parse_early_param(void)
+{
+ bool cpuid_taint = false;
+ char arg[128];
+ int arglen;
+
+#ifdef CONFIG_X86_32
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+#ifdef CONFIG_MATH_EMULATION
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+#else
+ pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
+ setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+
+ /* Minimize the gap between FRED is available and available but disabled. */
+ arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg));
+ if (arglen != 2 || strncmp(arg, "on", 2))
+ setup_clear_cpu_cap(X86_FEATURE_FRED);
+
+ arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, false);
+
+ arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, true);
+
+ if (cpuid_taint) {
+ pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
}
/*
@@ -1589,6 +1637,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (have_cpuid_p()) {
cpu_detect(c);
get_cpu_vendor(c);
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
get_cpu_address_sizes(c);
@@ -1601,6 +1650,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
c->cpu_index = 0;
filter_cpuid_features(c, false);
+ check_cpufeature_deps(c);
if (this_cpu->c_bsp_init)
this_cpu->c_bsp_init(c);
@@ -1642,15 +1692,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
detect_nopl();
}
-void __init early_cpu_init(void)
+void __init init_cpu_devs(void)
{
const struct cpu_dev *const *cdev;
int count = 0;
-#ifdef CONFIG_PROCESSOR_SELECT
- pr_info("KERNEL supported cpus:\n");
-#endif
-
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
const struct cpu_dev *cpudev = *cdev;
@@ -1658,20 +1704,30 @@ void __init early_cpu_init(void)
break;
cpu_devs[count] = cpudev;
count++;
+ }
+}
+
+void __init early_cpu_init(void)
+{
+#ifdef CONFIG_PROCESSOR_SELECT
+ unsigned int i, j;
+
+ pr_info("KERNEL supported cpus:\n");
+#endif
+
+ init_cpu_devs();
#ifdef CONFIG_PROCESSOR_SELECT
- {
- unsigned int j;
-
- for (j = 0; j < 2; j++) {
- if (!cpudev->c_ident[j])
- continue;
- pr_info(" %s %s\n", cpudev->c_vendor,
- cpudev->c_ident[j]);
- }
+ for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) {
+ for (j = 0; j < 2; j++) {
+ if (!cpu_devs[i]->c_ident[j])
+ continue;
+ pr_info(" %s %s\n", cpu_devs[i]->c_vendor,
+ cpu_devs[i]->c_ident[j]);
}
-#endif
}
+#endif
+
early_identify_cpu(&boot_cpu_data);
}
@@ -1748,7 +1804,7 @@ static void generic_identify(struct cpuinfo_x86 *c)
cpu_detect(c);
get_cpu_vendor(c);
-
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
get_cpu_address_sizes(c);
@@ -1831,6 +1887,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_init)
this_cpu->c_init(c);
+ bus_lock_init();
+
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
@@ -1853,6 +1911,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Filter out anything that depends on CPUID levels we don't have */
filter_cpuid_features(c, true);
+ /* Check for unmet dependencies based on the CPUID dependency table */
+ check_cpufeature_deps(c);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
const char *p;
@@ -1896,9 +1957,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Init Machine Check Exception if available. */
mcheck_cpu_init(c);
-#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
-#endif
}
/*
@@ -1947,9 +2006,15 @@ static __init void identify_boot_cpu(void)
lkgs_init();
}
-void identify_secondary_cpu(struct cpuinfo_x86 *c)
+void identify_secondary_cpu(unsigned int cpu)
{
- BUG_ON(c == &boot_cpu_data);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
+ c->cpu_index = cpu;
+
identify_cpu(c);
#ifdef CONFIG_X86_32
enable_sep_cpu();
@@ -1960,6 +2025,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
update_gds_msr();
tsx_ap_init();
+ c->initialized = true;
}
void print_cpu_info(struct cpuinfo_x86 *c)
@@ -1990,27 +2056,40 @@ void print_cpu_info(struct cpuinfo_x86 *c)
}
/*
- * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy
- * function prevents it from becoming an environment variable for init.
+ * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param().
+ * These dummy functions prevent them from becoming an environment variable for
+ * init.
*/
+
static __init int setup_clearcpuid(char *arg)
{
return 1;
}
__setup("clearcpuid=", setup_clearcpuid);
-DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
- .current_task = &init_task,
- .preempt_count = INIT_PREEMPT_COUNT,
- .top_of_stack = TOP_OF_INIT_STACK,
-};
-EXPORT_PER_CPU_SYMBOL(pcpu_hot);
-EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);
+static __init int setup_setcpuid(char *arg)
+{
+ return 1;
+}
+__setup("setcpuid=", setup_setcpuid);
+
+DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+EXPORT_PER_CPU_SYMBOL(const_current_task);
+
+DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
#ifdef CONFIG_X86_64
-DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
- fixed_percpu_data) __aligned(PAGE_SIZE) __visible;
-EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
+/*
+ * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+ * so that this space is reserved in the hot cache section even when the
+ * mitigation is disabled.
+ */
+DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth);
+EXPORT_PER_CPU_SYMBOL(__x86_call_depth);
static void wrmsrl_cstar(unsigned long val)
{
@@ -2074,15 +2153,14 @@ void syscall_init(void)
if (!cpu_feature_enabled(X86_FEATURE_FRED))
idt_syscall_init();
}
-
-#else /* CONFIG_X86_64 */
+#endif /* CONFIG_X86_64 */
#ifdef CONFIG_STACKPROTECTOR
-DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard);
+#ifndef CONFIG_SMP
EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif
-
-#endif /* CONFIG_X86_64 */
+#endif
/*
* Clear all 6 debug registers:
@@ -2170,7 +2248,7 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss)
* Setup everything needed to handle exceptions from the IDT, including the IST
* exceptions which use paranoid_entry().
*/
-void cpu_init_exception_handling(void)
+void cpu_init_exception_handling(bool boot_cpu)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
int cpu = raw_smp_processor_id();
@@ -2189,10 +2267,23 @@ void cpu_init_exception_handling(void)
/* GHCB needs to be setup to handle #VC. */
setup_ghcb();
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* The boot CPU has enabled FRED during early boot */
+ if (!boot_cpu)
+ cpu_init_fred_exceptions();
+
+ cpu_init_fred_rsps();
+ } else {
+ load_current_idt();
+ }
+}
+
+void __init cpu_init_replace_early_idt(void)
+{
if (cpu_feature_enabled(X86_FEATURE_FRED))
cpu_init_fred_exceptions();
else
- load_current_idt();
+ idt_setup_early_pf();
}
/*
@@ -2227,6 +2318,8 @@ void cpu_init(void)
barrier();
x2apic_setup();
+
+ intel_posted_msi_init();
}
mmgrab(&init_mm);
@@ -2365,6 +2458,15 @@ void __init arch_cpu_finalize_init(void)
alternative_instructions();
if (IS_ENABLED(CONFIG_X86_64)) {
+ unsigned long USER_PTR_MAX = TASK_SIZE_MAX;
+
+ /*
+ * Enable this when LAM is gated on LASS support
+ if (cpu_feature_enabled(X86_FEATURE_LAM))
+ USER_PTR_MAX = (1ul << 63) - PAGE_SIZE;
+ */
+ runtime_const_init(ptr, USER_PTR_MAX);
+
/*
* Make sure the first 2MB area is not mapped by huge pages
* There are typically fixed size MTRRs in there and overlapping
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index ea9e07d57c8d..51deb60a9d26 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -33,14 +33,6 @@ struct cpu_dev {
#endif
};
-struct _tlb_table {
- unsigned char descriptor;
- char tlb_type;
- unsigned int entries;
- /* unsigned int ways; */
- char info[128];
-};
-
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
__section(".x86_cpu_dev.init") = \
@@ -61,9 +53,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
void tsx_ap_init(void);
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
#else
static inline void tsx_init(void) { }
static inline void tsx_ap_init(void) { }
+static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { }
#endif /* CONFIG_CPU_SUP_INTEL */
extern void init_spectral_chicken(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 946813d816bf..a2fbea0be535 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -45,6 +45,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AES, X86_FEATURE_XMM2 },
{ X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
{ X86_FEATURE_GFNI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX },
{ X86_FEATURE_FMA, X86_FEATURE_AVX },
{ X86_FEATURE_VAES, X86_FEATURE_AVX },
{ X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX },
@@ -83,7 +84,6 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
{ X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
{ X86_FEATURE_FRED, X86_FEATURE_LKGS },
- { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS },
{}
};
@@ -114,6 +114,9 @@ static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
if (WARN_ON(feature >= MAX_FEATURE_BITS))
return;
+ if (boot_cpu_has(feature))
+ WARN_ON(alternatives_patched);
+
clear_feature(c, feature);
/* Collect all features to disable, handling dependencies */
@@ -144,3 +147,38 @@ void setup_clear_cpu_cap(unsigned int feature)
{
do_clear_cpu_cap(NULL, feature);
}
+
+/*
+ * Return the feature "name" if available, otherwise return
+ * the X86_FEATURE_* numerals to make it easier to identify
+ * the feature.
+ */
+static const char *x86_feature_name(unsigned int feature, char *buf)
+{
+ if (x86_cap_flags[feature])
+ return x86_cap_flags[feature];
+
+ snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32);
+
+ return buf;
+}
+
+void check_cpufeature_deps(struct cpuinfo_x86 *c)
+{
+ char feature_buf[16], depends_buf[16];
+ const struct cpuid_dep *d;
+
+ for (d = cpuid_deps; d->feature; d++) {
+ if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) {
+ /*
+ * Only warn about the first unmet dependency on the
+ * first CPU where it is encountered to avoid spamming
+ * the kernel log.
+ */
+ pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n",
+ smp_processor_id(),
+ x86_feature_name(d->feature, feature_buf),
+ x86_feature_name(d->depends, depends_buf));
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 9651275aecd1..dfec2c61e354 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -153,8 +153,8 @@ static void geode_configure(void)
u8 ccr3;
local_irq_save(flags);
- /* Suspend on halt power saving and enable #SUSP pin */
- setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+ /* Suspend on halt power saving */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
index 3baf3e435834..1976fef2dfe5 100644
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -16,14 +16,16 @@ static int cpu_debug_show(struct seq_file *m, void *p)
if (!c->initialized)
return 0;
- seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid);
- seq_printf(m, "apicid: %x\n", c->topo.apicid);
+ seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid);
+ seq_printf(m, "apicid: 0x%x\n", c->topo.apicid);
seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id);
seq_printf(m, "die_id: %u\n", c->topo.die_id);
seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
seq_printf(m, "core_id: %u\n", c->topo.core_id);
+ seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 1640ae76548f..4a4118784c13 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -188,7 +188,7 @@ update_caps:
update_sgx:
if (!(msr & FEAT_CTL_SGX_ENABLED)) {
if (enable_sgx_kvm || enable_sgx_driver)
- pr_err_once("SGX disabled by BIOS.\n");
+ pr_err_once("SGX disabled or unsupported by BIOS.\n");
clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index c5191b06f9f2..6af4a4a90a52 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -240,26 +240,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
- tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
- tlb_lli_4k[ENTRIES] = ebx & mask;
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
- tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
- tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ tlb_lld_2m = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
- tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+ tlb_lld_4m = tlb_lld_2m >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
- tlb_lli_2m[ENTRIES] = eax & 0xff;
+ tlb_lli_2m = eax & 0xff;
} else
- tlb_lli_2m[ENTRIES] = eax & mask;
+ tlb_lli_2m = eax & mask;
- tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+ tlb_lli_4m = tlb_lli_2m >> 1;
}
static const struct cpu_dev hygon_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index be30d7fa2e66..cdc9813871ef 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,68 +1,31 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/pgtable.h>
-#include <linux/string.h>
#include <linux/bitops.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/semaphore.h>
-#include <linux/thread_info.h>
#include <linux/init.h>
-#include <linux/uaccess.h>
-#include <linux/workqueue.h>
-#include <linux/delay.h>
-#include <linux/cpuhotplug.h>
+#include <linux/kernel.h>
+#include <linux/minmax.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+
+#ifdef CONFIG_X86_64
+#include <linux/topology.h>
+#endif
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
#include <asm/bugs.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
#include <asm/cpu.h>
+#include <asm/hwcap2.h>
#include <asm/intel-family.h>
#include <asm/microcode.h>
-#include <asm/hwcap2.h>
-#include <asm/elf.h>
-#include <asm/cpu_device_id.h>
-#include <asm/cmdline.h>
-#include <asm/traps.h>
-#include <asm/resctrl.h>
+#include <asm/msr.h>
#include <asm/numa.h>
+#include <asm/resctrl.h>
#include <asm/thermal.h>
-
-#ifdef CONFIG_X86_64
-#include <linux/topology.h>
-#endif
+#include <asm/uaccess.h>
#include "cpu.h"
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#endif
-
-enum split_lock_detect_state {
- sld_off = 0,
- sld_warn,
- sld_fatal,
- sld_ratelimit,
-};
-
-/*
- * Default to sld_off because most systems do not support split lock detection.
- * sld_state_setup() will switch this to sld_warn on systems that support
- * split lock/bus lock detect, unless there is a command line override.
- */
-static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
-static u64 msr_test_ctrl_cache __ro_after_init;
-
-/*
- * With a name like MSR_TEST_CTL it should go without saying, but don't touch
- * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
- * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
- */
-static bool cpu_model_supports_sld __ro_after_init;
-
/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
@@ -72,19 +35,19 @@ static bool cpu_model_supports_sld __ro_after_init;
*/
static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
{
- switch (c->x86_model) {
- case INTEL_FAM6_CORE_YONAH:
- case INTEL_FAM6_CORE2_MEROM:
- case INTEL_FAM6_CORE2_MEROM_L:
- case INTEL_FAM6_CORE2_PENRYN:
- case INTEL_FAM6_CORE2_DUNNINGTON:
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_NEHALEM_G:
- case INTEL_FAM6_NEHALEM_EP:
- case INTEL_FAM6_NEHALEM_EX:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_WESTMERE_EP:
- case INTEL_FAM6_SANDYBRIDGE:
+ switch (c->x86_vfm) {
+ case INTEL_CORE_YONAH:
+ case INTEL_CORE2_MEROM:
+ case INTEL_CORE2_MEROM_L:
+ case INTEL_CORE2_PENRYN:
+ case INTEL_CORE2_DUNNINGTON:
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_SANDYBRIDGE:
setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
}
}
@@ -106,9 +69,9 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
*/
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_XEON_PHI_KNL:
- case INTEL_FAM6_XEON_PHI_KNM:
+ switch (c->x86_vfm) {
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
break;
default:
return;
@@ -134,32 +97,32 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
* - Release note from 20180108 microcode release
*/
struct sku_microcode {
- u8 model;
+ u32 vfm;
u8 stepping;
u32 microcode;
};
static const struct sku_microcode spectre_bad_microcodes[] = {
- { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 },
- { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE, 0x09, 0x80 },
- { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 },
- { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
- { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
- { INTEL_FAM6_BROADWELL, 0x04, 0x28 },
- { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b },
- { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 },
- { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 },
- { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
- { INTEL_FAM6_HASWELL_L, 0x01, 0x21 },
- { INTEL_FAM6_HASWELL_G, 0x01, 0x18 },
- { INTEL_FAM6_HASWELL, 0x03, 0x23 },
- { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
- { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
- { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
+ { INTEL_KABYLAKE, 0x0B, 0x80 },
+ { INTEL_KABYLAKE, 0x0A, 0x80 },
+ { INTEL_KABYLAKE, 0x09, 0x80 },
+ { INTEL_KABYLAKE_L, 0x0A, 0x80 },
+ { INTEL_KABYLAKE_L, 0x09, 0x80 },
+ { INTEL_SKYLAKE_X, 0x03, 0x0100013e },
+ { INTEL_SKYLAKE_X, 0x04, 0x0200003c },
+ { INTEL_BROADWELL, 0x04, 0x28 },
+ { INTEL_BROADWELL_G, 0x01, 0x1b },
+ { INTEL_BROADWELL_D, 0x02, 0x14 },
+ { INTEL_BROADWELL_D, 0x03, 0x07000011 },
+ { INTEL_BROADWELL_X, 0x01, 0x0b000025 },
+ { INTEL_HASWELL_L, 0x01, 0x21 },
+ { INTEL_HASWELL_G, 0x01, 0x18 },
+ { INTEL_HASWELL, 0x03, 0x23 },
+ { INTEL_HASWELL_X, 0x02, 0x3b },
+ { INTEL_HASWELL_X, 0x04, 0x10 },
+ { INTEL_IVYBRIDGE_X, 0x04, 0x42a },
/* Observed in the wild */
- { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
- { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
+ { INTEL_SANDYBRIDGE_X, 0x06, 0x61b },
+ { INTEL_SANDYBRIDGE_X, 0x07, 0x712 },
};
static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
@@ -173,11 +136,8 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return false;
- if (c->x86 != 6)
- return false;
-
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
- if (c->x86_model == spectre_bad_microcodes[i].model &&
+ if (c->x86_vfm == spectre_bad_microcodes[i].vfm &&
c->x86_stepping == spectre_bad_microcodes[i].stepping)
return (c->microcode <= spectre_bad_microcodes[i].microcode);
}
@@ -190,101 +150,57 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
-#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
-#define TME_ACTIVATE_POLICY_AES_XTS_128 0
-
#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
-#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
-#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
-
-/* Values for mktme_status (SW only construct) */
-#define MKTME_ENABLED 0
-#define MKTME_DISABLED 1
-#define MKTME_UNINITIALIZED 2
-static int mktme_status = MKTME_UNINITIALIZED;
-
static void detect_tme_early(struct cpuinfo_x86 *c)
{
- u64 tme_activate, tme_policy, tme_crypto_algs;
- int keyid_bits = 0, nr_keyids = 0;
- static u64 tme_activate_cpu0 = 0;
+ u64 tme_activate;
+ int keyid_bits;
rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
- if (mktme_status != MKTME_UNINITIALIZED) {
- if (tme_activate != tme_activate_cpu0) {
- /* Broken BIOS? */
- pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
- pr_err_once("x86/tme: MKTME is not usable\n");
- mktme_status = MKTME_DISABLED;
-
- /* Proceed. We may need to exclude bits from x86_phys_bits. */
- }
- } else {
- tme_activate_cpu0 = tme_activate;
- }
-
if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
pr_info_once("x86/tme: not enabled by BIOS\n");
- mktme_status = MKTME_DISABLED;
+ clear_cpu_cap(c, X86_FEATURE_TME);
return;
}
+ pr_info_once("x86/tme: enabled by BIOS\n");
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ if (!keyid_bits)
+ return;
- if (mktme_status != MKTME_UNINITIALIZED)
- goto detect_keyid_bits;
-
- pr_info("x86/tme: enabled by BIOS\n");
-
- tme_policy = TME_ACTIVATE_POLICY(tme_activate);
- if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
- pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
+ /*
+ * KeyID bits are set by BIOS and can be present regardless
+ * of whether the kernel is using them. They effectively lower
+ * the number of physical address bits.
+ *
+ * Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+ pr_info_once("x86/mktme: BIOS enabled: x86_phys_bits reduced by %d\n",
+ keyid_bits);
+}
- tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
- if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
- pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
- tme_crypto_algs);
- mktme_status = MKTME_DISABLED;
- }
-detect_keyid_bits:
- keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
- nr_keyids = (1UL << keyid_bits) - 1;
- if (nr_keyids) {
- pr_info_once("x86/mktme: enabled by BIOS\n");
- pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
- } else {
- pr_info_once("x86/mktme: disabled by BIOS\n");
- }
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
- if (mktme_status == MKTME_UNINITIALIZED) {
- /* MKTME is usable */
- mktme_status = MKTME_ENABLED;
- }
+ if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN)
+ return;
/*
- * KeyID bits effectively lower the number of physical address
- * bits. Update cpuinfo_x86::x86_phys_bits accordingly.
+ * The BIOS can have limited CPUID to leaf 2, which breaks feature
+ * enumeration. Unlock it and update the maximum leaf info.
*/
- c->x86_phys_bits -= keyid_bits;
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0)
+ c->cpuid_level = cpuid_eax(0);
}
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
- /* Unmask CPUID levels if masked: */
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
- MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
- c->cpuid_level = cpuid_eax(0);
- get_cpu_cap(c);
- }
- }
-
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
@@ -312,7 +228,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* need the microcode to have already been loaded... so if it is
* not, recommend a BIOS update and disable large pages.
*/
- if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 &&
+ if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 &&
c->microcode < 0x20e) {
pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
clear_cpu_cap(c, X86_FEATURE_PSE);
@@ -327,8 +243,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
#endif
/* CPUID workaround for 0F33/0F34 CPU */
- if (c->x86 == 0xF && c->x86_model == 0x3
- && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
+ if (c->x86_vfm == INTEL_P4_PRESCOTT &&
+ (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
c->x86_phys_bits = 36;
/*
@@ -337,46 +253,57 @@ static void early_init_intel(struct cpuinfo_x86 *c)
*
* It is also reliable across cores and sockets. (but not across
* cabinets - we turn it off in that case explicitly.)
+ *
+ * Use a model-specific check for some older CPUs that have invariant
+ * TSC but may not report it architecturally via 8000_0007.
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_WILLAMETTE) ||
+ (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
- if (c->x86 == 6) {
- switch (c->x86_model) {
- case INTEL_FAM6_ATOM_SALTWELL_MID:
- case INTEL_FAM6_ATOM_SALTWELL_TABLET:
- case INTEL_FAM6_ATOM_SILVERMONT_MID:
- case INTEL_FAM6_ATOM_AIRMONT_NP:
- set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
- break;
- default:
- break;
- }
+ switch (c->x86_vfm) {
+ case INTEL_ATOM_SALTWELL_MID:
+ case INTEL_ATOM_SALTWELL_TABLET:
+ case INTEL_ATOM_SILVERMONT_MID:
+ case INTEL_ATOM_AIRMONT_NP:
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
}
/*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
+ * PAT is broken on early family 6 CPUs, the last of which
+ * is "Yonah" where the erratum is named "AN7":
*
- * Enable PAT WC only on P4, Core 2 or later CPUs.
+ * Page with PAT (Page Attribute Table) Set to USWC
+ * (Uncacheable Speculative Write Combine) While
+ * Associated MTRR (Memory Type Range Register) Is UC
+ * (Uncacheable) May Consolidate to UC
+ *
+ * Disable PAT and fall back to MTRR on these CPUs.
*/
- if (c->x86 == 6 && c->x86_model < 15)
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO &&
+ c->x86_vfm <= INTEL_CORE_YONAH)
clear_cpu_cap(c, X86_FEATURE_PAT);
/*
- * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
- * clear the fast string and enhanced fast string CPU capabilities.
+ * Modern CPUs are generally expected to have a sane fast string
+ * implementation. However, BIOSes typically have a knob to tweak
+ * the architectural MISC_ENABLE.FAST_STRING enable bit.
+ *
+ * Adhere to the preference and program the Linux-defined fast
+ * string flag and enhanced fast string capabilities accordingly.
*/
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) {
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
+ /* X86_FEATURE_ERMS is set based on CPUID */
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ } else {
pr_info("Disabled fast string operations\n");
setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
setup_clear_cpu_cap(X86_FEATURE_ERMS);
@@ -393,7 +320,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
* to be modified.
*/
- if (c->x86 == 5 && c->x86_model == 9) {
+ if (c->x86_vfm == INTEL_QUARK_X1000) {
pr_info("Disabling PGE capability bit\n");
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
@@ -423,9 +350,7 @@ static void bsp_init_intel(struct cpuinfo_x86 *c)
int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
+ if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
boot_cpu_data.x86_stepping < 8) {
pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
@@ -442,9 +367,8 @@ static void intel_smp_check(struct cpuinfo_x86 *c)
/*
* Mask B, Pentium, but not Pentium MMX
*/
- if (c->x86 == 5 &&
- c->x86_stepping >= 1 && c->x86_stepping <= 4 &&
- c->x86_model <= 3) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4) {
/*
* Remember we have B step Pentia with bugs
*/
@@ -471,7 +395,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* The Quark is also family 5, but does not have the same bug.
*/
clear_cpu_bug(c, X86_BUG_F00F);
- if (c->x86 == 5 && c->x86_model < 9) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) {
static int f00f_workaround_enabled;
set_cpu_bug(c, X86_BUG_F00F);
@@ -486,7 +410,8 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
- if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633)
+ if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) ||
+ c->x86_vfm < INTEL_PENTIUM_II_KLAMATH)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
@@ -504,7 +429,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* P4 Xeon erratum 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
- if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) {
+ if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) {
if (msr_set_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
pr_info("CPU: C0 stepping P4 Xeon detected.\n");
@@ -518,27 +443,20 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
- if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+ if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 &&
(c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
set_cpu_bug(c, X86_BUG_11AP);
-
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
- * Set up the preferred alignment for movsl bulk memory moves
+ * MOVSL bulk memory moves can be slow when source and dest are not
+ * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment.
+ *
+ * Set the preferred alignment for Pentium Pro and newer processors, as
+ * it has only been tested on these.
*/
- switch (c->x86) {
- case 4: /* 486: untested */
- break;
- case 5: /* Old Pentia: untested */
- break;
- case 6: /* PII/PIII only like movsl with 8-byte alignment */
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO)
movsl_mask.mask = 7;
- break;
- case 15: /* P4 is OK down to 8-byte alignment */
- movsl_mask.mask = 7;
- break;
- }
#endif
intel_smp_check(c);
@@ -594,8 +512,24 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
}
-static void split_lock_init(void);
-static void bus_lock_init(void);
+/*
+ * This is a list of Intel CPUs that are known to suffer from downclocking when
+ * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel
+ * executes SIMD-optimized code such as cryptography functions or CRCs, it
+ * should prefer 256-bit (YMM) code to 512-bit (ZMM) code.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, 0),
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ {},
+};
static void init_intel(struct cpuinfo_x86 *c)
{
@@ -625,19 +559,20 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_PEBS);
}
- if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) &&
- (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
+ if (boot_cpu_has(X86_FEATURE_CLFLUSH) &&
+ (c->x86_vfm == INTEL_CORE2_DUNNINGTON ||
+ c->x86_vfm == INTEL_NEHALEM_EX ||
+ c->x86_vfm == INTEL_WESTMERE_EX))
set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
- if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) &&
- ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT)))
+ if (boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (c->x86_vfm == INTEL_ATOM_GOLDMONT ||
+ c->x86_vfm == INTEL_LUNARLAKE_M))
set_cpu_bug(c, X86_BUG_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
#else
/*
* Names for the Pentium II/Celeron processors
@@ -672,13 +607,11 @@ static void init_intel(struct cpuinfo_x86 *c)
if (p)
strcpy(c->x86_model_id, p);
}
-
- if (c->x86 == 15)
- set_cpu_cap(c, X86_FEATURE_P4);
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_P3);
#endif
+ if (x86_match_cpu(zmm_exclusion_list))
+ set_cpu_cap(c, X86_FEATURE_PREFER_YMM);
+
/* Work around errata */
srat_detect_node(c);
@@ -687,7 +620,6 @@ static void init_intel(struct cpuinfo_x86 *c)
init_intel_misc_features(c);
split_lock_init();
- bus_lock_init();
intel_init_thermal(c);
}
@@ -701,83 +633,103 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
*/
- if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+ if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0)
size = 256;
/*
* Intel Quark SoC X1000 contains a 4-way set associative
* 16K cache with a 16 byte cache line and 256 lines per tag
*/
- if ((c->x86 == 5) && (c->x86_model == 9))
+ if (c->x86_vfm == INTEL_QUARK_X1000)
size = 16;
return size;
}
#endif
-#define TLB_INST_4K 0x01
-#define TLB_INST_4M 0x02
-#define TLB_INST_2M_4M 0x03
+#define TLB_INST_4K 0x01
+#define TLB_INST_4M 0x02
+#define TLB_INST_2M_4M 0x03
+
+#define TLB_INST_ALL 0x05
+#define TLB_INST_1G 0x06
+
+#define TLB_DATA_4K 0x11
+#define TLB_DATA_4M 0x12
+#define TLB_DATA_2M_4M 0x13
+#define TLB_DATA_4K_4M 0x14
-#define TLB_INST_ALL 0x05
-#define TLB_INST_1G 0x06
+#define TLB_DATA_1G 0x16
+#define TLB_DATA_1G_2M_4M 0x17
-#define TLB_DATA_4K 0x11
-#define TLB_DATA_4M 0x12
-#define TLB_DATA_2M_4M 0x13
-#define TLB_DATA_4K_4M 0x14
+#define TLB_DATA0_4K 0x21
+#define TLB_DATA0_4M 0x22
+#define TLB_DATA0_2M_4M 0x23
-#define TLB_DATA_1G 0x16
+#define STLB_4K 0x41
+#define STLB_4K_2M 0x42
-#define TLB_DATA0_4K 0x21
-#define TLB_DATA0_4M 0x22
-#define TLB_DATA0_2M_4M 0x23
+/*
+ * All of leaf 0x2's one-byte TLB descriptors implies the same number of
+ * entries for their respective TLB types. The 0x63 descriptor is an
+ * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries
+ * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for
+ * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the
+ * intel_tlb_table[] mapping.
+ */
+#define TLB_0x63_2M_4M_ENTRIES 32
-#define STLB_4K 0x41
-#define STLB_4K_2M 0x42
+struct _tlb_table {
+ unsigned char descriptor;
+ char tlb_type;
+ unsigned int entries;
+};
static const struct _tlb_table intel_tlb_table[] = {
- { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
- { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
- { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
- { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
- { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
- { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
- { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
- { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
- { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
- { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
- { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" },
- { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
- { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" },
- { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
- { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
- { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
- { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" },
- { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" },
- { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
- { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
- { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
- { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
+ { 0x01, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages, 4-way set associative */
+ { 0x02, TLB_INST_4M, 2}, /* TLB_INST 4 MByte pages, full associative */
+ { 0x03, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way set associative */
+ { 0x04, TLB_DATA_4M, 8}, /* TLB_DATA 4 MByte pages, 4-way set associative */
+ { 0x05, TLB_DATA_4M, 32}, /* TLB_DATA 4 MByte pages, 4-way set associative */
+ { 0x0b, TLB_INST_4M, 4}, /* TLB_INST 4 MByte pages, 4-way set associative */
+ { 0x4f, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages */
+ { 0x50, TLB_INST_ALL, 64}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ { 0x51, TLB_INST_ALL, 128}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ { 0x52, TLB_INST_ALL, 256}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ { 0x55, TLB_INST_2M_4M, 7}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ { 0x56, TLB_DATA0_4M, 16}, /* TLB_DATA0 4 MByte pages, 4-way set associative */
+ { 0x57, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, 4-way associative */
+ { 0x59, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, fully associative */
+ { 0x5a, TLB_DATA0_2M_4M, 32}, /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */
+ { 0x5b, TLB_DATA_4K_4M, 64}, /* TLB_DATA 4 KByte and 4 MByte pages */
+ { 0x5c, TLB_DATA_4K_4M, 128}, /* TLB_DATA 4 KByte and 4 MByte pages */
+ { 0x5d, TLB_DATA_4K_4M, 256}, /* TLB_DATA 4 KByte and 4 MByte pages */
+ { 0x61, TLB_INST_4K, 48}, /* TLB_INST 4 KByte pages, full associative */
+ { 0x63, TLB_DATA_1G_2M_4M, 4}, /* TLB_DATA 1 GByte pages, 4-way set associative
+ * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */
+ { 0x6b, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 8-way associative */
+ { 0x6c, TLB_DATA_2M_4M, 128}, /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */
+ { 0x6d, TLB_DATA_1G, 16}, /* TLB_DATA 1 GByte pages, fully associative */
+ { 0x76, TLB_INST_2M_4M, 8}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ { 0xb0, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 4-way set associative */
+ { 0xb1, TLB_INST_2M_4M, 4}, /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */
+ { 0xb2, TLB_INST_4K, 64}, /* TLB_INST 4KByte pages, 4-way set associative */
+ { 0xb3, TLB_DATA_4K, 128}, /* TLB_DATA 4 KByte pages, 4-way set associative */
+ { 0xb4, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 4-way associative */
+ { 0xb5, TLB_INST_4K, 64}, /* TLB_INST 4 KByte pages, 8-way set associative */
+ { 0xb6, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 8-way set associative */
+ { 0xba, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way associative */
+ { 0xc0, TLB_DATA_4K_4M, 8}, /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */
+ { 0xc1, STLB_4K_2M, 1024}, /* STLB 4 KByte and 2 MByte pages, 8-way associative */
+ { 0xc2, TLB_DATA_2M_4M, 16}, /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */
+ { 0xca, STLB_4K, 512}, /* STLB 4 KByte pages, 4-way associative */
{ 0x00, 0, 0 }
};
static void intel_tlb_lookup(const unsigned char desc)
{
+ unsigned int entries;
unsigned char k;
+
if (desc == 0)
return;
@@ -789,75 +741,58 @@ static void intel_tlb_lookup(const unsigned char desc)
if (intel_tlb_table[k].tlb_type == 0)
return;
+ entries = intel_tlb_table[k].entries;
switch (intel_tlb_table[k].tlb_type) {
case STLB_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
break;
case STLB_4K_2M:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_INST_ALL:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
break;
case TLB_INST_4M:
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_2M_4M:
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_DATA_4K:
case TLB_DATA0_4K:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
break;
case TLB_DATA_4M:
case TLB_DATA0_4M:
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_2M_4M:
case TLB_DATA0_2M_4M:
- if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_4K_4M:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
+ case TLB_DATA_1G_2M_4M:
+ tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES);
+ tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES);
+ fallthrough;
case TLB_DATA_1G:
- if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_1g = max(tlb_lld_1g, entries);
break;
}
}
@@ -878,7 +813,7 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
/* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
+ for (j = 0 ; j < 4 ; j++)
if (regs[j] & (1 << 31))
regs[j] = 0;
@@ -952,394 +887,3 @@ static const struct cpu_dev intel_cpu_dev = {
};
cpu_dev_register(intel_cpu_dev);
-
-#undef pr_fmt
-#define pr_fmt(fmt) "x86/split lock detection: " fmt
-
-static const struct {
- const char *option;
- enum split_lock_detect_state state;
-} sld_options[] __initconst = {
- { "off", sld_off },
- { "warn", sld_warn },
- { "fatal", sld_fatal },
- { "ratelimit:", sld_ratelimit },
-};
-
-static struct ratelimit_state bld_ratelimit;
-
-static unsigned int sysctl_sld_mitigate = 1;
-static DEFINE_SEMAPHORE(buslock_sem, 1);
-
-#ifdef CONFIG_PROC_SYSCTL
-static struct ctl_table sld_sysctls[] = {
- {
- .procname = "split_lock_mitigate",
- .data = &sysctl_sld_mitigate,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_douintvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-};
-
-static int __init sld_mitigate_sysctl_init(void)
-{
- register_sysctl_init("kernel", sld_sysctls);
- return 0;
-}
-
-late_initcall(sld_mitigate_sysctl_init);
-#endif
-
-static inline bool match_option(const char *arg, int arglen, const char *opt)
-{
- int len = strlen(opt), ratelimit;
-
- if (strncmp(arg, opt, len))
- return false;
-
- /*
- * Min ratelimit is 1 bus lock/sec.
- * Max ratelimit is 1000 bus locks/sec.
- */
- if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
- ratelimit > 0 && ratelimit <= 1000) {
- ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
- ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
- return true;
- }
-
- return len == arglen;
-}
-
-static bool split_lock_verify_msr(bool on)
-{
- u64 ctrl, tmp;
-
- if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
- return false;
- if (on)
- ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
- else
- ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
- if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
- return false;
- rdmsrl(MSR_TEST_CTRL, tmp);
- return ctrl == tmp;
-}
-
-static void __init sld_state_setup(void)
-{
- enum split_lock_detect_state state = sld_warn;
- char arg[20];
- int i, ret;
-
- if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
- !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
- return;
-
- ret = cmdline_find_option(boot_command_line, "split_lock_detect",
- arg, sizeof(arg));
- if (ret >= 0) {
- for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
- if (match_option(arg, ret, sld_options[i].option)) {
- state = sld_options[i].state;
- break;
- }
- }
- }
- sld_state = state;
-}
-
-static void __init __split_lock_setup(void)
-{
- if (!split_lock_verify_msr(false)) {
- pr_info("MSR access failed: Disabled\n");
- return;
- }
-
- rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
-
- if (!split_lock_verify_msr(true)) {
- pr_info("MSR access failed: Disabled\n");
- return;
- }
-
- /* Restore the MSR to its cached value. */
- wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
-
- setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
-}
-
-/*
- * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
- * is not implemented as one thread could undo the setting of the other
- * thread immediately after dropping the lock anyway.
- */
-static void sld_update_msr(bool on)
-{
- u64 test_ctrl_val = msr_test_ctrl_cache;
-
- if (on)
- test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
-
- wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
-}
-
-static void split_lock_init(void)
-{
- /*
- * #DB for bus lock handles ratelimit and #AC for split lock is
- * disabled.
- */
- if (sld_state == sld_ratelimit) {
- split_lock_verify_msr(false);
- return;
- }
-
- if (cpu_model_supports_sld)
- split_lock_verify_msr(sld_state != sld_off);
-}
-
-static void __split_lock_reenable_unlock(struct work_struct *work)
-{
- sld_update_msr(true);
- up(&buslock_sem);
-}
-
-static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
-
-static void __split_lock_reenable(struct work_struct *work)
-{
- sld_update_msr(true);
-}
-static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
-
-/*
- * If a CPU goes offline with pending delayed work to re-enable split lock
- * detection then the delayed work will be executed on some other CPU. That
- * handles releasing the buslock_sem, but because it executes on a
- * different CPU probably won't re-enable split lock detection. This is a
- * problem on HT systems since the sibling CPU on the same core may then be
- * left running with split lock detection disabled.
- *
- * Unconditionally re-enable detection here.
- */
-static int splitlock_cpu_offline(unsigned int cpu)
-{
- sld_update_msr(true);
-
- return 0;
-}
-
-static void split_lock_warn(unsigned long ip)
-{
- struct delayed_work *work;
- int cpu;
-
- if (!current->reported_split_lock)
- pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
- current->comm, current->pid, ip);
- current->reported_split_lock = 1;
-
- if (sysctl_sld_mitigate) {
- /*
- * misery factor #1:
- * sleep 10ms before trying to execute split lock.
- */
- if (msleep_interruptible(10) > 0)
- return;
- /*
- * Misery factor #2:
- * only allow one buslocked disabled core at a time.
- */
- if (down_interruptible(&buslock_sem) == -EINTR)
- return;
- work = &sl_reenable_unlock;
- } else {
- work = &sl_reenable;
- }
-
- cpu = get_cpu();
- schedule_delayed_work_on(cpu, work, 2);
-
- /* Disable split lock detection on this CPU to make progress */
- sld_update_msr(false);
- put_cpu();
-}
-
-bool handle_guest_split_lock(unsigned long ip)
-{
- if (sld_state == sld_warn) {
- split_lock_warn(ip);
- return true;
- }
-
- pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
- current->comm, current->pid,
- sld_state == sld_fatal ? "fatal" : "bogus", ip);
-
- current->thread.error_code = 0;
- current->thread.trap_nr = X86_TRAP_AC;
- force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
- return false;
-}
-EXPORT_SYMBOL_GPL(handle_guest_split_lock);
-
-static void bus_lock_init(void)
-{
- u64 val;
-
- if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
- return;
-
- rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
-
- if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
- (sld_state == sld_warn || sld_state == sld_fatal)) ||
- sld_state == sld_off) {
- /*
- * Warn and fatal are handled by #AC for split lock if #AC for
- * split lock is supported.
- */
- val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
- } else {
- val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- }
-
- wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
-}
-
-bool handle_user_split_lock(struct pt_regs *regs, long error_code)
-{
- if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
- return false;
- split_lock_warn(regs->ip);
- return true;
-}
-
-void handle_bus_lock(struct pt_regs *regs)
-{
- switch (sld_state) {
- case sld_off:
- break;
- case sld_ratelimit:
- /* Enforce no more than bld_ratelimit bus locks/sec. */
- while (!__ratelimit(&bld_ratelimit))
- msleep(20);
- /* Warn on the bus lock. */
- fallthrough;
- case sld_warn:
- pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
- current->comm, current->pid, regs->ip);
- break;
- case sld_fatal:
- force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
- break;
- }
-}
-
-/*
- * CPU models that are known to have the per-core split-lock detection
- * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
- */
-static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
- {}
-};
-
-static void __init split_lock_setup(struct cpuinfo_x86 *c)
-{
- const struct x86_cpu_id *m;
- u64 ia32_core_caps;
-
- if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return;
-
- /* Check for CPUs that have support but do not enumerate it: */
- m = x86_match_cpu(split_lock_cpu_ids);
- if (m)
- goto supported;
-
- if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
- return;
-
- /*
- * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
- * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
- * it have split lock detection.
- */
- rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
- if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
- goto supported;
-
- /* CPU is not in the model list and does not have the MSR bit: */
- return;
-
-supported:
- cpu_model_supports_sld = true;
- __split_lock_setup();
-}
-
-static void sld_state_show(void)
-{
- if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
- !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
- return;
-
- switch (sld_state) {
- case sld_off:
- pr_info("disabled\n");
- break;
- case sld_warn:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
- pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
- if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
- "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
- pr_warn("No splitlock CPU offline handler\n");
- } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
- pr_info("#DB: warning on user-space bus_locks\n");
- }
- break;
- case sld_fatal:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
- pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
- } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
- pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
- boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
- " from non-WB" : "");
- }
- break;
- case sld_ratelimit:
- if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
- pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
- break;
- }
-}
-
-void __init sld_setup(struct cpuinfo_x86 *c)
-{
- split_lock_setup(c);
- sld_state_setup();
- sld_state_show();
-}
-
-#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24
-
-/**
- * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU
- *
- * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in
- * a hybrid processor. If the processor is not hybrid, returns 0.
- */
-u8 get_this_hybrid_cpu_type(void)
-{
- if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- return 0;
-
- return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT;
-}
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index f18d35fe27a9..30b1d63b97f3 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -204,12 +204,12 @@ static int intel_epb_offline(unsigned int cpu)
}
static const struct x86_cpu_id intel_epb_normal[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
{}
};
diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c
deleted file mode 100644
index 5be2b1790282..000000000000
--- a/arch/x86/kernel/cpu/intel_pconfig.c
+++ /dev/null
@@ -1,84 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Intel PCONFIG instruction support.
- *
- * Copyright (C) 2017 Intel Corporation
- *
- * Author:
- * Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
- */
-#include <linux/bug.h>
-#include <linux/limits.h>
-
-#include <asm/cpufeature.h>
-#include <asm/intel_pconfig.h>
-
-#define PCONFIG_CPUID 0x1b
-
-#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1)
-
-/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */
-enum {
- PCONFIG_CPUID_SUBLEAF_INVALID = 0,
- PCONFIG_CPUID_SUBLEAF_TARGETID = 1,
-};
-
-/* Bitmask of supported targets */
-static u64 targets_supported __read_mostly;
-
-int pconfig_target_supported(enum pconfig_target target)
-{
- /*
- * We would need to re-think the implementation once we get > 64
- * PCONFIG targets. Spec allows up to 2^32 targets.
- */
- BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64);
-
- if (WARN_ON_ONCE(target >= 64))
- return 0;
- return targets_supported & (1ULL << target);
-}
-
-static int __init intel_pconfig_init(void)
-{
- int subleaf;
-
- if (!boot_cpu_has(X86_FEATURE_PCONFIG))
- return 0;
-
- /*
- * Scan subleafs of PCONFIG CPUID leaf.
- *
- * Subleafs of the same type need not to be consecutive.
- *
- * Stop on the first invalid subleaf type. All subleafs after the first
- * invalid are invalid too.
- */
- for (subleaf = 0; subleaf < INT_MAX; subleaf++) {
- struct cpuid_regs regs;
-
- cpuid_count(PCONFIG_CPUID, subleaf,
- &regs.eax, &regs.ebx, &regs.ecx, &regs.edx);
-
- switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) {
- case PCONFIG_CPUID_SUBLEAF_INVALID:
- /* Stop on the first invalid subleaf */
- goto out;
- case PCONFIG_CPUID_SUBLEAF_TARGETID:
- /* Mark supported PCONFIG targets */
- if (regs.ebx < 64)
- targets_supported |= (1ULL << regs.ebx);
- if (regs.ecx < 64)
- targets_supported |= (1ULL << regs.ecx);
- if (regs.edx < 64)
- targets_supported |= (1ULL << regs.edx);
- break;
- default:
- /* Unknown CPUID.PCONFIG subleaf: ignore */
- break;
- }
- }
-out:
- return 0;
-}
-arch_initcall(intel_pconfig_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index ad6776081e60..6af1e8baeb0f 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -6,7 +6,35 @@
#include <linux/slab.h>
/**
- * x86_match_cpu - match current CPU again an array of x86_cpu_ids
+ * x86_match_vendor_cpu_type - helper function to match the hardware defined
+ * cpu-type for a single entry in the x86_cpu_id
+ * table. Note, this function does not match the
+ * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and
+ * TOPO_CPU_TYPE_PERFORMANCE.
+ * @c: Pointer to the cpuinfo_x86 structure of the CPU to match.
+ * @m: Pointer to the x86_cpu_id entry to match against.
+ *
+ * Return: true if the cpu-type matches, false otherwise.
+ */
+static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m)
+{
+ if (m->type == X86_CPU_TYPE_ANY)
+ return true;
+
+ /* Hybrid CPUs are special, they are assumed to match all cpu-types */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return true;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ return m->type == c->topo.intel_type;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ return m->type == c->topo.amd_type;
+
+ return false;
+}
+
+/**
+ * x86_match_cpu - match current CPU against an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
*
@@ -17,8 +45,7 @@
*
* A typical table entry would be to match a specific CPU
*
- * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
- * X86_FEATURE_ANY, NULL);
+ * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
* %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
@@ -26,7 +53,7 @@
* asm/cpu_device_id.h contains a set of useful macros which are shortcuts
* for various common selections. The above can be shortened to:
*
- * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
+ * X86_MATCH_VFM(INTEL_BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
@@ -39,9 +66,7 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
- for (m = match;
- m->vendor | m->family | m->model | m->steppings | m->feature;
- m++) {
+ for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
@@ -53,39 +78,21 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
- return m;
- }
- return NULL;
-}
-EXPORT_SYMBOL(x86_match_cpu);
-
-static const struct x86_cpu_desc *
-x86_match_cpu_with_stepping(const struct x86_cpu_desc *match)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- const struct x86_cpu_desc *m;
-
- for (m = match; m->x86_family | m->x86_model; m++) {
- if (c->x86_vendor != m->x86_vendor)
- continue;
- if (c->x86 != m->x86_family)
- continue;
- if (c->x86_model != m->x86_model)
- continue;
- if (c->x86_stepping != m->x86_stepping)
+ if (!x86_match_vendor_cpu_type(c, m))
continue;
return m;
}
return NULL;
}
+EXPORT_SYMBOL(x86_match_cpu);
-bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table)
+bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
{
- const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table);
+ const struct x86_cpu_id *res = x86_match_cpu(table);
- if (!res || res->x86_microcode_rev > boot_cpu_data.microcode)
+ if (!res || res->driver_data > boot_cpu_data.microcode)
return false;
return true;
}
-EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev);
+EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 9a0133ef7e20..1075a90141da 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -4,8 +4,6 @@
*
* Written by Jacob Shin - AMD, Inc.
* Maintained by: Borislav Petkov <bp@alien8.de>
- *
- * All MC4_MISCi registers are shared between cores on a node.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
@@ -20,7 +18,6 @@
#include <linux/smp.h>
#include <linux/string.h>
-#include <asm/amd_nb.h>
#include <asm/traps.h>
#include <asm/apic.h>
#include <asm/mce.h>
@@ -221,6 +218,32 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
#define MAX_MCATYPE_NAME_LEN 30
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
+struct threshold_block {
+ /* This block's number within its bank. */
+ unsigned int block;
+ /* MCA bank number that contains this block. */
+ unsigned int bank;
+ /* CPU which controls this block's MCA bank. */
+ unsigned int cpu;
+ /* MCA_MISC MSR address for this block. */
+ u32 address;
+ /* Enable/Disable APIC interrupt. */
+ bool interrupt_enable;
+ /* Bank can generate an interrupt. */
+ bool interrupt_capable;
+ /* Value upon which threshold interrupt is generated. */
+ u16 threshold_limit;
+ /* sysfs object */
+ struct kobject kobj;
+ /* List of threshold blocks within this block's MCA bank. */
+ struct list_head miscj;
+};
+
+struct threshold_bank {
+ struct kobject *kobj;
+ struct threshold_block *blocks;
+};
+
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
/*
@@ -333,19 +356,6 @@ struct thresh_restart {
u16 old_limit;
};
-static inline bool is_shared_bank(int bank)
-{
- /*
- * Scalable MCA provides for only one core to have access to the MSRs of
- * a shared bank.
- */
- if (mce_flags.smca)
- return false;
-
- /* Bank 4 is for northbridge reporting and is thus shared */
- return (bank == 4);
-}
-
static const char *bank4_names(const struct threshold_block *b)
{
switch (b->address) {
@@ -381,7 +391,7 @@ static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
return msr_high_bits & BIT(28);
}
-static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
@@ -389,7 +399,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
b->bank, b->block, b->address, hi, lo);
- return 0;
+ return false;
}
if (apic != msr) {
@@ -399,15 +409,15 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
* was set is reserved. Return early here:
*/
if (mce_flags.smca)
- return 0;
+ return false;
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
- return 0;
+ return false;
}
- return 1;
+ return true;
};
/* Reprogram MCx_MISC MSR behind this threshold bank. */
@@ -778,29 +788,33 @@ bool amd_mce_usable_address(struct mce *m)
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
- struct mce m;
+ struct mce_hw_err err;
+ struct mce *m = &err.m;
- mce_setup(&m);
+ mce_prep_record(&err);
- m.status = status;
- m.misc = misc;
- m.bank = bank;
- m.tsc = rdtsc();
+ m->status = status;
+ m->misc = misc;
+ m->bank = bank;
+ m->tsc = rdtsc();
- if (m.status & MCI_STATUS_ADDRV) {
- m.addr = addr;
+ if (m->status & MCI_STATUS_ADDRV) {
+ m->addr = addr;
- smca_extract_err_addr(&m);
+ smca_extract_err_addr(m);
}
if (mce_flags.smca) {
- rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+ rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
- if (m.status & MCI_STATUS_SYNDV)
- rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+ if (m->status & MCI_STATUS_SYNDV) {
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
+ }
}
- mce_log(&m);
+ mce_log(&err);
}
DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
@@ -1194,35 +1208,10 @@ out_free:
return err;
}
-static int __threshold_add_blocks(struct threshold_bank *b)
-{
- struct list_head *head = &b->blocks->miscj;
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- int err = 0;
-
- err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
- if (err)
- return err;
-
- list_for_each_entry_safe(pos, tmp, head, miscj) {
-
- err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
- if (err) {
- list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
- kobject_del(&pos->kobj);
-
- return err;
- }
- }
- return err;
-}
-
static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
unsigned int bank)
{
struct device *dev = this_cpu_read(mce_device);
- struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = get_name(cpu, bank, NULL);
int err = 0;
@@ -1230,26 +1219,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
if (!dev)
return -ENODEV;
- if (is_shared_bank(bank)) {
- nb = node_to_amd_nb(topology_amd_node_id(cpu));
-
- /* threshold descriptor already initialized on this node? */
- if (nb && nb->bank4) {
- /* yes, use it */
- b = nb->bank4;
- err = kobject_add(b->kobj, &dev->kobj, name);
- if (err)
- goto out;
-
- bp[bank] = b;
- refcount_inc(&b->cpus);
-
- err = __threshold_add_blocks(b);
-
- goto out;
- }
- }
-
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
err = -ENOMEM;
@@ -1263,17 +1232,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
goto out_free;
}
- if (is_shared_bank(bank)) {
- b->shared = 1;
- refcount_set(&b->cpus, 1);
-
- /* nb is already initialized, see above */
- if (nb) {
- WARN_ON(nb->bank4);
- nb->bank4 = b;
- }
- }
-
err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
if (err)
goto out_kobj;
@@ -1306,40 +1264,11 @@ static void deallocate_threshold_blocks(struct threshold_bank *bank)
kobject_put(&bank->blocks->kobj);
}
-static void __threshold_remove_blocks(struct threshold_bank *b)
-{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
-
- kobject_put(b->kobj);
-
- list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
- kobject_put(b->kobj);
-}
-
static void threshold_remove_bank(struct threshold_bank *bank)
{
- struct amd_northbridge *nb;
-
if (!bank->blocks)
goto out_free;
- if (!bank->shared)
- goto out_dealloc;
-
- if (!refcount_dec_and_test(&bank->cpus)) {
- __threshold_remove_blocks(bank);
- return;
- } else {
- /*
- * The last CPU on this node using the shared bank is going
- * away, remove that bank now.
- */
- nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id()));
- nb->bank4 = NULL;
- }
-
-out_dealloc:
deallocate_threshold_blocks(bank);
out_free:
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 7f7309ff67d0..0a89947e47bc 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -28,7 +28,8 @@
void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
{
- struct mce m;
+ struct mce_hw_err err;
+ struct mce *m;
int lsb;
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
@@ -44,30 +45,33 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
else
lsb = PAGE_SHIFT;
- mce_setup(&m);
- m.bank = -1;
+ mce_prep_record(&err);
+ m = &err.m;
+ m->bank = -1;
/* Fake a memory read error with unknown channel */
- m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
- m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
+ m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
+ m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
if (severity >= GHES_SEV_RECOVERABLE)
- m.status |= MCI_STATUS_UC;
+ m->status |= MCI_STATUS_UC;
if (severity >= GHES_SEV_PANIC) {
- m.status |= MCI_STATUS_PCC;
- m.tsc = rdtsc();
+ m->status |= MCI_STATUS_PCC;
+ m->tsc = rdtsc();
}
- m.addr = mem_err->physical_addr;
- mce_log(&m);
+ m->addr = mem_err->physical_addr;
+ mce_log(&err);
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
{
const u64 *i_mce = ((const u64 *) (ctx_info + 1));
- unsigned int cpu;
- struct mce m;
+ unsigned int cpu, num_regs;
+ bool apicid_found = false;
+ struct mce_hw_err err;
+ struct mce *m;
if (!boot_cpu_has(X86_FEATURE_SMCA))
return -EINVAL;
@@ -85,41 +89,86 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
return -EINVAL;
/*
- * The register array size must be large enough to include all the
- * SMCA registers which need to be extracted.
- *
* The number of registers in the register array is determined by
* Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
- * The register layout is fixed and currently the raw data in the
- * register array includes 6 SMCA registers which the kernel can
- * extract.
+ * Sanity-check registers array size.
*/
- if (ctx_info->reg_arr_size < 48)
+ num_regs = ctx_info->reg_arr_size >> 3;
+ if (!num_regs)
return -EINVAL;
- mce_setup(&m);
-
- m.extcpu = -1;
- m.socketid = -1;
-
for_each_possible_cpu(cpu) {
if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
- m.extcpu = cpu;
- m.socketid = cpu_data(m.extcpu).topo.pkg_id;
+ apicid_found = true;
break;
}
}
- m.apicid = lapic_id;
- m.bank = (ctx_info->msr_addr >> 4) & 0xFF;
- m.status = *i_mce;
- m.addr = *(i_mce + 1);
- m.misc = *(i_mce + 2);
- /* Skipping MCA_CONFIG */
- m.ipid = *(i_mce + 4);
- m.synd = *(i_mce + 5);
+ if (!apicid_found)
+ return -EINVAL;
+
+ m = &err.m;
+ memset(&err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(cpu, m);
+
+ m->bank = (ctx_info->msr_addr >> 4) & 0xFF;
+
+ /*
+ * The SMCA register layout is fixed and includes 16 registers.
+ * The end of the array may be variable, but the beginning is known.
+ * Cap the number of registers to expected max (15).
+ */
+ if (num_regs > 15)
+ num_regs = 15;
+
+ switch (num_regs) {
+ /* MCA_SYND2 */
+ case 15:
+ err.vendor.amd.synd2 = *(i_mce + 14);
+ fallthrough;
+ /* MCA_SYND1 */
+ case 14:
+ err.vendor.amd.synd1 = *(i_mce + 13);
+ fallthrough;
+ /* MCA_MISC4 */
+ case 13:
+ /* MCA_MISC3 */
+ case 12:
+ /* MCA_MISC2 */
+ case 11:
+ /* MCA_MISC1 */
+ case 10:
+ /* MCA_DEADDR */
+ case 9:
+ /* MCA_DESTAT */
+ case 8:
+ /* reserved */
+ case 7:
+ /* MCA_SYND */
+ case 6:
+ m->synd = *(i_mce + 5);
+ fallthrough;
+ /* MCA_IPID */
+ case 5:
+ m->ipid = *(i_mce + 4);
+ fallthrough;
+ /* MCA_CONFIG */
+ case 4:
+ /* MCA_MISC0 */
+ case 3:
+ m->misc = *(i_mce + 2);
+ fallthrough;
+ /* MCA_ADDR */
+ case 2:
+ m->addr = *(i_mce + 1);
+ fallthrough;
+ /* MCA_STATUS */
+ case 1:
+ m->status = *i_mce;
+ }
- mce_log(&m);
+ mce_log(&err);
return 0;
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 84d41be6d06b..f6fd71b64b66 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -47,7 +47,7 @@
#include <linux/kexec.h>
#include <asm/fred.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
@@ -88,7 +88,7 @@ struct mca_config mca_cfg __read_mostly = {
.monarch_timeout = -1
};
-static DEFINE_PER_CPU(struct mce, mces_seen);
+static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen);
static unsigned long mce_need_notify;
/*
@@ -117,28 +117,41 @@ static struct irq_work mce_irq_work;
*/
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
+void mce_prep_record_common(struct mce *m)
{
- memset(m, 0, sizeof(struct mce));
- m->cpu = m->extcpu = smp_processor_id();
+ m->cpuid = cpuid_eax(1);
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
/* need the internal __ version to avoid deadlocks */
- m->time = __ktime_get_real_seconds();
- m->cpuvendor = boot_cpu_data.x86_vendor;
- m->cpuid = cpuid_eax(1);
- m->socketid = cpu_data(m->extcpu).topo.pkg_id;
- m->apicid = cpu_data(m->extcpu).topo.initial_apicid;
- m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
- m->ppin = cpu_data(m->extcpu).ppin;
- m->microcode = boot_cpu_data.microcode;
+ m->time = __ktime_get_real_seconds();
+}
+
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
+{
+ m->cpu = cpu;
+ m->extcpu = cpu;
+ m->apicid = cpu_data(cpu).topo.initial_apicid;
+ m->microcode = cpu_data(cpu).microcode;
+ m->ppin = topology_ppin(cpu);
+ m->socketid = topology_physical_package_id(cpu);
+}
+
+/* Do initial initialization of struct mce_hw_err */
+void mce_prep_record(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ memset(err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(smp_processor_id(), m);
}
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-void mce_log(struct mce *m)
+void mce_log(struct mce_hw_err *err)
{
- if (!mce_gen_pool_add(m))
+ if (mce_gen_pool_add(err))
irq_work_queue(&mce_irq_work);
}
EXPORT_SYMBOL_GPL(mce_log);
@@ -159,8 +172,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
-static void __print_mce(struct mce *m)
+static void __print_mce(struct mce_hw_err *err)
{
+ struct mce *m = &err->m;
+
pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
m->extcpu,
(m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
@@ -187,6 +202,10 @@ static void __print_mce(struct mce *m)
if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
+ if (err->vendor.amd.synd1)
+ pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
+ if (err->vendor.amd.synd2)
+ pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
@@ -202,9 +221,11 @@ static void __print_mce(struct mce *m)
m->microcode);
}
-static void print_mce(struct mce *m)
+static void print_mce(struct mce_hw_err *err)
{
- __print_mce(m);
+ struct mce *m = &err->m;
+
+ __print_mce(err);
if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
@@ -239,7 +260,7 @@ static const char *mce_dump_aux_info(struct mce *m)
return NULL;
}
-static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
+static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp)
{
struct llist_node *pending;
struct mce_evt_llist *l;
@@ -270,20 +291,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
llist_for_each_entry(l, pending, llnode) {
- struct mce *m = &l->mce;
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
if (!(m->status & MCI_STATUS_UC)) {
- print_mce(m);
+ print_mce(err);
if (!apei_err)
apei_err = apei_write_mce(m);
}
}
/* Now print uncorrected but with the final one last */
llist_for_each_entry(l, pending, llnode) {
- struct mce *m = &l->mce;
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
if (!(m->status & MCI_STATUS_UC))
continue;
- if (!final || mce_cmp(m, final)) {
- print_mce(m);
+ if (!final || mce_cmp(m, &final->m)) {
+ print_mce(err);
if (!apei_err)
apei_err = apei_write_mce(m);
}
@@ -291,12 +314,12 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
if (final) {
print_mce(final);
if (!apei_err)
- apei_err = apei_write_mce(final);
+ apei_err = apei_write_mce(&final->m);
}
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
- memmsg = mce_dump_aux_info(final);
+ memmsg = mce_dump_aux_info(&final->m);
if (memmsg)
pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
@@ -311,9 +334,9 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
* panic.
*/
if (kexec_crash_loaded()) {
- if (final && (final->status & MCI_STATUS_ADDRV)) {
+ if (final && (final->m.status & MCI_STATUS_ADDRV)) {
struct page *p;
- p = pfn_to_online_page(final->addr >> PAGE_SHIFT);
+ p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT);
if (p)
SetPageHWPoison(p);
}
@@ -433,16 +456,18 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
* check into our "mce" struct so that we can use it later to assess
* the severity of the problem as we read per-bank specific details.
*/
-static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
+static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
{
+ struct mce *m;
/*
- * Enable instrumentation around mce_setup() which calls external
+ * Enable instrumentation around mce_prep_record() which calls external
* facilities.
*/
instrumentation_begin();
- mce_setup(m);
+ mce_prep_record(err);
instrumentation_end();
+ m = &err->m;
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
if (regs) {
/*
@@ -467,10 +492,10 @@ static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
}
}
-int mce_available(struct cpuinfo_x86 *c)
+bool mce_available(struct cpuinfo_x86 *c)
{
if (mca_cfg.disabled)
- return 0;
+ return false;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
@@ -559,16 +584,38 @@ bool mce_is_correctable(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_correctable);
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+static bool mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return true;
+ }
+
+ return false;
+}
+
static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
- struct mce *m = (struct mce *)data;
+ struct mce_hw_err *err = to_mce_hw_err(data);
- if (!m)
+ if (!err)
return NOTIFY_DONE;
/* Emit the trace record: */
- trace_mce_record(m);
+ trace_mce_record(err);
set_bit(0, &mce_need_notify);
@@ -612,13 +659,13 @@ static struct notifier_block mce_uc_nb = {
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
- struct mce *m = (struct mce *)data;
+ struct mce_hw_err *err = to_mce_hw_err(data);
- if (!m)
+ if (!err)
return NOTIFY_DONE;
- if (mca_cfg.print_all || !m->kflags)
- __print_mce(m);
+ if (mca_cfg.print_all || !(err->m.kflags))
+ __print_mce(err);
return NOTIFY_DONE;
}
@@ -632,8 +679,10 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static noinstr void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
{
+ struct mce *m = &err->m;
+
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
@@ -655,8 +704,11 @@ static noinstr void mce_read_aux(struct mce *m, int i)
if (mce_flags.smca) {
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
- if (m->status & MCI_STATUS_SYNDV)
+ if (m->status & MCI_STATUS_SYNDV) {
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+ err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
+ err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
+ }
}
}
@@ -677,30 +729,31 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
-bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- bool error_seen = false;
- struct mce m;
+ struct mce_hw_err err;
+ struct mce *m;
int i;
this_cpu_inc(mce_poll_count);
- mce_gather_info(&m, NULL);
+ mce_gather_info(&err, NULL);
+ m = &err.m;
if (flags & MCP_TIMESTAMP)
- m.tsc = rdtsc();
+ m->tsc = rdtsc();
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
barrier();
- m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
/*
* Update storm tracking here, before checking for the
@@ -710,17 +763,17 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* storm status.
*/
if (!mca_cfg.cmci_disabled)
- mce_track_storm(&m);
+ mce_track_storm(m);
/* If this entry is not valid, ignore it */
- if (!(m.status & MCI_STATUS_VAL))
+ if (!(m->status & MCI_STATUS_VAL))
continue;
/*
* If we are logging everything (at CPU online) or this
* is a corrected error, then we must log it.
*/
- if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
+ if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
goto log_it;
/*
@@ -730,20 +783,20 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* everything else.
*/
if (!mca_cfg.ser) {
- if (m.status & MCI_STATUS_UC)
+ if (m->status & MCI_STATUS_UC)
continue;
goto log_it;
}
/* Log "not enabled" (speculative) errors */
- if (!(m.status & MCI_STATUS_EN))
+ if (!(m->status & MCI_STATUS_EN))
goto log_it;
/*
* Log UCNA (SDM: 15.6.3 "UCR Error Classification")
* UC == 1 && PCC == 0 && S == 0
*/
- if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
+ if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
goto log_it;
/*
@@ -754,25 +807,23 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
continue;
log_it:
- error_seen = true;
-
if (flags & MCP_DONTLOG)
goto clear_it;
- mce_read_aux(&m, i);
- m.severity = mce_severity(&m, NULL, NULL, false);
+ mce_read_aux(&err, i);
+ m->severity = mce_severity(m, NULL, NULL, false);
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
+ if (mca_cfg.dont_log_ce && !mce_usable_address(m))
goto clear_it;
if (flags & MCP_QUEUE_LOG)
- mce_gen_pool_add(&m);
+ mce_gen_pool_add(&err);
else
- mce_log(&m);
+ mce_log(&err);
clear_it:
/*
@@ -787,8 +838,6 @@ clear_it:
*/
sync_core();
-
- return error_seen;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -898,9 +947,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
+ struct mce *m = &err->m;
char *tmp = *msg;
int i;
@@ -918,7 +968,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo
m->bank = i;
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
- mce_read_aux(m, i);
+ mce_read_aux(err, i);
*msg = tmp;
return 1;
}
@@ -1009,10 +1059,11 @@ out:
*/
static void mce_reign(void)
{
- int cpu;
+ struct mce_hw_err *err = NULL;
struct mce *m = NULL;
int global_worst = 0;
char *msg = NULL;
+ int cpu;
/*
* This CPU is the Monarch and the other CPUs have run
@@ -1020,11 +1071,13 @@ static void mce_reign(void)
* Grade the severity of the errors of all the CPUs.
*/
for_each_possible_cpu(cpu) {
- struct mce *mtmp = &per_cpu(mces_seen, cpu);
+ struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu);
+ struct mce *mtmp = &etmp->m;
if (mtmp->severity > global_worst) {
global_worst = mtmp->severity;
- m = &per_cpu(mces_seen, cpu);
+ err = &per_cpu(hw_errs_seen, cpu);
+ m = &err->m;
}
}
@@ -1036,7 +1089,7 @@ static void mce_reign(void)
if (m && global_worst >= MCE_PANIC_SEVERITY) {
/* call mce_severity() to get "msg" for panic */
mce_severity(m, NULL, &msg, true);
- mce_panic("Fatal machine check", m, msg);
+ mce_panic("Fatal machine check", err, msg);
}
/*
@@ -1053,11 +1106,11 @@ static void mce_reign(void)
mce_panic("Fatal machine check from unknown source", NULL, NULL);
/*
- * Now clear all the mces_seen so that they don't reappear on
+ * Now clear all the hw_errs_seen so that they don't reappear on
* the next mce.
*/
for_each_possible_cpu(cpu)
- memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+ memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err));
}
static atomic_t global_nwo;
@@ -1261,13 +1314,14 @@ static noinstr bool mce_check_crashing_cpu(void)
}
static __always_inline int
-__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
- unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
- int *worst)
+__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs,
+ struct mce_hw_err *final, unsigned long *toclear,
+ unsigned long *valid_banks, int no_way_out, int *worst)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
int severity, i, taint = 0;
+ struct mce *m = &err->m;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
arch___clear_bit(i, toclear);
@@ -1312,7 +1366,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
if (severity == MCE_NO_SEVERITY)
continue;
- mce_read_aux(m, i);
+ mce_read_aux(err, i);
/* assuming valid severity level != 0 */
m->severity = severity;
@@ -1322,17 +1376,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
* done in #MC context, where instrumentation is disabled.
*/
instrumentation_begin();
- mce_log(m);
+ mce_log(err);
instrumentation_end();
if (severity > *worst) {
- *final = *m;
+ *final = *err;
*worst = severity;
}
}
/* mce_clear_state will clear *final, save locally for use later */
- *m = *final;
+ *err = *final;
return taint;
}
@@ -1392,9 +1446,10 @@ static void kill_me_never(struct callback_head *cb)
set_mce_nospec(pfn);
}
-static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
+static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *))
{
int count = ++current->mce_count;
+ struct mce *m = &err->m;
/* First call, save all the details */
if (count == 1) {
@@ -1407,11 +1462,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba
/* Ten is likely overkill. Don't expect more than two faults before task_work() */
if (count > 10)
- mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+ mce_panic("Too many consecutive machine checks while accessing user data",
+ err, msg);
/* Second or later call, make sure page address matches the one from first call */
if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
- mce_panic("Consecutive machine checks to different user pages", m, msg);
+ mce_panic("Consecutive machine checks to different user pages", err, msg);
/* Do not call task_work_add() more than once */
if (count > 1)
@@ -1460,8 +1516,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
- struct mce m, *final;
+ struct mce_hw_err *final;
+ struct mce_hw_err err;
char *msg = NULL;
+ struct mce *m;
if (unlikely(mce_flags.p5))
return pentium_machine_check(regs);
@@ -1499,13 +1557,14 @@ noinstr void do_machine_check(struct pt_regs *regs)
this_cpu_inc(mce_exception_count);
- mce_gather_info(&m, regs);
- m.tsc = rdtsc();
+ mce_gather_info(&err, regs);
+ m = &err.m;
+ m->tsc = rdtsc();
- final = this_cpu_ptr(&mces_seen);
- *final = m;
+ final = this_cpu_ptr(&hw_errs_seen);
+ *final = err;
- no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
+ no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs);
barrier();
@@ -1514,15 +1573,15 @@ noinstr void do_machine_check(struct pt_regs *regs)
* Assume the worst for now, but if we find the
* severity is MCE_AR_SEVERITY we have other options.
*/
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
kill_current_task = 1;
/*
* Check if this MCE is signaled to only this logical processor,
* on Intel, Zhaoxin only.
*/
- if (m.cpuvendor == X86_VENDOR_INTEL ||
- m.cpuvendor == X86_VENDOR_ZHAOXIN)
- lmce = m.mcgstatus & MCG_STATUS_LMCES;
+ if (m->cpuvendor == X86_VENDOR_INTEL ||
+ m->cpuvendor == X86_VENDOR_ZHAOXIN)
+ lmce = m->mcgstatus & MCG_STATUS_LMCES;
/*
* Local machine check may already know that we have to panic.
@@ -1533,12 +1592,12 @@ noinstr void do_machine_check(struct pt_regs *regs)
*/
if (lmce) {
if (no_way_out)
- mce_panic("Fatal local machine check", &m, msg);
+ mce_panic("Fatal local machine check", &err, msg);
} else {
order = mce_start(&no_way_out);
}
- taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
+ taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst);
if (!no_way_out)
mce_clear_state(toclear);
@@ -1553,7 +1612,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
no_way_out = worst >= MCE_PANIC_SEVERITY;
if (no_way_out)
- mce_panic("Fatal machine check on current CPU", &m, msg);
+ mce_panic("Fatal machine check on current CPU", &err, msg);
}
} else {
/*
@@ -1565,8 +1624,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
* make sure we have the right "msg".
*/
if (worst >= MCE_PANIC_SEVERITY) {
- mce_severity(&m, regs, &msg, true);
- mce_panic("Local fatal machine check!", &m, msg);
+ mce_severity(m, regs, &msg, true);
+ mce_panic("Local fatal machine check!", &err, msg);
}
}
@@ -1584,15 +1643,33 @@ noinstr void do_machine_check(struct pt_regs *regs)
goto out;
/* Fault was in user mode and we need to take some action */
- if ((m.cs & 3) == 3) {
+ if ((m->cs & 3) == 3) {
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- if (!mce_usable_address(&m))
- queue_task_work(&m, msg, kill_me_now);
+ if (!mce_usable_address(m))
+ queue_task_work(&err, msg, kill_me_now);
else
- queue_task_work(&m, msg, kill_me_maybe);
+ queue_task_work(&err, msg, kill_me_maybe);
+ } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) {
+ /*
+ * Saved RIP on stack makes it look like the machine check
+ * was taken in the kernel on the instruction following
+ * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+ * that the machine check was taken inside SEAM non-root
+ * mode. CPU core has already marked that guest as dead.
+ * It is OK for the kernel to resume execution at the
+ * apparent point of the machine check as the fault did
+ * not occur there. Mark the page as poisoned so it won't
+ * be added to free list when the guest is terminated.
+ */
+ if (mce_usable_address(m)) {
+ struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);
+
+ if (p)
+ SetPageHWPoison(p);
+ }
} else {
/*
* Handle an MCE which has happened in kernel space but from
@@ -1603,13 +1680,13 @@ noinstr void do_machine_check(struct pt_regs *regs)
* corresponding exception handler which would do that is the
* proper one.
*/
- if (m.kflags & MCE_IN_KERNEL_RECOV) {
+ if (m->kflags & MCE_IN_KERNEL_RECOV) {
if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
- mce_panic("Failed kernel mode recovery", &m, msg);
+ mce_panic("Failed kernel mode recovery", &err, msg);
}
- if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, msg, kill_me_never);
+ if (m->kflags & MCE_IN_KERNEL_COPYIN)
+ queue_task_work(&err, msg, kill_me_never);
}
out:
@@ -1709,37 +1786,15 @@ void mce_timer_kick(bool storm)
__this_cpu_write(mce_next_interval, check_interval * HZ);
}
-/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+/* Must not be called in IRQ context where timer_delete_sync() can deadlock */
static void mce_timer_delete_all(void)
{
int cpu;
for_each_online_cpu(cpu)
- del_timer_sync(&per_cpu(mce_timer, cpu));
+ timer_delete_sync(&per_cpu(mce_timer, cpu));
}
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-int mce_notify_irq(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- if (test_and_clear_bit(0, &mce_need_notify)) {
- mce_work_trigger();
-
- if (__ratelimit(&ratelimit))
- pr_info(HW_ERR "Machine check events logged\n");
-
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(mce_notify_irq);
-
static void __mcheck_cpu_mce_banks_init(void)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
@@ -1855,101 +1910,120 @@ static void __mcheck_cpu_check_banks(void)
}
}
-/* Add per CPU specific workarounds here */
-static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+static void apply_quirks_amd(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- struct mca_config *cfg = &mca_cfg;
-
- if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
- pr_info("unknown CPU type - not enabling MCE support\n");
- return -EOPNOTSUPP;
- }
/* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
- /*
- * disable GART TBL walk error reporting, which
- * trips off incorrectly with the IOMMU & 3ware
- * & Cerberus:
- */
- clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
- }
- if (c->x86 < 0x11 && cfg->bootlog < 0) {
- /*
- * Lots of broken BIOS around that don't clear them
- * by default and leave crap in there. Don't log:
- */
- cfg->bootlog = 0;
- }
+ if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
/*
- * Various K7s with broken bank 0 around. Always disable
- * by default.
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
*/
- if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
- mce_banks[0].ctl = 0;
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ }
+ if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
/*
- * overflow_recov is supported for F15h Models 00h-0fh
- * even though we don't have a CPUID bit for it.
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
*/
- if (c->x86 == 0x15 && c->x86_model <= 0xf)
- mce_flags.overflow_recov = 1;
+ mca_cfg.bootlog = 0;
+ }
- if (c->x86 >= 0x17 && c->x86 <= 0x1A)
- mce_flags.zen_ifu_quirk = 1;
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && this_cpu_read(mce_num_banks))
+ mce_banks[0].ctl = 0;
- }
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
- if (c->x86_vendor == X86_VENDOR_INTEL) {
- /*
- * SDM documents that on family 6 bank 0 should not be written
- * because it aliases to another special BIOS controlled
- * register.
- * But it's not aliased anymore on model 0x1a+
- * Don't ignore bank 0 completely because there could be a
- * valid event later, merely don't write CTL0.
- */
+ if (c->x86 >= 0x17 && c->x86 <= 0x1A)
+ mce_flags.zen_ifu_quirk = 1;
+}
- if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
- mce_banks[0].init = false;
+static void apply_quirks_intel(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- /*
- * All newer Intel systems support MCE broadcasting. Enable
- * synchronization with a one second timeout.
- */
- if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
- cfg->monarch_timeout < 0)
- cfg->monarch_timeout = USEC_PER_SEC;
+ /* Older CPUs (prior to family 6) don't need quirks. */
+ if (c->x86_vfm < INTEL_PENTIUM_PRO)
+ return;
- /*
- * There are also broken BIOSes on some Pentium M and
- * earlier systems:
- */
- if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
- cfg->bootlog = 0;
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ */
+ if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
+ mce_banks[0].init = false;
- if (c->x86 == 6 && c->x86_model == 45)
- mce_flags.snb_ifu_quirk = 1;
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
- /*
- * Skylake, Cascacde Lake and Cooper Lake require a quirk on
- * rep movs.
- */
- if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
- mce_flags.skx_repmov_quirk = 1;
+ /*
+ * There are also broken BIOSes on some Pentium M and
+ * earlier systems:
+ */
+ if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
+ mca_cfg.bootlog = 0;
+
+ if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
+ mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86_vfm == INTEL_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
+}
+
+static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
+{
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
}
+}
- if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
- /*
- * All newer Zhaoxin CPUs support MCE broadcasting. Enable
- * synchronization with a one second timeout.
- */
- if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
- if (cfg->monarch_timeout < 0)
- cfg->monarch_timeout = USEC_PER_SEC;
- }
+/* Add per CPU specific workarounds here */
+static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_UNKNOWN:
+ pr_info("unknown CPU type - not enabling MCE support\n");
+ return false;
+ case X86_VENDOR_AMD:
+ apply_quirks_amd(c);
+ break;
+ case X86_VENDOR_INTEL:
+ apply_quirks_intel(c);
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ apply_quirks_zhaoxin(c);
+ break;
}
if (cfg->monarch_timeout < 0)
@@ -1957,28 +2031,28 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (cfg->bootlog != 0)
cfg->panic_timeout = 30;
- return 0;
+ return true;
}
-static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
- return 0;
+ return false;
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
mce_flags.p5 = 1;
- return 1;
+ return true;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
mce_flags.winchip = 1;
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
- return 0;
+ return false;
}
/*
@@ -2044,13 +2118,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_intel_feature_init(c);
break;
- case X86_VENDOR_AMD: {
- mce_amd_feature_init(c);
- break;
- }
-
+ case X86_VENDOR_AMD:
case X86_VENDOR_HYGON:
- mce_hygon_feature_init(c);
+ mce_amd_feature_init(c);
break;
case X86_VENDOR_CENTAUR:
@@ -2224,12 +2294,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_cap_init();
- if (__mcheck_cpu_apply_quirks(c) < 0) {
+ if (!__mcheck_cpu_apply_quirks(c)) {
mca_cfg.disabled = 1;
return;
}
- if (mce_gen_pool_init()) {
+ if (!mce_gen_pool_init()) {
mca_cfg.disabled = 1;
pr_emerg("Couldn't allocate MCE records pool!\n");
return;
@@ -2750,7 +2820,7 @@ static int mce_cpu_pre_down(unsigned int cpu)
struct timer_list *t = this_cpu_ptr(&mce_timer);
mce_disable_cpu();
- del_timer_sync(t);
+ timer_delete_sync(t);
mce_threshold_remove_device(cpu);
mce_device_remove(cpu);
return 0;
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index a05ac0716ecf..8d023239ce18 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -264,15 +264,8 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
return put_user(mcelog->len, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog->flags;
- } while (cmpxchg(&mcelog->flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
+ case MCE_GETCLEAR_FLAGS:
+ return put_user(xchg(&mcelog->flags, 0), p);
default:
return -ENOTTY;
}
@@ -314,7 +307,7 @@ static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
/*
* Need to give user space some time to set everything up,
- * so do it a jiffie or two later everywhere.
+ * so do it a jiffy or two later everywhere.
*/
schedule_timeout(2);
@@ -331,7 +324,6 @@ static const struct file_operations mce_chrdev_ops = {
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
- .llseek = no_llseek,
};
static struct miscdevice mce_chrdev_device = {
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
index fbe8b61c3413..3ca9c007a666 100644
--- a/arch/x86/kernel/cpu/mce/genpool.c
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -16,14 +16,14 @@
* used to save error information organized in a lock-less list.
*
* This memory pool is only to be used to save MCE records in MCE context.
- * MCE events are rare, so a fixed size memory pool should be enough. Use
- * 2 pages to save MCE events for now (~80 MCE records at most).
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
*/
-#define MCE_POOLSZ (2 * PAGE_SIZE)
+#define MCE_MIN_ENTRIES 80
+#define MCE_PER_CPU 2
static struct gen_pool *mce_evt_pool;
static LLIST_HEAD(mce_event_llist);
-static char gen_pool_buf[MCE_POOLSZ];
/*
* Compare the record "t" with each of the records on list "l" to see if
@@ -31,15 +31,15 @@ static char gen_pool_buf[MCE_POOLSZ];
*/
static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
{
+ struct mce_hw_err *err1, *err2;
struct mce_evt_llist *node;
- struct mce *m1, *m2;
- m1 = &t->mce;
+ err1 = &t->err;
llist_for_each_entry(node, &l->llnode, llnode) {
- m2 = &node->mce;
+ err2 = &node->err;
- if (!mce_cmp(m1, m2))
+ if (!mce_cmp(&err1->m, &err2->m))
return true;
}
return false;
@@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void)
void mce_gen_pool_process(struct work_struct *__unused)
{
- struct llist_node *head;
struct mce_evt_llist *node, *tmp;
+ struct llist_node *head;
struct mce *mce;
head = llist_del_all(&mce_event_llist);
@@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused)
head = llist_reverse_order(head);
llist_for_each_entry_safe(node, tmp, head, llnode) {
- mce = &node->mce;
+ mce = &node->err.m;
blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
}
@@ -94,54 +94,63 @@ bool mce_gen_pool_empty(void)
return llist_empty(&mce_event_llist);
}
-int mce_gen_pool_add(struct mce *mce)
+bool mce_gen_pool_add(struct mce_hw_err *err)
{
struct mce_evt_llist *node;
- if (filter_mce(mce))
- return -EINVAL;
+ if (filter_mce(&err->m))
+ return false;
if (!mce_evt_pool)
- return -EINVAL;
+ return false;
node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
if (!node) {
pr_warn_ratelimited("MCE records pool full!\n");
- return -ENOMEM;
+ return false;
}
- memcpy(&node->mce, mce, sizeof(*mce));
+ memcpy(&node->err, err, sizeof(*err));
llist_add(&node->llnode, &mce_event_llist);
- return 0;
+ return true;
}
-static int mce_gen_pool_create(void)
+static bool mce_gen_pool_create(void)
{
- struct gen_pool *tmpp;
- int ret = -ENOMEM;
-
- tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
- if (!tmpp)
- goto out;
+ int mce_numrecords, mce_poolsz, order;
+ struct gen_pool *gpool;
+ void *mce_pool;
+
+ order = order_base_2(sizeof(struct mce_evt_llist));
+ gpool = gen_pool_create(order, -1);
+ if (!gpool)
+ return false;
+
+ mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+ mce_poolsz = mce_numrecords * (1 << order);
+ mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+ if (!mce_pool) {
+ gen_pool_destroy(gpool);
+ return false;
+ }
- ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
- if (ret) {
- gen_pool_destroy(tmpp);
- goto out;
+ if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) {
+ gen_pool_destroy(gpool);
+ kfree(mce_pool);
+ return false;
}
- mce_evt_pool = tmpp;
+ mce_evt_pool = gpool;
-out:
- return ret;
+ return true;
}
-int mce_gen_pool_init(void)
+bool mce_gen_pool_init(void)
{
/* Just init mce_gen_pool once. */
if (mce_evt_pool)
- return 0;
+ return true;
return mce_gen_pool_create();
}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 94953d749475..06e3cf7229ce 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -229,7 +229,6 @@ static int raise_local(void)
} else if (m->status) {
pr_info("Starting machine check poll CPU %d\n", cpu);
raise_poll(m);
- mce_notify_irq();
pr_info("Machine check poll done on CPU %d\n", cpu);
} else
m->finished = 0;
@@ -487,19 +486,24 @@ static void prepare_msrs(void *info)
wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
}
- wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+
+ if (m.misc)
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
} else {
wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
- wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+
+ if (m.misc)
+ wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
}
}
static void do_inject(void)
{
- u64 mcg_status = 0;
unsigned int cpu = i_mce.extcpu;
+ struct mce_hw_err err;
+ u64 mcg_status = 0;
u8 b = i_mce.bank;
i_mce.tsc = rdtsc_ordered();
@@ -513,7 +517,8 @@ static void do_inject(void)
i_mce.status |= MCI_STATUS_SYNDV;
if (inj_type == SW_INJ) {
- mce_log(&i_mce);
+ err.m = i_mce;
+ mce_log(&err);
return;
}
@@ -795,4 +800,5 @@ static void __exit inject_exit(void)
module_init(inject_init);
module_exit(inject_exit);
+MODULE_DESCRIPTION("Machine check injection support");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 399b62e223d2..f863df0ff42c 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -13,7 +13,7 @@
#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -75,12 +75,12 @@ static u16 cmci_threshold[MAX_NR_BANKS];
*/
#define CMCI_STORM_THRESHOLD 32749
-static int cmci_supported(int *banks)
+static bool cmci_supported(int *banks)
{
u64 cap;
if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
- return 0;
+ return false;
/*
* Vendor check is not strictly needed, but the initial
@@ -89,12 +89,13 @@ static int cmci_supported(int *banks)
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
- return 0;
+ return false;
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
- return 0;
+ return false;
+
rdmsrl(MSR_IA32_MCG_CAP, cap);
- *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
return !!(cap & MCG_CMCI_P);
}
@@ -455,10 +456,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
{
u64 error_control;
- switch (c->x86_model) {
- case INTEL_FAM6_SANDYBRIDGE_X:
- case INTEL_FAM6_IVYBRIDGE_X:
- case INTEL_FAM6_HASWELL_X:
+ switch (c->x86_vfm) {
+ case INTEL_SANDYBRIDGE_X:
+ case INTEL_IVYBRIDGE_X:
+ case INTEL_HASWELL_X:
if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
return;
error_control |= 2;
@@ -484,12 +485,11 @@ bool intel_filter_mce(struct mce *m)
struct cpuinfo_x86 *c = &boot_cpu_data;
/* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
- if ((c->x86 == 6) &&
- ((c->x86_model == INTEL_FAM6_HASWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_L) ||
- (c->x86_model == INTEL_FAM6_BROADWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_G) ||
- (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
+ if ((c->x86_vfm == INTEL_HASWELL ||
+ c->x86_vfm == INTEL_HASWELL_L ||
+ c->x86_vfm == INTEL_BROADWELL ||
+ c->x86_vfm == INTEL_HASWELL_G ||
+ c->x86_vfm == INTEL_SKYLAKE_X) &&
(m->bank == 0) &&
((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
return true;
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 01f8f03969e6..95a504ece43e 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -26,13 +26,13 @@ extern struct blocking_notifier_head x86_mce_decoder_chain;
struct mce_evt_llist {
struct llist_node llnode;
- struct mce mce;
+ struct mce_hw_err err;
};
void mce_gen_pool_process(struct work_struct *__unused);
bool mce_gen_pool_empty(void);
-int mce_gen_pool_add(struct mce *mce);
-int mce_gen_pool_init(void);
+bool mce_gen_pool_add(struct mce_hw_err *err);
+bool mce_gen_pool_init(void);
struct llist_node *mce_gen_pool_prepare_records(void);
int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp);
@@ -261,6 +261,8 @@ enum mca_msr {
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);
+void mce_prep_record_common(struct mce *m);
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index c4477162c07d..2235a7477436 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -12,7 +12,7 @@
#include <linux/uaccess.h>
#include <asm/mce.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/traps.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
@@ -39,20 +39,20 @@ static struct severity {
u64 mask;
u64 result;
unsigned char sev;
- unsigned char mcgmask;
- unsigned char mcgres;
+ unsigned short mcgmask;
+ unsigned short mcgres;
unsigned char ser;
unsigned char context;
unsigned char excp;
unsigned char covered;
- unsigned char cpu_model;
+ unsigned int cpu_vfm;
unsigned char cpu_minstepping;
unsigned char bank_lo, bank_hi;
char *msg;
} severities[] = {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
-#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s
+#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s
#define KERNEL .context = IN_KERNEL
#define USER .context = IN_USER
#define KERNEL_RECOV .context = IN_KERNEL_RECOV
@@ -128,7 +128,7 @@ static struct severity {
MCESEV(
AO, "Uncorrected Patrol Scrub Error",
SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
- MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18)
),
/* ignore OVER for UCNA */
@@ -174,6 +174,18 @@ static struct severity {
USER
),
MCESEV(
+ AR, "Data load error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ AR, "Instruction fetch error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
PANIC, "Data load in unrecoverable area of kernel",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
@@ -288,14 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
copy_user = is_copy_from_user(regs);
instrumentation_end();
- switch (fixup_type) {
- case EX_TYPE_UACCESS:
- case EX_TYPE_COPY:
- if (!copy_user)
- return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
- fallthrough;
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+ switch (fixup_type) {
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
@@ -386,7 +396,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char
continue;
if (s->excp && excp != s->excp)
continue;
- if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model)
+ if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm)
continue;
if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
continue;
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index 89e31e1e5c9c..f4a007616468 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -90,7 +90,7 @@ void cmci_storm_end(unsigned int bank)
storm->banks[bank].in_storm_mode = false;
/* If no banks left in storm mode, stop polling. */
- if (!this_cpu_dec_return(storm_desc.stormy_bank_count))
+ if (!--storm->stormy_bank_count)
mce_timer_kick(false);
}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 13b45b9c806d..4a10d35e70aa 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -23,17 +23,22 @@
#include <linux/earlycpio.h>
#include <linux/firmware.h>
+#include <linux/bsearch.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/initrd.h>
#include <linux/kernel.h>
#include <linux/pci.h>
+#include <crypto/sha2.h>
+
#include <asm/microcode.h>
#include <asm/processor.h>
+#include <asm/cmdline.h>
#include <asm/setup.h>
#include <asm/cpu.h>
#include <asm/msr.h>
+#include <asm/tlb.h>
#include "internal.h"
@@ -84,13 +89,36 @@ struct microcode_amd {
unsigned int mpb[];
};
-#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
-
static struct equiv_cpu_table {
unsigned int num_entries;
struct equiv_cpu_entry *entry;
} equiv_table;
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
+union cpuid_1_eax {
+ struct {
+ __u32 stepping : 4,
+ model : 4,
+ family : 4,
+ __reserved0 : 4,
+ ext_model : 4,
+ ext_fam : 8,
+ __reserved1 : 4;
+ };
+ __u32 full;
+};
+
/*
* This points to the current valid container of microcode patches which we will
* save from the initrd/builtin before jettisoning its contents. @mc is the
@@ -98,7 +126,6 @@ static struct equiv_cpu_table {
*/
struct cont_desc {
struct microcode_amd *mc;
- u32 cpuid_1_eax;
u32 psize;
u8 *data;
size_t size;
@@ -111,10 +138,154 @@ struct cont_desc {
static const char
ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+/*
+ * This is CPUID(1).EAX on the BSP. It is used in two ways:
+ *
+ * 1. To ignore the equivalence table on Zen1 and newer.
+ *
+ * 2. To match which patches to load because the patch revision ID
+ * already contains the f/m/s for which the microcode is destined
+ * for.
+ */
+static u32 bsp_cpuid_1_eax __ro_after_init;
+
+static bool sha_check = true;
+
+struct patch_digest {
+ u32 patch_id;
+ u8 sha256[SHA256_DIGEST_SIZE];
+};
+
+#include "amd_shas.c"
+
+static int cmp_id(const void *key, const void *elem)
+{
+ struct patch_digest *pd = (struct patch_digest *)elem;
+ u32 patch_id = *(u32 *)key;
+
+ if (patch_id == pd->patch_id)
+ return 0;
+ else if (patch_id < pd->patch_id)
+ return -1;
+ else
+ return 1;
+}
+
+static bool need_sha_check(u32 cur_rev)
+{
+ switch (cur_rev >> 8) {
+ case 0x80012: return cur_rev <= 0x800126f; break;
+ case 0x80082: return cur_rev <= 0x800820f; break;
+ case 0x83010: return cur_rev <= 0x830107c; break;
+ case 0x86001: return cur_rev <= 0x860010e; break;
+ case 0x86081: return cur_rev <= 0x8608108; break;
+ case 0x87010: return cur_rev <= 0x8701034; break;
+ case 0x8a000: return cur_rev <= 0x8a0000a; break;
+ case 0xa0010: return cur_rev <= 0xa00107a; break;
+ case 0xa0011: return cur_rev <= 0xa0011da; break;
+ case 0xa0012: return cur_rev <= 0xa001243; break;
+ case 0xa0082: return cur_rev <= 0xa00820e; break;
+ case 0xa1011: return cur_rev <= 0xa101153; break;
+ case 0xa1012: return cur_rev <= 0xa10124e; break;
+ case 0xa1081: return cur_rev <= 0xa108109; break;
+ case 0xa2010: return cur_rev <= 0xa20102f; break;
+ case 0xa2012: return cur_rev <= 0xa201212; break;
+ case 0xa4041: return cur_rev <= 0xa404109; break;
+ case 0xa5000: return cur_rev <= 0xa500013; break;
+ case 0xa6012: return cur_rev <= 0xa60120a; break;
+ case 0xa7041: return cur_rev <= 0xa704109; break;
+ case 0xa7052: return cur_rev <= 0xa705208; break;
+ case 0xa7080: return cur_rev <= 0xa708009; break;
+ case 0xa70c0: return cur_rev <= 0xa70C009; break;
+ case 0xaa001: return cur_rev <= 0xaa00116; break;
+ case 0xaa002: return cur_rev <= 0xaa00218; break;
+ case 0xb0021: return cur_rev <= 0xb002146; break;
+ case 0xb1010: return cur_rev <= 0xb101046; break;
+ case 0xb2040: return cur_rev <= 0xb204031; break;
+ case 0xb4040: return cur_rev <= 0xb404031; break;
+ case 0xb6000: return cur_rev <= 0xb600031; break;
+ case 0xb7000: return cur_rev <= 0xb700031; break;
+ default: break;
+ }
+
+ pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n");
+ pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev);
+ return true;
+}
+
+static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
+{
+ struct patch_digest *pd = NULL;
+ u8 digest[SHA256_DIGEST_SIZE];
+ struct sha256_state s;
+ int i;
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17)
+ return true;
+
+ if (!need_sha_check(cur_rev))
+ return true;
+
+ if (!sha_check)
+ return true;
+
+ pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id);
+ if (!pd) {
+ pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id);
+ return false;
+ }
+
+ sha256_init(&s);
+ sha256_update(&s, data, len);
+ sha256_final(&s, digest);
+
+ if (memcmp(digest, pd->sha256, sizeof(digest))) {
+ pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id);
+
+ for (i = 0; i < SHA256_DIGEST_SIZE; i++)
+ pr_cont("0x%x ", digest[i]);
+ pr_info("\n");
+
+ return false;
+ }
+
+ return true;
+}
+
+static u32 get_patch_level(void)
+{
+ u32 rev, dummy __always_unused;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ return rev;
+}
+
+static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
+{
+ union zen_patch_rev p;
+ union cpuid_1_eax c;
+
+ p.ucode_rev = val;
+ c.full = 0;
+
+ c.stepping = p.stepping;
+ c.model = p.model;
+ c.ext_model = p.ext_model;
+ c.family = 0xf;
+ c.ext_fam = p.ext_fam;
+
+ return c;
+}
+
static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
{
unsigned int i;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return 0;
+
if (!et || !et->num_entries)
return 0;
@@ -161,6 +332,10 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
if (!verify_container(buf, buf_size))
return false;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return true;
+
cont_type = hdr[1];
if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
pr_debug("Wrong microcode container equivalence table type: %u.\n",
@@ -187,8 +362,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
* On success, @sh_psize returns the patch size according to the section header,
* to the caller.
*/
-static bool
-__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
+static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
{
u32 p_type, p_size;
const u32 *hdr;
@@ -224,12 +398,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
* exceed the per-family maximum). @sh_psize is the size read from the section
* header.
*/
-static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size)
+static bool __verify_patch_size(u32 sh_psize, size_t buf_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
u32 max_size;
if (family >= 0x15)
- return min_t(u32, sh_psize, buf_size);
+ goto ret;
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
@@ -243,13 +418,15 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
break;
default:
WARN(1, "%s: WTF family: 0x%x\n", __func__, family);
- return 0;
+ return false;
}
- if (sh_psize > min_t(u32, buf_size, max_size))
- return 0;
+ if (sh_psize > max_size)
+ return false;
- return sh_psize;
+ret:
+ /* Working with the whole buffer so < is ok. */
+ return sh_psize <= buf_size;
}
/*
@@ -260,11 +437,10 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
* positive: patch is not for this family, skip it
* 0: success
*/
-static int
-verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
+static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct microcode_header_amd *mc_hdr;
- unsigned int ret;
u32 sh_psize;
u16 proc_id;
u8 patch_fam;
@@ -288,8 +464,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
return -1;
}
- ret = __verify_patch_size(family, sh_psize, buf_size);
- if (!ret) {
+ if (!__verify_patch_size(sh_psize, buf_size)) {
pr_debug("Per-family patch size mismatch.\n");
return -1;
}
@@ -310,10 +485,19 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
return 0;
}
+static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id)
+{
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax;
+ else
+ return eq_id == mc->hdr.processor_rev_id;
+}
+
/*
* This scans the ucode blob for the proper container as we can have multiple
- * containers glued together. Returns the equivalence ID from the equivalence
- * table or 0 if none found.
+ * containers glued together.
+ *
* Returns the amount of bytes consumed while scanning. @desc contains all the
* data we're going to use in later stages of the application.
*/
@@ -338,7 +522,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
* doesn't contain a patch for the CPU, scan through the whole container
* so that it can be skipped in case there are other containers appended.
*/
- eq_id = find_equiv_id(&table, desc->cpuid_1_eax);
+ eq_id = find_equiv_id(&table, bsp_cpuid_1_eax);
buf += hdr[2] + CONTAINER_HDR_SZ;
size -= hdr[2] + CONTAINER_HDR_SZ;
@@ -352,7 +536,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u32 patch_size;
int ret;
- ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size);
+ ret = verify_patch(buf, size, &patch_size);
if (ret < 0) {
/*
* Patch verification failed, skip to the next container, if
@@ -365,7 +549,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
}
mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
- if (eq_id == mc->hdr.processor_rev_id) {
+ if (mc_patch_matches(mc, eq_id)) {
desc->psize = patch_size;
desc->mc = mc;
}
@@ -415,59 +599,41 @@ static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
}
}
-static int __apply_microcode_amd(struct microcode_amd *mc)
+static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
+ unsigned int psize)
{
- u32 rev, dummy;
+ unsigned long p_addr = (unsigned long)&mc->hdr.data_code;
- native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code);
+ if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize))
+ return false;
- /* verify patch application was successful */
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (rev != mc->hdr.patch_id)
- return -1;
+ native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr);
- return 0;
-}
+ if (x86_family(bsp_cpuid_1_eax) == 0x17) {
+ unsigned long p_addr_end = p_addr + psize - 1;
-/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
- *
- * Returns true if container found (sets @desc), false otherwise.
- */
-static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size)
-{
- struct cont_desc desc = { 0 };
- struct microcode_amd *mc;
- bool ret = false;
-
- desc.cpuid_1_eax = cpuid_1_eax;
+ invlpg(p_addr);
- scan_containers(ucode, size, &desc);
-
- mc = desc.mc;
- if (!mc)
- return ret;
+ /*
+ * Flush next page too if patch image is crossing a page
+ * boundary.
+ */
+ if (p_addr >> PAGE_SHIFT != p_addr_end >> PAGE_SHIFT)
+ invlpg(p_addr_end);
+ }
- /*
- * Allow application of the same revision to pick up SMT-specific
- * changes even if the revision of the other SMT thread is already
- * up-to-date.
- */
- if (old_rev > mc->hdr.patch_id)
- return ret;
+ /* verify patch application was successful */
+ *cur_rev = get_patch_level();
+ if (*cur_rev != mc->hdr.patch_id)
+ return false;
- return !__apply_microcode_amd(mc);
+ return true;
}
-static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct firmware fw;
if (IS_ENABLED(CONFIG_X86_32))
@@ -486,85 +652,144 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
return false;
}
-static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static bool __init find_blobs_in_containers(struct cpio_data *ret)
{
struct cpio_data cp;
+ bool found;
- if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
+ if (!get_builtin_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path);
- *ret = cp;
+ found = cp.data && cp.size;
+ if (found)
+ *ret = cp;
+
+ return found;
}
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
{
+ struct cont_desc desc = { };
+ struct microcode_amd *mc;
struct cpio_data cp = { };
- u32 dummy;
+ char buf[4];
+ u32 rev;
+
+ if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) {
+ if (!strncmp(buf, "off", 3)) {
+ sha_check = false;
+ pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+ }
+
+ bsp_cpuid_1_eax = cpuid_1_eax;
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy);
+ rev = get_patch_level();
+ ed->old_rev = rev;
/* Needed in load_microcode_amd() */
ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
- find_blobs_in_containers(cpuid_1_eax, &cp);
- if (!(cp.data && cp.size))
+ if (!find_blobs_in_containers(&cp))
return;
- if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size))
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy);
-}
-
-static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
-
-static int __init save_microcode_in_initrd(void)
-{
- unsigned int cpuid_1_eax = native_cpuid_eax(1);
- struct cpuinfo_x86 *c = &boot_cpu_data;
- struct cont_desc desc = { 0 };
- enum ucode_state ret;
- struct cpio_data cp;
-
- if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
- return 0;
+ scan_containers(cp.data, cp.size, &desc);
- find_blobs_in_containers(cpuid_1_eax, &cp);
- if (!(cp.data && cp.size))
- return -EINVAL;
+ mc = desc.mc;
+ if (!mc)
+ return;
- desc.cpuid_1_eax = cpuid_1_eax;
+ /*
+ * Allow application of the same revision to pick up SMT-specific
+ * changes even if the revision of the other SMT thread is already
+ * up-to-date.
+ */
+ if (ed->old_rev > mc->hdr.patch_id)
+ return;
- scan_containers(cp.data, cp.size, &desc);
- if (!desc.mc)
- return -EINVAL;
+ if (__apply_microcode_amd(mc, &rev, desc.psize))
+ ed->new_rev = rev;
+}
- ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
- if (ret > UCODE_UPDATED)
- return -EINVAL;
+static inline bool patch_cpus_equivalent(struct ucode_patch *p,
+ struct ucode_patch *n,
+ bool ignore_stepping)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id);
+ union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id);
+
+ if (ignore_stepping) {
+ p_cid.stepping = 0;
+ n_cid.stepping = 0;
+ }
- return 0;
+ return p_cid.full == n_cid.full;
+ } else {
+ return p->equiv_cpu == n->equiv_cpu;
+ }
}
-early_initcall(save_microcode_in_initrd);
/*
* a small, trivial cache of per-family ucode patches
*/
-static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu)
{
struct ucode_patch *p;
+ struct ucode_patch n;
+
+ n.equiv_cpu = equiv_cpu;
+ n.patch_id = uci->cpu_sig.rev;
+
+ WARN_ON_ONCE(!n.patch_id);
list_for_each_entry(p, &microcode_cache, plist)
- if (p->equiv_cpu == equiv_cpu)
+ if (patch_cpus_equivalent(p, &n, false))
return p;
+
return NULL;
}
+static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union zen_patch_rev zp, zn;
+
+ zp.ucode_rev = p->patch_id;
+ zn.ucode_rev = n->patch_id;
+
+ if (zn.stepping != zp.stepping)
+ return -1;
+
+ return zn.rev > zp.rev;
+ } else {
+ return n->patch_id > p->patch_id;
+ }
+}
+
static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
+ int ret;
list_for_each_entry(p, &microcode_cache, plist) {
- if (p->equiv_cpu == new_patch->equiv_cpu) {
- if (p->patch_id >= new_patch->patch_id) {
+ if (patch_cpus_equivalent(p, new_patch, true)) {
+ ret = patch_newer(p, new_patch);
+ if (ret < 0)
+ continue;
+ else if (!ret) {
/* we already have the latest patch */
kfree(new_patch->data);
kfree(new_patch);
@@ -595,13 +820,17 @@ static void free_cache(void)
static struct ucode_patch *find_patch(unsigned int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u16 equiv_id;
+ u16 equiv_id = 0;
- equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
- if (!equiv_id)
- return NULL;
+ uci->cpu_sig.rev = get_patch_level();
- return cache_find_patch(equiv_id);
+ if (x86_family(bsp_cpuid_1_eax) < 0x17) {
+ equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
+ if (!equiv_id)
+ return NULL;
+ }
+
+ return cache_find_patch(uci, equiv_id);
}
void reload_ucode_amd(unsigned int cpu)
@@ -616,22 +845,20 @@ void reload_ucode_amd(unsigned int cpu)
mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
+ rev = get_patch_level();
if (rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc))
- pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id);
+ if (__apply_microcode_amd(mc, &rev, p->size))
+ pr_info_once("reload revision: 0x%08x\n", rev);
}
}
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct ucode_patch *p;
csig->sig = cpuid_eax(0x00000001);
- csig->rev = c->microcode;
+ csig->rev = get_patch_level();
/*
* a patch could have been loaded early, set uci->mc so that
@@ -651,7 +878,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
struct ucode_cpu_info *uci;
struct ucode_patch *p;
enum ucode_state ret;
- u32 rev, dummy __always_unused;
+ u32 rev;
BUG_ON(raw_smp_processor_id() != cpu);
@@ -661,18 +888,18 @@ static enum ucode_state apply_microcode_amd(int cpu)
if (!p)
return UCODE_NFOUND;
+ rev = uci->cpu_sig.rev;
+
mc_amd = p->data;
uci->mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
/* need to apply patch? */
if (rev > mc_amd->hdr.patch_id) {
ret = UCODE_OK;
goto out;
}
- if (__apply_microcode_amd(mc_amd)) {
+ if (!__apply_microcode_amd(mc_amd, &rev, p->size)) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
return UCODE_ERROR;
@@ -711,6 +938,10 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
hdr = (const u32 *)buf;
equiv_tbl_len = hdr[2];
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ goto out;
+
equiv_table.entry = vmalloc(equiv_tbl_len);
if (!equiv_table.entry) {
pr_err("failed to allocate equivalent CPU table\n");
@@ -720,12 +951,16 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len);
equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry);
+out:
/* add header length */
return equiv_tbl_len + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
{
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return;
+
vfree(equiv_table.entry);
memset(&equiv_table, 0, sizeof(equiv_table));
}
@@ -751,7 +986,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
u16 proc_id;
int ret;
- ret = verify_patch(family, fw, leftover, patch_size);
+ ret = verify_patch(fw, leftover, patch_size);
if (ret)
return ret;
@@ -776,7 +1011,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
patch->patch_id = mc_hdr->patch_id;
patch->equiv_cpu = proc_id;
- pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+ pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
__func__, patch->patch_id, proc_id);
/* ... and add to cache. */
@@ -786,8 +1021,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
}
/* Scan the blob in @data and add microcode patches to the cache. */
-static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
- size_t size)
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size)
{
u8 *fw = (u8 *)data;
size_t offset;
@@ -820,23 +1054,32 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size)
{
- struct cpuinfo_x86 *c;
- unsigned int nid, cpu;
- struct ucode_patch *p;
enum ucode_state ret;
/* free old equiv table */
free_equiv_cpu_table();
ret = __load_microcode_amd(family, data, size);
- if (ret != UCODE_OK) {
+ if (ret != UCODE_OK)
cleanup();
+
+ return ret;
+}
+
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ struct cpuinfo_x86 *c;
+ unsigned int nid, cpu;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+
+ ret = _load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK)
return ret;
- }
- for_each_node(nid) {
+ for_each_node_with_cpus(nid) {
cpu = cpumask_first(cpumask_of_node(nid));
c = &cpu_data(cpu);
@@ -853,6 +1096,32 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
return ret;
}
+static int __init save_microcode_in_initrd(void)
+{
+ unsigned int cpuid_1_eax = native_cpuid_eax(1);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ struct cont_desc desc = { 0 };
+ enum ucode_state ret;
+ struct cpio_data cp;
+
+ if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ return 0;
+
+ if (!find_blobs_in_containers(&cp))
+ return -EINVAL;
+
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
+
+ ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
+ if (ret > UCODE_UPDATED)
+ return -EINVAL;
+
+ return 0;
+}
+early_initcall(save_microcode_in_initrd);
+
/*
* AMD microcode firmware naming convention, up to family 15h they are in
* the legacy file:
diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c
new file mode 100644
index 000000000000..2a1655b1fdd8
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd_shas.c
@@ -0,0 +1,444 @@
+/* Keep 'em sorted. */
+static const struct patch_digest phashes[] = {
+ { 0x8001227, {
+ 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b,
+ 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46,
+ 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8,
+ 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18,
+ }
+ },
+ { 0x8001250, {
+ 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60,
+ 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa,
+ 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3,
+ 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19,
+ }
+ },
+ { 0x800126e, {
+ 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c,
+ 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3,
+ 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6,
+ 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43,
+ }
+ },
+ { 0x800126f, {
+ 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec,
+ 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b,
+ 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18,
+ 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33,
+ }
+ },
+ { 0x800820d, {
+ 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59,
+ 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67,
+ 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c,
+ 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e,
+ }
+ },
+ { 0x8301025, {
+ 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36,
+ 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1,
+ 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30,
+ 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77,
+ }
+ },
+ { 0x8301055, {
+ 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a,
+ 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea,
+ 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b,
+ 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b,
+ }
+ },
+ { 0x8301072, {
+ 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e,
+ 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28,
+ 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1,
+ 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30,
+ }
+ },
+ { 0x830107a, {
+ 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72,
+ 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34,
+ 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20,
+ 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a,
+ }
+ },
+ { 0x830107b, {
+ 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad,
+ 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a,
+ 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04,
+ 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19,
+ }
+ },
+ { 0x830107c, {
+ 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47,
+ 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac,
+ 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3,
+ 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38,
+ }
+ },
+ { 0x860010d, {
+ 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0,
+ 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7,
+ 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c,
+ 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8,
+ }
+ },
+ { 0x8608108, {
+ 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2,
+ 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38,
+ 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc,
+ 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd,
+ }
+ },
+ { 0x8701034, {
+ 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83,
+ 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d,
+ 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8,
+ 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b,
+ }
+ },
+ { 0x8a00008, {
+ 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e,
+ 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e,
+ 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72,
+ 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c,
+ }
+ },
+ { 0x8a0000a, {
+ 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c,
+ 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44,
+ 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb,
+ 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20,
+ }
+ },
+ { 0xa00104c, {
+ 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe,
+ 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76,
+ 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b,
+ 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b,
+ }
+ },
+ { 0xa00104e, {
+ 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2,
+ 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0,
+ 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e,
+ 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b,
+ }
+ },
+ { 0xa001053, {
+ 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d,
+ 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25,
+ 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb,
+ 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3,
+ }
+ },
+ { 0xa001058, {
+ 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36,
+ 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19,
+ 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec,
+ 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2,
+ }
+ },
+ { 0xa001075, {
+ 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9,
+ 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a,
+ 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f,
+ 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0,
+ }
+ },
+ { 0xa001078, {
+ 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25,
+ 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce,
+ 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04,
+ 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e,
+ }
+ },
+ { 0xa001079, {
+ 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb,
+ 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93,
+ 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb,
+ 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a,
+ }
+ },
+ { 0xa00107a, {
+ 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f,
+ 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd,
+ 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f,
+ 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18,
+ }
+ },
+ { 0xa001143, {
+ 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80,
+ 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42,
+ 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d,
+ 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8,
+ }
+ },
+ { 0xa001144, {
+ 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41,
+ 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b,
+ 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25,
+ 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc,
+ }
+ },
+ { 0xa00115d, {
+ 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd,
+ 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75,
+ 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e,
+ 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05,
+ }
+ },
+ { 0xa001173, {
+ 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a,
+ 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35,
+ 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3,
+ 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e,
+ }
+ },
+ { 0xa0011a8, {
+ 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b,
+ 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6,
+ 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4,
+ 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e,
+ }
+ },
+ { 0xa0011ce, {
+ 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71,
+ 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86,
+ 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e,
+ 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a,
+ }
+ },
+ { 0xa0011d1, {
+ 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e,
+ 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09,
+ 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5,
+ 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8,
+ }
+ },
+ { 0xa0011d3, {
+ 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b,
+ 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6,
+ 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00,
+ 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26,
+ }
+ },
+ { 0xa0011d5, {
+ 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13,
+ 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7,
+ 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62,
+ 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21,
+ }
+ },
+ { 0xa001223, {
+ 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8,
+ 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4,
+ 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15,
+ 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45,
+ }
+ },
+ { 0xa001224, {
+ 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25,
+ 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad,
+ 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9,
+ 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a,
+ }
+ },
+ { 0xa001227, {
+ 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad,
+ 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d,
+ 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9,
+ 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0,
+ }
+ },
+ { 0xa001229, {
+ 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6,
+ 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75,
+ 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4,
+ 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d,
+ }
+ },
+ { 0xa00122e, {
+ 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf,
+ 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15,
+ 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa,
+ 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10,
+ }
+ },
+ { 0xa001231, {
+ 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e,
+ 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64,
+ 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b,
+ 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd,
+ }
+ },
+ { 0xa001234, {
+ 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7,
+ 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc,
+ 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b,
+ 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a,
+ }
+ },
+ { 0xa001236, {
+ 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78,
+ 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d,
+ 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b,
+ 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25,
+ }
+ },
+ { 0xa001238, {
+ 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc,
+ 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a,
+ 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e,
+ 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59,
+ }
+ },
+ { 0xa00820c, {
+ 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3,
+ 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63,
+ 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1,
+ 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2,
+ }
+ },
+ { 0xa10113e, {
+ 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10,
+ 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0,
+ 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5,
+ 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53,
+ }
+ },
+ { 0xa101144, {
+ 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26,
+ 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09,
+ 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc,
+ 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47,
+ }
+ },
+ { 0xa101148, {
+ 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90,
+ 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f,
+ 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08,
+ 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4,
+ }
+ },
+ { 0xa10123e, {
+ 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18,
+ 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d,
+ 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc,
+ 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b,
+ }
+ },
+ { 0xa101244, {
+ 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c,
+ 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1,
+ 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72,
+ 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0,
+ }
+ },
+ { 0xa101248, {
+ 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e,
+ 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22,
+ 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e,
+ 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75,
+ }
+ },
+ { 0xa108108, {
+ 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9,
+ 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6,
+ 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c,
+ 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16,
+ }
+ },
+ { 0xa20102d, {
+ 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11,
+ 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89,
+ 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23,
+ 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4,
+ }
+ },
+ { 0xa201210, {
+ 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe,
+ 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9,
+ 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68,
+ 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41,
+ }
+ },
+ { 0xa404107, {
+ 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45,
+ 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0,
+ 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81,
+ 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99,
+ }
+ },
+ { 0xa500011, {
+ 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4,
+ 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1,
+ 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2,
+ 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74,
+ }
+ },
+ { 0xa601209, {
+ 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32,
+ 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30,
+ 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46,
+ 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d,
+ }
+ },
+ { 0xa704107, {
+ 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6,
+ 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93,
+ 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2,
+ 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39,
+ }
+ },
+ { 0xa705206, {
+ 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4,
+ 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7,
+ 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b,
+ 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc,
+ }
+ },
+ { 0xa708007, {
+ 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3,
+ 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2,
+ 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80,
+ 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93,
+ }
+ },
+ { 0xa70c005, {
+ 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b,
+ 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f,
+ 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55,
+ 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13,
+ }
+ },
+ { 0xaa00116, {
+ 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63,
+ 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5,
+ 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d,
+ 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9,
+ }
+ },
+ { 0xaa00212, {
+ 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75,
+ 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71,
+ 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2,
+ 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1,
+ }
+ },
+ { 0xaa00213, {
+ 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a,
+ 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f,
+ 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed,
+ 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b,
+ }
+ },
+ { 0xaa00215, {
+ 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9,
+ 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4,
+ 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b,
+ 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef,
+ }
+ },
+};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 232026a239a6..b3658d11e7b6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -60,11 +60,6 @@ module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
*/
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
-struct cpu_info_ctx {
- struct cpu_signature *cpu_sig;
- int err;
-};
-
/*
* Those patch levels cannot be updated to newer ones and thus should be final.
*/
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5f0414452b67..819199bc0119 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -21,7 +21,7 @@
#include <linux/uio.h>
#include <linux/mm.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
@@ -74,7 +74,7 @@ void intel_collect_cpu_info(struct cpu_signature *sig)
sig->pf = 0;
sig->rev = intel_get_microcode_revision();
- if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) {
+ if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) {
unsigned int val[2];
/* get processor flags from MSR 0x17 */
@@ -319,12 +319,6 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
return UCODE_OK;
}
- /*
- * Writeback and invalidate caches before updating microcode to avoid
- * internal issues depending on what the microcode is updating.
- */
- native_wbinvd();
-
/* write microcode via MSR 0x79 */
native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
@@ -574,15 +568,14 @@ static bool is_blacklisted(unsigned int cpu)
/*
* Late loading on model 79 with microcode revision less than 0x0b000021
* and LLC size per core bigger than 2.5MB may result in a system hang.
- * This behavior is documented in item BDF90, #334165 (Intel Xeon
+ * This behavior is documented in item BDX90, #334165 (Intel Xeon
* Processor E7-8800/4800 v4 Product Family).
*/
- if (c->x86 == 6 &&
- c->x86_model == INTEL_FAM6_BROADWELL_X &&
+ if (c->x86_vfm == INTEL_BROADWELL_X &&
c->x86_stepping == 0x01 &&
llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
- pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+ pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
return true;
}
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index 21776c529fa9..5df621752fef 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -100,14 +100,12 @@ extern bool force_minrev;
#ifdef CONFIG_CPU_SUP_AMD
void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
void load_ucode_amd_ap(unsigned int family);
-int save_microcode_in_initrd_amd(unsigned int family);
void reload_ucode_amd(unsigned int cpu);
struct microcode_ops *init_amd_microcode(void);
void exit_amd_microcode(void);
#else /* CONFIG_CPU_SUP_AMD */
static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
static inline void load_ucode_amd_ap(unsigned int family) { }
-static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
static inline void reload_ucode_amd(unsigned int cpu) { }
static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
static inline void exit_amd_microcode(void) { }
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 1db560ed2ca3..68f537347466 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -30,8 +30,7 @@ dump_array()
# If the /* comment */ starts with a quote string, grab that.
VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
- [ -z "$VALUE" ] && VALUE="\"$NAME\""
- [ "$VALUE" = '""' ] && continue
+ [ ! "$VALUE" ] && continue
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e0fd57a8ba84..3e2533954675 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -16,11 +16,10 @@
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kexec.h>
-#include <linux/i8253.h>
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/idtentry.h>
@@ -34,8 +33,6 @@
#include <asm/numa.h>
#include <asm/svm.h>
-/* Is Linux running as the root partition? */
-bool hv_root_partition;
/* Is Linux running on nested Microsoft Hypervisor */
bool hv_nested;
struct ms_hyperv_info ms_hyperv;
@@ -110,6 +107,7 @@ void hv_set_msr(unsigned int reg, u64 value)
}
EXPORT_SYMBOL_GPL(hv_set_msr);
+static void (*mshv_handler)(void);
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
@@ -120,6 +118,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
struct pt_regs *old_regs = set_irq_regs(regs);
inc_irq_stat(irq_hv_callback_count);
+ if (mshv_handler)
+ mshv_handler();
+
if (vmbus_handler)
vmbus_handler();
@@ -129,6 +130,11 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
set_irq_regs(old_regs);
}
+void hv_setup_mshv_handler(void (*handler)(void))
+{
+ mshv_handler = handler;
+}
+
void hv_setup_vmbus_handler(void (*handler)(void))
{
vmbus_handler = handler;
@@ -199,8 +205,8 @@ static void hv_machine_shutdown(void)
* Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
* corrupts the old VP Assist Pages and can crash the kexec kernel.
*/
- if (kexec_in_progress && hyperv_init_cpuhp > 0)
- cpuhp_remove_state(hyperv_init_cpuhp);
+ if (kexec_in_progress)
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
/* The function calls stop_other_cpus(). */
native_machine_shutdown();
@@ -224,6 +230,63 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
hyperv_cleanup();
}
#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
@@ -366,6 +429,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
return 0;
}
+EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static void __init ms_hyperv_init_platform(void)
{
@@ -380,13 +444,15 @@ static void __init ms_hyperv_init_platform(void)
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
- pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
- ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+ pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n",
+ ms_hyperv.features, ms_hyperv.priv_high,
+ ms_hyperv.ext_features, ms_hyperv.hints,
ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -395,25 +461,7 @@ static void __init ms_hyperv_init_platform(void)
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
- /*
- * Check CPU management privilege.
- *
- * To mirror what Windows does we should extract CPU management
- * features and use the ReservedIdentityBit to detect if Linux is the
- * root partition. But that requires negotiating CPU management
- * interface (a process to be finalized). For now, use the privilege
- * flag as the indicator for running as root.
- *
- * Hyper-V should never specify running as root and as a Confidential
- * VM. But to protect against a compromised/malicious Hyper-V trying
- * to exploit root behavior to expose Confidential VM memory, ignore
- * the root partition setting if also a Confidential VM.
- */
- if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
- !(ms_hyperv.priv_high & HV_ISOLATION)) {
- hv_root_partition = true;
- pr_info("Hyper-V: running as root partition\n");
- }
+ hv_identify_partition_type();
if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
hv_nested = true;
@@ -424,6 +472,7 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
if (ms_hyperv.priv_high & HV_ISOLATION) {
@@ -449,9 +498,23 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
if (!ms_hyperv.paravisor_present) {
- /* To be supported: more work is required. */
+ /*
+ * Mark the Hyper-V TSC page feature as disabled
+ * in a TDX VM without paravisor so that the
+ * Invariant TSC, which is a better clocksource
+ * anyway, is used instead.
+ */
ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+ /*
+ * The Invariant TSC is expected to be available
+ * in a TDX VM without paravisor, but if not,
+ * print a warning message. The slower Hyper-V MSR-based
+ * Ref Counter should end up being the clocksource.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ pr_warn("Hyper-V: Invariant TSC is unavailable\n");
+
/* HV_MSR_CRASH_CTL is unsupported. */
ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
@@ -522,16 +585,6 @@ static void __init ms_hyperv_init_platform(void)
if (efi_enabled(EFI_BOOT))
x86_platform.get_nmi_reason = hv_get_nmi_reason;
- /*
- * Hyper-V VMs have a PIT emulation quirk such that zeroing the
- * counter register during PIT shutdown restarts the PIT. So it
- * continues to interrupt @18.2 HZ. Setting i8253_clear_counter
- * to false tells pit_shutdown() not to zero the counter so that
- * the PIT really is shutdown. Generation 2 VMs don't have a PIT,
- * and setting this value has no effect.
- */
- i8253_clear_counter_on_shutdown = false;
-
#if IS_ENABLED(CONFIG_HYPERV)
if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
ms_hyperv.paravisor_present)
@@ -557,7 +610,7 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition ||
+ if (hv_root_partition() ||
(!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
@@ -575,6 +628,7 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
hv_vtl_init_platform();
#endif
/*
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7b29ebda024f..e2c6b471d230 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -9,9 +9,11 @@
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/cc_platform.h>
+#include <linux/string_choices.h>
#include <asm/processor-flags.h>
#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
#include <asm/tlbflush.h>
@@ -423,7 +425,7 @@ void __init mtrr_copy_map(void)
}
/**
- * mtrr_overwrite_state - set static MTRR state
+ * guest_force_mtrr_state - set static MTRR state for a guest
*
* Used to set MTRR state via different means (e.g. with data obtained from
* a hypervisor).
@@ -436,8 +438,8 @@ void __init mtrr_copy_map(void)
* @num_var: length of the @var array
* @def_type: default caching type
*/
-void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
- mtrr_type def_type)
+void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
+ mtrr_type def_type)
{
unsigned int i;
@@ -646,10 +648,10 @@ static void __init print_mtrr_state(void)
pr_info("MTRR default type: %s\n",
mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
- pr_info("MTRR fixed ranges %sabled:\n",
- ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
- (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
- "en" : "dis");
+ pr_info("MTRR fixed ranges %s:\n",
+ str_enabled_disabled(
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)));
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -661,8 +663,8 @@ static void __init print_mtrr_state(void)
/* tail */
print_fixed_last();
}
- pr_info("MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
+ pr_info("MTRR variable ranges %s:\n",
+ str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED));
high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
@@ -1025,8 +1027,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
* For Intel PPro stepping <= 7
* must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
*/
- if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
+ if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
boot_cpu_data.x86_stepping <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a5c506f6da7f..4049235b1bfe 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
char *ptr;
char line[LINE_SIZE];
int length;
- size_t linelen;
memset(line, 0, LINE_SIZE);
@@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
if (length < 0)
return length;
- linelen = strlen(line);
- ptr = line + linelen - 1;
- if (linelen && *ptr == '\n')
+ ptr = line + length - 1;
+ if (length && *ptr == '\n')
*ptr = '\0';
if (!strncmp(line, "disable=", 8)) {
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 767bf1c71aad..ecbda0341a8a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -55,6 +55,12 @@
#include "mtrr.h"
+static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE);
+static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB);
+static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH);
+static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT);
+static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK);
+
/* arch_phys_wc_add returns an MTRR register index plus this offset. */
#define MTRR_TO_PHYS_WC_OFFSET 1000
@@ -609,7 +615,7 @@ void mtrr_save_state(void)
{
int first_cpu;
- if (!mtrr_enabled())
+ if (!mtrr_enabled() || !mtrr_state.have_fixed)
return;
first_cpu = cpumask_first(cpu_online_mask);
@@ -619,7 +625,7 @@ void mtrr_save_state(void)
static int __init mtrr_init_finalize(void)
{
/*
- * Map might exist if mtrr_overwrite_state() has been called or if
+ * Map might exist if guest_force_mtrr_state() has been called or if
* mtrr_enabled() returns true.
*/
mtrr_copy_map();
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index e65fae63660e..6571d432cbe3 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -41,11 +41,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
"wp\t\t: yes\n",
- boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
- boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
- boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
- boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
- boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
+ str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
c->cpuid_level);
}
#else
@@ -86,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = arch_freq_get_on_cpu(cpu);
+ int freq = arch_freq_get_on_cpu(cpu);
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
+ if (freq < 0)
+ seq_puts(m, "cpu MHz\t\t: Unknown\n");
+ else
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index 4a06c37b9cf1..0c13b0befd8a 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
-obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 83e40341583e..cf29681d01e0 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -19,10 +19,9 @@
#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/err.h>
-#include <linux/cacheinfo.h>
#include <linux/cpuhotplug.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include "internal.h"
@@ -45,39 +44,29 @@ static DEFINE_MUTEX(domain_list_lock);
DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
/*
- * Used to store the max resource name width and max resource data width
- * to display the schemata in a tabular format
- */
-int max_name_width, max_data_width;
-
-/*
* Global boolean for rdt_alloc which is true if any
* resource allocation is enabled.
*/
bool rdt_alloc_capable;
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
-#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
+#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
+#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
-struct rdt_hw_resource rdt_resources_all[] = {
+struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
[RDT_RESOURCE_L3] =
{
.r_resctrl = {
.rid = RDT_RESOURCE_L3,
.name = "L3",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_L3),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .mon_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
+ .mon_domains = mon_domain_init(RDT_RESOURCE_L3),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L3_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -87,11 +76,9 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_L2,
.name = "L2",
- .cache_level = 2,
- .domains = domain_init(RDT_RESOURCE_L2),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .ctrl_scope = RESCTRL_L2_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L2_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -101,11 +88,9 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_MBA,
.name = "MB",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_MBA),
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
},
},
[RDT_RESOURCE_SMBA] =
@@ -113,15 +98,29 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_SMBA,
.name = "SMBA",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_SMBA),
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
},
},
};
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->num_rmid;
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &rdt_resources_all[l].r_resctrl;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
@@ -156,7 +155,6 @@ static inline void cache_alloc_hsw_probe(void)
return;
hw_res->num_closid = 4;
- r->default_ctrl = max_cbm;
r->cache.cbm_len = 20;
r->cache.shareable_bits = 0xc0000;
r->cache.min_cbm_bits = 2;
@@ -169,7 +167,7 @@ static inline void cache_alloc_hsw_probe(void)
bool is_mba_sc(struct rdt_resource *r)
{
if (!r)
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
+ r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
/*
* The software controller support is only applicable to MBA resource.
@@ -202,7 +200,7 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r)
return false;
}
-static bool __get_mem_config_intel(struct rdt_resource *r)
+static __init bool __get_mem_config_intel(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_3_eax eax;
@@ -212,7 +210,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
max_delay = eax.split.max_delay + 1;
- r->default_ctrl = MAX_MBA_BW;
+ r->membw.max_bw = MAX_MBA_BW;
r->membw.arch_needs_linear = true;
if (ecx & MBA_IS_LINEAR) {
r->membw.delay_linear = true;
@@ -223,20 +221,18 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
return false;
r->membw.arch_needs_linear = false;
}
- r->data_width = 3;
if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
else
r->membw.throttle_mode = THREAD_THROTTLE_MAX;
- thread_throttle_mode_init();
r->alloc_capable = true;
return true;
}
-static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
+static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
u32 eax, ebx, ecx, edx, subleaf;
@@ -249,7 +245,7 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
hw_res->num_closid = edx + 1;
- r->default_ctrl = 1 << eax;
+ r->membw.max_bw = 1 << eax;
/* AMD does not use delay */
r->membw.delay_linear = false;
@@ -262,8 +258,6 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
r->membw.min_bw = 0;
r->membw.bw_gran = 1;
- /* Max value is 2048, Data width should be 4 in decimal */
- r->data_width = 4;
r->alloc_capable = true;
@@ -276,14 +270,13 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
union cpuid_0x10_1_eax eax;
union cpuid_0x10_x_ecx ecx;
union cpuid_0x10_x_edx edx;
- u32 ebx;
+ u32 ebx, default_ctrl;
cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
- r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
- r->cache.shareable_bits = ebx & r->default_ctrl;
- r->data_width = (r->cache.cbm_len + 3) / 4;
+ default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & default_ctrl;
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
r->alloc_capable = true;
@@ -309,12 +302,11 @@ static void rdt_get_cdp_l2_config(void)
rdt_get_cdp_config(RDT_RESOURCE_L2);
}
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void mba_wrmsr_amd(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
@@ -331,46 +323,30 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
return MAX_MBA_BW - bw;
pr_warn_once("Non Linear delay-bw map not supported but queried\n");
- return r->default_ctrl;
+ return MAX_MBA_BW;
}
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r)
+static void mba_wrmsr_intel(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
/* Write the delay values for mba. */
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
+ wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
}
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void cat_wrmsr(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
-struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
-{
- struct rdt_domain *d;
-
- list_for_each_entry(d, &r->domains, list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->cpu_mask))
- return d;
- }
-
- return NULL;
-}
-
u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
{
return resctrl_to_arch_res(r)->num_closid;
@@ -378,52 +354,11 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
void rdt_ctrl_update(void *arg)
{
+ struct rdt_hw_resource *hw_res;
struct msr_param *m = arg;
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
- struct rdt_resource *r = m->res;
- int cpu = smp_processor_id();
- struct rdt_domain *d;
-
- d = get_domain_from_cpu(cpu, r);
- if (d) {
- hw_res->msr_update(d, m, r);
- return;
- }
- pr_warn_once("cpu %d not found in any domain for resource %s\n",
- cpu, r->name);
-}
-
-/*
- * rdt_find_domain - Find a domain in a resource that matches input resource id
- *
- * Search resource r's domain list to find the resource id. If the resource
- * id is found in a domain, return the domain. Otherwise, if requested by
- * caller, return the first domain whose id is bigger than the input id.
- * The domain list is sorted by id in ascending order.
- */
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos)
-{
- struct rdt_domain *d;
- struct list_head *l;
-
- if (id < 0)
- return ERR_PTR(-ENODEV);
-
- list_for_each(l, &r->domains) {
- d = list_entry(l, struct rdt_domain, list);
- /* When id is found, return its domain. */
- if (id == d->id)
- return d;
- /* Stop searching when finding id's position in sorted list. */
- if (id < d->id)
- break;
- }
-
- if (pos)
- *pos = l;
- return NULL;
+ hw_res = resctrl_to_arch_res(m->res);
+ hw_res->msr_update(m);
}
static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
@@ -437,21 +372,26 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
* For Memory Allocation: Set b/w requested to 100%
*/
for (i = 0; i < hw_res->num_closid; i++, dc++)
- *dc = r->default_ctrl;
+ *dc = resctrl_get_default_ctrl(r);
}
-static void domain_free(struct rdt_hw_domain *hw_dom)
+static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
+{
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom);
+}
+
+static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
{
kfree(hw_dom->arch_mbm_total);
kfree(hw_dom->arch_mbm_local);
- kfree(hw_dom->ctrl_val);
kfree(hw_dom);
}
-static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct msr_param m;
u32 *dc;
@@ -463,9 +403,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
hw_dom->ctrl_val = dc;
setup_default_ctrlval(r, dc);
+ m.res = r;
+ m.dom = d;
m.low = 0;
m.high = hw_res->num_closid;
- hw_res->msr_update(d, &m, r);
+ hw_res->msr_update(&m);
return 0;
}
@@ -474,17 +416,17 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
* @num_rmid: The size of the MBM counter array
* @hw_dom: The domain that owns the allocated arrays
*/
-static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
{
size_t tsize;
- if (is_mbm_total_enabled()) {
+ if (resctrl_arch_is_mbm_total_enabled()) {
tsize = sizeof(*hw_dom->arch_mbm_total);
hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL);
if (!hw_dom->arch_mbm_total)
return -ENOMEM;
}
- if (is_mbm_local_enabled()) {
+ if (resctrl_arch_is_mbm_local_enabled()) {
tsize = sizeof(*hw_dom->arch_mbm_local);
hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL);
if (!hw_dom->arch_mbm_local) {
@@ -497,37 +439,45 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
return 0;
}
-/*
- * domain_add_cpu - Add a cpu to a resource's domain list.
- *
- * If an existing domain in the resource r's domain list matches the cpu's
- * resource id, add the cpu in the domain.
- *
- * Otherwise, a new domain is allocated and inserted into the right position
- * in the domain list sorted by id in ascending order.
- *
- * The order in the domain list is visible to users when we print entries
- * in the schemata file and schemata input is validated to have the same order
- * as this list.
- */
-static void domain_add_cpu(int cpu, struct rdt_resource *r)
+static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
{
- int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
+ switch (scope) {
+ case RESCTRL_L2_CACHE:
+ case RESCTRL_L3_CACHE:
+ return get_cpu_cacheinfo_id(cpu, scope);
+ case RESCTRL_L3_NODE:
+ return cpu_to_node(cpu);
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
struct list_head *add_pos = NULL;
- struct rdt_hw_domain *hw_dom;
- struct rdt_domain *d;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
int err;
lockdep_assert_held(&domain_list_lock);
- d = rdt_find_domain(r, id, &add_pos);
- if (IS_ERR(d)) {
- pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
return;
}
- if (d) {
- cpumask_set_cpu(cpu, &d->cpu_mask);
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
if (r->cache.arch_has_per_cpu_cfg)
rdt_domain_reconfigure_cdp(r);
return;
@@ -538,64 +488,187 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
d = &hw_dom->d_resctrl;
- d->id = id;
- cpumask_set_cpu(cpu, &d->cpu_mask);
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_CTRL_DOMAIN;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
rdt_domain_reconfigure_cdp(r);
- if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
- domain_free(hw_dom);
+ if (domain_setup_ctrlval(r, d)) {
+ ctrl_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_ctrl_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ ctrl_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct list_head *add_pos = NULL;
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+ return;
+ }
+
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_MON_DOMAIN;
+ d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!d->ci) {
+ pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
+ mon_domain_free(hw_dom);
return;
}
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+ arch_mon_domain_online(r, d);
- if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
- domain_free(hw_dom);
+ if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+ mon_domain_free(hw_dom);
return;
}
- list_add_tail_rcu(&d->list, add_pos);
+ list_add_tail_rcu(&d->hdr.list, add_pos);
- err = resctrl_online_domain(r, d);
+ err = resctrl_online_mon_domain(r, d);
if (err) {
- list_del_rcu(&d->list);
+ list_del_rcu(&d->hdr.list);
synchronize_rcu();
- domain_free(hw_dom);
+ mon_domain_free(hw_dom);
}
}
-static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
{
- int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
- struct rdt_hw_domain *hw_dom;
- struct rdt_domain *d;
+ if (r->alloc_capable)
+ domain_add_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_add_cpu_mon(cpu, r);
+}
+
+static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
lockdep_assert_held(&domain_list_lock);
- d = rdt_find_domain(r, id, NULL);
- if (IS_ERR_OR_NULL(d)) {
- pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
return;
}
- hw_dom = resctrl_to_arch_dom(d);
- cpumask_clear_cpu(cpu, &d->cpu_mask);
- if (cpumask_empty(&d->cpu_mask)) {
- resctrl_offline_domain(r, d);
- list_del_rcu(&d->list);
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_ctrl_domain(r, d);
+ list_del_rcu(&d->hdr.list);
synchronize_rcu();
/*
- * rdt_domain "d" is going to be freed below, so clear
+ * rdt_ctrl_domain "d" is going to be freed below, so clear
* its pointer from pseudo_lock_region struct.
*/
if (d->plr)
d->plr->d = NULL;
- domain_free(hw_dom);
+ ctrl_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ hw_dom = resctrl_to_arch_mon_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_mon_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
return;
}
}
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_remove_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_remove_cpu_mon(cpu, r);
+}
+
static void clear_closid_rmid(int cpu)
{
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
@@ -639,20 +712,6 @@ static int resctrl_arch_offline_cpu(unsigned int cpu)
return 0;
}
-/*
- * Choose a width for the resource name and resource data based on the
- * resource that has widest name and cbm.
- */
-static __init void rdt_init_padding(void)
-{
- struct rdt_resource *r;
-
- for_each_alloc_capable_rdt_resource(r) {
- if (r->data_width > max_data_width)
- max_data_width = r->data_width;
- }
-}
-
enum {
RDT_FLAG_CMT,
RDT_FLAG_MBM_TOTAL,
@@ -738,6 +797,21 @@ bool __init rdt_cpu_has(int flag)
return ret;
}
+__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ if (!rdt_cpu_has(X86_FEATURE_BMEC))
+ return false;
+
+ switch (evt) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
+ default:
+ return false;
+ }
+}
+
static __init bool get_mem_config(void)
{
struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
@@ -821,18 +895,18 @@ static __init bool get_rdt_mon_resources(void)
static __init void __check_quirks_intel(void)
{
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_HASWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_HASWELL_X:
if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
cache_alloc_hsw_probe();
break;
- case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_SKYLAKE_X:
if (boot_cpu_data.x86_stepping <= 4)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
else
set_rdt_options("!l3cat");
fallthrough;
- case INTEL_FAM6_BROADWELL_X:
+ case INTEL_BROADWELL_X:
intel_rdt_mbm_apply_quirk();
break;
}
@@ -934,7 +1008,7 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
}
}
-static int __init resctrl_late_init(void)
+static int __init resctrl_arch_late_init(void)
{
struct rdt_resource *r;
int state, ret;
@@ -950,8 +1024,6 @@ static int __init resctrl_late_init(void)
if (!get_rdt_resources())
return -ENODEV;
- rdt_init_padding();
-
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/resctrl/cat:online:",
resctrl_arch_online_cpu,
@@ -959,7 +1031,7 @@ static int __init resctrl_late_init(void)
if (state < 0)
return state;
- ret = rdtgroup_init();
+ ret = resctrl_init();
if (ret) {
cpuhp_remove_state(state);
return ret;
@@ -975,18 +1047,13 @@ static int __init resctrl_late_init(void)
return 0;
}
-late_initcall(resctrl_late_init);
+late_initcall(resctrl_arch_late_init);
-static void __exit resctrl_exit(void)
+static void __exit resctrl_arch_exit(void)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
cpuhp_remove_state(rdt_online);
- rdtgroup_exit();
-
- if (r->mon_capable)
- rdt_put_mon_l3_config();
+ resctrl_exit();
}
-__exitcall(resctrl_exit);
+__exitcall(resctrl_arch_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 7997b47743a2..0a0ac5f6112e 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -23,16 +23,25 @@
#include "internal.h"
+struct rdt_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
+typedef int (ctrlval_parser_t)(struct rdt_parse_data *data,
+ struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d);
+
/*
* Check whether MBA bandwidth percentage value is correct. The value is
* checked against the minimum and max bandwidth values specified by the
* hardware. The allocated bandwidth percentage is rounded to the next
* control step available on the hardware.
*/
-static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
{
- unsigned long bw;
int ret;
+ u32 bw;
/*
* Only linear delay values is supported for current Intel SKUs.
@@ -42,16 +51,21 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return false;
}
- ret = kstrtoul(buf, 10, &bw);
+ ret = kstrtou32(buf, 10, &bw);
if (ret) {
- rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
+ rdt_last_cmd_printf("Invalid MB value %s\n", buf);
return false;
}
- if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
- !is_mba_sc(r)) {
- rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
- r->membw.min_bw, r->default_ctrl);
+ /* Nothing else to do if software controller is enabled. */
+ if (is_mba_sc(r)) {
+ *data = bw;
+ return true;
+ }
+
+ if (bw < r->membw.min_bw || bw > r->membw.max_bw) {
+ rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n",
+ bw, r->membw.min_bw, r->membw.max_bw);
return false;
}
@@ -59,17 +73,17 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return true;
}
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d)
+static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d)
{
struct resctrl_staged_config *cfg;
u32 closid = data->rdtgrp->closid;
struct rdt_resource *r = s->res;
- unsigned long bw_val;
+ u32 bw_val;
cfg = &d->staged_config[s->conf_type];
if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
+ rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
return -EINVAL;
}
@@ -99,8 +113,9 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
*/
static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
{
- unsigned long first_bit, zero_bit, val;
+ u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1;
unsigned int cbm_len = r->cache.cbm_len;
+ unsigned long first_bit, zero_bit, val;
int ret;
ret = kstrtoul(buf, 16, &val);
@@ -109,7 +124,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
return false;
}
- if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) {
+ if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) {
rdt_last_cmd_puts("Mask out of range\n");
return false;
}
@@ -138,8 +153,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d)
+static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d)
{
struct rdtgroup *rdtgrp = data->rdtgrp;
struct resctrl_staged_config *cfg;
@@ -148,7 +163,7 @@ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
cfg = &d->staged_config[s->conf_type];
if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
+ rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
return -EINVAL;
}
@@ -205,16 +220,29 @@ static int parse_line(char *line, struct resctrl_schema *s,
struct rdtgroup *rdtgrp)
{
enum resctrl_conf_type t = s->conf_type;
+ ctrlval_parser_t *parse_ctrlval = NULL;
struct resctrl_staged_config *cfg;
struct rdt_resource *r = s->res;
struct rdt_parse_data data;
+ struct rdt_ctrl_domain *d;
char *dom = NULL, *id;
- struct rdt_domain *d;
unsigned long dom_id;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
+ switch (r->schema_fmt) {
+ case RESCTRL_SCHEMA_BITMAP:
+ parse_ctrlval = &parse_cbm;
+ break;
+ case RESCTRL_SCHEMA_RANGE:
+ parse_ctrlval = &parse_bw;
+ break;
+ }
+
+ if (WARN_ON_ONCE(!parse_ctrlval))
+ return -EINVAL;
+
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
(r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
@@ -231,11 +259,11 @@ next:
return -EINVAL;
}
dom = strim(dom);
- list_for_each_entry(d, &r->domains, list) {
- if (d->id == dom_id) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ if (d->hdr.id == dom_id) {
data.buf = dom;
data.rdtgrp = rdtgrp;
- if (r->parse_ctrlval(&data, s, d))
+ if (parse_ctrlval(&data, s, d))
return -EINVAL;
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
cfg = &d->staged_config[t];
@@ -259,52 +287,24 @@ next:
return -EINVAL;
}
-static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
-{
- switch (type) {
- default:
- case CDP_NONE:
- return closid;
- case CDP_CODE:
- return closid * 2 + 1;
- case CDP_DATA:
- return closid * 2;
- }
-}
-
-static bool apply_config(struct rdt_hw_domain *hw_dom,
- struct resctrl_staged_config *cfg, u32 idx,
- cpumask_var_t cpu_mask)
-{
- struct rdt_domain *dom = &hw_dom->d_resctrl;
-
- if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) {
- cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
- hw_dom->ctrl_val[idx] = cfg->new_ctrl;
-
- return true;
- }
-
- return false;
-}
-
-int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- u32 idx = get_config_index(closid, t);
+ u32 idx = resctrl_get_config_index(closid, t);
struct msr_param msr_param;
- if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
+ if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
return -EINVAL;
hw_dom->ctrl_val[idx] = cfg_val;
msr_param.res = r;
+ msr_param.dom = d;
msr_param.low = idx;
msr_param.high = idx + 1;
- hw_res->msr_update(d, &msr_param, r);
+ hw_res->msr_update(&msr_param);
return 0;
}
@@ -312,51 +312,42 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
{
struct resctrl_staged_config *cfg;
- struct rdt_hw_domain *hw_dom;
+ struct rdt_hw_ctrl_domain *hw_dom;
struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
enum resctrl_conf_type t;
- cpumask_var_t cpu_mask;
- struct rdt_domain *d;
u32 idx;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
- msr_param.res = NULL;
- list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+ msr_param.res = NULL;
for (t = 0; t < CDP_NUM_TYPES; t++) {
cfg = &hw_dom->d_resctrl.staged_config[t];
if (!cfg->have_new_ctrl)
continue;
- idx = get_config_index(closid, t);
- if (!apply_config(hw_dom, cfg, idx, cpu_mask))
+ idx = resctrl_get_config_index(closid, t);
+ if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
continue;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
if (!msr_param.res) {
msr_param.low = idx;
msr_param.high = msr_param.low + 1;
msr_param.res = r;
+ msr_param.dom = d;
} else {
msr_param.low = min(msr_param.low, idx);
msr_param.high = max(msr_param.high, idx + 1);
}
}
+ if (msr_param.res)
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- if (cpumask_empty(cpu_mask))
- goto done;
-
- /* Update resource control msr on all the CPUs. */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
-done:
- free_cpumask_var(cpu_mask);
-
return 0;
}
@@ -454,11 +445,11 @@ out:
return ret ?: nbytes;
}
-u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type type)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- u32 idx = get_config_index(closid, type);
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ u32 idx = resctrl_get_config_index(closid, type);
return hw_dom->ctrl_val[idx];
}
@@ -466,7 +457,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
{
struct rdt_resource *r = schema->res;
- struct rdt_domain *dom;
+ struct rdt_ctrl_domain *dom;
bool sep = false;
u32 ctrl_val;
@@ -474,7 +465,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
lockdep_assert_cpus_held();
seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
if (sep)
seq_puts(s, ";");
@@ -484,8 +475,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
ctrl_val = resctrl_arch_get_config(r, dom, closid,
schema->conf_type);
- seq_printf(s, r->format_str, dom->id, max_data_width,
- ctrl_val);
+ seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val);
sep = true;
}
seq_puts(s, "\n");
@@ -513,7 +503,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
} else {
seq_printf(s, "%s:%d=%x\n",
rdtgrp->plr->s->res->name,
- rdtgrp->plr->d->id,
+ rdtgrp->plr->d->hdr.id,
rdtgrp->plr->cbm);
}
} else {
@@ -537,9 +527,101 @@ static int smp_mon_event_count(void *arg)
return 0;
}
+ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ if (!strcmp(buf, "mbm_local_bytes")) {
+ if (resctrl_arch_is_mbm_local_enabled())
+ rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+ else
+ ret = -EINVAL;
+ } else if (!strcmp(buf, "mbm_total_bytes")) {
+ if (resctrl_arch_is_mbm_total_enabled())
+ rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+ else
+ ret = -EINVAL;
+ } else {
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ rdt_last_cmd_printf("Unsupported event id '%s'\n", buf);
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp) {
+ switch (rdtgrp->mba_mbps_event) {
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ seq_puts(s, "mbm_local_bytes\n");
+ break;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ seq_puts(s, "mbm_total_bytes\n");
+ break;
+ default:
+ pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event);
+ ret = -EINVAL;
+ break;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id,
+ struct list_head **pos)
+{
+ struct rdt_domain_hdr *d;
+ struct list_head *l;
+
+ list_for_each(l, h) {
+ d = list_entry(l, struct rdt_domain_hdr, list);
+ /* When id is found, return its domain. */
+ if (id == d->id)
+ return d;
+ /* Stop searching when finding id's position in sorted list. */
+ if (id < d->id)
+ break;
+ }
+
+ if (pos)
+ *pos = l;
+
+ return NULL;
+}
+
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_domain *d, struct rdtgroup *rdtgrp,
- int evtid, int first)
+ struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ cpumask_t *cpumask, int evtid, int first)
{
int cpu;
@@ -553,7 +635,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
rr->evtid = evtid;
rr->r = r;
rr->d = d;
- rr->val = 0;
rr->first = first;
rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
if (IS_ERR(rr->arch_mon_ctx)) {
@@ -561,7 +642,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
return;
}
- cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU);
+ cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
/*
* cpumask_any_housekeeping() prefers housekeeping CPUs, but
@@ -570,7 +651,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
* counters on some platforms if its called in IRQ context.
*/
if (tick_nohz_full_cpu(cpu))
- smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+ smp_call_function_any(cpumask, mon_event_count, rr, 1);
else
smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
@@ -580,12 +661,13 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
int rdtgroup_mondata_show(struct seq_file *m, void *arg)
{
struct kernfs_open_file *of = m->private;
+ struct rdt_domain_hdr *hdr;
+ struct rmid_read rr = {0};
+ struct rdt_mon_domain *d;
u32 resid, evtid, domid;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
union mon_data_bits md;
- struct rdt_domain *d;
- struct rmid_read rr;
int ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
@@ -598,15 +680,40 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
resid = md.u.rid;
domid = md.u.domid;
evtid = md.u.evtid;
+ r = resctrl_arch_get_resource(resid);
- r = &rdt_resources_all[resid].r_resctrl;
- d = rdt_find_domain(r, domid, NULL);
- if (IS_ERR_OR_NULL(d)) {
+ if (md.u.sum) {
+ /*
+ * This file requires summing across all domains that share
+ * the L3 cache id that was provided in the "domid" field of the
+ * mon_data_bits union. Search all domains in the resource for
+ * one that matches this cache id.
+ */
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (d->ci->id == domid) {
+ rr.ci = d->ci;
+ mon_event_read(&rr, r, NULL, rdtgrp,
+ &d->ci->shared_cpu_map, evtid, false);
+ goto checkresult;
+ }
+ }
ret = -ENOENT;
goto out;
+ } else {
+ /*
+ * This file provides data from a single domain. Search
+ * the resource to find the domain with "domid".
+ */
+ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL);
+ if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false);
}
- mon_event_read(&rr, r, d, rdtgrp, evtid, false);
+checkresult:
if (rr.err == -EIO)
seq_puts(m, "Error\n");
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 1a8687f8073a..eaae99602b61 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -32,30 +32,6 @@
*/
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
-/* Reads to Local DRAM Memory */
-#define READS_TO_LOCAL_MEM BIT(0)
-
-/* Reads to Remote DRAM Memory */
-#define READS_TO_REMOTE_MEM BIT(1)
-
-/* Non-Temporal Writes to Local Memory */
-#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2)
-
-/* Non-Temporal Writes to Remote Memory */
-#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3)
-
-/* Reads to Local Memory the system identifies as "Slow Memory" */
-#define READS_TO_LOCAL_S_MEM BIT(4)
-
-/* Reads to Remote Memory the system identifies as "Slow Memory" */
-#define READS_TO_REMOTE_S_MEM BIT(5)
-
-/* Dirty Victims to All Types of Memory */
-#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6)
-
-/* Max event bits supported */
-#define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
-
/**
* cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
* aren't marked nohz_full
@@ -127,35 +103,59 @@ struct mon_evt {
};
/**
- * union mon_data_bits - Monitoring details for each event file
+ * union mon_data_bits - Monitoring details for each event file.
* @priv: Used to store monitoring event data in @u
- * as kernfs private data
- * @rid: Resource id associated with the event file
- * @evtid: Event id associated with the event file
- * @domid: The domain to which the event file belongs
- * @u: Name of the bit fields struct
+ * as kernfs private data.
+ * @u.rid: Resource id associated with the event file.
+ * @u.evtid: Event id associated with the event file.
+ * @u.sum: Set when event must be summed across multiple
+ * domains.
+ * @u.domid: When @u.sum is zero this is the domain to which
+ * the event file belongs. When @sum is one this
+ * is the id of the L3 cache that all domains to be
+ * summed share.
+ * @u: Name of the bit fields struct.
*/
union mon_data_bits {
void *priv;
struct {
unsigned int rid : 10;
- enum resctrl_event_id evtid : 8;
+ enum resctrl_event_id evtid : 7;
+ unsigned int sum : 1;
unsigned int domid : 14;
} u;
};
+/**
+ * struct rmid_read - Data passed across smp_call*() to read event count.
+ * @rgrp: Resource group for which the counter is being read. If it is a parent
+ * resource group then its event count is summed with the count from all
+ * its child resource groups.
+ * @r: Resource describing the properties of the event being read.
+ * @d: Domain that the counter should be read from. If NULL then sum all
+ * domains in @r sharing L3 @ci.id
+ * @evtid: Which monitor event to read.
+ * @first: Initialize MBM counter when true.
+ * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
+ * @err: Error encountered when reading counter.
+ * @val: Returned value of event counter. If @rgrp is a parent resource group,
+ * @val includes the sum of event counts from its child resource groups.
+ * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id,
+ * (summed across child resource groups if @rgrp is a parent resource group).
+ * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only).
+ */
struct rmid_read {
struct rdtgroup *rgrp;
struct rdt_resource *r;
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
enum resctrl_event_id evtid;
bool first;
+ struct cacheinfo *ci;
int err;
u64 val;
void *arch_mon_ctx;
};
-extern unsigned int rdt_mon_features;
extern struct list_head resctrl_schema_all;
extern bool resctrl_mounted;
@@ -209,43 +209,6 @@ struct mongroup {
};
/**
- * struct pseudo_lock_region - pseudo-lock region information
- * @s: Resctrl schema for the resource to which this
- * pseudo-locked region belongs
- * @d: RDT domain to which this pseudo-locked region
- * belongs
- * @cbm: bitmask of the pseudo-locked region
- * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread
- * completion
- * @thread_done: variable used by waitqueue to test if pseudo-locking
- * thread completed
- * @cpu: core associated with the cache on which the setup code
- * will be run
- * @line_size: size of the cache lines
- * @size: size of pseudo-locked region in bytes
- * @kmem: the kernel memory associated with pseudo-locked region
- * @minor: minor number of character device associated with this
- * region
- * @debugfs_dir: pointer to this region's directory in the debugfs
- * filesystem
- * @pm_reqs: Power management QoS requests related to this region
- */
-struct pseudo_lock_region {
- struct resctrl_schema *s;
- struct rdt_domain *d;
- u32 cbm;
- wait_queue_head_t lock_thread_wq;
- int thread_done;
- int cpu;
- unsigned int line_size;
- unsigned int size;
- void *kmem;
- unsigned int minor;
- struct dentry *debugfs_dir;
- struct list_head pm_reqs;
-};
-
-/**
* struct rdtgroup - store rdtgroup's data in resctrl file system.
* @kn: kernfs node
* @rdtgroup_list: linked list for all rdtgroups
@@ -258,6 +221,7 @@ struct pseudo_lock_region {
* monitor only or ctrl_mon group
* @mon: mongroup related data
* @mode: mode of resource group
+ * @mba_mbps_event: input monitoring event id when mba_sc is enabled
* @plr: pseudo-locked region
*/
struct rdtgroup {
@@ -270,6 +234,7 @@ struct rdtgroup {
enum rdt_group_type type;
struct mongroup mon;
enum rdtgrp_mode mode;
+ enum resctrl_event_id mba_mbps_event;
struct pseudo_lock_region *plr;
};
@@ -299,10 +264,7 @@ struct rdtgroup {
/* List of all resource groups */
extern struct list_head rdt_all_groups;
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-void __exit rdtgroup_exit(void);
+extern int max_name_width;
/**
* struct rftype - describe each file in the resctrl file system
@@ -355,70 +317,57 @@ struct arch_mbm_state {
};
/**
- * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share
- * a resource
+ * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a control function
* @d_resctrl: Properties exposed to the resctrl file system
* @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_ctrl_domain {
+ struct rdt_ctrl_domain d_resctrl;
+ u32 *ctrl_val;
+};
+
+/**
+ * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a monitor function
+ * @d_resctrl: Properties exposed to the resctrl file system
* @arch_mbm_total: arch private state for MBM total bandwidth
* @arch_mbm_local: arch private state for MBM local bandwidth
*
* Members of this structure are accessed via helpers that provide abstraction.
*/
-struct rdt_hw_domain {
- struct rdt_domain d_resctrl;
- u32 *ctrl_val;
+struct rdt_hw_mon_domain {
+ struct rdt_mon_domain d_resctrl;
struct arch_mbm_state *arch_mbm_total;
struct arch_mbm_state *arch_mbm_local;
};
-static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
+static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r)
+{
+ return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl);
+}
+
+static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r)
{
- return container_of(r, struct rdt_hw_domain, d_resctrl);
+ return container_of(r, struct rdt_hw_mon_domain, d_resctrl);
}
/**
* struct msr_param - set a range of MSRs from a domain
* @res: The resource to use
+ * @dom: The domain to update
* @low: Beginning index from base MSR
* @high: End index
*/
struct msr_param {
struct rdt_resource *res;
+ struct rdt_ctrl_domain *dom;
u32 low;
u32 high;
};
-static inline bool is_llc_occupancy_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
-}
-
-static inline bool is_mbm_total_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
-}
-
-static inline bool is_mbm_local_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
-}
-
-static inline bool is_mbm_enabled(void)
-{
- return (is_mbm_total_enabled() || is_mbm_local_enabled());
-}
-
-static inline bool is_mbm_event(int e)
-{
- return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
- e <= QOS_L3_MBM_LOCAL_EVENT_ID);
-}
-
-struct rdt_parse_data {
- struct rdtgroup *rdtgrp;
- char *buf;
-};
-
/**
* struct rdt_hw_resource - arch private attributes of a resctrl resource
* @r_resctrl: Attributes of the resource used directly by resctrl.
@@ -431,8 +380,6 @@ struct rdt_parse_data {
* @msr_update: Function pointer to update QOS MSRs
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
* @mbm_width: Monitor width, to detect and correct for overflow.
- * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth
- * Monitoring Event Configuration (BMEC) is supported.
* @cdp_enabled: CDP state of this resource
*
* Members of this structure are either private to the architecture
@@ -443,11 +390,9 @@ struct rdt_hw_resource {
struct rdt_resource r_resctrl;
u32 num_closid;
unsigned int msr_base;
- void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+ void (*msr_update)(struct msr_param *m);
unsigned int mon_scale;
unsigned int mbm_width;
- unsigned int mbm_cfg_mask;
bool cdp_enabled;
};
@@ -456,34 +401,17 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
return container_of(r, struct rdt_hw_resource, r_resctrl);
}
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d);
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d);
-
extern struct mutex rdtgroup_mutex;
+static inline const char *rdt_kn_name(const struct kernfs_node *kn)
+{
+ return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex));
+}
+
extern struct rdt_hw_resource rdt_resources_all[];
extern struct rdtgroup rdtgroup_default;
extern struct dentry *debugfs_resctrl;
-
-enum resctrl_res_level {
- RDT_RESOURCE_L3,
- RDT_RESOURCE_L2,
- RDT_RESOURCE_MBA,
- RDT_RESOURCE_SMBA,
-
- /* Must be the last */
- RDT_NUM_RESOURCES,
-};
-
-static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
-{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res);
-
- hw_res++;
- return &hw_res->r_resctrl;
-}
+extern enum resctrl_event_id mba_mbps_default_event;
static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
{
@@ -492,26 +420,7 @@ static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
-/*
- * To return the common struct rdt_resource, which is contained in struct
- * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
- */
-#define for_each_rdt_resource(r) \
- for (r = &rdt_resources_all[0].r_resctrl; \
- r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \
- r = resctrl_inc(r))
-
-#define for_each_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable || r->mon_capable)
-
-#define for_each_alloc_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable)
-
-#define for_each_mon_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->mon_capable)
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
union cpuid_0x10_1_eax {
@@ -557,55 +466,88 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn);
int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
umode_t mask);
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos);
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
+ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid, bool exclusive);
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
unsigned long cbm);
enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
int rdtgroup_tasks_assigned(struct rdtgroup *r);
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm);
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
-int rdt_pseudo_lock_init(void);
-void rdt_pseudo_lock_release(void);
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
-struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
int closids_supported(void);
void closid_free(int closid);
int alloc_rmid(u32 closid);
void free_rmid(u32 closid, u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
-void __exit rdt_put_mon_l3_config(void);
+void resctrl_mon_resource_exit(void);
bool __init rdt_cpu_has(int flag);
void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg);
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_domain *d, struct rdtgroup *rdtgrp,
- int evtid, int first);
-void mbm_setup_overflow_handler(struct rdt_domain *dom,
+ struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ cpumask_t *cpumask, int evtid, int first);
+int __init resctrl_mon_resource_init(void);
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
unsigned long delay_ms,
int exclude_cpu);
void mbm_handle_overflow(struct work_struct *work);
void __init intel_rdt_mbm_apply_quirk(void);
bool is_mba_sc(struct rdt_resource *r);
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
int exclude_cpu);
void cqm_handle_limbo(struct work_struct *work);
-bool has_busy_rmid(struct rdt_domain *d);
-void __check_limbo(struct rdt_domain *d, bool force_free);
+bool has_busy_rmid(struct rdt_mon_domain *d);
+void __check_limbo(struct rdt_mon_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
-void __init thread_throttle_mode_init(void);
-void __init mbm_config_rftype_init(const char *config);
+void resctrl_file_fflags_init(const char *config, unsigned long fflags);
void rdt_staged_configs_clear(void);
bool closid_allocated(unsigned int closid);
int resctrl_find_cleanest_closid(void);
+#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
+int rdt_pseudo_lock_init(void);
+void rdt_pseudo_lock_release(void);
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
+#else
+static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
+{
+ return false;
+}
+
+static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
+{
+ return false;
+}
+
+static inline int rdt_pseudo_lock_init(void) { return 0; }
+static inline void rdt_pseudo_lock_release(void) { }
+static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { }
+#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */
+
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c34a35ec0f03..a93ed7d2a160 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -15,6 +15,8 @@
* Software Developer Manual June 2016, volume 3, section 17.17.
*/
+#define pr_fmt(fmt) "resctrl: " fmt
+
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/sizes.h>
@@ -24,6 +26,7 @@
#include <asm/resctrl.h>
#include "internal.h"
+#include "trace.h"
/**
* struct rmid_entry - dirty tracking for all RMID.
@@ -96,6 +99,8 @@ unsigned int resctrl_rmid_realloc_limit;
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+static int snc_nodes_per_l3_cache = 1;
+
/*
* The correction factor table is documented in Documentation/arch/x86/resctrl.rst.
* If rmid > rmid threshold, MBM total and local values should be multiplied
@@ -184,7 +189,43 @@ static inline struct rmid_entry *__rmid_entry(u32 idx)
return entry;
}
-static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
+/*
+ * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
+ * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
+ * needed. The physical RMID is the same as the logical RMID.
+ *
+ * On a platform with SNC mode enabled, Linux enables RMID sharing mode
+ * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
+ * Resource Director Technology Architecture Specification" for a full
+ * description of RMID sharing mode).
+ *
+ * In RMID sharing mode there are fewer "logical RMID" values available
+ * to accumulate data ("physical RMIDs" are divided evenly between SNC
+ * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
+ * each SNC node.
+ *
+ * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
+ *
+ * Data is collected independently on each SNC node and can be retrieved
+ * using the "physical RMID" value computed by this function and loaded
+ * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
+ *
+ * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
+ * cache. So a "physical RMID" may be read from any CPU that shares
+ * the L3 cache with the desired SNC node, not just from a CPU in
+ * the specific SNC node.
+ */
+static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ if (snc_nodes_per_l3_cache == 1)
+ return lrmid;
+
+ return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
+}
+
+static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
{
u64 msr_val;
@@ -196,7 +237,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
* are error bits.
*/
- wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
rdmsrl(MSR_IA32_QM_CTR, msr_val);
if (msr_val & RMID_VAL_ERROR)
@@ -208,7 +249,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
return 0;
}
-static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
u32 rmid,
enum resctrl_event_id eventid)
{
@@ -227,19 +268,22 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
return NULL;
}
-void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 unused, u32 rmid,
enum resctrl_event_id eventid)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
struct arch_mbm_state *am;
+ u32 prmid;
am = get_arch_mbm_state(hw_dom, rmid, eventid);
if (am) {
memset(am, 0, sizeof(*am));
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
/* Record any initial, non-zero count value. */
- __rmid_read(rmid, eventid, &am->prev_msr);
+ __rmid_read_phys(prmid, eventid, &am->prev_msr);
}
}
@@ -247,15 +291,15 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
* Assumes that hardware counters are also reset and thus that there is
* no need to record initial non-zero counts.
*/
-void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d)
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
memset(hw_dom->arch_mbm_total, 0,
sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
memset(hw_dom->arch_mbm_local, 0,
sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
}
@@ -268,22 +312,22 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
return chunks >> shift;
}
-int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 unused, u32 rmid, enum resctrl_event_id eventid,
u64 *val, void *ignored)
{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
struct arch_mbm_state *am;
u64 msr_val, chunks;
+ u32 prmid;
int ret;
resctrl_arch_rmid_read_context_check();
- if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
- return -EINVAL;
-
- ret = __rmid_read(rmid, eventid, &msr_val);
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ ret = __rmid_read_phys(prmid, eventid, &msr_val);
if (ret)
return ret;
@@ -319,9 +363,9 @@ static void limbo_release_entry(struct rmid_entry *entry)
* decrement the count. If the busy count gets to zero on an RMID, we
* free the RMID
*/
-void __check_limbo(struct rdt_domain *d, bool force_free)
+void __check_limbo(struct rdt_mon_domain *d, bool force_free)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
struct rmid_entry *entry;
u32 idx, cur_idx = 1;
@@ -354,6 +398,16 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
rmid_dirty = true;
} else {
rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+
+ /*
+ * x86's CLOSID and RMID are independent numbers, so the entry's
+ * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
+ * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
+ * used to select the configuration. It is thus necessary to track both
+ * CLOSID and RMID because there may be dependencies between them
+ * on some architectures.
+ */
+ trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val);
}
if (force_free || !rmid_dirty) {
@@ -367,7 +421,7 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
}
-bool has_busy_rmid(struct rdt_domain *d)
+bool has_busy_rmid(struct rdt_mon_domain *d)
{
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
@@ -467,8 +521,8 @@ int alloc_rmid(u32 closid)
static void add_rmid_to_limbo(struct rmid_entry *entry)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- struct rdt_domain *d;
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdt_mon_domain *d;
u32 idx;
lockdep_assert_held(&rdtgroup_mutex);
@@ -479,7 +533,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
entry->busy = 0;
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
/*
* For the first limbo RMID in the domain,
* setup up the limbo worker.
@@ -508,19 +562,20 @@ void free_rmid(u32 closid, u32 rmid)
* allows architectures that ignore the closid parameter to avoid an
* unnecessary check.
*/
- if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+ if (!resctrl_arch_mon_capable() ||
+ idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
RESCTRL_RESERVED_RMID))
return;
entry = __rmid_entry(idx);
- if (is_llc_occupancy_enabled())
+ if (resctrl_arch_is_llc_occupancy_enabled())
add_rmid_to_limbo(entry);
else
list_add_tail(&entry->list, &rmid_free_lru);
}
-static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid,
+static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
u32 rmid, enum resctrl_event_id evtid)
{
u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
@@ -537,7 +592,10 @@ static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid,
static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
+ int cpu = smp_processor_id();
+ struct rdt_mon_domain *d;
struct mbm_state *m;
+ int err, ret;
u64 tval = 0;
if (rr->first) {
@@ -548,14 +606,47 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
return 0;
}
- rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid,
- &tval, rr->arch_mon_ctx);
- if (rr->err)
- return rr->err;
+ if (rr->d) {
+ /* Reading a single domain, must be on a CPU in that domain. */
+ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
+ return -EINVAL;
+ rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
+ if (rr->err)
+ return rr->err;
- rr->val += tval;
+ rr->val += tval;
- return 0;
+ return 0;
+ }
+
+ /* Summing domains that share a cache, must be on a CPU for that cache. */
+ if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
+ return -EINVAL;
+
+ /*
+ * Legacy files must report the sum of an event across all
+ * domains that share the same L3 cache instance.
+ * Report success if a read from any domain succeeds, -EINVAL
+ * (translated to "Unavailable" for user space) if reading from
+ * all domains fail for any reason.
+ */
+ ret = -EINVAL;
+ list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
+ if (d->ci->id != rr->ci->id)
+ continue;
+ err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
+ if (!err) {
+ rr->val += tval;
+ ret = 0;
+ }
+ }
+
+ if (ret)
+ rr->err = ret;
+
+ return ret;
}
/*
@@ -572,9 +663,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
*/
static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
- u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
- struct mbm_state *m = &rr->d->mbm_local[idx];
u64 cur_bw, bytes, cur_bytes;
+ struct mbm_state *m;
+
+ m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
+ if (WARN_ON_ONCE(!m))
+ return;
cur_bytes = rr->val;
bytes = cur_bytes - m->prev_bw_bytes;
@@ -624,6 +718,22 @@ void mon_event_count(void *info)
rr->err = 0;
}
+static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu,
+ struct rdt_resource *r)
+{
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
/*
* Feedback loop for MBA software controller (mba_sc)
*
@@ -656,27 +766,27 @@ void mon_event_count(void *info)
* throttle MSRs already have low percentage values. To avoid
* unnecessarily restricting such rdtgroups, we also increase the bandwidth.
*/
-static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
{
u32 closid, rmid, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
+ struct rdt_ctrl_domain *dom_mba;
+ enum resctrl_event_id evt_id;
struct rdt_resource *r_mba;
- struct rdt_domain *dom_mba;
- u32 cur_bw, user_bw, idx;
struct list_head *head;
struct rdtgroup *entry;
+ u32 cur_bw, user_bw;
- if (!is_mbm_local_enabled())
- return;
-
- r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+ evt_id = rgrp->mba_mbps_event;
closid = rgrp->closid;
rmid = rgrp->mon.rmid;
- idx = resctrl_arch_rmid_idx_encode(closid, rmid);
- pmbm_data = &dom_mbm->mbm_local[idx];
+ pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
+ if (WARN_ON_ONCE(!pmbm_data))
+ return;
- dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+ dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
if (!dom_mba) {
pr_warn_once("Failure to get domain for MBA update\n");
return;
@@ -693,7 +803,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
*/
head = &rgrp->mon.crdtgrp_list;
list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+ cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
+ if (WARN_ON_ONCE(!cmbm_data))
+ return;
cur_bw += cmbm_data->prev_bw;
}
@@ -722,55 +834,45 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
}
-static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
- u32 closid, u32 rmid)
+static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 closid, u32 rmid, enum resctrl_event_id evtid)
{
- struct rmid_read rr;
+ struct rmid_read rr = {0};
- rr.first = false;
rr.r = r;
rr.d = d;
+ rr.evtid = evtid;
+ rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+ if (IS_ERR(rr.arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(rr.arch_mon_ctx));
+ return;
+ }
+
+ __mon_event_count(closid, rmid, &rr);
/*
- * This is protected from concurrent reads from user
- * as both the user and we hold the global mutex.
+ * If the software controller is enabled, compute the
+ * bandwidth for this event id.
*/
- if (is_mbm_total_enabled()) {
- rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
- rr.val = 0;
- rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
- if (IS_ERR(rr.arch_mon_ctx)) {
- pr_warn_ratelimited("Failed to allocate monitor context: %ld",
- PTR_ERR(rr.arch_mon_ctx));
- return;
- }
-
- __mon_event_count(closid, rmid, &rr);
+ if (is_mba_sc(NULL))
+ mbm_bw_count(closid, rmid, &rr);
- resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
- }
- if (is_mbm_local_enabled()) {
- rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
- rr.val = 0;
- rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
- if (IS_ERR(rr.arch_mon_ctx)) {
- pr_warn_ratelimited("Failed to allocate monitor context: %ld",
- PTR_ERR(rr.arch_mon_ctx));
- return;
- }
-
- __mon_event_count(closid, rmid, &rr);
+ resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
+}
- /*
- * Call the MBA software controller only for the
- * control groups and when user has enabled
- * the software controller explicitly.
- */
- if (is_mba_sc(NULL))
- mbm_bw_count(closid, rmid, &rr);
+static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 closid, u32 rmid)
+{
+ /*
+ * This is protected from concurrent reads from user as both
+ * the user and overflow handler hold the global mutex.
+ */
+ if (resctrl_arch_is_mbm_total_enabled())
+ mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
- resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
- }
+ if (resctrl_arch_is_mbm_local_enabled())
+ mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
}
/*
@@ -780,17 +882,17 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
void cqm_handle_limbo(struct work_struct *work)
{
unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- d = container_of(work, struct rdt_domain, cqm_limbo.work);
+ d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);
__check_limbo(d, false);
if (has_busy_rmid(d)) {
- d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
+ d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
RESCTRL_PICK_ANY_CPU);
schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
delay);
@@ -808,13 +910,13 @@ void cqm_handle_limbo(struct work_struct *work)
* @exclude_cpu: Which CPU the handler should not run on,
* RESCTRL_PICK_ANY_CPU to pick any CPU.
*/
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
int exclude_cpu)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
int cpu;
- cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
+ cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
dom->cqm_work_cpu = cpu;
if (cpu < nr_cpu_ids)
@@ -825,9 +927,9 @@ void mbm_handle_overflow(struct work_struct *work)
{
unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
struct rdtgroup *prgrp, *crgrp;
+ struct rdt_mon_domain *d;
struct list_head *head;
struct rdt_resource *r;
- struct rdt_domain *d;
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
@@ -839,8 +941,8 @@ void mbm_handle_overflow(struct work_struct *work)
if (!resctrl_mounted || !resctrl_arch_mon_capable())
goto out_unlock;
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- d = container_of(work, struct rdt_domain, mbm_over.work);
+ r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ d = container_of(work, struct rdt_mon_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
@@ -857,7 +959,7 @@ void mbm_handle_overflow(struct work_struct *work)
* Re-check for housekeeping CPUs. This allows the overflow handler to
* move off a nohz_full CPU quickly.
*/
- d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
+ d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
RESCTRL_PICK_ANY_CPU);
schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
@@ -874,7 +976,7 @@ out_unlock:
* @exclude_cpu: Which CPU the handler should not run on,
* RESCTRL_PICK_ANY_CPU to pick any CPU.
*/
-void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms,
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
int exclude_cpu)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
@@ -886,7 +988,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms,
*/
if (!resctrl_mounted || !resctrl_arch_mon_capable())
return;
- cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
+ cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
dom->mbm_work_cpu = cpu;
if (cpu < nr_cpu_ids)
@@ -941,7 +1043,7 @@ static int dom_data_init(struct rdt_resource *r)
/*
* RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
* are always allocated. These are used for the rdtgroup_default
- * control group, which will be setup later in rdtgroup_init().
+ * control group, which will be setup later in resctrl_init().
*/
idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
RESCTRL_RESERVED_RMID);
@@ -954,10 +1056,13 @@ out_unlock:
return err;
}
-static void __exit dom_data_exit(void)
+static void dom_data_exit(struct rdt_resource *r)
{
mutex_lock(&rdtgroup_mutex);
+ if (!r->mon_capable)
+ goto out_unlock;
+
if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
kfree(closid_num_dirty_rmid);
closid_num_dirty_rmid = NULL;
@@ -966,6 +1071,7 @@ static void __exit dom_data_exit(void)
kfree(rmid_ptrs);
rmid_ptrs = NULL;
+out_unlock:
mutex_unlock(&rdtgroup_mutex);
}
@@ -995,24 +1101,153 @@ static void l3_mon_evt_init(struct rdt_resource *r)
{
INIT_LIST_HEAD(&r->evt_list);
- if (is_llc_occupancy_enabled())
+ if (resctrl_arch_is_llc_occupancy_enabled())
list_add_tail(&llc_occupancy_event.list, &r->evt_list);
- if (is_mbm_total_enabled())
+ if (resctrl_arch_is_mbm_total_enabled())
list_add_tail(&mbm_total_event.list, &r->evt_list);
- if (is_mbm_local_enabled())
+ if (resctrl_arch_is_mbm_local_enabled())
list_add_tail(&mbm_local_event.list, &r->evt_list);
}
+/*
+ * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
+ * which indicates that RMIDs are configured in legacy mode.
+ * This mode is incompatible with Linux resctrl semantics
+ * as RMIDs are partitioned between SNC nodes, which requires
+ * a user to know which RMID is allocated to a task.
+ * Clearing bit 0 reconfigures the RMID counters for use
+ * in RMID sharing mode. This mode is better for Linux.
+ * The RMID space is divided between all SNC nodes with the
+ * RMIDs renumbered to start from zero in each node when
+ * counting operations from tasks. Code to read the counters
+ * must adjust RMID counter numbers based on SNC node. See
+ * logical_rmid_to_physical_rmid() for code that does this.
+ */
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ if (snc_nodes_per_l3_cache > 1)
+ msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
+}
+
+/* CPU models that support MSR_RMID_SNC_CONFIG */
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+ {}
+};
+
+/*
+ * There isn't a simple hardware bit that indicates whether a CPU is running
+ * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
+ * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
+ * the same NUMA node as CPU0.
+ * It is not possible to accurately determine SNC state if the system is
+ * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
+ * to L3 caches. It will be OK if system is booted with hyperthreading
+ * disabled (since this doesn't affect the ratio).
+ */
+static __init int snc_get_config(void)
+{
+ struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
+ const cpumask_t *node0_cpumask;
+ int cpus_per_node, cpus_per_l3;
+ int ret;
+
+ if (!x86_match_cpu(snc_cpu_ids) || !ci)
+ return 1;
+
+ cpus_read_lock();
+ if (num_online_cpus() != num_present_cpus())
+ pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
+ cpus_read_unlock();
+
+ node0_cpumask = cpumask_of_node(cpu_to_node(0));
+
+ cpus_per_node = cpumask_weight(node0_cpumask);
+ cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
+
+ if (!cpus_per_node || !cpus_per_l3)
+ return 1;
+
+ ret = cpus_per_l3 / cpus_per_node;
+
+ /* sanity check: Only valid results are 1, 2, 3, 4, 6 */
+ switch (ret) {
+ case 1:
+ break;
+ case 2 ... 4:
+ case 6:
+ pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
+ rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
+ break;
+ default:
+ pr_warn("Ignore improbable SNC node count %d\n", ret);
+ ret = 1;
+ break;
+ }
+
+ return ret;
+}
+
+/**
+ * resctrl_mon_resource_init() - Initialise global monitoring structures.
+ *
+ * Allocate and initialise global monitor resources that do not belong to a
+ * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists.
+ * Called once during boot after the struct rdt_resource's have been configured
+ * but before the filesystem is mounted.
+ * Resctrl's cpuhp callbacks may be called before this point to bring a domain
+ * online.
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
+int __init resctrl_mon_resource_init(void)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ int ret;
+
+ if (!r->mon_capable)
+ return 0;
+
+ ret = dom_data_init(r);
+ if (ret)
+ return ret;
+
+ l3_mon_evt_init(r);
+
+ if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
+ mbm_total_event.configurable = true;
+ resctrl_file_fflags_init("mbm_total_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ }
+ if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
+ mbm_local_event.configurable = true;
+ resctrl_file_fflags_init("mbm_local_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ }
+
+ if (resctrl_arch_is_mbm_local_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+ else if (resctrl_arch_is_mbm_total_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+
+ return 0;
+}
+
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
unsigned int threshold;
- int ret;
+
+ snc_nodes_per_l3_cache = snc_get_config();
resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
- hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
- r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
+ r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
@@ -1036,37 +1271,24 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
*/
resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
- ret = dom_data_init(r);
- if (ret)
- return ret;
-
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
u32 eax, ebx, ecx, edx;
/* Detect list of bandwidth sources that can be tracked */
cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
- hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
-
- if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
- mbm_total_event.configurable = true;
- mbm_config_rftype_init("mbm_total_bytes_config");
- }
- if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
- mbm_local_event.configurable = true;
- mbm_config_rftype_init("mbm_local_bytes_config");
- }
+ r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
}
- l3_mon_evt_init(r);
-
r->mon_capable = true;
return 0;
}
-void __exit rdt_put_mon_l3_config(void)
+void resctrl_mon_resource_exit(void)
{
- dom_data_exit();
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+
+ dom_data_exit(r);
}
void __init intel_rdt_mbm_apply_quirk(void)
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 884b88e25141..92ea1472bde9 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -11,7 +11,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
@@ -23,7 +22,7 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include <asm/perf_event.h>
@@ -31,7 +30,7 @@
#include "internal.h"
#define CREATE_TRACE_POINTS
-#include "pseudo_lock_event.h"
+#include "trace.h"
/*
* The bits needed to disable hardware prefetching varies based on the
@@ -53,7 +52,8 @@ static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
rdtgrp = dev_get_drvdata(dev);
if (mode)
*mode = 0600;
- return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
+ guard(mutex)(&rdtgroup_mutex);
+ return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn));
}
static const struct class pseudo_lock_class = {
@@ -62,7 +62,8 @@ static const struct class pseudo_lock_class = {
};
/**
- * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
+ * platforms
* @void: It takes no parameters.
*
* Capture the list of platforms that have been validated to support
@@ -76,20 +77,22 @@ static const struct class pseudo_lock_class = {
* in the SDM.
*
* When adding a platform here also add support for its cache events to
- * measure_cycles_perf_fn()
+ * resctrl_arch_measure_l*_residency()
*
* Return:
* If platform is supported, the bits to disable hardware prefetchers, 0
* if platform is not supported.
*/
-static u64 get_prefetch_disable_bits(void)
+u64 resctrl_arch_get_prefetch_disable_bits(void)
{
+ prefetch_disable_bits = 0;
+
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
boot_cpu_data.x86 != 6)
return 0;
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -99,9 +102,10 @@ static u64 get_prefetch_disable_bits(void)
* 3 DCU IP Prefetcher Disable (R/W)
* 63:4 Reserved
*/
- return 0xF;
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ prefetch_disable_bits = 0xF;
+ break;
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -110,10 +114,11 @@ static u64 get_prefetch_disable_bits(void)
* 2 DCU Hardware Prefetcher Disable (R/W)
* 63:3 Reserved
*/
- return 0x5;
+ prefetch_disable_bits = 0x5;
+ break;
}
- return 0;
+ return prefetch_disable_bits;
}
/**
@@ -221,7 +226,7 @@ static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
int cpu;
int ret;
- for_each_cpu(cpu, &plr->d->cpu_mask) {
+ for_each_cpu(cpu, &plr->d->hdr.cpu_mask) {
pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
if (!pm_req) {
rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
@@ -292,12 +297,15 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
*/
static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
{
- struct cpu_cacheinfo *ci;
+ enum resctrl_scope scope = plr->s->res->ctrl_scope;
+ struct cacheinfo *ci;
int ret;
- int i;
+
+ if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE))
+ return -ENODEV;
/* Pick the first cpu we find that is associated with the cache. */
- plr->cpu = cpumask_first(&plr->d->cpu_mask);
+ plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask);
if (!cpu_online(plr->cpu)) {
rdt_last_cmd_printf("CPU %u associated with cache not online\n",
@@ -306,15 +314,11 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
goto out_region;
}
- ci = get_cpu_cacheinfo(plr->cpu);
-
- plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
-
- for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == plr->s->res->cache_level) {
- plr->line_size = ci->info_list[i].coherency_line_size;
- return 0;
- }
+ ci = get_cpu_cacheinfo_level(plr->cpu, scope);
+ if (ci) {
+ plr->line_size = ci->coherency_line_size;
+ plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
+ return 0;
}
ret = -1;
@@ -410,8 +414,8 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp)
}
/**
- * pseudo_lock_fn - Load kernel memory into cache
- * @_rdtgrp: resource group to which pseudo-lock region belongs
+ * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
+ * @_plr: the pseudo-lock region descriptor
*
* This is the core pseudo-locking flow.
*
@@ -428,10 +432,9 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int pseudo_lock_fn(void *_rdtgrp)
+int resctrl_arch_pseudo_lock_fn(void *_plr)
{
- struct rdtgroup *rdtgrp = _rdtgrp;
- struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct pseudo_lock_region *plr = _plr;
u32 rmid_p, closid_p;
unsigned long i;
u64 saved_msr;
@@ -461,7 +464,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
* increase likelihood that allocated cache portion will be filled
* with associated memory.
*/
- native_wbinvd();
+ wbinvd();
/*
* Always called with interrupts enabled. By disabling interrupts
@@ -491,7 +494,8 @@ static int pseudo_lock_fn(void *_rdtgrp)
* pseudo-locked followed by reading of kernel memory to load it
* into the cache.
*/
- __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
+
/*
* Cache was flushed earlier. Now access kernel memory to read it
* into cache region associated with just activated plr->closid.
@@ -714,8 +718,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
* Not knowing the bits to disable prefetching implies that this
* platform does not support Cache Pseudo-Locking.
*/
- prefetch_disable_bits = get_prefetch_disable_bits();
- if (prefetch_disable_bits == 0) {
+ if (resctrl_arch_get_prefetch_disable_bits() == 0) {
rdt_last_cmd_puts("Pseudo-locking not supported\n");
return -EINVAL;
}
@@ -810,7 +813,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
* Return: true if @cbm overlaps with pseudo-locked region on @d, false
* otherwise.
*/
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
{
unsigned int cbm_len;
unsigned long cbm_b;
@@ -837,11 +840,11 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm
* if it is not possible to test due to memory allocation issue,
* false otherwise.
*/
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
{
+ struct rdt_ctrl_domain *d_i;
cpumask_var_t cpu_with_psl;
struct rdt_resource *r;
- struct rdt_domain *d_i;
bool ret = false;
/* Walking r->domains, ensure it can't race with cpuhp */
@@ -855,10 +858,10 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
* associated with them.
*/
for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(d_i, &r->domains, list) {
+ list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) {
if (d_i->plr)
cpumask_or(cpu_with_psl, cpu_with_psl,
- &d_i->cpu_mask);
+ &d_i->hdr.cpu_mask);
}
}
@@ -866,7 +869,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
* Next test if new pseudo-locked region would intersect with
* existing region.
*/
- if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
+ if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl))
ret = true;
free_cpumask_var(cpu_with_psl);
@@ -874,7 +877,8 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
}
/**
- * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory
+ * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
+ * pseudo-locked memory
* @_plr: pseudo-lock region to measure
*
* There is no deterministic way to test if a memory region is cached. One
@@ -887,7 +891,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int measure_cycles_lat_fn(void *_plr)
+int resctrl_arch_measure_cycles_lat_fn(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
u32 saved_low, saved_high;
@@ -1071,7 +1075,7 @@ out:
return 0;
}
-static int measure_l2_residency(void *_plr)
+int resctrl_arch_measure_l2_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1084,9 +1088,9 @@ static int measure_l2_residency(void *_plr)
* L2_HIT 02H
* L2_MISS 10H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
.umask = 0x10);
perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
@@ -1109,7 +1113,7 @@ out:
return 0;
}
-static int measure_l3_residency(void *_plr)
+int resctrl_arch_measure_l3_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1123,8 +1127,8 @@ static int measure_l3_residency(void *_plr)
* MISS 41H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/* On BDW the hit event counts references, not hits */
perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
.umask = 0x4f);
@@ -1142,7 +1146,7 @@ static int measure_l3_residency(void *_plr)
*/
counts.miss_after -= counts.miss_before;
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) {
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
/*
* On BDW references and misses are counted, need to adjust.
* Sometimes the "hits" counter is a bit more than the
@@ -1198,7 +1202,7 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
}
plr->thread_done = 0;
- cpu = cpumask_first(&plr->d->cpu_mask);
+ cpu = cpumask_first(&plr->d->hdr.cpu_mask);
if (!cpu_online(cpu)) {
ret = -ENODEV;
goto out;
@@ -1207,20 +1211,14 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
plr->cpu = cpu;
if (sel == 1)
- thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
- cpu_to_node(cpu),
- "pseudo_lock_measure/%u",
- cpu);
+ thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn,
+ plr, cpu, "pseudo_lock_measure/%u");
else if (sel == 2)
- thread = kthread_create_on_node(measure_l2_residency, plr,
- cpu_to_node(cpu),
- "pseudo_lock_measure/%u",
- cpu);
+ thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency,
+ plr, cpu, "pseudo_lock_measure/%u");
else if (sel == 3)
- thread = kthread_create_on_node(measure_l3_residency, plr,
- cpu_to_node(cpu),
- "pseudo_lock_measure/%u",
- cpu);
+ thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency,
+ plr, cpu, "pseudo_lock_measure/%u");
else
goto out;
@@ -1228,8 +1226,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
ret = PTR_ERR(thread);
goto out;
}
- kthread_bind(thread, cpu);
- wake_up_process(thread);
ret = wait_event_interruptible(plr->lock_thread_wq,
plr->thread_done == 1);
@@ -1303,6 +1299,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
struct task_struct *thread;
unsigned int new_minor;
struct device *dev;
+ char *kn_name __free(kfree) = NULL;
int ret;
ret = pseudo_lock_region_alloc(plr);
@@ -1314,21 +1311,22 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
ret = -EINVAL;
goto out_region;
}
+ kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL);
+ if (!kn_name) {
+ ret = -ENOMEM;
+ goto out_cstates;
+ }
plr->thread_done = 0;
- thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
- cpu_to_node(plr->cpu),
- "pseudo_lock/%u", plr->cpu);
+ thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr,
+ plr->cpu, "pseudo_lock/%u");
if (IS_ERR(thread)) {
ret = PTR_ERR(thread);
rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
goto out_cstates;
}
- kthread_bind(thread, plr->cpu);
- wake_up_process(thread);
-
ret = wait_event_interruptible(plr->lock_thread_wq,
plr->thread_done == 1);
if (ret < 0) {
@@ -1362,8 +1360,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
mutex_unlock(&rdtgroup_mutex);
if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
- plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
- debugfs_resctrl);
+ plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl);
if (!IS_ERR_OR_NULL(plr->debugfs_dir))
debugfs_create_file("pseudo_lock_measure", 0200,
plr->debugfs_dir, rdtgrp,
@@ -1372,7 +1369,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
dev = device_create(&pseudo_lock_class, NULL,
MKDEV(pseudo_lock_major, new_minor),
- rdtgrp, "%s", rdtgrp->kn->name);
+ rdtgrp, "%s", kn_name);
mutex_lock(&rdtgroup_mutex);
@@ -1528,7 +1525,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
* may be scheduled elsewhere and invalidate entries in the
* pseudo-locked region.
*/
- if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
+ if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) {
mutex_unlock(&rdtgroup_mutex);
return -EINVAL;
}
@@ -1569,7 +1566,6 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
static const struct file_operations pseudo_lock_dev_fops = {
.owner = THIS_MODULE,
- .llseek = no_llseek,
.read = NULL,
.write = NULL,
.open = pseudo_lock_dev_open,
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 011e17efb1a6..cc4a54145c83 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -12,7 +12,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
@@ -58,6 +57,12 @@ static struct kernfs_node *kn_mongrp;
/* Kernel fs node for "mon_data" directory under root */
static struct kernfs_node *kn_mondata;
+/*
+ * Used to store the max resource name width to display the schemata names in
+ * a tabular format.
+ */
+int max_name_width;
+
static struct seq_buf last_cmd_status;
static char last_cmd_status_buf[512];
@@ -66,6 +71,15 @@ static void rdtgroup_destroy_root(void);
struct dentry *debugfs_resctrl;
+/*
+ * Memory bandwidth monitoring event to use for the default CTRL_MON group
+ * and each new CTRL_MON group created by the user. Only relevant when
+ * the filesystem is mounted with the "mba_MBps" option so it does not
+ * matter that it remains uninitialized on systems that do not support
+ * the "mba_MBps" option.
+ */
+enum resctrl_event_id mba_mbps_default_event;
+
static bool resctrl_debug;
void rdt_last_cmd_clear(void)
@@ -92,17 +106,29 @@ void rdt_last_cmd_printf(const char *fmt, ...)
void rdt_staged_configs_clear(void)
{
+ struct rdt_ctrl_domain *dom;
struct rdt_resource *r;
- struct rdt_domain *dom;
lockdep_assert_held(&rdtgroup_mutex);
for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(dom, &r->domains, list)
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
memset(dom->staged_config, 0, sizeof(dom->staged_config));
}
}
+static bool resctrl_is_mbm_enabled(void)
+{
+ return (resctrl_arch_is_mbm_total_enabled() ||
+ resctrl_arch_is_mbm_local_enabled());
+}
+
+static bool resctrl_is_mbm_event(int e)
+{
+ return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+ e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
/*
* Trivial allocator for CLOSIDs. Since h/w only supports a small number,
* we can keep a bitmap of free CLOSIDs in a single integer.
@@ -149,7 +175,8 @@ static int closid_alloc(void)
lockdep_assert_held(&rdtgroup_mutex);
- if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
+ resctrl_arch_is_llc_occupancy_enabled()) {
cleanest_closid = resctrl_find_cleanest_closid();
if (cleanest_closid < 0)
return cleanest_closid;
@@ -317,7 +344,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
rdt_last_cmd_puts("Cache domain offline\n");
ret = -ENODEV;
} else {
- mask = &rdtgrp->plr->d->cpu_mask;
+ mask = &rdtgrp->plr->d->hdr.cpu_mask;
seq_printf(s, is_cpu_list(of) ?
"%*pbl\n" : "%*pb\n",
cpumask_pr_args(mask));
@@ -340,13 +367,13 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
* from update_closid_rmid() is protected against __switch_to() because
* preemption is disabled.
*/
-static void update_cpu_closid_rmid(void *info)
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
{
- struct rdtgroup *r = info;
+ struct resctrl_cpu_defaults *r = info;
if (r) {
this_cpu_write(pqr_state.default_closid, r->closid);
- this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+ this_cpu_write(pqr_state.default_rmid, r->rmid);
}
/*
@@ -361,11 +388,20 @@ static void update_cpu_closid_rmid(void *info)
* Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
*
* Per task closids/rmids must have been set up before calling this function.
+ * @r may be NULL.
*/
static void
update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
{
- on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1);
+ struct resctrl_cpu_defaults defaults, *p = NULL;
+
+ if (r) {
+ defaults.closid = r->closid;
+ defaults.rmid = r->mon.rmid;
+ p = &defaults;
+ }
+
+ on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
}
static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
@@ -908,14 +944,14 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
continue;
seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
- rdtg->kn->name);
+ rdt_kn_name(rdtg->kn));
seq_puts(s, "mon:");
list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
mon.crdtgrp_list) {
if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
crg->mon.rmid))
continue;
- seq_printf(s, "%s", crg->kn->name);
+ seq_printf(s, "%s", rdt_kn_name(crg->kn));
break;
}
seq_putc(s, '\n');
@@ -948,10 +984,20 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
return 0;
}
+static void *rdt_kn_parent_priv(struct kernfs_node *kn)
+{
+ /*
+ * The parent pointer is only valid within RCU section since it can be
+ * replaced.
+ */
+ guard(rcu)();
+ return rcu_dereference(kn->__parent)->priv;
+}
+
static int rdt_num_closids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
seq_printf(seq, "%u\n", s->num_closid);
return 0;
@@ -960,17 +1006,17 @@ static int rdt_num_closids_show(struct kernfs_open_file *of,
static int rdt_default_ctrl_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
- seq_printf(seq, "%x\n", r->default_ctrl);
+ seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
return 0;
}
static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
@@ -980,7 +1026,7 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
static int rdt_shareable_bits_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%x\n", r->cache.shareable_bits);
@@ -1004,7 +1050,7 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
static int rdt_bit_usage_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
/*
* Use unsigned long even though only 32 bits are used to ensure
* test_bit() is used safely.
@@ -1012,7 +1058,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
unsigned long sw_shareable = 0, hw_shareable = 0;
unsigned long exclusive = 0, pseudo_locked = 0;
struct rdt_resource *r = s->res;
- struct rdt_domain *dom;
+ struct rdt_ctrl_domain *dom;
int i, hwb, swb, excl, psl;
enum rdtgrp_mode mode;
bool sep = false;
@@ -1021,12 +1067,12 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
hw_shareable = r->cache.shareable_bits;
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
if (sep)
seq_putc(seq, ';');
sw_shareable = 0;
exclusive = 0;
- seq_printf(seq, "%d=", dom->id);
+ seq_printf(seq, "%d=", dom->hdr.id);
for (i = 0; i < closids_supported(); i++) {
if (!closid_allocated(i))
continue;
@@ -1086,7 +1132,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
static int rdt_min_bw_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.min_bw);
@@ -1096,7 +1142,7 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
static int rdt_num_rmids_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
seq_printf(seq, "%d\n", r->num_rmid);
@@ -1106,7 +1152,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of,
static int rdt_mon_features_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
struct mon_evt *mevt;
list_for_each_entry(mevt, &r->evt_list, list) {
@@ -1121,7 +1167,7 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
static int rdt_bw_gran_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.bw_gran);
@@ -1131,7 +1177,7 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of,
static int rdt_delay_linear_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->membw.delay_linear);
@@ -1149,13 +1195,22 @@ static int max_threshold_occ_show(struct kernfs_open_file *of,
static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
- if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
+ switch (r->membw.throttle_mode) {
+ case THREAD_THROTTLE_PER_THREAD:
seq_puts(seq, "per-thread\n");
- else
+ return 0;
+ case THREAD_THROTTLE_MAX:
seq_puts(seq, "max\n");
+ return 0;
+ case THREAD_THROTTLE_UNDEFINED:
+ seq_puts(seq, "undefined\n");
+ return 0;
+ }
+
+ WARN_ON_ONCE(1);
return 0;
}
@@ -1214,7 +1269,7 @@ static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct resctrl_schema *s = of->kn->parent->priv;
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
struct rdt_resource *r = s->res;
seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
@@ -1243,7 +1298,7 @@ static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
*
* Return: false if CBM does not overlap, true if it does.
*/
-static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid,
enum resctrl_conf_type type, bool exclusive)
{
@@ -1298,7 +1353,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
*
* Return: true if CBM overlap detected, false if there is no overlap
*/
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid, bool exclusive)
{
enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
@@ -1329,10 +1384,10 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
{
int closid = rdtgrp->closid;
+ struct rdt_ctrl_domain *d;
struct resctrl_schema *s;
struct rdt_resource *r;
bool has_cache = false;
- struct rdt_domain *d;
u32 ctrl;
/* Walking r->domains, ensure it can't race with cpuhp */
@@ -1343,7 +1398,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
continue;
has_cache = true;
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
ctrl = resctrl_arch_get_config(r, d, closid,
s->conf_type);
if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
@@ -1417,7 +1472,8 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
goto out;
}
rdtgrp->mode = RDT_MODE_EXCLUSIVE;
- } else if (!strcmp(buf, "pseudo-locksetup")) {
+ } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
+ !strcmp(buf, "pseudo-locksetup")) {
ret = rdtgroup_locksetup_enter(rdtgrp);
if (ret)
goto out;
@@ -1448,20 +1504,19 @@ out:
* bitmap functions work correctly.
*/
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
- struct rdt_domain *d, unsigned long cbm)
+ struct rdt_ctrl_domain *d, unsigned long cbm)
{
- struct cpu_cacheinfo *ci;
unsigned int size = 0;
- int num_b, i;
+ struct cacheinfo *ci;
+ int num_b;
+
+ if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
+ return size;
num_b = bitmap_weight(&cbm, r->cache.cbm_len);
- ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
- for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == r->cache_level) {
- size = ci->info_list[i].size / r->cache.cbm_len * num_b;
- break;
- }
- }
+ ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
+ if (ci)
+ size = ci->size / r->cache.cbm_len * num_b;
return size;
}
@@ -1477,9 +1532,9 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
{
struct resctrl_schema *schema;
enum resctrl_conf_type type;
+ struct rdt_ctrl_domain *d;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
- struct rdt_domain *d;
unsigned int size;
int ret = 0;
u32 closid;
@@ -1503,7 +1558,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
rdtgrp->plr->d,
rdtgrp->plr->cbm);
- seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
+ seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
}
goto out;
}
@@ -1515,7 +1570,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
type = schema->conf_type;
sep = false;
seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
if (sep)
seq_putc(s, ';');
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
@@ -1533,7 +1588,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
else
size = rdtgroup_cbm_to_size(r, d, ctrl);
}
- seq_printf(s, "%d=%u", d->id, size);
+ seq_printf(s, "%d=%u", d->hdr.id, size);
sep = true;
}
seq_putc(s, '\n');
@@ -1545,11 +1600,6 @@ out:
return ret;
}
-struct mon_config_info {
- u32 evtid;
- u32 mon_config;
-};
-
#define INVALID_CONFIG_INDEX UINT_MAX
/**
@@ -1574,46 +1624,49 @@ static inline unsigned int mon_event_config_index_get(u32 evtid)
}
}
-static void mon_event_config_read(void *info)
+void resctrl_arch_mon_event_config_read(void *_config_info)
{
- struct mon_config_info *mon_info = info;
+ struct resctrl_mon_config_info *config_info = _config_info;
unsigned int index;
u64 msrval;
- index = mon_event_config_index_get(mon_info->evtid);
+ index = mon_event_config_index_get(config_info->evtid);
if (index == INVALID_CONFIG_INDEX) {
- pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
return;
}
rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
/* Report only the valid event configuration bits */
- mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
+ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
}
-static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info)
+static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
{
- smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1);
+ smp_call_function_any(&mon_info->d->hdr.cpu_mask,
+ resctrl_arch_mon_event_config_read, mon_info, 1);
}
static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
{
- struct mon_config_info mon_info = {0};
- struct rdt_domain *dom;
+ struct resctrl_mon_config_info mon_info;
+ struct rdt_mon_domain *dom;
bool sep = false;
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
if (sep)
seq_puts(s, ";");
- memset(&mon_info, 0, sizeof(struct mon_config_info));
+ memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
+ mon_info.r = r;
+ mon_info.d = dom;
mon_info.evtid = evtid;
- mondata_config_read(dom, &mon_info);
+ mondata_config_read(&mon_info);
- seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
+ seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
sep = true;
}
seq_puts(s, "\n");
@@ -1627,7 +1680,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
@@ -1637,37 +1690,39 @@ static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
return 0;
}
-static void mon_event_config_write(void *info)
+void resctrl_arch_mon_event_config_write(void *_config_info)
{
- struct mon_config_info *mon_info = info;
+ struct resctrl_mon_config_info *config_info = _config_info;
unsigned int index;
- index = mon_event_config_index_get(mon_info->evtid);
+ index = mon_event_config_index_get(config_info->evtid);
if (index == INVALID_CONFIG_INDEX) {
- pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
return;
}
- wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
+ wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0);
}
static void mbm_config_write_domain(struct rdt_resource *r,
- struct rdt_domain *d, u32 evtid, u32 val)
+ struct rdt_mon_domain *d, u32 evtid, u32 val)
{
- struct mon_config_info mon_info = {0};
+ struct resctrl_mon_config_info mon_info = {0};
/*
* Read the current config value first. If both are the same then
* no need to write it again.
*/
+ mon_info.r = r;
+ mon_info.d = d;
mon_info.evtid = evtid;
- mondata_config_read(d, &mon_info);
+ mondata_config_read(&mon_info);
if (mon_info.mon_config == val)
return;
@@ -1679,7 +1734,7 @@ static void mbm_config_write_domain(struct rdt_resource *r,
* are scoped at the domain level. Writing any of these MSRs
* on one CPU is observed by all the CPUs in the domain.
*/
- smp_call_function_any(&d->cpu_mask, mon_event_config_write,
+ smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
&mon_info, 1);
/*
@@ -1696,10 +1751,9 @@ static void mbm_config_write_domain(struct rdt_resource *r,
static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
char *dom_str = NULL, *id_str;
unsigned long dom_id, val;
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
@@ -1723,14 +1777,14 @@ next:
}
/* Value from user cannot be more than the supported set of events */
- if ((val & hw_res->mbm_cfg_mask) != val) {
+ if ((val & r->mbm_cfg_mask) != val) {
rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
- hw_res->mbm_cfg_mask);
+ r->mbm_cfg_mask);
return -EINVAL;
}
- list_for_each_entry(d, &r->domains, list) {
- if (d->id == dom_id) {
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (d->hdr.id == dom_id) {
mbm_config_write_domain(r, d, evtid, val);
goto next;
}
@@ -1743,7 +1797,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
int ret;
/* Valid input requires a trailing newline */
@@ -1769,7 +1823,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- struct rdt_resource *r = of->kn->parent->priv;
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
int ret;
/* Valid input requires a trailing newline */
@@ -1944,6 +1998,13 @@ static struct rftype res_common_files[] = {
.fflags = RFTYPE_CTRL_BASE,
},
{
+ .name = "mba_MBps_event",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_mba_mbps_event_write,
+ .seq_show = rdtgroup_mba_mbps_event_show,
+ },
+ {
.name = "mode",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -2022,24 +2083,35 @@ static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
return NULL;
}
-void __init thread_throttle_mode_init(void)
+static void thread_throttle_mode_init(void)
{
- struct rftype *rft;
+ enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
+ struct rdt_resource *r_mba, *r_smba;
+
+ r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+ if (r_mba->alloc_capable &&
+ r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+ throttle_mode = r_mba->membw.throttle_mode;
+
+ r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
+ if (r_smba->alloc_capable &&
+ r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+ throttle_mode = r_smba->membw.throttle_mode;
- rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
- if (!rft)
+ if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
return;
- rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB;
+ resctrl_file_fflags_init("thread_throttle_mode",
+ RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
}
-void __init mbm_config_rftype_init(const char *config)
+void resctrl_file_fflags_init(const char *config, unsigned long fflags)
{
struct rftype *rft;
rft = rdtgroup_get_rftype_by_name(config);
if (rft)
- rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE;
+ rft->fflags = fflags;
}
/**
@@ -2161,6 +2233,20 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
return ret;
}
+static unsigned long fflags_from_resource(struct rdt_resource *r)
+{
+ switch (r->rid) {
+ case RDT_RESOURCE_L3:
+ case RDT_RESOURCE_L2:
+ return RFTYPE_RES_CACHE;
+ case RDT_RESOURCE_MBA:
+ case RDT_RESOURCE_SMBA:
+ return RFTYPE_RES_MB;
+ }
+
+ return WARN_ON_ONCE(1);
+}
+
static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
{
struct resctrl_schema *s;
@@ -2181,14 +2267,14 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
/* loop over enabled controls, these are all alloc_capable */
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
- fflags = r->fflags | RFTYPE_CTRL_INFO;
+ fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
if (ret)
goto out_destroy;
}
for_each_mon_capable_rdt_resource(r) {
- fflags = r->fflags | RFTYPE_MON_INFO;
+ fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
sprintf(name, "%s_MON", r->name);
ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
if (ret)
@@ -2252,15 +2338,15 @@ static void l2_qos_cfg_update(void *arg)
static inline bool is_mba_linear(void)
{
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear;
+ return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
}
static int set_cache_qos_cfg(int level, bool enable)
{
void (*update)(void *arg);
+ struct rdt_ctrl_domain *d;
struct rdt_resource *r_l;
cpumask_var_t cpu_mask;
- struct rdt_domain *d;
int cpu;
/* Walking r->domains, ensure it can't race with cpuhp */
@@ -2277,14 +2363,14 @@ static int set_cache_qos_cfg(int level, bool enable)
return -ENOMEM;
r_l = &rdt_resources_all[level].r_resctrl;
- list_for_each_entry(d, &r_l->domains, list) {
+ list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
if (r_l->cache.arch_has_per_cpu_cfg)
/* Pick all the CPUs in the domain instance */
- for_each_cpu(cpu, &d->cpu_mask)
+ for_each_cpu(cpu, &d->hdr.cpu_mask)
cpumask_set_cpu(cpu, cpu_mask);
else
/* Pick one CPU from each domain instance to update MSR */
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
}
/* Update QOS_CFG MSR on all the CPUs in cpu_mask */
@@ -2310,10 +2396,10 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
l3_qos_cfg_update(&hw_res->cdp_enabled);
}
-static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
+static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
u32 num_closid = resctrl_arch_get_num_closid(r);
- int cpu = cpumask_any(&d->cpu_mask);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
int i;
d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
@@ -2328,7 +2414,7 @@ static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
}
static void mba_sc_domain_destroy(struct rdt_resource *r,
- struct rdt_domain *d)
+ struct rdt_ctrl_domain *d)
{
kfree(d->mbps_val);
d->mbps_val = NULL;
@@ -2336,14 +2422,18 @@ static void mba_sc_domain_destroy(struct rdt_resource *r,
/*
* MBA software controller is supported only if
- * MBM is supported and MBA is in linear scale.
+ * MBM is supported and MBA is in linear scale,
+ * and the MBM monitor scope is the same as MBA
+ * control scope.
*/
static bool supports_mba_mbps(void)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
- return (is_mbm_local_enabled() &&
- r->alloc_capable && is_mba_linear());
+ return (resctrl_is_mbm_enabled() &&
+ r->alloc_capable && is_mba_linear() &&
+ r->ctrl_scope == rmbm->mon_scope);
}
/*
@@ -2352,9 +2442,10 @@ static bool supports_mba_mbps(void)
*/
static int set_mba_sc(bool mba_sc)
{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
u32 num_closid = resctrl_arch_get_num_closid(r);
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
+ unsigned long fflags;
int i;
if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
@@ -2362,11 +2453,16 @@ static int set_mba_sc(bool mba_sc)
r->membw.mba_sc = mba_sc;
- list_for_each_entry(d, &r->domains, list) {
+ rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
for (i = 0; i < num_closid; i++)
d->mbps_val[i] = MBA_MAX_MBPS;
}
+ fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
+ resctrl_file_fflags_init("mba_MBps_event", fflags);
+
return 0;
}
@@ -2427,12 +2523,13 @@ static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
* resource. "info" and its subdirectories don't
* have rdtgroup structures, so return NULL here.
*/
- if (kn == kn_info || kn->parent == kn_info)
+ if (kn == kn_info ||
+ rcu_access_pointer(kn->__parent) == kn_info)
return NULL;
else
return kn->priv;
} else {
- return kn->parent->priv;
+ return rdt_kn_parent_priv(kn);
}
}
@@ -2583,6 +2680,20 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type
if (cl > max_name_width)
max_name_width = cl;
+ switch (r->schema_fmt) {
+ case RESCTRL_SCHEMA_BITMAP:
+ s->fmt_str = "%d=%x";
+ break;
+ case RESCTRL_SCHEMA_RANGE:
+ s->fmt_str = "%d=%u";
+ break;
+ }
+
+ if (WARN_ON_ONCE(!s->fmt_str)) {
+ kfree(s);
+ return -EINVAL;
+ }
+
INIT_LIST_HEAD(&s->list);
list_add(&s->list, &resctrl_schema_all);
@@ -2626,7 +2737,7 @@ static int rdt_get_tree(struct fs_context *fc)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
unsigned long flags = RFTYPE_CTRL_BASE;
- struct rdt_domain *dom;
+ struct rdt_mon_domain *dom;
struct rdt_resource *r;
int ret;
@@ -2699,9 +2810,9 @@ static int rdt_get_tree(struct fs_context *fc)
if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
resctrl_mounted = true;
- if (is_mbm_enabled()) {
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- list_for_each_entry(dom, &r->domains, list)
+ if (resctrl_is_mbm_enabled()) {
+ r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ list_for_each_entry(dom, &r->mon_domains, hdr.list)
mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
RESCTRL_PICK_ANY_CPU);
}
@@ -2751,6 +2862,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
struct fs_parse_result result;
+ const char *msg;
int opt;
opt = fs_parse(fc, rdt_fs_parameters, param, &result);
@@ -2765,8 +2877,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->enable_cdpl2 = true;
return 0;
case Opt_mba_mbps:
+ msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
if (!supports_mba_mbps())
- return -EINVAL;
+ return invalfc(fc, msg);
ctx->enable_mba_mbps = true;
return 0;
case Opt_debug:
@@ -2808,44 +2921,36 @@ static int rdt_init_fs_context(struct fs_context *fc)
return 0;
}
-static int reset_all_ctrls(struct rdt_resource *r)
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom;
+ struct rdt_hw_ctrl_domain *hw_dom;
struct msr_param msr_param;
- cpumask_var_t cpu_mask;
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
int i;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
msr_param.res = r;
msr_param.low = 0;
msr_param.high = hw_res->num_closid;
/*
* Disable resource control for this resource by setting all
- * CBMs in all domains to the maximum mask value. Pick one CPU
+ * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
* from each domain to update the MSRs below.
*/
- list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
for (i = 0; i < hw_res->num_closid; i++)
- hw_dom->ctrl_val[i] = r->default_ctrl;
+ hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
+ msr_param.dom = d;
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- /* Update CBM on all the CPUs in cpu_mask */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
- free_cpumask_var(cpu_mask);
-
- return 0;
+ return;
}
/*
@@ -2964,9 +3069,10 @@ static void rdt_kill_sb(struct super_block *sb)
rdt_disable_ctx();
- /*Put everything back to default values. */
+ /* Put everything back to default values. */
for_each_alloc_capable_rdt_resource(r)
- reset_all_ctrls(r);
+ resctrl_arch_reset_all_ctrls(r);
+
rmdir_all_sub();
rdt_pseudo_lock_release();
rdtgroup_default.mode = RDT_MODE_SHAREABLE;
@@ -3010,62 +3116,126 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
return ret;
}
+static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_find_and_get(pkn, name);
+ if (!kn)
+ return;
+ kernfs_put(kn);
+
+ if (kn->dir.subdirs <= 1)
+ kernfs_remove(kn);
+ else
+ kernfs_remove_by_name(kn, subname);
+}
+
/*
* Remove all subdirectories of mon_data of ctrl_mon groups
- * and monitor groups with given domain id.
+ * and monitor groups for the given domain.
+ * Remove files and directories containing "sum" of domain data
+ * when last domain being summed is removed.
*/
static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- unsigned int dom_id)
+ struct rdt_mon_domain *d)
{
struct rdtgroup *prgrp, *crgrp;
+ char subname[32];
+ bool snc_mode;
char name[32];
+ snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ if (snc_mode)
+ sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
+
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- sprintf(name, "mon_%s_%02d", r->name, dom_id);
- kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+ mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
- kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
+ mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
}
}
-static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
- struct rdt_domain *d,
- struct rdt_resource *r, struct rdtgroup *prgrp)
+static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp,
+ bool do_sum)
{
+ struct rmid_read rr = {0};
union mon_data_bits priv;
- struct kernfs_node *kn;
struct mon_evt *mevt;
- struct rmid_read rr;
- char name[32];
int ret;
- sprintf(name, "mon_%s_%02d", r->name, d->id);
- /* create the directory */
- kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
-
- if (WARN_ON(list_empty(&r->evt_list))) {
- ret = -EPERM;
- goto out_destroy;
- }
+ if (WARN_ON(list_empty(&r->evt_list)))
+ return -EPERM;
priv.u.rid = r->rid;
- priv.u.domid = d->id;
+ priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
+ priv.u.sum = do_sum;
list_for_each_entry(mevt, &r->evt_list, list) {
priv.u.evtid = mevt->evtid;
ret = mon_addfile(kn, mevt->name, priv.priv);
if (ret)
+ return ret;
+
+ if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
+ mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
+ }
+
+ return 0;
+}
+
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+ struct rdt_mon_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+ struct kernfs_node *kn, *ckn;
+ char name[32];
+ bool snc_mode;
+ int ret = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ kn = kernfs_find_and_get(parent_kn, name);
+ if (kn) {
+ /*
+ * rdtgroup_mutex will prevent this directory from being
+ * removed. No need to keep this hold.
+ */
+ kernfs_put(kn);
+ } else {
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
goto out_destroy;
+ ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
+ if (ret)
+ goto out_destroy;
+ }
+
+ if (snc_mode) {
+ sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
+ ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(ckn)) {
+ ret = -EINVAL;
+ goto out_destroy;
+ }
- if (is_mbm_event(mevt->evtid))
- mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
+ ret = rdtgroup_kn_set_ugid(ckn);
+ if (ret)
+ goto out_destroy;
+
+ ret = mon_add_all_files(ckn, d, r, prgrp, false);
+ if (ret)
+ goto out_destroy;
}
+
kernfs_activate(kn);
return 0;
@@ -3079,7 +3249,7 @@ out_destroy:
* and "monitor" groups with given domain id.
*/
static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_domain *d)
+ struct rdt_mon_domain *d)
{
struct kernfs_node *parent_kn;
struct rdtgroup *prgrp, *crgrp;
@@ -3101,13 +3271,13 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
struct rdt_resource *r,
struct rdtgroup *prgrp)
{
- struct rdt_domain *dom;
+ struct rdt_mon_domain *dom;
int ret;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
if (ret)
return ret;
@@ -3206,7 +3376,7 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
* Set the RDT domain up to start off with all usable allocations. That is,
* all shareable and unused bits. All-zero CBM is invalid.
*/
-static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
+static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
u32 closid)
{
enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
@@ -3266,7 +3436,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
*/
tmp_cbm = cfg->new_ctrl;
if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
+ rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
return -ENOSPC;
}
cfg->have_new_ctrl = true;
@@ -3286,10 +3456,10 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
*/
static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
{
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
int ret;
- list_for_each_entry(d, &s->res->domains, list) {
+ list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
ret = __init_one_rdt_domain(d, s, closid);
if (ret < 0)
return ret;
@@ -3302,16 +3472,16 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
{
struct resctrl_staged_config *cfg;
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
if (is_mba_sc(r)) {
d->mbps_val[closid] = MBA_MAX_MBPS;
continue;
}
cfg = &d->staged_config[CDP_NONE];
- cfg->new_ctrl = r->default_ctrl;
+ cfg->new_ctrl = resctrl_get_default_ctrl(r);
cfg->have_new_ctrl = true;
}
}
@@ -3383,6 +3553,22 @@ static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
free_rmid(rgrp->closid, rgrp->mon.rmid);
}
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ * This makes sure "mon_groups" directory always has a ctrl_mon group
+ * as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+ return (!strcmp(rdt_kn_name(kn), "mon_groups") &&
+ strcmp(name, "mon_groups"));
+}
+
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
const char *name, umode_t mode,
enum rdt_group_type rtype, struct rdtgroup **r)
@@ -3398,6 +3584,15 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
goto out_unlock;
}
+ /*
+ * Check that the parent directory for a monitor group is a "mon_groups"
+ * directory.
+ */
+ if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
if (rtype == RDTMON_GROUP &&
(prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
@@ -3562,6 +3757,8 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
rdt_last_cmd_puts("kernfs subdir error\n");
goto out_del_list;
}
+ if (is_mba_sc(NULL))
+ rdtgrp->mba_mbps_event = mba_mbps_default_event;
}
goto out_unlock;
@@ -3579,22 +3776,6 @@ out_unlock:
return ret;
}
-/*
- * We allow creating mon groups only with in a directory called "mon_groups"
- * which is present in every ctrl_mon group. Check if this is a valid
- * "mon_groups" directory.
- *
- * 1. The directory should be named "mon_groups".
- * 2. The mon group itself should "not" be named "mon_groups".
- * This makes sure "mon_groups" directory always has a ctrl_mon group
- * as parent.
- */
-static bool is_mon_groups(struct kernfs_node *kn, const char *name)
-{
- return (!strcmp(kn->name, "mon_groups") &&
- strcmp(name, "mon_groups"));
-}
-
static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
umode_t mode)
{
@@ -3610,11 +3791,8 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
- /*
- * If RDT monitoring is supported and the parent directory is a valid
- * "mon_groups" directory, add a monitoring subdirectory.
- */
- if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name))
+ /* Else, attempt to add a monitoring subdirectory. */
+ if (resctrl_arch_mon_capable())
return rdtgroup_mkdir_mon(parent_kn, name, mode);
return -EPERM;
@@ -3623,14 +3801,21 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+ u32 closid, rmid;
int cpu;
/* Give any tasks back to the parent group */
rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
- /* Update per cpu rmid of the moved CPUs first */
+ /*
+ * Update per cpu closid/rmid of the moved CPUs first.
+ * Note: the closid will not change, but the arch code still needs it.
+ */
+ closid = prdtgrp->closid;
+ rmid = prdtgrp->mon.rmid;
for_each_cpu(cpu, &rdtgrp->cpu_mask)
- per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+ resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
+
/*
* Update the MSR on moved CPUs and CPUs which have moved
* task running on them.
@@ -3663,6 +3848,7 @@ static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
+ u32 closid, rmid;
int cpu;
/* Give any tasks back to the default group */
@@ -3673,10 +3859,10 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
/* Update per cpu closid and rmid of the moved CPUs first */
- for_each_cpu(cpu, &rdtgrp->cpu_mask) {
- per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
- per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
- }
+ closid = rdtgroup_default.closid;
+ rmid = rdtgroup_default.mon.rmid;
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
/*
* Update the MSR on moved CPUs and CPUs which have moved
@@ -3698,9 +3884,18 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
return 0;
}
+static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn)
+{
+ /*
+ * Valid within the RCU section it was obtained or while rdtgroup_mutex
+ * is held.
+ */
+ return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
+}
+
static int rdtgroup_rmdir(struct kernfs_node *kn)
{
- struct kernfs_node *parent_kn = kn->parent;
+ struct kernfs_node *parent_kn;
struct rdtgroup *rdtgrp;
cpumask_var_t tmpmask;
int ret = 0;
@@ -3713,6 +3908,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
ret = -EPERM;
goto out;
}
+ parent_kn = rdt_kn_parent(kn);
/*
* If the rdtgroup is a ctrl_mon group and parent directory
@@ -3730,7 +3926,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
}
} else if (rdtgrp->type == RDTMON_GROUP &&
- is_mon_groups(parent_kn, kn->name)) {
+ is_mon_groups(parent_kn, rdt_kn_name(kn))) {
ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
} else {
ret = -EPERM;
@@ -3781,6 +3977,7 @@ static void mongrp_reparent(struct rdtgroup *rdtgrp,
static int rdtgroup_rename(struct kernfs_node *kn,
struct kernfs_node *new_parent, const char *new_name)
{
+ struct kernfs_node *kn_parent;
struct rdtgroup *new_prdtgrp;
struct rdtgroup *rdtgrp;
cpumask_var_t tmpmask;
@@ -3815,8 +4012,9 @@ static int rdtgroup_rename(struct kernfs_node *kn,
goto out;
}
- if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
- !is_mon_groups(kn->parent, kn->name)) {
+ kn_parent = rdt_kn_parent(kn);
+ if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
+ !is_mon_groups(kn_parent, rdt_kn_name(kn))) {
rdt_last_cmd_puts("Source must be a MON group\n");
ret = -EPERM;
goto out;
@@ -3877,7 +4075,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
seq_puts(seq, ",cdpl2");
- if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
+ if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
seq_puts(seq, ",mba_MBps");
if (resctrl_debug)
@@ -3928,33 +4126,37 @@ static void __init rdtgroup_setup_default(void)
mutex_unlock(&rdtgroup_mutex);
}
-static void domain_destroy_mon_state(struct rdt_domain *d)
+static void domain_destroy_mon_state(struct rdt_mon_domain *d)
{
bitmap_free(d->rmid_busy_llc);
kfree(d->mbm_total);
kfree(d->mbm_local);
}
-void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
+void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
mutex_lock(&rdtgroup_mutex);
if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
mba_sc_domain_destroy(r, d);
- if (!r->mon_capable)
- goto out_unlock;
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ mutex_lock(&rdtgroup_mutex);
/*
* If resctrl is mounted, remove all the
* per domain monitor data directories.
*/
if (resctrl_mounted && resctrl_arch_mon_capable())
- rmdir_mondata_subdir_allrdtgrp(r, d->id);
+ rmdir_mondata_subdir_allrdtgrp(r, d);
- if (is_mbm_enabled())
+ if (resctrl_is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
- if (is_llc_occupancy_enabled() && has_busy_rmid(d)) {
+ if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
/*
* When a package is going down, forcefully
* decrement rmid->ebusy. There is no way to know
@@ -3969,21 +4171,33 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
domain_destroy_mon_state(d);
-out_unlock:
mutex_unlock(&rdtgroup_mutex);
}
-static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+/**
+ * domain_setup_mon_state() - Initialise domain monitoring structures.
+ * @r: The resource for the newly online domain.
+ * @d: The newly online domain.
+ *
+ * Allocate monitor resources that belong to this domain.
+ * Called when the first CPU of a domain comes online, regardless of whether
+ * the filesystem is mounted.
+ * During boot this may be called before global allocations have been made by
+ * resctrl_mon_resource_init().
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
{
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
size_t tsize;
- if (is_llc_occupancy_enabled()) {
+ if (resctrl_arch_is_llc_occupancy_enabled()) {
d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
if (!d->rmid_busy_llc)
return -ENOMEM;
}
- if (is_mbm_total_enabled()) {
+ if (resctrl_arch_is_mbm_total_enabled()) {
tsize = sizeof(*d->mbm_total);
d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_total) {
@@ -3991,7 +4205,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
return -ENOMEM;
}
}
- if (is_mbm_local_enabled()) {
+ if (resctrl_arch_is_mbm_local_enabled()) {
tsize = sizeof(*d->mbm_local);
d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_local) {
@@ -4004,7 +4218,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
return 0;
}
-int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
+int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
int err = 0;
@@ -4013,23 +4227,30 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
/* RDT_RESOURCE_MBA is never mon_capable */
err = mba_sc_domain_allocate(r, d);
- goto out_unlock;
}
- if (!r->mon_capable)
- goto out_unlock;
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ int err;
+
+ mutex_lock(&rdtgroup_mutex);
err = domain_setup_mon_state(r, d);
if (err)
goto out_unlock;
- if (is_mbm_enabled()) {
+ if (resctrl_is_mbm_enabled()) {
INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
RESCTRL_PICK_ANY_CPU);
}
- if (is_llc_occupancy_enabled())
+ if (resctrl_arch_is_llc_occupancy_enabled())
INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
/*
@@ -4065,11 +4286,27 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
}
}
+static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
+ struct rdt_resource *r)
+{
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
void resctrl_offline_cpu(unsigned int cpu)
{
- struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdt_mon_domain *d;
struct rdtgroup *rdtgrp;
- struct rdt_domain *d;
mutex_lock(&rdtgroup_mutex);
list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
@@ -4082,14 +4319,14 @@ void resctrl_offline_cpu(unsigned int cpu)
if (!l3->mon_capable)
goto out_unlock;
- d = get_domain_from_cpu(cpu, l3);
+ d = get_mon_domain_from_cpu(cpu, l3);
if (d) {
- if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
cancel_delayed_work(&d->mbm_over);
mbm_setup_overflow_handler(d, 0, cpu);
}
- if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
- has_busy_rmid(d)) {
+ if (resctrl_arch_is_llc_occupancy_enabled() &&
+ cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
cancel_delayed_work(&d->cqm_limbo);
cqm_setup_limbo_handler(d, 0, cpu);
}
@@ -4100,14 +4337,14 @@ out_unlock:
}
/*
- * rdtgroup_init - rdtgroup initialization
+ * resctrl_init - resctrl filesystem initialization
*
* Setup resctrl file system including set up root, create mount point,
- * register rdtgroup filesystem, and initialize files under root directory.
+ * register resctrl filesystem, and initialize files under root directory.
*
* Return: 0 on success or -errno
*/
-int __init rdtgroup_init(void)
+int __init resctrl_init(void)
{
int ret = 0;
@@ -4116,10 +4353,18 @@ int __init rdtgroup_init(void)
rdtgroup_setup_default();
- ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ thread_throttle_mode_init();
+
+ ret = resctrl_mon_resource_init();
if (ret)
return ret;
+ ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ if (ret) {
+ resctrl_mon_resource_exit();
+ return ret;
+ }
+
ret = register_filesystem(&rdt_fs_type);
if (ret)
goto cleanup_mountpoint;
@@ -4151,13 +4396,16 @@ int __init rdtgroup_init(void)
cleanup_mountpoint:
sysfs_remove_mount_point(fs_kobj, "resctrl");
+ resctrl_mon_resource_exit();
return ret;
}
-void __exit rdtgroup_exit(void)
+void __exit resctrl_exit(void)
{
debugfs_remove_recursive(debugfs_resctrl);
unregister_filesystem(&rdt_fs_type);
sysfs_remove_mount_point(fs_kobj, "resctrl");
+
+ resctrl_mon_resource_exit();
}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/trace.h
index 428ebbd4270b..2a506316b303 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h
+++ b/arch/x86/kernel/cpu/resctrl/trace.h
@@ -2,8 +2,8 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM resctrl
-#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_PSEUDO_LOCK_H
+#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RESCTRL_H
#include <linux/tracepoint.h>
@@ -35,9 +35,25 @@ TRACE_EVENT(pseudo_lock_l3,
TP_printk("hits=%llu miss=%llu",
__entry->l3_hits, __entry->l3_miss));
-#endif /* _TRACE_PSEUDO_LOCK_H */
+TRACE_EVENT(mon_llc_occupancy_limbo,
+ TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
+ TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
+ TP_STRUCT__entry(__field(u32, ctrl_hw_id)
+ __field(u32, mon_hw_id)
+ __field(int, domain_id)
+ __field(u64, llc_occupancy_bytes)),
+ TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
+ __entry->mon_hw_id = mon_hw_id;
+ __entry->domain_id = domain_id;
+ __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
+ TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
+ __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
+ __entry->llc_occupancy_bytes)
+ );
+
+#endif /* _TRACE_RESCTRL_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE pseudo_lock_event
+#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index af5aa2c754c2..16f3ca30626a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -24,33 +24,36 @@ struct cpuid_bit {
* levels are different and there is a separate entry for each.
*/
static const struct cpuid_bit cpuid_bits[] = {
- { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
- { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
- { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
- { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
- { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
- { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
- { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
- { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
- { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
- { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
- { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
- { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
- { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
- { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
- { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
- { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
- { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
- { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
- { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
- { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
- { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
- { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
- { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
- { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
- { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
+ { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
+ { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
+ { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
+ { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
+ { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
+ { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
+ { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
+ { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 },
+ { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
+ { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 262f5fb18d74..7f8d1e11dbee 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file,
if (flags & MAP_FIXED)
return addr;
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
#ifdef CONFIG_COMPAT
@@ -150,13 +150,15 @@ int __init sgx_drv_init(void)
u64 xfrm_mask;
int ret;
- if (!cpu_feature_enabled(X86_FEATURE_SGX_LC))
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
+ }
cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
if (!(eax & 1)) {
- pr_err("SGX disabled: SGX1 instruction support not available.\n");
+ pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
}
@@ -173,8 +175,10 @@ int __init sgx_drv_init(void)
}
ret = misc_register(&sgx_dev_enclave);
- if (ret)
+ if (ret) {
+ pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret);
return ret;
+ }
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index b65ab214bdf5..776a20172867 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -64,6 +64,13 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
struct file *backing;
long ret;
+ /*
+ * ECREATE would detect this too, but checking here also ensures
+ * that the 'encl_size' calculations below can never overflow.
+ */
+ if (!is_power_of_2(secs->size))
+ return -EINVAL;
+
va_page = sgx_encl_grow(encl, true);
if (IS_ERR(va_page))
return PTR_ERR(va_page);
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 166692f2d501..8ce352fc72ac 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -13,6 +13,7 @@
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
@@ -474,24 +475,25 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
struct sgx_epc_page *page;
int nid_of_current = numa_node_id();
- int nid = nid_of_current;
+ int nid_start, nid;
- if (node_isset(nid_of_current, sgx_numa_mask)) {
- page = __sgx_alloc_epc_page_from_node(nid_of_current);
- if (page)
- return page;
- }
-
- /* Fall back to the non-local NUMA nodes: */
- while (true) {
- nid = next_node_in(nid, sgx_numa_mask);
- if (nid == nid_of_current)
- break;
+ /*
+ * Try local node first. If it doesn't have an EPC section,
+ * fall back to the non-local NUMA nodes.
+ */
+ if (node_isset(nid_of_current, sgx_numa_mask))
+ nid_start = nid_of_current;
+ else
+ nid_start = next_node_in(nid_of_current, sgx_numa_mask);
+ nid = nid_start;
+ do {
page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
- }
+
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != nid_start);
return ERR_PTR(-ENOMEM);
}
@@ -628,7 +630,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
if (!section->virt_addr)
return false;
- section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
+ section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page));
if (!section->pages) {
memunmap(section->virt_addr);
return false;
@@ -731,7 +733,7 @@ out:
return 0;
}
-/**
+/*
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
* metric.
@@ -846,6 +848,13 @@ static bool __init sgx_page_cache_init(void)
return false;
}
+ for_each_online_node(nid) {
+ if (!node_isset(nid, sgx_numa_mask) &&
+ node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ nid);
+ }
+
return true;
}
@@ -892,19 +901,15 @@ static struct miscdevice sgx_dev_provision = {
int sgx_set_attribute(unsigned long *allowed_attributes,
unsigned int attribute_fd)
{
- struct fd f = fdget(attribute_fd);
+ CLASS(fd, f)(attribute_fd);
- if (!f.file)
+ if (fd_empty(f))
return -EINVAL;
- if (f.file->f_op != &sgx_provision_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &sgx_provision_fops)
return -EINVAL;
- }
*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
-
- fdput(f);
return 0;
}
EXPORT_SYMBOL_GPL(sgx_set_attribute);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index d17c9b71eb4a..01456236a6dd 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -128,6 +128,9 @@ static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
static __init bool check_for_real_bsp(u32 apic_id)
{
+ bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
+ u64 msr;
+
/*
* There is no real good way to detect whether this a kdump()
* kernel, but except on the Voyager SMP monstrosity which is not
@@ -144,17 +147,61 @@ static __init bool check_for_real_bsp(u32 apic_id)
if (topo_info.real_bsp_apic_id != BAD_APICID)
return false;
+ /*
+ * Check whether the enumeration order is broken by evaluating the
+ * BSP bit in the APICBASE MSR. If the CPU does not have the
+ * APICBASE MSR then the BSP detection is not possible and the
+ * kernel must rely on the firmware enumeration order.
+ */
+ if (has_apic_base) {
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
+ }
+
if (apic_id == topo_info.boot_cpu_apic_id) {
- topo_info.real_bsp_apic_id = apic_id;
- return false;
+ /*
+ * If the boot CPU has the APIC BSP bit set then the
+ * firmware enumeration is agreeing. If the CPU does not
+ * have the APICBASE MSR then the only choice is to trust
+ * the enumeration order.
+ */
+ if (is_bsp || !has_apic_base) {
+ topo_info.real_bsp_apic_id = apic_id;
+ return false;
+ }
+ /*
+ * If the boot APIC is enumerated first, but the APICBASE
+ * MSR does not have the BSP bit set, then there is no way
+ * to discover the real BSP here. Assume a crash kernel and
+ * limit the number of CPUs to 1 as an INIT to the real BSP
+ * would reset the machine.
+ */
+ pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
+ pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
+ set_nr_cpu_ids(1);
+ goto fwbug;
}
- pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x > %x\n",
+ pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
topo_info.boot_cpu_apic_id, apic_id);
+
+ if (is_bsp) {
+ /*
+ * The boot CPU has the APIC BSP bit set. Use it and complain
+ * about the broken firmware enumeration.
+ */
+ topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
+ goto fwbug;
+ }
+
pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
topo_info.real_bsp_apic_id = apic_id;
return true;
+
+fwbug:
+ pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
+ return false;
}
static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
@@ -381,8 +428,8 @@ void __init topology_apply_cmdline_limits_early(void)
{
unsigned int possible = nr_cpu_ids;
- /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */
- if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled)
+ /* 'maxcpus=0' 'nosmp' 'nolapic' */
+ if (!setup_max_cpus || apic_is_disabled)
possible = 1;
/* 'possible_cpus=N' */
@@ -396,7 +443,7 @@ void __init topology_apply_cmdline_limits_early(void)
static __init bool restrict_to_up(void)
{
- if (!smp_found_config || ioapic_is_disabled)
+ if (!smp_found_config)
return true;
/*
* XEN PV is special as it does not advertise the local APIC
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
index a7aa6eff4ae5..03b3c9c3a45e 100644
--- a/arch/x86/kernel/cpu/topology_amd.c
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -58,7 +58,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
tscan->amd_node_id = node_id;
}
-static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
+static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext)
{
struct {
// eax
@@ -84,9 +84,9 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
/*
* If leaf 0xb is available, then the domain shifts are set
- * already and nothing to do here.
+ * already and nothing to do here. Only valid for family >= 0x17.
*/
- if (!has_0xb) {
+ if (!has_topoext && tscan->c->x86 >= 0x17) {
/*
* Leaf 0x80000008 set the CORE domain shift already.
* Update the SMT domain, but do not propagate it.
@@ -119,7 +119,7 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
return true;
}
-static bool parse_fam10h_node_id(struct topo_scan *tscan)
+static void parse_fam10h_node_id(struct topo_scan *tscan)
{
union {
struct {
@@ -131,20 +131,20 @@ static bool parse_fam10h_node_id(struct topo_scan *tscan)
} nid;
if (!boot_cpu_has(X86_FEATURE_NODEID_MSR))
- return false;
+ return;
rdmsrl(MSR_FAM10H_NODE_ID, nid.msr);
store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id);
tscan->c->topo.llc_id = nid.node_id;
- return true;
}
static void legacy_set_llc(struct topo_scan *tscan)
{
unsigned int apicid = tscan->c->topo.initial_apicid;
- /* parse_8000_0008() set everything up except llc_id */
- tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
+ /* If none of the parsers set LLC ID then use the die ID for it. */
+ if (tscan->c->topo.llc_id == BAD_APICID)
+ tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
}
static void topoext_fixup(struct topo_scan *tscan)
@@ -169,28 +169,31 @@ static void topoext_fixup(struct topo_scan *tscan)
static void parse_topology_amd(struct topo_scan *tscan)
{
- bool has_0xb = false;
+ bool has_topoext = false;
/*
* If the extended topology leaf 0x8000_001e is available
- * try to get SMT and CORE shift from leaf 0xb first, then
- * try to get the CORE shift from leaf 0x8000_0008.
+ * try to get SMT, CORE, TILE, and DIE shifts from extended
+ * CPUID leaf 0x8000_0026 on supported processors first. If
+ * extended CPUID leaf 0x8000_0026 is not supported, try to
+ * get SMT and CORE shift from leaf 0xb first, then try to
+ * get the CORE shift from leaf 0x8000_0008.
*/
if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
- has_0xb = cpu_parse_topology_ext(tscan);
+ has_topoext = cpu_parse_topology_ext(tscan);
+
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES))
+ tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
- if (!has_0xb && !parse_8000_0008(tscan))
+ if (!has_topoext && !parse_8000_0008(tscan))
return;
/* Prefer leaf 0x8000001e if available */
- if (parse_8000_001e(tscan, has_0xb))
+ if (parse_8000_001e(tscan, has_topoext))
return;
/* Try the NODEID MSR */
- if (parse_fam10h_node_id(tscan))
- return;
-
- legacy_set_llc(tscan);
+ parse_fam10h_node_id(tscan);
}
void cpu_parse_topology_amd(struct topo_scan *tscan)
@@ -198,6 +201,7 @@ void cpu_parse_topology_amd(struct topo_scan *tscan)
tscan->amd_nodes_per_pkg = 1;
topoext_fixup(tscan);
parse_topology_amd(tscan);
+ legacy_set_llc(tscan);
if (tscan->amd_nodes_per_pkg > 1)
set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM);
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
index 9a6069e7133c..b5a5e1411469 100644
--- a/arch/x86/kernel/cpu/topology_common.c
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -3,6 +3,7 @@
#include <xen/xen.h>
+#include <asm/intel-family.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/smp.h>
@@ -27,6 +28,36 @@ void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
}
}
+enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ switch (c->topo.intel_type) {
+ case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY;
+ case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE;
+ }
+ }
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ switch (c->topo.amd_type) {
+ case 0: return TOPO_CPU_TYPE_PERFORMANCE;
+ case 1: return TOPO_CPU_TYPE_EFFICIENCY;
+ }
+ }
+
+ return TOPO_CPU_TYPE_UNKNOWN;
+}
+
+const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c)
+{
+ switch (get_topology_cpu_type(c)) {
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ return "performance";
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ return "efficiency";
+ default:
+ return "unknown";
+ }
+}
+
static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c)
{
struct {
@@ -87,6 +118,7 @@ static void parse_topology(struct topo_scan *tscan, bool early)
.cu_id = 0xff,
.llc_id = BAD_APICID,
.l2c_id = BAD_APICID,
+ .cpu_type = TOPO_CPU_TYPE_UNKNOWN,
};
struct cpuinfo_x86 *c = tscan->c;
struct {
@@ -132,6 +164,8 @@ static void parse_topology(struct topo_scan *tscan, bool early)
case X86_VENDOR_INTEL:
if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan))
parse_legacy(tscan);
+ if (c->cpuid_level >= 0x1a)
+ c->topo.cpu_type = cpuid_eax(0x1a);
break;
case X86_VENDOR_HYGON:
if (IS_ENABLED(CONFIG_CPU_SUP_HYGON))
@@ -151,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early)
if (!early) {
c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
}
/* Package relative core ID */
diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c
index e477228cd5b2..467b0326bf1a 100644
--- a/arch/x86/kernel/cpu/topology_ext.c
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -13,7 +13,10 @@ enum topo_types {
CORE_TYPE = 2,
MAX_TYPE_0B = 3,
MODULE_TYPE = 3,
+ AMD_CCD_TYPE = 3,
TILE_TYPE = 4,
+ AMD_SOCKET_TYPE = 4,
+ MAX_TYPE_80000026 = 5,
DIE_TYPE = 5,
DIEGRP_TYPE = 6,
MAX_TYPE_1F = 7,
@@ -32,6 +35,13 @@ static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = {
[DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN,
};
+static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN,
+ [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN,
+};
+
static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
unsigned int *last_dom)
{
@@ -56,6 +66,7 @@ static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
switch (leaf) {
case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break;
case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break;
+ case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break;
default: return false;
}
@@ -125,6 +136,10 @@ bool cpu_parse_topology_ext(struct topo_scan *tscan)
if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f))
return true;
+ /* AMD: Try leaf 0x80000026 first. */
+ if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026))
+ return true;
+
/* Intel/AMD: Fall back to leaf 0xB if available */
return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b);
}
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 11f83d07925e..cb3f900c46fc 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/clocksource.h>
#include <linux/cpu.h>
+#include <linux/efi.h>
#include <linux/reboot.h>
#include <linux/static_call.h>
#include <asm/div64.h>
@@ -41,80 +42,97 @@
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define CPUID_VMWARE_FEATURES_LEAF 0x40000010
-#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0)
-#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1)
-#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
-
-#define VMWARE_CMD_GETVERSION 10
-#define VMWARE_CMD_GETHZ 45
-#define VMWARE_CMD_GETVCPU_INFO 68
-#define VMWARE_CMD_LEGACY_X2APIC 3
-#define VMWARE_CMD_VCPU_RESERVED 31
-#define VMWARE_CMD_STEALCLOCK 91
+#define GETVCPU_INFO_LEGACY_X2APIC BIT(3)
+#define GETVCPU_INFO_VCPU_RESERVED BIT(31)
#define STEALCLOCK_NOT_AVAILABLE (-1)
#define STEALCLOCK_DISABLED 0
#define STEALCLOCK_ENABLED 1
-#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx), %%eax" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \
- __asm__("vmcall" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(0), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \
- __asm__("vmmcall" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(0), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \
- switch (vmware_hypercall_mode) { \
- case CPUID_VMWARE_FEATURES_ECX_VMCALL: \
- VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \
- break; \
- case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \
- VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \
- break; \
- default: \
- VMWARE_PORT(cmd, eax, ebx, ecx, edx); \
- break; \
- } \
- } while (0)
-
struct vmware_steal_time {
union {
- uint64_t clock; /* stolen time counter in units of vtsc */
+ u64 clock; /* stolen time counter in units of vtsc */
struct {
/* only for little-endian */
- uint32_t clock_low;
- uint32_t clock_high;
+ u32 clock_low;
+ u32 clock_high;
};
};
- uint64_t reserved[7];
+ u64 reserved[7];
};
static unsigned long vmware_tsc_khz __ro_after_init;
static u8 vmware_hypercall_mode __ro_after_init;
+unsigned long vmware_hypercall_slow(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ unsigned long out0, rbx, rcx, rdx, rsi, rdi;
+
+ switch (vmware_hypercall_mode) {
+ case CPUID_VMWARE_FEATURES_ECX_VMCALL:
+ asm_inline volatile ("vmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ case CPUID_VMWARE_FEATURES_ECX_VMMCALL:
+ asm_inline volatile ("vmmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ default:
+ asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ }
+
+ if (out1)
+ *out1 = rbx;
+ if (out2)
+ *out2 = rcx;
+ if (out3)
+ *out3 = rdx;
+ if (out4)
+ *out4 = rsi;
+ if (out5)
+ *out5 = rdi;
+
+ return out0;
+}
+
static inline int __vmware_platform(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx);
- return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+ u32 eax, ebx, ecx;
+
+ eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx);
+ return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
}
static unsigned long vmware_get_tsc_khz(void)
@@ -166,21 +184,12 @@ static void __init vmware_cyc2ns_setup(void)
pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
}
-static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2)
+static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo)
{
- uint32_t result, info;
-
- asm volatile (VMWARE_HYPERCALL :
- "=a"(result),
- "=c"(info) :
- "a"(VMWARE_HYPERVISOR_MAGIC),
- "b"(0),
- "c"(VMWARE_CMD_STEALCLOCK),
- "d"(0),
- "S"(arg1),
- "D"(arg2) :
- "memory");
- return result;
+ u32 info;
+
+ return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo,
+ &info);
}
static bool stealclock_enable(phys_addr_t pa)
@@ -215,15 +224,15 @@ static bool vmware_is_stealclock_available(void)
* Return:
* The steal clock reading in ns.
*/
-static uint64_t vmware_steal_clock(int cpu)
+static u64 vmware_steal_clock(int cpu)
{
struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
- uint64_t clock;
+ u64 clock;
if (IS_ENABLED(CONFIG_64BIT))
clock = READ_ONCE(steal->clock);
else {
- uint32_t initial_high, low, high;
+ u32 initial_high, low, high;
do {
initial_high = READ_ONCE(steal->clock_high);
@@ -235,7 +244,7 @@ static uint64_t vmware_steal_clock(int cpu)
high = READ_ONCE(steal->clock_high);
} while (initial_high != high);
- clock = ((uint64_t)high << 32) | low;
+ clock = ((u64)high << 32) | low;
}
return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
@@ -389,13 +398,13 @@ static void __init vmware_set_capabilities(void)
static void __init vmware_platform_setup(void)
{
- uint32_t eax, ebx, ecx, edx;
- uint64_t lpj, tsc_khz;
+ u32 eax, ebx, ecx;
+ u64 lpj, tsc_khz;
- VMWARE_CMD(GETHZ, eax, ebx, ecx, edx);
+ eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx);
if (ebx != UINT_MAX) {
- lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+ lpj = tsc_khz = eax | (((u64)ebx) << 32);
do_div(tsc_khz, 1000);
WARN_ON(tsc_khz >> 32);
pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
@@ -421,6 +430,9 @@ static void __init vmware_platform_setup(void)
pr_warn("Failed to get TSC freq from the hypervisor\n");
}
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT))
+ x86_init.mpparse.find_mptable = mpparse_find_mptable;
+
vmware_paravirt_ops_setup();
#ifdef CONFIG_X86_IO_APIC
@@ -446,7 +458,7 @@ static u8 __init vmware_select_hypercall(void)
* If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
* intentionally defaults to 0.
*/
-static uint32_t __init vmware_platform(void)
+static u32 __init vmware_platform(void)
{
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
unsigned int eax;
@@ -474,11 +486,64 @@ static uint32_t __init vmware_platform(void)
/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
static bool __init vmware_legacy_x2apic_available(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx);
- return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) &&
- (eax & BIT(VMWARE_CMD_LEGACY_X2APIC));
+ u32 eax;
+
+ eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0);
+ return !(eax & GETVCPU_INFO_VCPU_RESERVED) &&
+ (eax & GETVCPU_INFO_LEGACY_X2APIC);
+}
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+/*
+ * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore,
+ * we remap those registers to %r12 and %r13, respectively.
+ */
+unsigned long vmware_tdx_hypercall(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ struct tdx_module_args args = {};
+
+ if (!hypervisor_is_type(X86_HYPER_VMWARE)) {
+ pr_warn_once("Incorrect usage\n");
+ return ULONG_MAX;
+ }
+
+ if (cmd & ~VMWARE_CMD_MASK) {
+ pr_warn_once("Out of range command %lx\n", cmd);
+ return ULONG_MAX;
+ }
+
+ args.rbx = in1;
+ args.rdx = in3;
+ args.rsi = in4;
+ args.rdi = in5;
+ args.r10 = VMWARE_TDX_VENDOR_LEAF;
+ args.r11 = VMWARE_TDX_HCALL_FUNC;
+ args.r12 = VMWARE_HYPERVISOR_MAGIC;
+ args.r13 = cmd;
+ /* CPL */
+ args.r15 = 0;
+
+ __tdx_hypercall(&args);
+
+ if (out1)
+ *out1 = args.rbx;
+ if (out2)
+ *out2 = args.r13;
+ if (out3)
+ *out3 = args.rdx;
+ if (out4)
+ *out4 = args.rsi;
+ if (out5)
+ *out5 = args.rdi;
+
+ return args.r12;
}
+EXPORT_SYMBOL_GPL(vmware_tdx_hypercall);
+#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,