summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-26 15:09:18 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-26 15:09:18 -0700
commitaa35a4835e4f4c113c29bc7ea64cfecb951d51b8 (patch)
tree8b5b5bfbae79175497687e0488873d38af1b5e21 /arch/x86/kernel/cpu
parente5ce2f196fb9ab35fe18dcfd2bc17883db7bbe33 (diff)
parent4251566ebc1cf95ae26a1e5a24cdac1ac25e942f (diff)
downloadlinux-aa35a4835e4f4c113c29bc7ea64cfecb951d51b8.tar.gz
linux-aa35a4835e4f4c113c29bc7ea64cfecb951d51b8.tar.bz2
linux-aa35a4835e4f4c113c29bc7ea64cfecb951d51b8.zip
Merge tag 'ras_core_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Borislav Petkov: - Add initial support for RAS hardware found on AMD server GPUs (MI200). Those GPUs and CPUs are connected together through the coherent fabric and the GPU memory controllers report errors through x86's MCA so EDAC needs to support them. The amd64_edac driver supports now HBM (High Bandwidth Memory) and thus such heterogeneous memory controller systems - Other small cleanups and improvements * tag 'ras_core_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: EDAC/amd64: Cache and use GPU node map EDAC/amd64: Add support for AMD heterogeneous Family 19h Model 30h-3Fh EDAC/amd64: Document heterogeneous system enumeration x86/MCE/AMD, EDAC/mce_amd: Decode UMC_V2 ECC errors x86/amd_nb: Re-sort and re-indent PCI defines x86/amd_nb: Add MI200 PCI IDs ras/debugfs: Fix error checking for debugfs_create_dir() x86/MCE: Check a hw error's address to determine proper recovery action
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c6
-rw-r--r--arch/x86/kernel/cpu/mce/core.c2
2 files changed, 5 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 0b971f974096..5e74610b39e7 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -715,11 +715,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
bool amd_mce_is_memory_error(struct mce *m)
{
+ enum smca_bank_types bank_type;
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
+ bank_type = smca_get_bank_type(m->extcpu, m->bank);
if (mce_flags.smca)
- return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0;
+ return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0;
return m->bank == 4 && xec == 0x8;
}
@@ -1050,7 +1052,7 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;
- if (b && bank_type == SMCA_UMC) {
+ if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
return smca_umc_block_names[b->block];
return NULL;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2eec60f50057..22dfcb2adcd7 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1533,7 +1533,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- if (kill_current_task)
+ if (!mce_usable_address(&m))
queue_task_work(&m, msg, kill_me_now);
else
queue_task_work(&m, msg, kill_me_maybe);