diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-26 15:09:18 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-26 15:09:18 -0700 |
commit | aa35a4835e4f4c113c29bc7ea64cfecb951d51b8 (patch) | |
tree | 8b5b5bfbae79175497687e0488873d38af1b5e21 /arch/x86/kernel/cpu | |
parent | e5ce2f196fb9ab35fe18dcfd2bc17883db7bbe33 (diff) | |
parent | 4251566ebc1cf95ae26a1e5a24cdac1ac25e942f (diff) | |
download | linux-aa35a4835e4f4c113c29bc7ea64cfecb951d51b8.tar.gz linux-aa35a4835e4f4c113c29bc7ea64cfecb951d51b8.tar.bz2 linux-aa35a4835e4f4c113c29bc7ea64cfecb951d51b8.zip |
Merge tag 'ras_core_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Borislav Petkov:
- Add initial support for RAS hardware found on AMD server GPUs (MI200).
Those GPUs and CPUs are connected together through the coherent
fabric and the GPU memory controllers report errors through x86's MCA
so EDAC needs to support them. The amd64_edac driver supports now HBM
(High Bandwidth Memory) and thus such heterogeneous memory controller
systems
- Other small cleanups and improvements
* tag 'ras_core_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
EDAC/amd64: Cache and use GPU node map
EDAC/amd64: Add support for AMD heterogeneous Family 19h Model 30h-3Fh
EDAC/amd64: Document heterogeneous system enumeration
x86/MCE/AMD, EDAC/mce_amd: Decode UMC_V2 ECC errors
x86/amd_nb: Re-sort and re-indent PCI defines
x86/amd_nb: Add MI200 PCI IDs
ras/debugfs: Fix error checking for debugfs_create_dir()
x86/MCE: Check a hw error's address to determine proper recovery action
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r-- | arch/x86/kernel/cpu/mce/amd.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 2 |
2 files changed, 5 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 0b971f974096..5e74610b39e7 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -715,11 +715,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) bool amd_mce_is_memory_error(struct mce *m) { + enum smca_bank_types bank_type; /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; + bank_type = smca_get_bank_type(m->extcpu, m->bank); if (mce_flags.smca) - return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0; + return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0; return m->bank == 4 && xec == 0x8; } @@ -1050,7 +1052,7 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol if (bank_type >= N_SMCA_BANK_TYPES) return NULL; - if (b && bank_type == SMCA_UMC) { + if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) return smca_umc_block_names[b->block]; return NULL; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2eec60f50057..22dfcb2adcd7 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1533,7 +1533,7 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (kill_current_task) + if (!mce_usable_address(&m)) queue_task_work(&m, msg, kill_me_now); else queue_task_work(&m, msg, kill_me_maybe); |