summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorZhiquan Li <zhiquan1.li@intel.com>2023-10-26 08:39:03 +0800
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2024-02-23 08:42:02 +0100
commit5224b9db24f6bbb46a611fd33f899c740b95f3ee (patch)
treee05536b16c7beda565ec663827234f1f10ad5930 /arch
parentbeee482cc4c9a6b1dcffb2e190b4fd8782258678 (diff)
downloadlinux-stable-5224b9db24f6bbb46a611fd33f899c740b95f3ee.tar.gz
linux-stable-5224b9db24f6bbb46a611fd33f899c740b95f3ee.tar.bz2
linux-stable-5224b9db24f6bbb46a611fd33f899c740b95f3ee.zip
x86/mce: Mark fatal MCE's page as poison to avoid panic in the kdump kernel
[ Upstream commit 9f3b130048bfa2e44a8cfb1b616f826d9d5d8188 ] Memory errors don't happen very often, especially fatal ones. However, in large-scale scenarios such as data centers, that probability increases with the amount of machines present. When a fatal machine check happens, mce_panic() is called based on the severity grading of that error. The page containing the error is not marked as poison. However, when kexec is enabled, tools like makedumpfile understand when pages are marked as poison and do not touch them so as not to cause a fatal machine check exception again while dumping the previous kernel's memory. Therefore, mark the page containing the error as poisoned so that the kexec'ed kernel can avoid accessing the page. [ bp: Rewrite commit message and comment. ] Co-developed-by: Youquan Song <youquan.song@intel.com> Signed-off-by: Youquan Song <youquan.song@intel.com> Signed-off-by: Zhiquan Li <zhiquan1.li@intel.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com> Link: https://lore.kernel.org/r/20231014051754.3759099-1-zhiquan1.li@intel.com Signed-off-by: Sasha Levin <sashal@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/kernel/cpu/mce/core.c16
1 files changed, 16 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 0b7c81389c50..18a6ed2afca0 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -44,6 +44,7 @@
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
+#include <linux/kexec.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
@@ -274,6 +275,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
+ struct page *p;
/*
* Allow instrumentation around external facilities usage. Not that it
@@ -329,6 +331,20 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
+
+ /*
+ * Kdump skips the poisoned page in order to avoid
+ * touching the error bits again. Poison the page even
+ * if the error is fatal and the machine is about to
+ * panic.
+ */
+ if (kexec_crash_loaded()) {
+ if (final && (final->status & MCI_STATUS_ADDRV)) {
+ p = pfn_to_online_page(final->addr >> PAGE_SHIFT);
+ if (p)
+ SetPageHWPoison(p);
+ }
+ }
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);