From 6d8e294bf5f0e85c34e8b14b064e2965f53f38b0 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 20 Apr 2019 12:53:05 +0200
Subject: RAS/CEC: Fix pfn insertion

When inserting random PFNs for debugging the CEC through
(debugfs)/ras/cec/pfn, depending on the return value of pfn_set(),
multiple values get inserted per a single write.

That is because simple_attr_write() interprets a retval of 0 as
success and claims the whole input. However, pfn_set() returns the
cec_add_elem() value, which, if > 0 and smaller than the whole input
length, makes glibc continue issuing the write syscall until there's
input left:

  pfn_set
  simple_attr_write
  debugfs_attr_write
  full_proxy_write
  vfs_write
  ksys_write
  do_syscall_64
  entry_SYSCALL_64_after_hwframe

leading to those repeated calls.

Return 0 to fix that.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
---
 drivers/ras/cec.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 673f8a128397..f5795adc5a6e 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -369,7 +369,9 @@ static int pfn_set(void *data, u64 val)
 {
 	*(u64 *)data = val;
 
-	return cec_add_elem(val);
+	cec_add_elem(val);
+
+	return 0;
 }
 
 DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
-- 
cgit v1.2.3


From de0e0624d86ff9fc512dedb297f8978698abf21a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 20 Apr 2019 14:06:37 +0200
Subject: RAS/CEC: Check count_threshold unconditionally

The count_threshold should be checked unconditionally, after insertion
too, so that a count_threshold value of 1 can cause an immediate
offlining. I.e., offline the page on the *first* error encountered.

Add comments to make it clear what cec_add_elem() does, while at it.

Reported-by: WANG Chao <chao.wang@ucloud.cn>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac@vger.kernel.org
Link: https://lkml.kernel.org/r/20190418034115.75954-3-chao.wang@ucloud.cn
---
 drivers/ras/cec.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index f5795adc5a6e..73a975c26f9f 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -294,6 +294,7 @@ int cec_add_elem(u64 pfn)
 
 	ca->ces_entered++;
 
+	/* Array full, free the LRU slot. */
 	if (ca->n == MAX_ELEMS)
 		WARN_ON(!del_lru_elem_unlocked(ca));
 
@@ -306,24 +307,17 @@ int cec_add_elem(u64 pfn)
 			(void *)&ca->array[to],
 			(ca->n - to) * sizeof(u64));
 
-		ca->array[to] = (pfn << PAGE_SHIFT) |
-				(DECAY_MASK << COUNT_BITS) | 1;
-
+		ca->array[to] = pfn << PAGE_SHIFT;
 		ca->n++;
-
-		ret = 0;
-
-		goto decay;
 	}
 
-	count = COUNT(ca->array[to]);
-
-	if (count < count_threshold) {
-		ca->array[to] |= (DECAY_MASK << COUNT_BITS);
-		ca->array[to]++;
+	/* Add/refresh element generation and increment count */
+	ca->array[to] |= DECAY_MASK << COUNT_BITS;
+	ca->array[to]++;
 
-		ret = 0;
-	} else {
+	/* Check action threshold and soft-offline, if reached. */
+	count = COUNT(ca->array[to]);
+	if (count >= count_threshold) {
 		u64 pfn = ca->array[to] >> PAGE_SHIFT;
 
 		if (!pfn_valid(pfn)) {
@@ -338,15 +332,14 @@ int cec_add_elem(u64 pfn)
 		del_elem(ca, to);
 
 		/*
-		 * Return a >0 value to denote that we've reached the offlining
-		 * threshold.
+		 * Return a >0 value to callers, to denote that we've reached
+		 * the offlining threshold.
 		 */
 		ret = 1;
 
 		goto unlock;
 	}
 
-decay:
 	ca->decay_count++;
 
 	if (ca->decay_count >= CLEAN_ELEMS)
-- 
cgit v1.2.3


From 5cc6b16ea1313d05956b55e83a1f753c604282a8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 20 Apr 2019 21:33:08 +0200
Subject: RAS/CEC: Do not set decay value on error

When the value requested doesn't match the allowed (min,max) range,
the @data buffer should not be modified with the invalid value because
reading "decay_interval" shows it otherwise as if the previous write
succeeded.

Move the data write after the check.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
---
 drivers/ras/cec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 73a975c26f9f..31868bd99e8d 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -371,17 +371,17 @@ DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
 
 static int decay_interval_set(void *data, u64 val)
 {
-	*(u64 *)data = val;
-
 	if (val < CEC_DECAY_MIN_INTERVAL)
 		return -EINVAL;
 
 	if (val > CEC_DECAY_MAX_INTERVAL)
 		return -EINVAL;
 
+	*(u64 *)data   = val;
 	decay_interval = val;
 
 	cec_mod_work(decay_interval);
+
 	return 0;
 }
 DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
-- 
cgit v1.2.3


From d0e375e8f26edd2e577e3afa9d952f91037cbd87 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 20 Apr 2019 21:39:24 +0200
Subject: RAS/CEC: Fix potential memory leak

Free the array page if a failure is encountered while creating the
debugfs nodes.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
---
 drivers/ras/cec.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 31868bd99e8d..f57e869dfea2 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -504,8 +504,10 @@ void __init cec_init(void)
 		return;
 	}
 
-	if (create_debugfs_nodes())
+	if (create_debugfs_nodes()) {
+		free_page((unsigned long)ce_arr.array);
 		return;
+	}
 
 	INIT_DELAYED_WORK(&cec_work, cec_work_fn);
 	schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);
-- 
cgit v1.2.3


From 9632a3299bb1897f01c6a485ff035b20e61d7ae1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 21 Apr 2019 21:41:45 +0200
Subject: RAS/CEC: Sanity-check array on every insertion

Check the elements order in the array after every insertion.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
---
 drivers/ras/cec.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index f57e869dfea2..da5797c38051 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -276,11 +276,39 @@ static u64 __maybe_unused del_lru_elem(void)
 	return pfn;
 }
 
+static bool sanity_check(struct ce_array *ca)
+{
+	bool ret = false;
+	u64 prev = 0;
+	int i;
+
+	for (i = 0; i < ca->n; i++) {
+		u64 this = PFN(ca->array[i]);
+
+		if (WARN(prev > this, "prev: 0x%016llx <-> this: 0x%016llx\n", prev, this))
+			ret = true;
+
+		prev = this;
+	}
+
+	if (!ret)
+		return ret;
+
+	pr_info("Sanity check dump:\n{ n: %d\n", ca->n);
+	for (i = 0; i < ca->n; i++) {
+		u64 this = PFN(ca->array[i]);
+
+		pr_info(" %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
+	}
+	pr_info("}\n");
+
+	return ret;
+}
 
 int cec_add_elem(u64 pfn)
 {
 	struct ce_array *ca = &ce_arr;
-	unsigned int to;
+	unsigned int to = 0;
 	int count, ret = 0;
 
 	/*
@@ -345,6 +373,8 @@ int cec_add_elem(u64 pfn)
 	if (ca->decay_count >= CLEAN_ELEMS)
 		do_spring_cleaning(ca);
 
+	WARN_ON_ONCE(sanity_check(ca));
+
 unlock:
 	mutex_unlock(&ce_mutex);
 
@@ -402,7 +432,6 @@ DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%ll
 static int array_dump(struct seq_file *m, void *v)
 {
 	struct ce_array *ca = &ce_arr;
-	u64 prev = 0;
 	int i;
 
 	mutex_lock(&ce_mutex);
@@ -412,10 +441,6 @@ static int array_dump(struct seq_file *m, void *v)
 		u64 this = PFN(ca->array[i]);
 
 		seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
-
-		WARN_ON(prev > this);
-
-		prev = this;
 	}
 
 	seq_printf(m, "}\n");
-- 
cgit v1.2.3


From b8b5ca6600dec2a4f1e50ca9d3cf9e1d032870cd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 20 Apr 2019 21:30:11 +0200
Subject: RAS/CEC: Rename count_threshold to action_threshold

... which is the better, more-fitting name anyway.

Tony:
 - make action_threshold u64 due to debugfs accessors expecting u64.
 - rename the remaining: s/count_threshold/action_threshold/g

Co-developed-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: linux-edac <linux-edac@vger.kernel.org>
---
 drivers/ras/cec.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index da5797c38051..364f7e1a6bad 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -37,9 +37,9 @@
  * thus emulate an an LRU-like behavior when deleting elements to free up space
  * in the page.
  *
- * When an element reaches it's max count of count_threshold, we try to poison
- * it by assuming that errors triggered count_threshold times in a single page
- * are excessive and that page shouldn't be used anymore. count_threshold is
+ * When an element reaches it's max count of action_threshold, we try to poison
+ * it by assuming that errors triggered action_threshold times in a single page
+ * are excessive and that page shouldn't be used anymore. action_threshold is
  * initialized to COUNT_MASK which is the maximum.
  *
  * That error event entry causes cec_add_elem() to return !0 value and thus
@@ -122,7 +122,7 @@ static DEFINE_MUTEX(ce_mutex);
 static u64 dfs_pfn;
 
 /* Amount of errors after which we offline */
-static unsigned int count_threshold = COUNT_MASK;
+static u64 action_threshold = COUNT_MASK;
 
 /* Each element "decays" each decay_interval which is 24hrs by default. */
 #define CEC_DECAY_DEFAULT_INTERVAL	24 * 60 * 60	/* 24 hrs */
@@ -345,7 +345,7 @@ int cec_add_elem(u64 pfn)
 
 	/* Check action threshold and soft-offline, if reached. */
 	count = COUNT(ca->array[to]);
-	if (count >= count_threshold) {
+	if (count >= action_threshold) {
 		u64 pfn = ca->array[to] >> PAGE_SHIFT;
 
 		if (!pfn_valid(pfn)) {
@@ -416,18 +416,18 @@ static int decay_interval_set(void *data, u64 val)
 }
 DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
 
-static int count_threshold_set(void *data, u64 val)
+static int action_threshold_set(void *data, u64 val)
 {
 	*(u64 *)data = val;
 
 	if (val > COUNT_MASK)
 		val = COUNT_MASK;
 
-	count_threshold = val;
+	action_threshold = val;
 
 	return 0;
 }
-DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n");
 
 static int array_dump(struct seq_file *m, void *v)
 {
@@ -453,7 +453,7 @@ static int array_dump(struct seq_file *m, void *v)
 	seq_printf(m, "Decay interval: %lld seconds\n", decay_interval);
 	seq_printf(m, "Decays: %lld\n", ca->decays_done);
 
-	seq_printf(m, "Action threshold: %d\n", count_threshold);
+	seq_printf(m, "Action threshold: %lld\n", action_threshold);
 
 	mutex_unlock(&ce_mutex);
 
@@ -502,10 +502,10 @@ static int __init create_debugfs_nodes(void)
 		goto err;
 	}
 
-	count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
-				    &count_threshold, &count_threshold_ops);
+	count = debugfs_create_file("action_threshold", S_IRUSR | S_IWUSR, d,
+				    &action_threshold, &action_threshold_ops);
 	if (!count) {
-		pr_warn("Error creating count_threshold debugfs node!\n");
+		pr_warn("Error creating action_threshold debugfs node!\n");
 		goto err;
 	}
 
-- 
cgit v1.2.3


From f57518cd56e2919afbcef3839122a75e291c7f85 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 20 Apr 2019 23:01:03 +0200
Subject: RAS/CEC: Dump the different array element sections

When dumping the array elements, print them in the following format:

  [ PFN | generation in binary | count ]

to be perfectly clear what all those sections are.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
---
 drivers/ras/cec.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 364f7e1a6bad..dc08c705b493 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -429,6 +429,8 @@ static int action_threshold_set(void *data, u64 val)
 }
 DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n");
 
+static const char * const bins[] = { "00", "01", "10", "11" };
+
 static int array_dump(struct seq_file *m, void *v)
 {
 	struct ce_array *ca = &ce_arr;
@@ -440,7 +442,8 @@ static int array_dump(struct seq_file *m, void *v)
 	for (i = 0; i < ca->n; i++) {
 		u64 this = PFN(ca->array[i]);
 
-		seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
+		seq_printf(m, " %3d: [%016llx|%s|%03llx]\n",
+			   i, this, bins[DECAY(ca->array[i])], COUNT(ca->array[i]));
 	}
 
 	seq_printf(m, "}\n");
-- 
cgit v1.2.3


From 60fd42d26cc7ec8847598da50ebf27e3c9647d7b Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Mon, 6 May 2019 13:13:22 +0200
Subject: RAS/CEC: Add CONFIG_RAS_CEC_DEBUG and move CEC debug features there

The pfn and array files in (debugfs)/ras/cec are intended for debugging
the CEC code itself. They are not needed on production systems, so the
default setting for this CONFIG option is "n".

 [ bp: Have it with less ifdeffery by using IS_ENABLED(). ]

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/ras/Kconfig | 10 ++++++++++
 drivers/ras/cec.c    | 26 ++++++++++++++------------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index a9c3db125222..9ad6842de4b4 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -11,3 +11,13 @@ config RAS_CEC
 
 	  Bear in mind that this is absolutely useless if your platform doesn't
 	  have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
+
+config RAS_CEC_DEBUG
+	bool "CEC debugging machinery"
+	default n
+	depends on RAS_CEC
+	help
+	  Add extra files to (debugfs)/ras/cec to test the correctable error
+	  collector feature. "pfn" is a writable file that allows user to
+	  simulate an error in a particular page frame. "array" is a read-only
+	  file that dumps out the current state of all pages logged so far.
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index dc08c705b493..0907dc6f4afe 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -486,18 +486,6 @@ static int __init create_debugfs_nodes(void)
 		return -1;
 	}
 
-	pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
-	if (!pfn) {
-		pr_warn("Error creating pfn debugfs node!\n");
-		goto err;
-	}
-
-	array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
-	if (!array) {
-		pr_warn("Error creating array debugfs node!\n");
-		goto err;
-	}
-
 	decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
 				    &decay_interval, &decay_interval_ops);
 	if (!decay) {
@@ -512,6 +500,20 @@ static int __init create_debugfs_nodes(void)
 		goto err;
 	}
 
+	if (!IS_ENABLED(CONFIG_RAS_CEC_DEBUG))
+		return 0;
+
+	pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
+	if (!pfn) {
+		pr_warn("Error creating pfn debugfs node!\n");
+		goto err;
+	}
+
+	array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
+	if (!array) {
+		pr_warn("Error creating array debugfs node!\n");
+		goto err;
+	}
 
 	return 0;
 
-- 
cgit v1.2.3


From 09afc797f3629f722df6a90ca6eb944013133c7a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 20 Apr 2019 22:06:43 +0200
Subject: RAS/CEC: Add copyright

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/ras/cec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 0907dc6f4afe..5d545806d930 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -1,4 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017-2019 Borislav Petkov, SUSE Labs.
+ */
 #include <linux/mm.h>
 #include <linux/gfp.h>
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From 95fdce6b24f3526c2bd1aad15978d238b79da6bd Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Fri, 7 Jun 2019 20:18:03 +0000
Subject: x86/MCE: Make struct mce_banks[] static

The struct mce_banks[] array is only used in mce/core.c so move its
definition there and make it static. Also, change the "init" field to
bool type.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "x86@kernel.org" <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190607201752.221446-2-Yazen.Ghannam@amd.com
---
 arch/x86/kernel/cpu/mce/core.c     | 11 ++++++++++-
 arch/x86/kernel/cpu/mce/internal.h | 10 ----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 282916f3b8d8..55bdbedde0b8 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -65,7 +65,16 @@ static DEFINE_MUTEX(mce_sysfs_mutex);
 
 DEFINE_PER_CPU(unsigned, mce_exception_count);
 
-struct mce_bank *mce_banks __read_mostly;
+#define ATTR_LEN               16
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank {
+	u64			ctl;			/* subevents to enable */
+	bool			init;			/* initialise bank? */
+	struct device_attribute	attr;			/* device attribute */
+	char			attrname[ATTR_LEN];	/* attribute name */
+};
+
+static struct mce_bank *mce_banks __read_mostly;
 struct mce_vendor_flags mce_flags __read_mostly;
 
 struct mca_config mca_cfg __read_mostly = {
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index a34b55baa7aa..35b3e5c02c1c 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -22,17 +22,8 @@ enum severity_level {
 
 extern struct blocking_notifier_head x86_mce_decoder_chain;
 
-#define ATTR_LEN		16
 #define INITIAL_CHECK_INTERVAL	5 * 60 /* 5 minutes */
 
-/* One object for each MCE bank, shared by all CPUs */
-struct mce_bank {
-	u64			ctl;			/* subevents to enable */
-	unsigned char init;				/* initialise bank? */
-	struct device_attribute attr;			/* device attribute */
-	char			attrname[ATTR_LEN];	/* attribute name */
-};
-
 struct mce_evt_llist {
 	struct llist_node llnode;
 	struct mce mce;
@@ -47,7 +38,6 @@ struct llist_node *mce_gen_pool_prepare_records(void);
 extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
 struct dentry *mce_get_debugfs_dir(void);
 
-extern struct mce_bank *mce_banks;
 extern mce_banks_t mce_banks_ce_disabled;
 
 #ifdef CONFIG_X86_MCE_INTEL
-- 
cgit v1.2.3


From b4914508f1fe0eca1cd011b6026ff762a1aa62d5 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Fri, 7 Jun 2019 20:18:04 +0000
Subject: x86/MCE: Make mce_banks a per-CPU array

Current AMD systems have unique MCA banks per logical CPU even though
the type of the banks may all align to the same bank number. Each CPU
will have control of a set of MCA banks in the hardware and these are
not shared with other CPUs.

For example, bank 0 may be the Load-Store Unit on every logical CPU, but
each bank 0 is a unique structure in the hardware. In other words, there
isn't a *single* Load-Store Unit at MCA bank 0 that all logical CPUs
share.

This idea extends even to non-core MCA banks. For example, CPU0 and CPU4
may see a Unified Memory Controller at bank 15, but each CPU is actually
seeing a unique hardware structure that is not shared with other CPUs.

Because the MCA banks are all unique hardware structures, it would be
good to control them in a more granular way. For example, if there is a
known issue with the Floating Point Unit on CPU5 and a user wishes to
disable an error type on the Floating Point Unit, then it would be good
to do this only for CPU5 rather than all CPUs.

Also, future AMD systems may have heterogeneous MCA banks. Meaning
the bank numbers may not necessarily represent the same types between
CPUs. For example, bank 20 visible to CPU0 may be a Unified Memory
Controller and bank 20 visible to CPU4 may be a Coherent Slave. So
granular control will be even more necessary should the user wish to
control specific MCA banks.

Split the device attributes from struct mce_bank leaving only the MCA
bank control fields.

Make struct mce_banks[] per_cpu in order to have more granular control
over individual MCA banks in the hardware.

Allocate the device attributes statically based on the maximum number of
MCA banks supported. The sysfs interface will use as many as needed per
CPU. Currently, this is set to mca_cfg.banks, but will be changed to a
per_cpu bank count in a future patch.

Allocate the MCA control bits statically. This is in order to avoid
locking warnings when memory is allocated during secondary CPUs' init
sequences.

Also, remove the now unnecessary return values from
__mcheck_cpu_mce_banks_init() and __mcheck_cpu_cap_init().

Redo the sysfs store/show functions to handle the per_cpu mce_banks[].

 [ bp: s/mce_banks_percpu/mce_banks_array/g ]

[ Locking issue reported by ]
Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "x86@kernel.org" <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190607201752.221446-3-Yazen.Ghannam@amd.com
---
 arch/x86/kernel/cpu/mce/core.c | 76 ++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 55bdbedde0b8..49fac95d036b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -65,16 +65,21 @@ static DEFINE_MUTEX(mce_sysfs_mutex);
 
 DEFINE_PER_CPU(unsigned, mce_exception_count);
 
-#define ATTR_LEN               16
-/* One object for each MCE bank, shared by all CPUs */
 struct mce_bank {
 	u64			ctl;			/* subevents to enable */
 	bool			init;			/* initialise bank? */
+};
+static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
+#define ATTR_LEN               16
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank_dev {
 	struct device_attribute	attr;			/* device attribute */
 	char			attrname[ATTR_LEN];	/* attribute name */
+	u8			bank;			/* bank number */
 };
+static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
 
-static struct mce_bank *mce_banks __read_mostly;
 struct mce_vendor_flags mce_flags __read_mostly;
 
 struct mca_config mca_cfg __read_mostly = {
@@ -684,6 +689,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
  */
 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 {
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	bool error_seen = false;
 	struct mce m;
 	int i;
@@ -1131,6 +1137,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
 			    unsigned long *toclear, unsigned long *valid_banks,
 			    int no_way_out, int *worst)
 {
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	struct mca_config *cfg = &mca_cfg;
 	int severity, i;
 
@@ -1472,27 +1479,23 @@ int mce_notify_irq(void)
 }
 EXPORT_SYMBOL_GPL(mce_notify_irq);
 
-static int __mcheck_cpu_mce_banks_init(void)
+static void __mcheck_cpu_mce_banks_init(void)
 {
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	int i;
 
-	mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL);
-	if (!mce_banks)
-		return -ENOMEM;
-
 	for (i = 0; i < MAX_NR_BANKS; i++) {
 		struct mce_bank *b = &mce_banks[i];
 
 		b->ctl = -1ULL;
 		b->init = 1;
 	}
-	return 0;
 }
 
 /*
  * Initialize Machine Checks for a CPU.
  */
-static int __mcheck_cpu_cap_init(void)
+static void __mcheck_cpu_cap_init(void)
 {
 	u64 cap;
 	u8 b;
@@ -1505,11 +1508,7 @@ static int __mcheck_cpu_cap_init(void)
 
 	mca_cfg.banks = max(mca_cfg.banks, b);
 
-	if (!mce_banks) {
-		int err = __mcheck_cpu_mce_banks_init();
-		if (err)
-			return err;
-	}
+	__mcheck_cpu_mce_banks_init();
 
 	/* Use accurate RIP reporting if available. */
 	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
@@ -1517,8 +1516,6 @@ static int __mcheck_cpu_cap_init(void)
 
 	if (cap & MCG_SER_P)
 		mca_cfg.ser = 1;
-
-	return 0;
 }
 
 static void __mcheck_cpu_init_generic(void)
@@ -1545,6 +1542,7 @@ static void __mcheck_cpu_init_generic(void)
 
 static void __mcheck_cpu_init_clear_banks(void)
 {
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	int i;
 
 	for (i = 0; i < mca_cfg.banks; i++) {
@@ -1588,6 +1586,7 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
 /* Add per CPU specific workarounds here */
 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 {
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	struct mca_config *cfg = &mca_cfg;
 
 	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
@@ -1824,7 +1823,9 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	if (!mce_available(c))
 		return;
 
-	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
+	__mcheck_cpu_cap_init();
+
+	if (__mcheck_cpu_apply_quirks(c) < 0) {
 		mca_cfg.disabled = 1;
 		return;
 	}
@@ -1958,6 +1959,7 @@ int __init mcheck_init(void)
  */
 static void mce_disable_error_reporting(void)
 {
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	int i;
 
 	for (i = 0; i < mca_cfg.banks; i++) {
@@ -2060,26 +2062,41 @@ static struct bus_type mce_subsys = {
 
 DEFINE_PER_CPU(struct device *, mce_device);
 
-static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
+static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
 {
-	return container_of(attr, struct mce_bank, attr);
+	return container_of(attr, struct mce_bank_dev, attr);
 }
 
 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
 			 char *buf)
 {
-	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
+	u8 bank = attr_to_bank(attr)->bank;
+	struct mce_bank *b;
+
+	if (bank >= mca_cfg.banks)
+		return -EINVAL;
+
+	b = &per_cpu(mce_banks_array, s->id)[bank];
+
+	return sprintf(buf, "%llx\n", b->ctl);
 }
 
 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
 			const char *buf, size_t size)
 {
+	u8 bank = attr_to_bank(attr)->bank;
+	struct mce_bank *b;
 	u64 new;
 
 	if (kstrtou64(buf, 0, &new) < 0)
 		return -EINVAL;
 
-	attr_to_bank(attr)->ctl = new;
+	if (bank >= mca_cfg.banks)
+		return -EINVAL;
+
+	b = &per_cpu(mce_banks_array, s->id)[bank];
+
+	b->ctl = new;
 	mce_restart();
 
 	return size;
@@ -2194,7 +2211,7 @@ static void mce_device_release(struct device *dev)
 	kfree(dev);
 }
 
-/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+/* Per CPU device init. All of the CPUs still share the same bank device: */
 static int mce_device_create(unsigned int cpu)
 {
 	struct device *dev;
@@ -2227,7 +2244,7 @@ static int mce_device_create(unsigned int cpu)
 			goto error;
 	}
 	for (j = 0; j < mca_cfg.banks; j++) {
-		err = device_create_file(dev, &mce_banks[j].attr);
+		err = device_create_file(dev, &mce_bank_devs[j].attr);
 		if (err)
 			goto error2;
 	}
@@ -2237,7 +2254,7 @@ static int mce_device_create(unsigned int cpu)
 	return 0;
 error2:
 	while (--j >= 0)
-		device_remove_file(dev, &mce_banks[j].attr);
+		device_remove_file(dev, &mce_bank_devs[j].attr);
 error:
 	while (--i >= 0)
 		device_remove_file(dev, mce_device_attrs[i]);
@@ -2259,7 +2276,7 @@ static void mce_device_remove(unsigned int cpu)
 		device_remove_file(dev, mce_device_attrs[i]);
 
 	for (i = 0; i < mca_cfg.banks; i++)
-		device_remove_file(dev, &mce_banks[i].attr);
+		device_remove_file(dev, &mce_bank_devs[i].attr);
 
 	device_unregister(dev);
 	cpumask_clear_cpu(cpu, mce_device_initialized);
@@ -2280,6 +2297,7 @@ static void mce_disable_cpu(void)
 
 static void mce_reenable_cpu(void)
 {
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	int i;
 
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
@@ -2337,10 +2355,12 @@ static __init void mce_init_banks(void)
 {
 	int i;
 
-	for (i = 0; i < mca_cfg.banks; i++) {
-		struct mce_bank *b = &mce_banks[i];
+	for (i = 0; i < MAX_NR_BANKS; i++) {
+		struct mce_bank_dev *b = &mce_bank_devs[i];
 		struct device_attribute *a = &b->attr;
 
+		b->bank = i;
+
 		sysfs_attr_init(&a->attr);
 		a->attr.name	= b->attrname;
 		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
-- 
cgit v1.2.3


From 95d057f54664f3c6e8f650faf5690b82b30a9e52 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Fri, 7 Jun 2019 20:18:04 +0000
Subject: x86/MCE/AMD: Don't cache block addresses on SMCA systems

On legacy systems, the addresses of the MCA_MISC* registers need to be
recursively discovered based on a Block Pointer field in the registers.

On Scalable MCA systems, the register space is fixed, and particular
addresses can be derived by regular offsets for bank and register type.
This fixed address space includes the MCA_MISC* registers.

MCA_MISC0 is always available for each MCA bank. MCA_MISC1 through
MCA_MISC4 are considered available if MCA_MISC0[BlkPtr]=1.

Cache the value of MCA_MISC0[BlkPtr] for each bank and per CPU. This
needs to be done only during init. The values should be saved per CPU
to accommodate heterogeneous SMCA systems.

Redo smca_get_block_address() to directly return the block addresses.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "x86@kernel.org" <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190607201752.221446-4-Yazen.Ghannam@amd.com
---
 arch/x86/kernel/cpu/mce/amd.c | 73 ++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index d904aafe6409..d4d6e4b7f9dc 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -101,11 +101,6 @@ static struct smca_bank_name smca_names[] = {
 	[SMCA_PCIE]	= { "pcie",		"PCI Express Unit" },
 };
 
-static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
-{
-	[0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
-};
-
 static const char *smca_get_name(enum smca_bank_types t)
 {
 	if (t >= N_SMCA_BANK_TYPES)
@@ -199,6 +194,9 @@ static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned int, bank_map);	/* see which banks are on */
 
+/* Map of banks that have more than MCA_MISC0 available. */
+static DEFINE_PER_CPU(u32, smca_misc_banks_map);
+
 static void amd_threshold_interrupt(void);
 static void amd_deferred_error_interrupt(void);
 
@@ -208,6 +206,28 @@ static void default_deferred_error_interrupt(void)
 }
 void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
 
+static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
+{
+	u32 low, high;
+
+	/*
+	 * For SMCA enabled processors, BLKPTR field of the first MISC register
+	 * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
+	 */
+	if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+		return;
+
+	if (!(low & MCI_CONFIG_MCAX))
+		return;
+
+	if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high))
+		return;
+
+	if (low & MASK_BLKPTR_LO)
+		per_cpu(smca_misc_banks_map, cpu) |= BIT(bank);
+
+}
+
 static void smca_configure(unsigned int bank, unsigned int cpu)
 {
 	unsigned int i, hwid_mcatype;
@@ -245,6 +265,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
 		wrmsr(smca_config, low, high);
 	}
 
+	smca_set_misc_banks_map(bank, cpu);
+
 	/* Return early if this bank was already initialized. */
 	if (smca_banks[bank].hwid)
 		return;
@@ -455,42 +477,21 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
-static u32 smca_get_block_address(unsigned int bank, unsigned int block)
+static u32 smca_get_block_address(unsigned int bank, unsigned int block,
+				  unsigned int cpu)
 {
-	u32 low, high;
-	u32 addr = 0;
-
-	if (smca_get_bank_type(bank) == SMCA_RESERVED)
-		return addr;
-
 	if (!block)
 		return MSR_AMD64_SMCA_MCx_MISC(bank);
 
-	/* Check our cache first: */
-	if (smca_bank_addrs[bank][block] != -1)
-		return smca_bank_addrs[bank][block];
-
-	/*
-	 * For SMCA enabled processors, BLKPTR field of the first MISC register
-	 * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
-	 */
-	if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
-		goto out;
-
-	if (!(low & MCI_CONFIG_MCAX))
-		goto out;
-
-	if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
-	    (low & MASK_BLKPTR_LO))
-		addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+	if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank)))
+		return 0;
 
-out:
-	smca_bank_addrs[bank][block] = addr;
-	return addr;
+	return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
 }
 
 static u32 get_block_address(u32 current_addr, u32 low, u32 high,
-			     unsigned int bank, unsigned int block)
+			     unsigned int bank, unsigned int block,
+			     unsigned int cpu)
 {
 	u32 addr = 0, offset = 0;
 
@@ -498,7 +499,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
 		return addr;
 
 	if (mce_flags.smca)
-		return smca_get_block_address(bank, block);
+		return smca_get_block_address(bank, block, cpu);
 
 	/* Fall back to method we used for older processors: */
 	switch (block) {
@@ -637,7 +638,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 		disable_err_thresholding(c, bank);
 
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			address = get_block_address(address, low, high, bank, block);
+			address = get_block_address(address, low, high, bank, block, cpu);
 			if (!address)
 				break;
 
@@ -1254,7 +1255,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 	if (err)
 		goto out_free;
 recurse:
-	address = get_block_address(address, low, high, bank, ++block);
+	address = get_block_address(address, low, high, bank, ++block, cpu);
 	if (!address)
 		return 0;
 
-- 
cgit v1.2.3


From c7d314f386e987be8b51eeb7dd947756ae23f6b6 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Fri, 7 Jun 2019 20:18:05 +0000
Subject: x86/MCE: Make the number of MCA banks a per-CPU variable

The number of MCA banks is provided per logical CPU. Historically, this
number has been the same across all CPUs, but this is not an
architectural guarantee. Future AMD systems may have MCA bank counts
that vary between logical CPUs in a system.

This issue was partially addressed in

  006c077041dc ("x86/mce: Handle varying MCA bank counts")

by allocating structures using the maximum number of MCA banks and by
saving the maximum MCA bank count in a system as the global count. This
means that some extra structures are allocated. Also, this means that
CPUs will spend more time in the #MC and other handlers checking extra
MCA banks.

Thus, define the number of MCA banks as a per-CPU variable.

 [ bp: Make mce_num_banks an unsigned int. ]

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "x86@kernel.org" <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190607201752.221446-5-Yazen.Ghannam@amd.com
---
 arch/x86/kernel/cpu/mce/amd.c      | 19 ++++++++--------
 arch/x86/kernel/cpu/mce/core.c     | 45 +++++++++++++++++++++-----------------
 arch/x86/kernel/cpu/mce/internal.h |  2 +-
 3 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index d4d6e4b7f9dc..fb5c935af2c5 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -495,7 +495,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
 {
 	u32 addr = 0, offset = 0;
 
-	if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
+	if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
 		return addr;
 
 	if (mce_flags.smca)
@@ -627,11 +627,12 @@ void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
-	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block, cpu = smp_processor_id();
+	u32 low = 0, high = 0, address = 0;
 	int offset = -1;
 
-	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+
+	for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
 		if (mce_flags.smca)
 			smca_configure(bank, cpu);
 
@@ -976,7 +977,7 @@ static void amd_deferred_error_interrupt(void)
 {
 	unsigned int bank;
 
-	for (bank = 0; bank < mca_cfg.banks; ++bank)
+	for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank)
 		log_error_deferred(bank);
 }
 
@@ -1017,7 +1018,7 @@ static void amd_threshold_interrupt(void)
 	struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
 	unsigned int bank, cpu = smp_processor_id();
 
-	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+	for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 
@@ -1204,7 +1205,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 	u32 low, high;
 	int err;
 
-	if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
+	if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
 		return 0;
 
 	if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
@@ -1438,7 +1439,7 @@ int mce_threshold_remove_device(unsigned int cpu)
 {
 	unsigned int bank;
 
-	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+	for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		threshold_remove_bank(cpu, bank);
@@ -1459,14 +1460,14 @@ int mce_threshold_create_device(unsigned int cpu)
 	if (bp)
 		return 0;
 
-	bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *),
+	bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *),
 		     GFP_KERNEL);
 	if (!bp)
 		return -ENOMEM;
 
 	per_cpu(threshold_banks, cpu) = bp;
 
-	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+	for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) {
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		err = threshold_create_bank(cpu, bank);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 49fac95d036b..10f9f140985e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -65,6 +65,8 @@ static DEFINE_MUTEX(mce_sysfs_mutex);
 
 DEFINE_PER_CPU(unsigned, mce_exception_count);
 
+DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
+
 struct mce_bank {
 	u64			ctl;			/* subevents to enable */
 	bool			init;			/* initialise bank? */
@@ -701,7 +703,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	if (flags & MCP_TIMESTAMP)
 		m.tsc = rdtsc();
 
-	for (i = 0; i < mca_cfg.banks; i++) {
+	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		if (!mce_banks[i].ctl || !test_bit(i, *b))
 			continue;
 
@@ -803,7 +805,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 	char *tmp;
 	int i;
 
-	for (i = 0; i < mca_cfg.banks; i++) {
+	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		m->status = mce_rdmsrl(msr_ops.status(i));
 		if (!(m->status & MCI_STATUS_VAL))
 			continue;
@@ -1083,7 +1085,7 @@ static void mce_clear_state(unsigned long *toclear)
 {
 	int i;
 
-	for (i = 0; i < mca_cfg.banks; i++) {
+	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		if (test_bit(i, toclear))
 			mce_wrmsrl(msr_ops.status(i), 0);
 	}
@@ -1141,7 +1143,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
 	struct mca_config *cfg = &mca_cfg;
 	int severity, i;
 
-	for (i = 0; i < cfg->banks; i++) {
+	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		__clear_bit(i, toclear);
 		if (!test_bit(i, valid_banks))
 			continue;
@@ -1482,9 +1484,10 @@ EXPORT_SYMBOL_GPL(mce_notify_irq);
 static void __mcheck_cpu_mce_banks_init(void)
 {
 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+	u8 n_banks = this_cpu_read(mce_num_banks);
 	int i;
 
-	for (i = 0; i < MAX_NR_BANKS; i++) {
+	for (i = 0; i < n_banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
 
 		b->ctl = -1ULL;
@@ -1503,10 +1506,14 @@ static void __mcheck_cpu_cap_init(void)
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 
 	b = cap & MCG_BANKCNT_MASK;
-	if (WARN_ON_ONCE(b > MAX_NR_BANKS))
+
+	if (b > MAX_NR_BANKS) {
+		pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
+			smp_processor_id(), MAX_NR_BANKS, b);
 		b = MAX_NR_BANKS;
+	}
 
-	mca_cfg.banks = max(mca_cfg.banks, b);
+	this_cpu_write(mce_num_banks, b);
 
 	__mcheck_cpu_mce_banks_init();
 
@@ -1545,7 +1552,7 @@ static void __mcheck_cpu_init_clear_banks(void)
 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	int i;
 
-	for (i = 0; i < mca_cfg.banks; i++) {
+	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		struct mce_bank *b = &mce_banks[i];
 
 		if (!b->init)
@@ -1596,7 +1603,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-		if (c->x86 == 15 && cfg->banks > 4) {
+		if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
 			/*
 			 * disable GART TBL walk error reporting, which
 			 * trips off incorrectly with the IOMMU & 3ware
@@ -1615,7 +1622,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 		 * Various K7s with broken bank 0 around. Always disable
 		 * by default.
 		 */
-		if (c->x86 == 6 && cfg->banks > 0)
+		if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
 			mce_banks[0].ctl = 0;
 
 		/*
@@ -1637,7 +1644,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 		 * valid event later, merely don't write CTL0.
 		 */
 
-		if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
+		if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
 			mce_banks[0].init = 0;
 
 		/*
@@ -1873,7 +1880,7 @@ static void __mce_disable_bank(void *arg)
 
 void mce_disable_bank(int bank)
 {
-	if (bank >= mca_cfg.banks) {
+	if (bank >= this_cpu_read(mce_num_banks)) {
 		pr_warn(FW_BUG
 			"Ignoring request to disable invalid MCA bank %d.\n",
 			bank);
@@ -1962,7 +1969,7 @@ static void mce_disable_error_reporting(void)
 	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
 	int i;
 
-	for (i = 0; i < mca_cfg.banks; i++) {
+	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		struct mce_bank *b = &mce_banks[i];
 
 		if (b->init)
@@ -2073,7 +2080,7 @@ static ssize_t show_bank(struct device *s, struct device_attribute *attr,
 	u8 bank = attr_to_bank(attr)->bank;
 	struct mce_bank *b;
 
-	if (bank >= mca_cfg.banks)
+	if (bank >= per_cpu(mce_num_banks, s->id))
 		return -EINVAL;
 
 	b = &per_cpu(mce_banks_array, s->id)[bank];
@@ -2091,7 +2098,7 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
 	if (kstrtou64(buf, 0, &new) < 0)
 		return -EINVAL;
 
-	if (bank >= mca_cfg.banks)
+	if (bank >= per_cpu(mce_num_banks, s->id))
 		return -EINVAL;
 
 	b = &per_cpu(mce_banks_array, s->id)[bank];
@@ -2243,7 +2250,7 @@ static int mce_device_create(unsigned int cpu)
 		if (err)
 			goto error;
 	}
-	for (j = 0; j < mca_cfg.banks; j++) {
+	for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
 		err = device_create_file(dev, &mce_bank_devs[j].attr);
 		if (err)
 			goto error2;
@@ -2275,7 +2282,7 @@ static void mce_device_remove(unsigned int cpu)
 	for (i = 0; mce_device_attrs[i]; i++)
 		device_remove_file(dev, mce_device_attrs[i]);
 
-	for (i = 0; i < mca_cfg.banks; i++)
+	for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
 		device_remove_file(dev, &mce_bank_devs[i].attr);
 
 	device_unregister(dev);
@@ -2305,7 +2312,7 @@ static void mce_reenable_cpu(void)
 
 	if (!cpuhp_tasks_frozen)
 		cmci_reenable();
-	for (i = 0; i < mca_cfg.banks; i++) {
+	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
 		struct mce_bank *b = &mce_banks[i];
 
 		if (b->init)
@@ -2493,8 +2500,6 @@ EXPORT_SYMBOL_GPL(mcsafe_key);
 
 static int __init mcheck_late_init(void)
 {
-	pr_info("Using %d MCE banks\n", mca_cfg.banks);
-
 	if (mca_cfg.recovery)
 		static_branch_inc(&mcsafe_key);
 
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 35b3e5c02c1c..43031db429d2 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -118,7 +118,6 @@ struct mca_config {
 	      bios_cmci_threshold	: 1,
 	      __reserved		: 59;
 
-	u8 banks;
 	s8 bootlog;
 	int tolerant;
 	int monarch_timeout;
@@ -127,6 +126,7 @@ struct mca_config {
 };
 
 extern struct mca_config mca_cfg;
+DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
 
 struct mce_vendor_flags {
 	/*
-- 
cgit v1.2.3


From 068b053dca0e2ab40b3d953b102a178654eec282 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Fri, 7 Jun 2019 20:18:06 +0000
Subject: x86/MCE: Determine MCA banks' init state properly

The OS is expected to write all bits to MCA_CTL for each bank,
thus enabling error reporting in all banks. However, some banks
may be unused in which case the registers for such banks are
Read-as-Zero/Writes-Ignored. Also, the OS may avoid setting some control
bits because of quirks, etc.

A bank can be considered uninitialized if the MCA_CTL register returns
zero. This is because either the OS did not write anything or because
the hardware is enforcing RAZ/WI for the bank.

Set a bank's init value based on if the control bits are set or not in
hardware. Return an error code in the sysfs interface for uninitialized
banks.

Do a final bank init check in a separate function which is not part of
any user-controlled code flows. This is so a user may enable/disable a
bank during runtime without having to restart their system.

 [ bp: Massage a bit. Discover bank init state at boot. ]

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "x86@kernel.org" <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190607201752.221446-6-Yazen.Ghannam@amd.com
---
 arch/x86/kernel/cpu/mce/core.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 10f9f140985e..c2c93e9195ed 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1490,6 +1490,11 @@ static void __mcheck_cpu_mce_banks_init(void)
 	for (i = 0; i < n_banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
 
+		/*
+		 * Init them all, __mcheck_cpu_apply_quirks() is going to apply
+		 * the required vendor quirks before
+		 * __mcheck_cpu_init_clear_banks() does the final bank setup.
+		 */
 		b->ctl = -1ULL;
 		b->init = 1;
 	}
@@ -1562,6 +1567,33 @@ static void __mcheck_cpu_init_clear_banks(void)
 	}
 }
 
+/*
+ * Do a final check to see if there are any unused/RAZ banks.
+ *
+ * This must be done after the banks have been initialized and any quirks have
+ * been applied.
+ *
+ * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
+ * Otherwise, a user who disables a bank will not be able to re-enable it
+ * without a system reboot.
+ */
+static void __mcheck_cpu_check_banks(void)
+{
+	struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+	u64 msrval;
+	int i;
+
+	for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+		struct mce_bank *b = &mce_banks[i];
+
+		if (!b->init)
+			continue;
+
+		rdmsrl(msr_ops.ctl(i), msrval);
+		b->init = !!msrval;
+	}
+}
+
 /*
  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
@@ -1849,6 +1881,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_clear_banks();
+	__mcheck_cpu_check_banks();
 	__mcheck_cpu_setup_timer();
 }
 
@@ -2085,6 +2118,9 @@ static ssize_t show_bank(struct device *s, struct device_attribute *attr,
 
 	b = &per_cpu(mce_banks_array, s->id)[bank];
 
+	if (!b->init)
+		return -ENODEV;
+
 	return sprintf(buf, "%llx\n", b->ctl);
 }
 
@@ -2103,6 +2139,9 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
 
 	b = &per_cpu(mce_banks_array, s->id)[bank];
 
+	if (!b->init)
+		return -ENODEV;
+
 	b->ctl = new;
 	mce_restart();
 
-- 
cgit v1.2.3


From 6e4f929ea8b2097b0052f6674de839a3c9d477e9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 12 Jun 2019 17:15:31 +0200
Subject: x86/mce: Do not check return value of debugfs_create functions

When calling debugfs functions, there is no need to ever check the
return value. The function can work or not, but the code logic should
never do something different based on this.

The only way this can fail is if:

 * debugfs superblock can not be pinned - something really went wrong with the
 vfs layer.
 * file is created with same name - the caller's fault.
 * new_inode() fails - happens if memory is exhausted.

so failing to clean up debugfs properly is the least of the system's
sproblems in uch a situation.

 [ bp: Extend commit message, remove unused err var in inject_init(). ]

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190612151531.GA16278@kroah.com
---
 arch/x86/kernel/cpu/mce/core.c     | 16 +++++-----------
 arch/x86/kernel/cpu/mce/inject.c   | 37 +++++--------------------------------
 arch/x86/kernel/cpu/mce/severity.c | 14 +++-----------
 3 files changed, 13 insertions(+), 54 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index c2c93e9195ed..066562a1ea20 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2516,22 +2516,16 @@ static int fake_panic_set(void *data, u64 val)
 DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
 			 "%llu\n");
 
-static int __init mcheck_debugfs_init(void)
+static void __init mcheck_debugfs_init(void)
 {
-	struct dentry *dmce, *ffake_panic;
+	struct dentry *dmce;
 
 	dmce = mce_get_debugfs_dir();
-	if (!dmce)
-		return -ENOMEM;
-	ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce,
-						 NULL, &fake_panic_fops);
-	if (!ffake_panic)
-		return -ENOMEM;
-
-	return 0;
+	debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
+				   &fake_panic_fops);
 }
 #else
-static int __init mcheck_debugfs_init(void) { return -EINVAL; }
+static void __init mcheck_debugfs_init(void) { }
 #endif
 
 DEFINE_STATIC_KEY_FALSE(mcsafe_key);
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 5d108f70f315..1f30117b24ba 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -645,7 +645,6 @@ static const struct file_operations readme_fops = {
 
 static struct dfs_node {
 	char *name;
-	struct dentry *d;
 	const struct file_operations *fops;
 	umode_t perm;
 } dfs_fls[] = {
@@ -659,49 +658,23 @@ static struct dfs_node {
 	{ .name = "README",	.fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
 };
 
-static int __init debugfs_init(void)
+static void __init debugfs_init(void)
 {
 	unsigned int i;
 
 	dfs_inj = debugfs_create_dir("mce-inject", NULL);
-	if (!dfs_inj)
-		return -EINVAL;
-
-	for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
-		dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
-						    dfs_fls[i].perm,
-						    dfs_inj,
-						    &i_mce,
-						    dfs_fls[i].fops);
-
-		if (!dfs_fls[i].d)
-			goto err_dfs_add;
-	}
-
-	return 0;
-
-err_dfs_add:
-	while (i-- > 0)
-		debugfs_remove(dfs_fls[i].d);
 
-	debugfs_remove(dfs_inj);
-	dfs_inj = NULL;
-
-	return -ENODEV;
+	for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
+		debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj,
+				    &i_mce, dfs_fls[i].fops);
 }
 
 static int __init inject_init(void)
 {
-	int err;
-
 	if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
 		return -ENOMEM;
 
-	err = debugfs_init();
-	if (err) {
-		free_cpumask_var(mce_inject_cpumask);
-		return err;
-	}
+	debugfs_init();
 
 	register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
 	mce_register_injector_chain(&inject_nb);
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 65201e180fe0..27fd6816e270 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -404,21 +404,13 @@ static const struct file_operations severities_coverage_fops = {
 
 static int __init severities_debugfs_init(void)
 {
-	struct dentry *dmce, *fsev;
+	struct dentry *dmce;
 
 	dmce = mce_get_debugfs_dir();
-	if (!dmce)
-		goto err_out;
-
-	fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
-				   &severities_coverage_fops);
-	if (!fsev)
-		goto err_out;
 
+	debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+			    &severities_coverage_fops);
 	return 0;
-
-err_out:
-	return -ENOMEM;
 }
 late_initcall(severities_debugfs_init);
 #endif /* CONFIG_DEBUG_FS */
-- 
cgit v1.2.3