summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd
diff options
context:
space:
mode:
authorYiPeng Chai <YiPeng.Chai@amd.com>2024-03-18 11:48:07 +0800
committerAlex Deucher <alexander.deucher@amd.com>2024-04-26 17:22:41 -0400
commitf27defca68824e8e97218b8816249f258d3d5d32 (patch)
treef3f9fcea0a4d86208aba92baa75ba7b3cf64478c /drivers/gpu/drm/amd
parentb2aa6b108dd3bf081f0848f07ba74ad73ec635be (diff)
downloadlinux-stable-f27defca68824e8e97218b8816249f258d3d5d32.tar.gz
linux-stable-f27defca68824e8e97218b8816249f258d3d5d32.tar.bz2
linux-stable-f27defca68824e8e97218b8816249f258d3d5d32.zip
drm/amdgpu: umc v12_0 logs ecc errors
1. umc v12_0 logs ecc errors. 2. Reserve newly detected ecc error pages. 3. Add tag for bad pages, so that they can be retired later. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c67
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/umc_v12_0.c41
3 files changed, 113 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 7006a57277ef..0f2d765c4e2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -21,10 +21,13 @@
*
*/
+#include <linux/sort.h>
#include "amdgpu.h"
#include "umc_v6_7.h"
#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
+#define MAX_UMC_HASH_STRING_SIZE 256
+
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst)
@@ -446,3 +449,67 @@ int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
status, ipid, addr);
return 0;
}
+
+static int amdgpu_umc_uint64_cmp(const void *a, const void *b)
+{
+ uint64_t *addr_a = (uint64_t *)a;
+ uint64_t *addr_b = (uint64_t *)b;
+
+ if (*addr_a > *addr_b)
+ return 1;
+ else if (*addr_a < *addr_b)
+ return -1;
+ else
+ return 0;
+}
+
+/* Use string hash to avoid logging the same bad pages repeatedly */
+int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
+ uint64_t *pfns, int len, uint64_t *val)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ char buf[MAX_UMC_HASH_STRING_SIZE] = {0};
+ int offset = 0, i = 0;
+ uint64_t hash_val;
+
+ if (!pfns || !len)
+ return -EINVAL;
+
+ sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL);
+
+ for (i = 0; i < len; i++)
+ offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]);
+
+ hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key);
+
+ *val = hash_val;
+
+ return 0;
+}
+
+int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
+ struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_ecc_log_info *ecc_log;
+ int ret;
+
+ ecc_log = &con->umc_ecc_log;
+
+ mutex_lock(&ecc_log->lock);
+ ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
+ if (!ret) {
+ struct ras_err_pages *err_pages = &ecc_err->err_pages;
+ int i;
+
+ /* Reserve memory */
+ for (i = 0; i < err_pages->count; i++)
+ amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
+
+ radix_tree_tag_set(ecc_tree,
+ ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
+ }
+ mutex_unlock(&ecc_log->lock);
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 4f3834fa10a8..c83d24097c5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -52,6 +52,8 @@
#define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \
LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
+/* Page retirement tag */
+#define UMC_ECC_NEW_DETECTED_TAG 0x1
typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
uint32_t umc_inst, uint32_t ch_inst, void *data);
@@ -127,5 +129,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
uint64_t status, uint64_t ipid, uint64_t addr);
-
+int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
+ uint64_t *pfns, int len, uint64_t *val);
+int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
+ struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 085dcfe16b5e..6c2b61ef5b57 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -546,8 +546,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
uint16_t hwid, mcatype;
struct ta_ras_query_address_input addr_in;
uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
- uint64_t err_addr;
+ uint64_t err_addr, hash_val = 0;
+ struct ras_ecc_err *ecc_err;
int count;
+ int ret;
hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
@@ -589,6 +591,43 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
return 0;
}
+ ret = amdgpu_umc_build_pages_hash(adev,
+ page_pfn, count, &hash_val);
+ if (ret) {
+ dev_err(adev->dev, "Fail to build error pages hash\n");
+ return ret;
+ }
+
+ ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
+ if (!ecc_err)
+ return -ENOMEM;
+
+ ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), GFP_KERNEL);
+ if (!ecc_err->err_pages.pfn) {
+ kfree(ecc_err);
+ return -ENOMEM;
+ }
+
+ memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err->err_pages.pfn));
+ ecc_err->err_pages.count = count;
+
+ ecc_err->hash_index = hash_val;
+ ecc_err->status = status;
+ ecc_err->ipid = ipid;
+ ecc_err->addr = addr;
+
+ ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
+ if (ret) {
+ if (ret == -EEXIST)
+ con->umc_ecc_log.de_updated = true;
+ else
+ dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
+
+ kfree(ecc_err->err_pages.pfn);
+ kfree(ecc_err);
+ return ret;
+ }
+
con->umc_ecc_log.de_updated = true;
return 0;