diff options
author | Yang Wang <kevinyang.wang@amd.com> | 2023-11-02 14:17:04 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2023-11-09 17:01:51 -0500 |
commit | 07c1db70364671eea4e84befe43ac91941153a43 (patch) | |
tree | ed2fda67d3ae451796881b36d15656db86c4edca /drivers | |
parent | 0b1695710ab8be263a5c19f17240c6a44b4b0a3e (diff) | |
download | linux-stable-07c1db70364671eea4e84befe43ac91941153a43.tar.gz linux-stable-07c1db70364671eea4e84befe43ac91941153a43.tar.bz2 linux-stable-07c1db70364671eea4e84befe43ac91941153a43.zip |
drm/amdgpu: refine smu v13.0.6 mca dump driver
refine smu mca driver to support query ras error from pmfw path.
- correct gfx smu bank hwid (from mp5 to smu bank)
- retire unused callback function in amdgpu_mca_smu_funcs{}
- add new mca_bank_set{} structure to collect mca bank
- move enum mca_reg_idx into amdgpu_mca.h header
- add mca status register field decode macro
Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 164 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 61 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 276 |
3 files changed, 319 insertions, 182 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 5a828c175e3a..3302f4a29387 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -143,6 +143,46 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev) return 0; } +void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set) +{ + if (!mca_set) + return; + + memset(mca_set, 0, sizeof(*mca_set)); + INIT_LIST_HEAD(&mca_set->list); +} + +int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry) +{ + struct mca_bank_node *node; + + if (!entry) + return -EINVAL; + + node = kvzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + memcpy(&node->entry, entry, sizeof(*entry)); + + INIT_LIST_HEAD(&node->node); + list_add_tail(&node->node, &mca_set->list); + + mca_set->nr_entries++; + + return 0; +} + +void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set) +{ + struct mca_bank_node *node, *tmp; + + list_for_each_entry_safe(node, tmp, &mca_set->list, node) { + list_del(&node->node); + kvfree(node); + } +} + void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs) { struct amdgpu_mca *mca = &adev->mca; @@ -160,6 +200,58 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) return -EOPNOTSUPP; } +static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(entry->regs); i++) + dev_dbg(adev->dev, "mca entry[%02d].regs[%02d]=0x%016llx\n", idx, i, entry->regs[i]); +} + +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data) +{ + struct amdgpu_smuio_mcm_config_info mcm_info; + struct mca_bank_set mca_set; + struct mca_bank_node *node; + struct mca_bank_entry *entry; + uint32_t count; + int ret, i = 0; + + amdgpu_mca_bank_set_init(&mca_set); + + ret = amdgpu_mca_smu_get_mca_set(adev, blk, type, &mca_set); + if (ret) + goto out_mca_release; + + list_for_each_entry(node, &mca_set.list, node) { + entry = &node->entry; + + amdgpu_mca_smu_mca_bank_dump(adev, i++, entry); + + count = 0; + ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count); + if (ret) + goto out_mca_release; + + if (!count) + continue; + + mcm_info.socket_id = entry->info.socket_id; + mcm_info.die_id = entry->info.aid; + + if (type == AMDGPU_MCA_ERROR_TYPE_UE) + amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, (uint64_t)count); + else + amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, (uint64_t)count); + } + +out_mca_release: + amdgpu_mca_bank_set_release(&mca_set); + + return ret; +} + + int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count) { const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; @@ -173,17 +265,77 @@ int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_m return -EOPNOTSUPP; } -int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, uint32_t *count) +int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, uint32_t *total) { const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - if (!count) + struct mca_bank_set mca_set; + struct mca_bank_node *node; + struct mca_bank_entry *entry; + uint32_t count; + int ret; + + if (!total) return -EINVAL; - if (mca_funcs && mca_funcs->mca_get_error_count) - return mca_funcs->mca_get_error_count(adev, blk, type, count); + if (!mca_funcs) + return -EOPNOTSUPP; - return -EOPNOTSUPP; + if (!mca_funcs->mca_get_ras_mca_set || !mca_funcs->mca_get_valid_mca_count) + return -EOPNOTSUPP; + + amdgpu_mca_bank_set_init(&mca_set); + + ret = mca_funcs->mca_get_ras_mca_set(adev, blk, type, &mca_set); + if (ret) + goto err_mca_set_release; + + *total = 0; + list_for_each_entry(node, &mca_set.list, node) { + entry = &node->entry; + + count = 0; + ret = mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, &count); + if (ret) + goto err_mca_set_release; + + *total += count; + } + +err_mca_set_release: + amdgpu_mca_bank_set_release(&mca_set); + + return ret; +} + +int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count) +{ + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + if (!count || !entry) + return -EINVAL; + + if (!mca_funcs || !mca_funcs->mca_parse_mca_error_count) + return -EOPNOTSUPP; + + + return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count); +} + +int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set) +{ + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + + if (!mca_set) + return -EINVAL; + + if (!mca_funcs || !mca_funcs->mca_get_ras_mca_set) + return -EOPNOTSUPP; + + WARN_ON(!list_empty(&mca_set->list)); + + return mca_funcs->mca_get_ras_mca_set(adev, blk, type, mca_set); } int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h index 28ad463cf5c9..45f90edf2283 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h @@ -25,6 +25,27 @@ #define MCA_MAX_REGS_COUNT (16) +#define MCA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l) +#define MCA_REG__STATUS__VAL(x) MCA_REG_FIELD(x, 63, 63) +#define MCA_REG__STATUS__OVERFLOW(x) MCA_REG_FIELD(x, 62, 62) +#define MCA_REG__STATUS__UC(x) MCA_REG_FIELD(x, 61, 61) +#define MCA_REG__STATUS__EN(x) MCA_REG_FIELD(x, 60, 60) +#define MCA_REG__STATUS__MISCV(x) MCA_REG_FIELD(x, 59, 59) +#define MCA_REG__STATUS__ADDRV(x) MCA_REG_FIELD(x, 58, 58) +#define MCA_REG__STATUS__PCC(x) MCA_REG_FIELD(x, 57, 57) +#define MCA_REG__STATUS__ERRCOREIDVAL(x) MCA_REG_FIELD(x, 56, 56) +#define MCA_REG__STATUS__TCC(x) MCA_REG_FIELD(x, 55, 55) +#define MCA_REG__STATUS__SYNDV(x) MCA_REG_FIELD(x, 53, 53) +#define MCA_REG__STATUS__CECC(x) MCA_REG_FIELD(x, 46, 46) +#define MCA_REG__STATUS__UECC(x) MCA_REG_FIELD(x, 45, 45) +#define MCA_REG__STATUS__DEFERRED(x) MCA_REG_FIELD(x, 44, 44) +#define MCA_REG__STATUS__POISON(x) MCA_REG_FIELD(x, 43, 43) +#define MCA_REG__STATUS__SCRUB(x) MCA_REG_FIELD(x, 40, 40) +#define MCA_REG__STATUS__ERRCOREID(x) MCA_REG_FIELD(x, 37, 32) +#define MCA_REG__STATUS__ADDRLSB(x) MCA_REG_FIELD(x, 29, 24) +#define MCA_REG__STATUS__ERRORCODEEXT(x) MCA_REG_FIELD(x, 21, 16) +#define MCA_REG__STATUS__ERRORCODE(x) MCA_REG_FIELD(x, 15, 0) + enum amdgpu_mca_ip { AMDGPU_MCA_IP_UNKNOW = -1, AMDGPU_MCA_IP_PSP = 0, @@ -57,6 +78,17 @@ struct amdgpu_mca { const struct amdgpu_mca_smu_funcs *mca_funcs; }; +enum mca_reg_idx { + MCA_REG_IDX_CONTROL = 0, + MCA_REG_IDX_STATUS = 1, + MCA_REG_IDX_ADDR = 2, + MCA_REG_IDX_MISC0 = 3, + MCA_REG_IDX_CONFIG = 4, + MCA_REG_IDX_IPID = 5, + MCA_REG_IDX_SYND = 6, + MCA_REG_IDX_COUNT = 16, +}; + struct mca_bank_info { int socket_id; int aid; @@ -72,18 +104,28 @@ struct mca_bank_entry { uint64_t regs[MCA_MAX_REGS_COUNT]; }; +struct mca_bank_node { + struct mca_bank_entry entry; + struct list_head node; +}; + +struct mca_bank_set { + int nr_entries; + struct list_head list; +}; + struct amdgpu_mca_smu_funcs { int max_ue_count; int max_ce_count; int (*mca_set_debug_mode)(struct amdgpu_device *adev, bool enable); - int (*mca_get_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, uint32_t *count); + int (*mca_get_ras_mca_set)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, + struct mca_bank_set *mca_set); + int (*mca_parse_mca_error_count)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, + struct mca_bank_entry *entry, uint32_t *count); int (*mca_get_valid_mca_count)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count); int (*mca_get_mca_entry)(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, int idx, struct mca_bank_entry *entry); - int (*mca_get_ras_mca_idx_array)(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size); }; void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, @@ -107,11 +149,22 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev); void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs); int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable); int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count); +int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, uint32_t *total); int amdgpu_mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, uint32_t *count); +int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count); +int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set); int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, int idx, struct mca_bank_entry *entry); void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); +void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set); +int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry); +void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set); +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data); + #endif diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index a6b57a5d0c77..f0cddbc78496 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -100,17 +100,6 @@ MODULE_FIRMWARE("amdgpu/smu_13_0_6.bin"); #define MCA_BANK_IPID(_ip, _hwid, _type) \ [AMDGPU_MCA_IP_##_ip] = { .hwid = _hwid, .mcatype = _type, } -enum mca_reg_idx { - MCA_REG_IDX_CONTROL = 0, - MCA_REG_IDX_STATUS = 1, - MCA_REG_IDX_ADDR = 2, - MCA_REG_IDX_MISC0 = 3, - MCA_REG_IDX_CONFIG = 4, - MCA_REG_IDX_IPID = 5, - MCA_REG_IDX_SYND = 6, - MCA_REG_IDX_COUNT = 16, -}; - struct mca_bank_ipid { enum amdgpu_mca_ip ip; uint16_t hwid; @@ -123,7 +112,9 @@ struct mca_ras_info { int *err_code_array; int err_code_count; int (*get_err_count)(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, - enum amdgpu_mca_error_type type, int idx, uint32_t *count); + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count); + bool (*bank_is_valid)(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry); }; #define P2S_TABLE_ID_A 0x50325341 @@ -2449,48 +2440,34 @@ static int mca_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_t return 0; } -static int mca_decode_mca_ipid(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, int idx, int *ip) +static int mca_decode_ipid_to_hwip(uint64_t val) { const struct mca_bank_ipid *ipid; - uint64_t val; uint16_t hwid, mcatype; - int i, ret; - - ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_IPID, &val); - if (ret) - return ret; + int i; hwid = REG_GET_FIELD(val, MCMP1_IPIDT0, HardwareID); mcatype = REG_GET_FIELD(val, MCMP1_IPIDT0, McaType); - if (hwid) { - for (i = 0; i < ARRAY_SIZE(smu_v13_0_6_mca_ipid_table); i++) { - ipid = &smu_v13_0_6_mca_ipid_table[i]; + for (i = 0; i < ARRAY_SIZE(smu_v13_0_6_mca_ipid_table); i++) { + ipid = &smu_v13_0_6_mca_ipid_table[i]; - if (!ipid->hwid) - continue; + if (!ipid->hwid) + continue; - if (ipid->hwid == hwid && ipid->mcatype == mcatype) { - *ip = i; - return 0; - } - } + if (ipid->hwid == hwid && ipid->mcatype == mcatype) + return i; } - *ip = AMDGPU_MCA_IP_UNKNOW; - - return 0; + return AMDGPU_MCA_IP_UNKNOW; } static int mca_umc_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, - enum amdgpu_mca_error_type type, int idx, uint32_t *count) + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count) { uint64_t status0; - int ret; - ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_STATUS, &status0); - if (ret) - return ret; + status0 = entry->regs[MCA_REG_IDX_STATUS]; if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) { *count = 0; @@ -2521,70 +2498,41 @@ static bool mca_smu_check_error_code(struct amdgpu_device *adev, const struct mc return false; } -static int mca_mp5_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, - enum amdgpu_mca_error_type type, int idx, uint32_t *count) +static int mca_gfx_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count) { - uint64_t status0 = 0, misc0 = 0; - uint32_t errcode; - int ret; - - if (mca_ras->ip != AMDGPU_MCA_IP_MP5) - return -EINVAL; - - ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_STATUS, &status0); - if (ret) - return ret; + uint64_t status0, misc0; + status0 = entry->regs[MCA_REG_IDX_STATUS]; if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) { *count = 0; return 0; } - errcode = REG_GET_FIELD(status0, MCMP1_STATUST0, ErrorCode); - if (!mca_smu_check_error_code(adev, mca_ras, errcode)) - return 0; - if (type == AMDGPU_MCA_ERROR_TYPE_UE && REG_GET_FIELD(status0, MCMP1_STATUST0, UC) == 1 && REG_GET_FIELD(status0, MCMP1_STATUST0, PCC) == 1) { - if (count) - *count = 1; + *count = 1; return 0; - } - - ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_MISC0, &misc0); - if (ret) - return ret; - - if (count) + } else { + misc0 = entry->regs[MCA_REG_IDX_MISC0]; *count = REG_GET_FIELD(misc0, MCMP1_MISC0T0, ErrCnt); + } return 0; } static int mca_smu_mca_get_err_count(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, - enum amdgpu_mca_error_type type, int idx, uint32_t *count) + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count) { - uint64_t status0 = 0, misc0 = 0; - uint32_t errcode; - int ret; - - if (mca_ras->ip != AMDGPU_MCA_IP_SMU) - return -EINVAL; - - ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_STATUS, &status0); - if (ret) - return ret; + uint64_t status0, misc0; + status0 = entry->regs[MCA_REG_IDX_STATUS]; if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) { *count = 0; return 0; } - errcode = REG_GET_FIELD(status0, MCMP1_STATUST0, ErrorCode); - if (!mca_smu_check_error_code(adev, mca_ras, errcode)) - return 0; - if (type == AMDGPU_MCA_ERROR_TYPE_UE && REG_GET_FIELD(status0, MCMP1_STATUST0, UC) == 1 && REG_GET_FIELD(status0, MCMP1_STATUST0, PCC) == 1) { @@ -2593,16 +2541,43 @@ static int mca_smu_mca_get_err_count(const struct mca_ras_info *mca_ras, struct return 0; } - ret = mca_bank_read_reg(adev, type, idx, MCA_REG_IDX_MISC0, &misc0); - if (ret) - return ret; - - if (count) - *count = REG_GET_FIELD(misc0, MCMP1_MISC0T0, ErrCnt); + misc0 = entry->regs[MCA_REG_IDX_MISC0]; + *count = REG_GET_FIELD(misc0, MCMP1_MISC0T0, ErrCnt); return 0; } +static bool mca_gfx_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry) +{ + uint32_t instlo; + + instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, InstanceIdLo); + switch (instlo) { + case 0x36430400: /* SMNAID XCD 0 */ + case 0x38430400: /* SMNAID XCD 1 */ + case 0x40430400: /* SMNXCD XCD 0, NOTE: FIXME: fix this error later */ + return true; + default: + return false; + } + + return false; +}; + +static bool mca_smu_bank_is_valid(const struct mca_ras_info *mca_ras, struct amdgpu_device *adev, + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry) +{ + uint32_t errcode, instlo; + + instlo = REG_GET_FIELD(entry->regs[MCA_REG_IDX_IPID], MCMP1_IPIDT0, InstanceIdLo); + if (instlo != 0x03b30400) + return false; + + errcode = REG_GET_FIELD(entry->regs[MCA_REG_IDX_STATUS], MCMP1_STATUST0, ErrorCode); + return mca_smu_check_error_code(adev, mca_ras, errcode); +} + static int sdma_err_codes[] = { CODE_SDMA0, CODE_SDMA1, CODE_SDMA2, CODE_SDMA3 }; static int mmhub_err_codes[] = { CODE_DAGB0, CODE_DAGB0 + 1, CODE_DAGB0 + 2, CODE_DAGB0 + 3, CODE_DAGB0 + 4, /* DAGB0-4 */ @@ -2617,20 +2592,23 @@ static const struct mca_ras_info mca_ras_table[] = { .get_err_count = mca_umc_mca_get_err_count, }, { .blkid = AMDGPU_RAS_BLOCK__GFX, - .ip = AMDGPU_MCA_IP_MP5, - .get_err_count = mca_mp5_mca_get_err_count, + .ip = AMDGPU_MCA_IP_SMU, + .get_err_count = mca_gfx_mca_get_err_count, + .bank_is_valid = mca_gfx_smu_bank_is_valid, }, { .blkid = AMDGPU_RAS_BLOCK__SDMA, .ip = AMDGPU_MCA_IP_SMU, .err_code_array = sdma_err_codes, .err_code_count = ARRAY_SIZE(sdma_err_codes), .get_err_count = mca_smu_mca_get_err_count, + .bank_is_valid = mca_smu_bank_is_valid, }, { .blkid = AMDGPU_RAS_BLOCK__MMHUB, .ip = AMDGPU_MCA_IP_SMU, .err_code_array = mmhub_err_codes, .err_code_count = ARRAY_SIZE(mmhub_err_codes), .get_err_count = mca_smu_mca_get_err_count, + .bank_is_valid = mca_smu_bank_is_valid, }, }; @@ -2665,130 +2643,84 @@ static int mca_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_e } static bool mca_bank_is_valid(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras, - enum amdgpu_mca_error_type type, int idx) + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry) { - int ret, ip = AMDGPU_MCA_IP_UNKNOW; - - ret = mca_decode_mca_ipid(adev, type, idx, &ip); - if (ret) + if (mca_decode_ipid_to_hwip(entry->regs[MCA_REG_IDX_IPID]) != mca_ras->ip) return false; - if (ip == AMDGPU_MCA_IP_UNKNOW) - return false; + if (mca_ras->bank_is_valid) + return mca_ras->bank_is_valid(mca_ras, adev, type, entry); - return ip == mca_ras->ip; -} - -static int mca_get_valid_mca_idx(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras, - enum amdgpu_mca_error_type type, - uint32_t mca_cnt, int *idx_array, int idx_array_size) -{ - int i, idx_cnt = 0; - - for (i = 0; i < mca_cnt; i++) { - if (!mca_bank_is_valid(adev, mca_ras, type, i)) - continue; - - if (idx_array) { - if (idx_cnt < idx_array_size) - idx_array[idx_cnt] = i; - else - return -EINVAL; - } - - idx_cnt++; - } - - return idx_cnt; + return true; } -static int __mca_smu_get_error_count(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras, enum amdgpu_mca_error_type type, uint32_t *count) +static int __mca_smu_get_ras_mca_set(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras, + enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set) { - uint32_t result, mca_cnt, total = 0; - int idx_array[16]; - int i, ret, idx_cnt = 0; + struct mca_bank_entry entry; + uint32_t mca_cnt; + int i, ret; ret = mca_get_valid_mca_count(adev, type, &mca_cnt); if (ret) return ret; /* if valid mca bank count is 0, the driver can return 0 directly */ - if (!mca_cnt) { - *count = 0; + if (!mca_cnt) return 0; - } - if (!mca_ras->get_err_count) - return -EINVAL; + for (i = 0; i < mca_cnt; i++) { + memset(&entry, 0, sizeof(entry)); + ret = mca_get_mca_entry(adev, type, i, &entry); + if (ret) + return ret; - idx_cnt = mca_get_valid_mca_idx(adev, mca_ras, type, mca_cnt, idx_array, ARRAY_SIZE(idx_array)); - if (idx_cnt < 0) - return -EINVAL; + if (mca_ras && !mca_bank_is_valid(adev, mca_ras, type, &entry)) + continue; - for (i = 0; i < idx_cnt; i++) { - result = 0; - ret = mca_ras->get_err_count(mca_ras, adev, type, idx_array[i], &result); + ret = amdgpu_mca_bank_set_add_entry(mca_set, &entry); if (ret) return ret; - - total += result; } - *count = total; - return 0; } -static int mca_smu_get_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, uint32_t *count) +static int mca_smu_get_ras_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set) { - const struct mca_ras_info *mca_ras; + const struct mca_ras_info *mca_ras = NULL; - if (!count) + if (!mca_set) return -EINVAL; - mca_ras = mca_get_mca_ras_info(adev, blk); - if (!mca_ras) - return -EOPNOTSUPP; - - return __mca_smu_get_error_count(adev, mca_ras, type, count); -} - -static int __mca_smu_get_ras_mca_idx_array(struct amdgpu_device *adev, const struct mca_ras_info *mca_ras, - enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size) -{ - uint32_t mca_cnt = 0; - int ret, idx_cnt = 0; - - ret = mca_get_valid_mca_count(adev, type, &mca_cnt); - if (ret) - return ret; - - /* if valid mca bank count is 0, the driver can return 0 directly */ - if (!mca_cnt) { - *idx_array_size = 0; - return 0; + if (blk != AMDGPU_RAS_BLOCK_COUNT) { + mca_ras = mca_get_mca_ras_info(adev, blk); + if (!mca_ras) + return -EOPNOTSUPP; } - idx_cnt = mca_get_valid_mca_idx(adev, mca_ras, type, mca_cnt, idx_array, *idx_array_size); - if (idx_cnt < 0) - return -EINVAL; - - *idx_array_size = idx_cnt; - - return 0; + return __mca_smu_get_ras_mca_set(adev, mca_ras, type, mca_set); } -static int mca_smu_get_ras_mca_idx_array(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, int *idx_array, int *idx_array_size) +static int mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, + struct mca_bank_entry *entry, uint32_t *count) { const struct mca_ras_info *mca_ras; + if (!entry || !count) + return -EINVAL; + mca_ras = mca_get_mca_ras_info(adev, blk); if (!mca_ras) return -EOPNOTSUPP; - return __mca_smu_get_ras_mca_idx_array(adev, mca_ras, type, idx_array, idx_array_size); + if (!mca_bank_is_valid(adev, mca_ras, type, entry)) { + *count = 0; + return 0; + } + + return mca_ras->get_err_count(mca_ras, adev, type, entry, count); } static int mca_smu_get_mca_entry(struct amdgpu_device *adev, @@ -2807,10 +2739,10 @@ static const struct amdgpu_mca_smu_funcs smu_v13_0_6_mca_smu_funcs = { .max_ue_count = 12, .max_ce_count = 12, .mca_set_debug_mode = mca_smu_set_debug_mode, - .mca_get_error_count = mca_smu_get_error_count, + .mca_get_ras_mca_set = mca_smu_get_ras_mca_set, + .mca_parse_mca_error_count = mca_smu_parse_mca_error_count, .mca_get_mca_entry = mca_smu_get_mca_entry, .mca_get_valid_mca_count = mca_smu_get_valid_mca_count, - .mca_get_ras_mca_idx_array = mca_smu_get_ras_mca_idx_array, }; static int smu_v13_0_6_select_xgmi_plpd_policy(struct smu_context *smu, |