summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorHawking Zhang <Hawking.Zhang@amd.com>2023-03-20 17:51:30 +0800
committerAlex Deucher <alexander.deucher@amd.com>2023-06-09 09:58:32 -0400
commit9b337b7d628a5e97b4dd72bb1d75f1716567b416 (patch)
treea9d6954bb508148642b127e18937d66673c4da3e /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent35d54e21e002198c13647b6cd8c77586f683cf39 (diff)
downloadlinux-9b337b7d628a5e97b4dd72bb1d75f1716567b416.tar.gz
linux-9b337b7d628a5e97b4dd72bb1d75f1716567b416.tar.bz2
linux-9b337b7d628a5e97b4dd72bb1d75f1716567b416.zip
drm/amdgpu: Adjust the sequence to query ras error info
It turns out STATUS_VALID_FLAG needs to be checked ahead of any other fields. ADDRESS_VALID_FLAG and ERR_INFO_VALID_FLAG only manages ADDRESS and ERR_INFO field respectively. driver should continue poll ERR CNT field even ERR_INFO_VALD_FLAG is not set. Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c13
1 files changed, 7 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5ae89602a116..64f80e8cbd63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3164,7 +3164,8 @@ bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) &&
!REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG))
- return false;
+ /* keep the check here in case we need to refer to the result later */
+ dev_dbg(adev->dev, "Invalid err_info field\n");
/* read err count */
*err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT);
@@ -3187,17 +3188,17 @@ void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
uint32_t i, j;
for (i = 0; i < reg_list_size; i++) {
+ /* query memory_id from err_status_lo */
+ if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
+ instance, &memory_id))
+ continue;
+
/* query err_cnt from err_status_hi */
if (!amdgpu_ras_inst_get_err_cnt_field(adev, &reg_list[i],
instance, &err_cnt) ||
!err_cnt)
continue;
- /* query memory_id from err_status_lo */
- if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
- instance, &memory_id))
- continue;
-
*err_count += err_cnt;
/* log the errors */