summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd
diff options
context:
space:
mode:
authorLijo Lazar <lijo.lazar@amd.com>2024-03-25 12:33:02 +0530
committerAlex Deucher <alexander.deucher@amd.com>2024-04-09 22:13:36 -0400
commitb41f742d6fa6bc9799ff670a9b081f4ac6b911a0 (patch)
tree7756b6c8b83b154e49c3d61a0f063bde428b79ed /drivers/gpu/drm/amd
parent052965fba1979b89e2530c6e0c7ea24770285803 (diff)
downloadlinux-stable-b41f742d6fa6bc9799ff670a9b081f4ac6b911a0.tar.gz
linux-stable-b41f742d6fa6bc9799ff670a9b081f4ac6b911a0.tar.bz2
linux-stable-b41f742d6fa6bc9799ff670a9b081f4ac6b911a0.zip
drm/amdgpu: Set fatal errror detected flag earlier
In case of fatal errors, set FED status when interrupt is received. Set the flag on other devices in the hive before RAS recovery work. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Asad Kamal <asad.kamal@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c41
1 files changed, 28 insertions, 13 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b8c7d0bf8fb1..352ce16a0963 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2399,6 +2399,19 @@ out:
return ret;
}
+static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
+ struct amdgpu_hive_info *hive, bool status)
+{
+ struct amdgpu_device *tmp_adev;
+
+ if (hive) {
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ amdgpu_ras_set_fed(tmp_adev, status);
+ } else {
+ amdgpu_ras_set_fed(adev, status);
+ }
+}
+
static void amdgpu_ras_do_recovery(struct work_struct *work)
{
struct amdgpu_ras *ras =
@@ -2408,8 +2421,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct list_head device_list, *device_list_handle = NULL;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
- if (hive)
+ if (hive) {
atomic_set(&hive->ras_recovery, 1);
+
+ /* If any device which is part of the hive received RAS fatal
+ * error interrupt, set fatal error status on all. This
+ * condition will need a recovery, and flag will be cleared
+ * as part of recovery.
+ */
+ list_for_each_entry(remote_adev, &hive->device_list,
+ gmc.xgmi.head)
+ if (amdgpu_ras_get_fed_status(remote_adev)) {
+ amdgpu_ras_set_fed_all(adev, hive, true);
+ break;
+ }
+ }
if (!ras->disable_ras_err_cnt_harvest) {
/* Build list of devices to query RAS related errors */
@@ -2454,18 +2480,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- /* For any RAS error that needs a full reset to
- * recover, set the fatal error status
- */
- if (hive) {
- list_for_each_entry(remote_adev,
- &hive->device_list,
- gmc.xgmi.head)
- amdgpu_ras_set_fed(remote_adev,
- true);
- } else {
- amdgpu_ras_set_fed(adev, true);
- }
psp_fatal_error_recovery_quirk(&adev->psp);
}
}
@@ -3550,6 +3564,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
+ amdgpu_ras_set_fed(adev, true);
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev);
}