drm/amdgpu: Update amdgpu_job_timedout to check if the ring is guilty

This patch updates the `amdgpu_job_timedout` function to check if the ring is actually guilty of causing the timeout. If not, it skips error handling and fence completion. v2: move the is_guilty check down into the queue reset area (Alex) v3: need to call is_guilty before reset (Alex) v4: squash in is_guilty logic fixes (Alex) Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Jesse.zhang@amd.com <Jesse.zhang@amd.com> 2025-02-21 10:26:52 +0800
committer: Alex Deucher <alexander.deucher@amd.com> 2025-02-25 11:43:59 -0500
commit: c94943b0863ef3b8e88769f0805f715c8247b2bf (patch)
tree: 3cf6964fbe81bc682a82757b961251853670b2fe /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parent: d190e4d0f7b52bdb2b5bd8c3dcfbcd7877a0dc53 (diff)
download: linux-c94943b0863ef3b8e88769f0805f715c8247b2bf.tar.gz
linux-c94943b0863ef3b8e88769f0805f715c8247b2bf.tar.bz2
linux-c94943b0863ef3b8e88769f0805f715c8247b2bf.zip
1 files changed, 20 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 100f04475943..abfbbc6babe7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -130,29 +130,45 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		amdgpu_vm_put_task_info(ti);
 	}
 
-	dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
-
 	/* attempt a per ring reset */
 	if (amdgpu_gpu_recovery &&
 	    ring->funcs->reset) {
+		bool is_guilty;
+
 		dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name);
 		/* stop the scheduler, but don't mess with the
 		 * bad job yet because if ring reset fails
 		 * we'll fall back to full GPU reset.
 		 */
 		drm_sched_wqueue_stop(&ring->sched);
+
+		/* for engine resets, we need to reset the engine,
+		 * but individual queues may be unaffected.
+		 * check here to make sure the accounting is correct.
+		 */
+		if (ring->funcs->is_guilty)
+			is_guilty = ring->funcs->is_guilty(ring);
+		else
+			is_guilty = true;
+
+		if (is_guilty)
+			dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
+
 		r = amdgpu_ring_reset(ring, job->vmid);
 		if (!r) {
 			if (amdgpu_ring_sched_ready(ring))
 				drm_sched_stop(&ring->sched, s_job);
-			atomic_inc(&ring->adev->gpu_reset_counter);
-			amdgpu_fence_driver_force_completion(ring);
+			if (is_guilty) {
+				atomic_inc(&ring->adev->gpu_reset_counter);
+				amdgpu_fence_driver_force_completion(ring);
+			}
 			if (amdgpu_ring_sched_ready(ring))
 				drm_sched_start(&ring->sched, 0);
 			goto exit;
 		}
 		dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
 	}
+	dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
 
 	if (amdgpu_device_should_recover_gpu(ring->adev)) {
 		struct amdgpu_reset_context reset_context;
author	Jesse.zhang@amd.com <Jesse.zhang@amd.com>	2025-02-21 10:26:52 +0800
committer	Alex Deucher <alexander.deucher@amd.com>	2025-02-25 11:43:59 -0500
commit	c94943b0863ef3b8e88769f0805f715c8247b2bf (patch)
tree	3cf6964fbe81bc682a82757b961251853670b2fe /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parent	d190e4d0f7b52bdb2b5bd8c3dcfbcd7877a0dc53 (diff)
download	linux-c94943b0863ef3b8e88769f0805f715c8247b2bf.tar.gz linux-c94943b0863ef3b8e88769f0805f715c8247b2bf.tar.bz2 linux-c94943b0863ef3b8e88769f0805f715c8247b2bf.zip