summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
diff options
context:
space:
mode:
authorJesse.zhang@amd.com <Jesse.zhang@amd.com>2025-02-21 10:26:52 +0800
committerAlex Deucher <alexander.deucher@amd.com>2025-02-25 11:43:59 -0500
commitc94943b0863ef3b8e88769f0805f715c8247b2bf (patch)
tree3cf6964fbe81bc682a82757b961251853670b2fe /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parentd190e4d0f7b52bdb2b5bd8c3dcfbcd7877a0dc53 (diff)
downloadlinux-c94943b0863ef3b8e88769f0805f715c8247b2bf.tar.gz
linux-c94943b0863ef3b8e88769f0805f715c8247b2bf.tar.bz2
linux-c94943b0863ef3b8e88769f0805f715c8247b2bf.zip
drm/amdgpu: Update amdgpu_job_timedout to check if the ring is guilty
This patch updates the `amdgpu_job_timedout` function to check if the ring is actually guilty of causing the timeout. If not, it skips error handling and fence completion. v2: move the is_guilty check down into the queue reset area (Alex) v3: need to call is_guilty before reset (Alex) v4: squash in is_guilty logic fixes (Alex) Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_job.c24
1 files changed, 20 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 100f04475943..abfbbc6babe7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -130,29 +130,45 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
amdgpu_vm_put_task_info(ti);
}
- dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
-
/* attempt a per ring reset */
if (amdgpu_gpu_recovery &&
ring->funcs->reset) {
+ bool is_guilty;
+
dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name);
/* stop the scheduler, but don't mess with the
* bad job yet because if ring reset fails
* we'll fall back to full GPU reset.
*/
drm_sched_wqueue_stop(&ring->sched);
+
+ /* for engine resets, we need to reset the engine,
+ * but individual queues may be unaffected.
+ * check here to make sure the accounting is correct.
+ */
+ if (ring->funcs->is_guilty)
+ is_guilty = ring->funcs->is_guilty(ring);
+ else
+ is_guilty = true;
+
+ if (is_guilty)
+ dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
+
r = amdgpu_ring_reset(ring, job->vmid);
if (!r) {
if (amdgpu_ring_sched_ready(ring))
drm_sched_stop(&ring->sched, s_job);
- atomic_inc(&ring->adev->gpu_reset_counter);
- amdgpu_fence_driver_force_completion(ring);
+ if (is_guilty) {
+ atomic_inc(&ring->adev->gpu_reset_counter);
+ amdgpu_fence_driver_force_completion(ring);
+ }
if (amdgpu_ring_sched_ready(ring))
drm_sched_start(&ring->sched, 0);
goto exit;
}
dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
}
+ dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
if (amdgpu_device_should_recover_gpu(ring->adev)) {
struct amdgpu_reset_context reset_context;