summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesse Zhang <zhexi.zhang@amd.com>2019-07-30 19:15:42 +0800
committerAlex Deucher <alexander.deucher@amd.com>2019-10-03 09:11:01 -0500
commitdf99ac0fcc507cb680c5b75b679cda1195a99a42 (patch)
tree98950f910c1f3b99a59f7620fb4017d904d5e4e9
parent90a08351f72d609dfa22871226d75d3758b2df50 (diff)
downloadlinux-stable-df99ac0fcc507cb680c5b75b679cda1195a99a42.tar.gz
linux-stable-df99ac0fcc507cb680c5b75b679cda1195a99a42.tar.bz2
linux-stable-df99ac0fcc507cb680c5b75b679cda1195a99a42.zip
drm/amd/amdgpu:Fix compute ring unable to detect hang.
When compute fence did not signal, compute ring cannot detect hardware hang because its timeout value is set to be infinite by default. In SR-IOV and passthrough mode, if user does not declare custome timeout value for compute ring, then use gfx ring timeout value as default. So that when there is a ture hardware hang, compute ring can detect it. Signed-off-by: Jesse Zhang <zhexi.zhang@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c12
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c13
3 files changed, 13 insertions, 19 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 9f916a331459..1364a2be68e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1025,12 +1025,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
amdgpu_device_check_block_size(adev);
- ret = amdgpu_device_get_job_timeout_settings(adev);
- if (ret) {
- dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
- return ret;
- }
-
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
return ret;
@@ -2745,6 +2739,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
return r;
+ r = amdgpu_device_get_job_timeout_settings(adev);
+ if (r) {
+ dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
+ return r;
+ }
+
/* doorbell bar mapping and doorbell index init*/
amdgpu_device_doorbell_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index e2f166a32c92..864f7858d630 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1341,10 +1341,15 @@ int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
/*
* By default timeout for non compute jobs is 10000.
* And there is no timeout enforced on compute jobs.
+ * In SR-IOV or passthrough mode, timeout for compute
+ * jobs are 10000 by default.
*/
adev->gfx_timeout = msecs_to_jiffies(10000);
adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
- adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
+ if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
+ adev->compute_timeout = adev->gfx_timeout;
+ else
+ adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) {
while ((timeout_setting = strsep(&input, ",")) &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 23085b352cf2..377fe20bce23 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -462,18 +462,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
timeout = adev->gfx_timeout;
break;
case AMDGPU_RING_TYPE_COMPUTE:
- /*
- * For non-sriov case, no timeout enforce
- * on compute ring by default. Unless user
- * specifies a timeout for compute ring.
- *
- * For sriov case, always use the timeout
- * as gfx ring
- */
- if (!amdgpu_sriov_vf(ring->adev))
- timeout = adev->compute_timeout;
- else
- timeout = adev->gfx_timeout;
+ timeout = adev->compute_timeout;
break;
case AMDGPU_RING_TYPE_SDMA:
timeout = adev->sdma_timeout;