diff options
author | Lijo Lazar <lijo.lazar@amd.com> | 2024-10-24 11:01:57 +0530 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-12-10 10:26:46 -0500 |
commit | e1ee2111ca48169a9fdc5075f7863f5d4d591e2f (patch) | |
tree | 487517237aa6a8c5587ef88e717accf3a72878b2 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 0eecff79e49f8ce5475e1b4d968f26263587be66 (diff) | |
download | linux-e1ee2111ca48169a9fdc5075f7863f5d4d591e2f.tar.gz linux-e1ee2111ca48169a9fdc5075f7863f5d4d591e2f.tar.bz2 linux-e1ee2111ca48169a9fdc5075f7863f5d4d591e2f.zip |
drm/amdgpu: Prefer RAS recovery for scheduler hang
Before scheduling a recovery due to scheduler/job hang, check if a RAS
error is detected. If so, choose RAS recovery to handle the situation. A
scheduler/job hang could be the side effect of a RAS error. In such
cases, it is required to go through the RAS error recovery process. A
RAS error recovery process in certains cases also could avoid a full
device device reset.
An error state is maintained in RAS context to detect the block
affected. Fatal Error state uses unused block id. Set the block id when
error is detected. If the interrupt handler detected a poison error,
it's not required to look for a fatal error. Skip fatal error checking
in such cases.
Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 |
1 files changed, 53 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f22242ab2407..4df9a8dfe9eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2156,6 +2156,16 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) /* Fatal error events are handled on host side */ if (amdgpu_sriov_vf(adev)) return; + /** + * If the current interrupt is caused by a non-fatal RAS error, skip + * check for fatal error. For fatal errors, FED status of all devices + * in XGMI hive gets set when the first device gets fatal error + * interrupt. The error gets propagated to other devices as well, so + * make sure to ack the interrupt regardless of FED status. + */ + if (!amdgpu_ras_get_fed_status(adev) && + amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY)) + return; if (adev->nbio.ras && adev->nbio.ras->handle_ras_controller_intr_no_bifring) @@ -2185,6 +2195,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * if (ret) return; + amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block); /* both query_poison_status and handle_poison_consumption are optional, * but at least one of them should be implemented if we need poison * consumption handler @@ -4172,7 +4183,7 @@ bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev) if (!ras) return false; - return atomic_read(&ras->fed); + return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); } void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) @@ -4180,8 +4191,48 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) struct amdgpu_ras *ras; ras = amdgpu_ras_get_context(adev); + if (ras) { + if (status) + set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); + else + clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state); + } +} + +void amdgpu_ras_clear_err_state(struct amdgpu_device *adev) +{ + struct amdgpu_ras *ras; + + ras = amdgpu_ras_get_context(adev); if (ras) - atomic_set(&ras->fed, !!status); + ras->ras_err_state = 0; +} + +void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, + enum amdgpu_ras_block block) +{ + struct amdgpu_ras *ras; + + ras = amdgpu_ras_get_context(adev); + if (ras) + set_bit(block, &ras->ras_err_state); +} + +bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block) +{ + struct amdgpu_ras *ras; + + ras = amdgpu_ras_get_context(adev); + if (ras) { + if (block == AMDGPU_RAS_BLOCK__ANY) + return (ras->ras_err_state != 0); + else + return test_bit(block, &ras->ras_err_state) || + test_bit(AMDGPU_RAS_BLOCK__LAST, + &ras->ras_err_state); + } + + return false; } static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev) |