summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
diff options
context:
space:
mode:
authorLijo Lazar <lijo.lazar@amd.com>2024-10-24 11:01:57 +0530
committerAlex Deucher <alexander.deucher@amd.com>2024-12-10 10:26:46 -0500
commite1ee2111ca48169a9fdc5075f7863f5d4d591e2f (patch)
tree487517237aa6a8c5587ef88e717accf3a72878b2 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
parent0eecff79e49f8ce5475e1b4d968f26263587be66 (diff)
downloadlinux-e1ee2111ca48169a9fdc5075f7863f5d4d591e2f.tar.gz
linux-e1ee2111ca48169a9fdc5075f7863f5d4d591e2f.tar.bz2
linux-e1ee2111ca48169a9fdc5075f7863f5d4d591e2f.zip
drm/amdgpu: Prefer RAS recovery for scheduler hang
Before scheduling a recovery due to scheduler/job hang, check if a RAS error is detected. If so, choose RAS recovery to handle the situation. A scheduler/job hang could be the side effect of a RAS error. In such cases, it is required to go through the RAS error recovery process. A RAS error recovery process in certains cases also could avoid a full device device reset. An error state is maintained in RAS context to detect the block affected. Fatal Error state uses unused block id. Set the block id when error is detected. If the interrupt handler detected a poison error, it's not required to look for a fatal error. Skip fatal error checking in such cases. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h11
1 files changed, 8 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6db772ecfee4..b13debcf48ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -99,7 +99,8 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__IH,
AMDGPU_RAS_BLOCK__MPIO,
- AMDGPU_RAS_BLOCK__LAST
+ AMDGPU_RAS_BLOCK__LAST,
+ AMDGPU_RAS_BLOCK__ANY = -1
};
enum amdgpu_ras_mca_block {
@@ -558,8 +559,8 @@ struct amdgpu_ras {
struct ras_ecc_log_info umc_ecc_log;
struct delayed_work page_retirement_dwork;
- /* Fatal error detected flag */
- atomic_t fed;
+ /* ras errors detected */
+ unsigned long ras_err_state;
/* RAS event manager */
struct ras_event_manager __event_mgr;
@@ -952,6 +953,10 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
+void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block);
+void amdgpu_ras_clear_err_state(struct amdgpu_device *adev);
+bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block);
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,