diff options
author | Tao Zhou <tao.zhou1@amd.com> | 2024-10-18 17:52:59 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-12-10 10:26:46 -0500 |
commit | 772df3df809a536c3e1b14db5a1dd74ae7baa102 (patch) | |
tree | 22e9a4874dd2f8f333d9dcc82d892a13b556a7fa /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 95024c714b83d267036564be998328762c47fbda (diff) | |
download | linux-772df3df809a536c3e1b14db5a1dd74ae7baa102.tar.gz linux-772df3df809a536c3e1b14db5a1dd74ae7baa102.tar.bz2 linux-772df3df809a536c3e1b14db5a1dd74ae7baa102.zip |
drm/amdgpu: add flag to indicate the type of RAS eeprom record
One UMC MCA address could map to multiply physical address (PA):
AMDGPU_RAS_EEPROM_REC_PA: one record store one PA
AMDGPU_RAS_EEPROM_REC_MCA: one record store one MCA address, PA
is not cared about
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 33 |
1 files changed, 26 insertions, 7 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4c9fa24dd972..e7eccc30c692 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2821,10 +2821,20 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) return -ENOMEM; ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); - if (ret) + if (ret) { dev_err(adev->dev, "Failed to load EEPROM table records!"); - else + } else { + if (control->ras_num_recs > 1 && + adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { + if ((bps[0].address == bps[1].address) && + (bps[0].mem_channel == bps[1].mem_channel)) + control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; + else + control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; + } + ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); + } kfree(bps); return ret; @@ -3205,13 +3215,14 @@ static int amdgpu_ras_page_retirement_thread(void *param) int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_ras_eeprom_control *control; int ret; if (!con || amdgpu_sriov_vf(adev)) return 0; - ret = amdgpu_ras_eeprom_init(&con->eeprom_control); - + control = &con->eeprom_control; + ret = amdgpu_ras_eeprom_init(control); if (ret) return ret; @@ -3219,17 +3230,25 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) if (amdgpu_ras_is_rma(adev)) return -EHWPOISON; - if (con->eeprom_control.ras_num_recs) { + if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) + control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; + + /* default status is MCA storage */ + if (control->ras_num_recs <= 1 && + adev->umc.ras && adev->umc.ras->convert_ras_err_addr) + control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; + + if (control->ras_num_recs) { ret = amdgpu_ras_load_bad_pages(adev); if (ret) return ret; amdgpu_dpm_send_hbm_bad_pages_num( - adev, con->eeprom_control.ras_num_recs); + adev, control->ras_num_recs); if (con->update_channel_flag == true) { amdgpu_dpm_send_hbm_bad_channel_flag( - adev, con->eeprom_control.bad_channel_bitmap); + adev, control->bad_channel_bitmap); con->update_channel_flag = false; } } |