summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorTao Zhou <tao.zhou1@amd.com>2024-10-18 14:49:00 +0800
committerAlex Deucher <alexander.deucher@amd.com>2024-12-10 10:26:46 -0500
commit0eecff79e49f8ce5475e1b4d968f26263587be66 (patch)
tree0f0376629742031a98e2a33708c262037b56c2f6 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent772df3df809a536c3e1b14db5a1dd74ae7baa102 (diff)
downloadlinux-0eecff79e49f8ce5475e1b4d968f26263587be66.tar.gz
linux-0eecff79e49f8ce5475e1b4d968f26263587be66.tar.bz2
linux-0eecff79e49f8ce5475e1b4d968f26263587be66.zip
drm/amdgpu: do RAS MCA2PA conversion in device init phase
NPS mode is introduced, the value of memory physical address (PA) related to a MCA address varies per nps mode. We need to rely on MCA address and convert it into PA accroding to nps mode. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c94
1 files changed, 82 insertions, 12 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e7eccc30c692..f22242ab2407 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2717,40 +2717,110 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
return 0;
}
+static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
+ struct eeprom_table_record *bps,
+ struct ras_err_data *err_data)
+{
+ struct ta_ras_query_address_input addr_in;
+ uint32_t socket = 0;
+ int ret = 0;
+
+ if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
+ socket = adev->smuio.funcs->get_socket_id(adev);
+
+ /* reinit err_data */
+ err_data->err_addr_cnt = 0;
+ err_data->err_addr_len = adev->umc.retire_unit;
+
+ memset(&addr_in, 0, sizeof(addr_in));
+ addr_in.ma.err_addr = bps->address;
+ addr_in.ma.socket_id = socket;
+ addr_in.ma.ch_inst = bps->mem_channel;
+ /* tell RAS TA the node instance is not used */
+ addr_in.ma.node_inst = TA_RAS_INV_NODE;
+
+ if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
+ ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
+ &addr_in, NULL, false);
+
+ return ret;
+}
+
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
+ struct ras_err_data err_data;
+ struct eeprom_table_record *err_rec;
int ret = 0;
- uint32_t i;
+ uint32_t i, j, loop_cnt = 1;
+ bool is_mca_add = true;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
+ if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) {
+ is_mca_add = false;
+ } else {
+ if ((pages > 1) &&
+ (bps[0].address == bps[1].address) &&
+ (bps[0].mem_channel == bps[1].mem_channel))
+ is_mca_add = false;
+ }
+
mutex_lock(&con->recovery_lock);
data = con->eh_data;
if (!data)
goto out;
- for (i = 0; i < pages; i++) {
- if (amdgpu_ras_check_bad_page_unlock(con,
- bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
- continue;
-
- if (!data->space_left &&
- amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+ if (is_mca_add) {
+ err_data.err_addr =
+ kcalloc(adev->umc.retire_unit,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+ if (!err_data.err_addr) {
+ dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
ret = -ENOMEM;
goto out;
}
- amdgpu_ras_reserve_page(adev, bps[i].retired_page);
+ loop_cnt = adev->umc.retire_unit;
+ }
+
+ for (i = 0; i < pages; i++) {
+ if (is_mca_add) {
+ if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data))
+ goto free;
+
+ err_rec = err_data.err_addr;
+ } else {
+ err_rec = &bps[i];
+ }
- memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
- data->count++;
- data->space_left--;
+ for (j = 0; j < loop_cnt; j++) {
+ if (amdgpu_ras_check_bad_page_unlock(con,
+ err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+ continue;
+
+ if (!data->space_left &&
+ amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
+
+ memcpy(&data->bps[data->count], &(err_rec[j]),
+ sizeof(struct eeprom_table_record));
+ data->count++;
+ data->space_left--;
+ }
}
+
+free:
+ if (is_mca_add)
+ kfree(err_data.err_addr);
out:
mutex_unlock(&con->recovery_lock);