summaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorTao Zhou <tao.zhou1@amd.com>2021-09-23 14:11:22 +0800
committerAlex Deucher <alexander.deucher@amd.com>2021-10-04 15:22:57 -0400
commitc7490949239646c61db869014fcc74ed2cb91d53 (patch)
treec6a82a5cfd48c00b509af6f3fad17f25594aa9c6 /drivers/gpu
parente5d59cfa330523e47cba62a496864acc3948fc27 (diff)
downloadlinux-c7490949239646c61db869014fcc74ed2cb91d53.tar.gz
linux-c7490949239646c61db869014fcc74ed2cb91d53.tar.bz2
linux-c7490949239646c61db869014fcc74ed2cb91d53.zip
amd/amdkfd: add ras page retirement handling for sq/sdma (v3)
In ras poison mode, page retirement will be handled by the irq handler of the module which consumes corrupted data. v2: rename ras_process_cb to ras_poison_consumption_handler. move the handler's implementation from ASIC specific file to common file. v3: call gpu reset for xGMI connected mode. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c4
3 files changed, 17 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1d41c2c00623..7505f1b9d3f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -31,6 +31,8 @@
#include <linux/dma-buf.h>
#include "amdgpu_xgmi.h"
#include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_ras.h"
+#include "amdgpu_umc.h"
/* Total memory size in system memory and all GPU VRAM. Used to
* estimate worst case amount of memory to reserve for page tables
@@ -780,3 +782,15 @@ bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
return adev->have_atomics_support;
}
+
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+
+ /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
+ if (!adev->gmc.xgmi.connected_to_cpu)
+ amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL);
+ else
+ amdgpu_amdkfd_gpu_reset(kgd);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3bc52b2c604f..7db37e2016df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -290,6 +290,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
uint64_t *mmap_offset);
int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd,
struct tile_config *config);
+void amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd);
#if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 12d91e53556c..543e7ea75593 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -231,7 +231,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
kfd_signal_poison_consumed_event(dev, pasid);
- amdgpu_amdkfd_gpu_reset(dev->kgd);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);
return;
}
break;
@@ -253,7 +253,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
kfd_signal_poison_consumed_event(dev, pasid);
- amdgpu_amdkfd_gpu_reset(dev->kgd);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||