summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c20
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c16
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c40
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h4
5 files changed, 79 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 905c5ab486a1..d01d7969d47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4125,8 +4125,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
amdgpu_fbdev_set_suspend(tmp_adev, 0);
- /* must succeed. */
- amdgpu_ras_resume(tmp_adev);
+ /*
+ * The GPU enters bad state once faulty pages
+ * by ECC has reached the threshold, and ras
+ * recovery is scheduled next. So add one check
+ * here to break recovery if it indeed exceeds
+ * bad page threshold, and remind user to
+ * retire this GPU or setting one bigger
+ * bad_page_threshold value to fix this once
+ * probing driver again.
+ */
+ if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+ /* must succeed. */
+ amdgpu_ras_resume(tmp_adev);
+ } else {
+ r = -EINVAL;
+ goto out;
+ }
/* Update PSP FW topology after reset */
if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
@@ -4134,7 +4149,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
}
}
-
out:
if (!r) {
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d081de232c66..ab65dfd71927 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2205,3 +2205,19 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
return false;
}
+
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ bool exc_err_limit = false;
+
+ if (con && (amdgpu_bad_page_threshold != 0))
+ amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control,
+ &exc_err_limit);
+
+ /*
+ * We are only interested in variable exc_err_limit,
+ * as it says if GPU is in bad state or not.
+ */
+ return exc_err_limit;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cf9f60202334..70a6fca73617 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -497,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
bool is_ce);
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev);
+
/* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 461dfd22bc1c..7848a426c95b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -386,6 +386,46 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address)
return curr_address;
}
+int amdgpu_ras_eeprom_check_err_threshold(
+ struct amdgpu_ras_eeprom_control *control,
+ bool *exceed_err_limit)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ unsigned char buff[EEPROM_ADDRESS_SIZE +
+ EEPROM_TABLE_HEADER_SIZE] = { 0 };
+ struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ struct i2c_msg msg = {
+ .addr = control->i2c_address,
+ .flags = I2C_M_RD,
+ .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
+ .buf = buff,
+ };
+ int ret;
+
+ *exceed_err_limit = false;
+
+ /* read EEPROM table header */
+ mutex_lock(&control->tbl_mutex);
+ ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
+ if (ret < 1) {
+ dev_err(adev->dev, "Failed to read EEPROM table header.\n");
+ goto err;
+ }
+
+ __decode_table_header_from_buff(hdr, &buff[2]);
+
+ if (hdr->header == EEPROM_TABLE_HDR_BAD) {
+ dev_warn(adev->dev, "This GPU is in BAD status.");
+ dev_warn(adev->dev, "Please retire it or setting one bigger "
+ "threshold value when reloading driver.\n");
+ *exceed_err_limit = true;
+ }
+
+err:
+ mutex_unlock(&control->tbl_mutex);
+ return 0;
+}
+
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *records,
bool write,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 9839b4ea5e18..c7a5e5c7c61e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -80,6 +80,10 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
bool *exceed_err_limit);
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
+int amdgpu_ras_eeprom_check_err_threshold(
+ struct amdgpu_ras_eeprom_control *control,
+ bool *exceed_err_limit);
+
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
struct eeprom_table_record *records,
bool write,