summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c1136
1 files changed, 890 insertions, 246 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8ebab6f22e5a..1a1395c5fff1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -120,7 +120,11 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
-#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms
+
+#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
+
+#define MAX_FLUSH_RETIRE_DWORK_TIMES 100
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
@@ -878,7 +882,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
if (ret)
return ret;
- /* gfx block ras dsiable cmd must send to ras-ta */
+ /* gfx block ras disable cmd must send to ras-ta */
if (head->block == AMDGPU_RAS_BLOCK__GFX)
con->features |= BIT(head->block);
@@ -1045,6 +1049,7 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct ras_manager *ras_mgr,
struct ras_err_data *err_data,
+ struct ras_query_context *qctx,
const char *blk_name,
bool is_ue,
bool is_de)
@@ -1052,27 +1057,28 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node;
struct ras_err_info *err_info;
+ u64 event_id = qctx->evid.event_id;
if (is_ue) {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->ue_count) {
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld new uncorrectable hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->ue_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld new uncorrectable hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ue_count,
+ blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld uncorrectable hardware errors detected in total in %s block\n",
- mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld uncorrectable hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
}
} else {
@@ -1081,44 +1087,44 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->de_count) {
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld new deferred hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->de_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld new deferred hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->de_count,
+ blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld deferred hardware errors detected in total in %s block\n",
- mcm_info->socket_id, mcm_info->die_id,
- err_info->de_count, blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld deferred hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id,
+ err_info->de_count, blk_name);
}
} else {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->ce_count) {
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld new correctable hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->ce_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld new correctable hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ce_count,
+ blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
- dev_info(adev->dev, "socket: %d, die: %d, "
- "%lld correctable hardware errors detected in total in %s block\n",
- mcm_info->socket_id, mcm_info->die_id,
- err_info->ce_count, blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
+ "%lld correctable hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id,
+ err_info->ce_count, blk_name);
}
}
}
@@ -1131,77 +1137,79 @@ static inline bool err_data_has_source_info(struct ras_err_data *data)
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
struct ras_query_if *query_if,
- struct ras_err_data *err_data)
+ struct ras_err_data *err_data,
+ struct ras_query_context *qctx)
{
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
const char *blk_name = get_ras_block_str(&query_if->head);
+ u64 event_id = qctx->evid.event_id;
if (err_data->ce_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
blk_name, false, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%ld correctable hardware errors "
- "detected in %s block\n",
- adev->smuio.funcs->get_socket_id(adev),
- adev->smuio.funcs->get_die_id(adev),
- ras_mgr->err_data.ce_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
+ "%ld correctable hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.ce_count,
+ blk_name);
} else {
- dev_info(adev->dev, "%ld correctable hardware errors "
- "detected in %s block\n",
- ras_mgr->err_data.ce_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors "
+ "detected in %s block\n",
+ ras_mgr->err_data.ce_count,
+ blk_name);
}
}
if (err_data->ue_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
blk_name, true, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%ld uncorrectable hardware errors "
- "detected in %s block\n",
- adev->smuio.funcs->get_socket_id(adev),
- adev->smuio.funcs->get_die_id(adev),
- ras_mgr->err_data.ue_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
+ "%ld uncorrectable hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.ue_count,
+ blk_name);
} else {
- dev_info(adev->dev, "%ld uncorrectable hardware errors "
- "detected in %s block\n",
- ras_mgr->err_data.ue_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors "
+ "detected in %s block\n",
+ ras_mgr->err_data.ue_count,
+ blk_name);
}
}
if (err_data->de_count) {
if (err_data_has_source_info(err_data)) {
- amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
blk_name, false, true);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%ld deferred hardware errors "
- "detected in %s block\n",
- adev->smuio.funcs->get_socket_id(adev),
- adev->smuio.funcs->get_die_id(adev),
- ras_mgr->err_data.de_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
+ "%ld deferred hardware errors "
+ "detected in %s block\n",
+ adev->smuio.funcs->get_socket_id(adev),
+ adev->smuio.funcs->get_die_id(adev),
+ ras_mgr->err_data.de_count,
+ blk_name);
} else {
- dev_info(adev->dev, "%ld deferred hardware errors "
- "detected in %s block\n",
- ras_mgr->err_data.de_count,
- blk_name);
+ RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors "
+ "detected in %s block\n",
+ ras_mgr->err_data.de_count,
+ blk_name);
}
}
}
@@ -1215,11 +1223,11 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
amdgpu_ras_error_statistic_de_count(&obj->err_data,
- &err_info->mcm_info, NULL, err_info->de_count);
+ &err_info->mcm_info, err_info->de_count);
amdgpu_ras_error_statistic_ce_count(&obj->err_data,
- &err_info->mcm_info, NULL, err_info->ce_count);
+ &err_info->mcm_info, err_info->ce_count);
amdgpu_ras_error_statistic_ue_count(&obj->err_data,
- &err_info->mcm_info, NULL, err_info->ue_count);
+ &err_info->mcm_info, err_info->ue_count);
}
} else {
/* for legacy asic path which doesn't has error source info */
@@ -1244,6 +1252,10 @@ int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
{
struct ras_manager *obj;
+ /* in resume phase, no need to create aca fs node */
+ if (adev->in_suspend || amdgpu_in_reset(adev))
+ return 0;
+
obj = get_ras_manager(adev, blk);
if (!obj)
return -EINVAL;
@@ -1265,7 +1277,8 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
}
static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
- enum aca_error_type type, struct ras_err_data *err_data)
+ enum aca_error_type type, struct ras_err_data *err_data,
+ struct ras_query_context *qctx)
{
struct ras_manager *obj;
@@ -1273,7 +1286,7 @@ static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu
if (!obj)
return -EINVAL;
- return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data);
+ return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx);
}
ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
@@ -1284,16 +1297,20 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
.head = obj->head,
};
+ if (!amdgpu_ras_get_error_query_ready(obj->adev))
+ return sysfs_emit(buf, "Query currently inaccessible\n");
+
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
- return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
- "ce", info.ce_count);
+ return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count,
+ "ce", info.ce_count, "de", info.de_count);
}
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
struct ras_query_if *info,
struct ras_err_data *err_data,
+ struct ras_query_context *qctx,
unsigned int error_query_mode)
{
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
@@ -1329,17 +1346,21 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
}
} else {
if (amdgpu_aca_is_enabled(adev)) {
- ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data);
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx);
+ if (ret)
+ return ret;
+
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx);
if (ret)
return ret;
- ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data);
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx);
if (ret)
return ret;
} else {
/* FIXME: add code to check return value later */
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx);
}
}
@@ -1347,10 +1368,13 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
}
/* query/inject/cure begin */
-int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
+static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev,
+ struct ras_query_if *info,
+ enum ras_event_type type)
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data;
+ struct ras_query_context qctx;
unsigned int error_query_mode;
int ret;
@@ -1364,9 +1388,20 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
return -EINVAL;
+ memset(&qctx, 0, sizeof(qctx));
+ qctx.evid.type = type;
+ qctx.evid.event_id = amdgpu_ras_acquire_event_id(adev, type);
+
+ if (!down_read_trylock(&adev->reset_domain->sem)) {
+ ret = -EIO;
+ goto out_fini_err_data;
+ }
+
ret = amdgpu_ras_query_error_status_helper(adev, info,
&err_data,
+ &qctx,
error_query_mode);
+ up_read(&adev->reset_domain->sem);
if (ret)
goto out_fini_err_data;
@@ -1376,7 +1411,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
info->ce_count = obj->err_data.ce_count;
info->de_count = obj->err_data.de_count;
- amdgpu_ras_error_generate_report(adev, info, &err_data);
+ amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
out_fini_err_data:
amdgpu_ras_error_data_fini(&err_data);
@@ -1384,15 +1419,17 @@ out_fini_err_data:
return ret;
}
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
+{
+ return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID);
+}
+
int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
enum amdgpu_ras_block block)
{
struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
- struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
- struct amdgpu_hive_info *hive;
- int hive_ras_recovery = 0;
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
@@ -1404,15 +1441,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
!amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP;
- hive = amdgpu_get_xgmi_hive(adev);
- if (hive) {
- hive_ras_recovery = atomic_read(&hive->ras_recovery);
- amdgpu_put_xgmi_hive(hive);
- }
-
/* skip ras error reset in gpu reset */
- if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
- hive_ras_recovery) &&
+ if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) &&
((smu_funcs && smu_funcs->set_debug_mode) ||
(mca_funcs && mca_funcs->mca_set_debug_mode)))
return -EOPNOTSUPP;
@@ -1703,6 +1733,39 @@ static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev,
return sysfs_emit(buf, "schema: 0x%x\n", con->schema);
}
+static struct {
+ enum ras_event_type type;
+ const char *name;
+} dump_event[] = {
+ {RAS_EVENT_TYPE_FATAL, "Fatal Error"},
+ {RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"},
+ {RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"},
+};
+
+static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct amdgpu_ras *con =
+ container_of(attr, struct amdgpu_ras, event_state_attr);
+ struct ras_event_manager *event_mgr = con->event_mgr;
+ struct ras_event_state *event_state;
+ int i, size = 0;
+
+ if (!event_mgr)
+ return -EINVAL;
+
+ size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno));
+ for (i = 0; i < ARRAY_SIZE(dump_event); i++) {
+ event_state = &event_mgr->event_state[dump_event[i].type];
+ size += sysfs_emit_at(buf, size, "%s: count:%llu, last_seqno:%llu\n",
+ dump_event[i].name,
+ atomic64_read(&event_state->count),
+ event_state->last_seqno);
+ }
+
+ return (ssize_t)size;
+}
+
static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1720,6 +1783,7 @@ static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev)
&con->features_attr.attr,
&con->version_attr.attr,
&con->schema_attr.attr,
+ &con->event_state_attr.attr,
NULL
};
struct attribute_group group = {
@@ -1738,6 +1802,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+ if (amdgpu_aca_is_enabled(adev))
+ return 0;
+
if (!obj || obj->attr_inuse)
return -EINVAL;
@@ -1772,6 +1839,9 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+ if (amdgpu_aca_is_enabled(adev))
+ return 0;
+
if (!obj || !obj->attr_inuse)
return -EINVAL;
@@ -1884,6 +1954,23 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
obj, &amdgpu_ras_debugfs_ops);
}
+static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev)
+{
+ bool ret;
+
+ switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
+ case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 14):
+ ret = true;
+ break;
+ default:
+ ret = false;
+ break;
+ }
+
+ return ret;
+}
+
void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1910,10 +1997,12 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
}
}
- if (amdgpu_aca_is_enabled(adev))
- amdgpu_aca_smu_debugfs_init(adev, dir);
- else
- amdgpu_mca_smu_debugfs_init(adev, dir);
+ if (amdgpu_ras_aca_is_supported(adev)) {
+ if (amdgpu_aca_is_enabled(adev))
+ amdgpu_aca_smu_debugfs_init(adev, dir);
+ else
+ amdgpu_mca_smu_debugfs_init(adev, dir);
+ }
}
/* debugfs end */
@@ -1927,6 +2016,8 @@ static DEVICE_ATTR(version, 0444,
amdgpu_ras_sysfs_version_show, NULL);
static DEVICE_ATTR(schema, 0444,
amdgpu_ras_sysfs_schema_show, NULL);
+static DEVICE_ATTR(event_state, 0444,
+ amdgpu_ras_sysfs_event_state_show, NULL);
static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1937,6 +2028,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
&con->features_attr.attr,
&con->version_attr.attr,
&con->schema_attr.attr,
+ &con->event_state_attr.attr,
NULL
};
struct bin_attribute *bin_attrs[] = {
@@ -1959,6 +2051,10 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
con->schema_attr = dev_attr_schema;
sysfs_attr_init(attrs[2]);
+ /* add event_state entry */
+ con->event_state_attr = dev_attr_event_state;
+ sysfs_attr_init(attrs[3]);
+
if (amdgpu_bad_page_threshold != 0) {
/* add bad_page_features entry */
bin_attr_gpu_vram_bad_pages.private = NULL;
@@ -2022,8 +2118,16 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
struct amdgpu_device *adev = obj->adev;
struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
+ u64 event_id;
+ int ret;
- if (!block_obj)
+ if (!block_obj || !con)
+ return;
+
+ ret = amdgpu_ras_mark_ras_event(adev, type);
+ if (ret)
return;
/* both query_poison_status and handle_poison_consumption are optional,
@@ -2041,26 +2145,49 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
}
}
- amdgpu_umc_poison_handler(adev, obj->head.block, false);
+ amdgpu_umc_poison_handler(adev, obj->head.block, 0);
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
- /* gpu reset is fallback for failed and default cases */
- if (poison_stat) {
- dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
- block_obj->ras_comm.name);
+ /* gpu reset is fallback for failed and default cases.
+ * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
+ */
+ if (poison_stat && !amdgpu_ras_is_rma(adev)) {
+ event_id = amdgpu_ras_acquire_event_id(adev, type);
+ RAS_EVENT_LOG(adev, event_id,
+ "GPU reset for %s RAS poison consumption is issued!\n",
+ block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev);
- } else {
- amdgpu_gfx_poison_consumption_handler(adev, entry);
}
+
+ if (!poison_stat)
+ amdgpu_gfx_poison_consumption_handler(adev, entry);
}
static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
struct amdgpu_iv_entry *entry)
{
- dev_info(obj->adev->dev,
- "Poison is created\n");
+ struct amdgpu_device *adev = obj->adev;
+ enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+ u64 event_id;
+ int ret;
+
+ ret = amdgpu_ras_mark_ras_event(adev, type);
+ if (ret)
+ return;
+
+ event_id = amdgpu_ras_acquire_event_id(adev, type);
+ RAS_EVENT_LOG(adev, event_id, "Poison is created\n");
+
+ if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
+ struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
+
+ atomic_inc(&con->page_retirement_req_cnt);
+ atomic_inc(&con->poison_creation_count);
+
+ wake_up(&con->page_retirement_wq);
+ }
}
static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
@@ -2080,6 +2207,7 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
/* Let IP handle its data, maybe we need get the output
* from the callback to update the error type/count, etc
*/
+ amdgpu_ras_set_fed(obj->adev, true);
ret = data->cb(obj->adev, &err_data, entry);
/* ue will trigger an interrupt, and in that case
* we need do a reset to recovery the whole system.
@@ -2140,12 +2268,15 @@ static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
struct ras_dispatch_if *info)
{
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
- struct ras_ih_data *data = &obj->ih_data;
+ struct ras_manager *obj;
+ struct ras_ih_data *data;
+ obj = amdgpu_ras_find_obj(adev, &info->head);
if (!obj)
return -EINVAL;
+ data = &obj->ih_data;
+
if (data->inuse == 0)
return 0;
@@ -2242,7 +2373,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
/* ih end */
/* traversal all IPs except NBIO to query error counter */
-static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
+static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, enum ras_event_type type)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
@@ -2275,7 +2406,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
IP_VERSION(13, 0, 2)))
continue;
- amdgpu_ras_query_error_status(adev, &info);
+ amdgpu_ras_query_error_status_with_event(adev, &info, type);
if (amdgpu_ip_version(adev, MP0_HWIP, 0) !=
IP_VERSION(11, 0, 2) &&
@@ -2371,7 +2502,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
};
status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
- data->bps[i].retired_page);
+ data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
if (status == -EBUSY)
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
else if (status == -ENOENT)
@@ -2384,6 +2515,44 @@ out:
return ret;
}
+static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
+ struct amdgpu_hive_info *hive, bool status)
+{
+ struct amdgpu_device *tmp_adev;
+
+ if (hive) {
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ amdgpu_ras_set_fed(tmp_adev, status);
+ } else {
+ amdgpu_ras_set_fed(adev, status);
+ }
+}
+
+bool amdgpu_ras_in_recovery(struct amdgpu_device *adev)
+{
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ int hive_ras_recovery = 0;
+
+ if (hive) {
+ hive_ras_recovery = atomic_read(&hive->ras_recovery);
+ amdgpu_put_xgmi_hive(hive);
+ }
+
+ if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery))
+ return true;
+
+ return false;
+}
+
+static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device *adev)
+{
+ if (amdgpu_ras_intr_triggered())
+ return RAS_EVENT_TYPE_FATAL;
+ else
+ return RAS_EVENT_TYPE_POISON_CONSUMPTION;
+}
+
static void amdgpu_ras_do_recovery(struct work_struct *work)
{
struct amdgpu_ras *ras =
@@ -2392,9 +2561,23 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle = NULL;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ enum ras_event_type type;
- if (hive)
+ if (hive) {
atomic_set(&hive->ras_recovery, 1);
+
+ /* If any device which is part of the hive received RAS fatal
+ * error interrupt, set fatal error status on all. This
+ * condition will need a recovery, and flag will be cleared
+ * as part of recovery.
+ */
+ list_for_each_entry(remote_adev, &hive->device_list,
+ gmc.xgmi.head)
+ if (amdgpu_ras_get_fed_status(remote_adev)) {
+ amdgpu_ras_set_fed_all(adev, hive, true);
+ break;
+ }
+ }
if (!ras->disable_ras_err_cnt_harvest) {
/* Build list of devices to query RAS related errors */
@@ -2406,10 +2589,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
device_list_handle = &device_list;
}
+ type = amdgpu_ras_get_fatal_error_event(adev);
list_for_each_entry(remote_adev,
device_list_handle, gmc.xgmi.head) {
amdgpu_ras_query_err_status(remote_adev);
- amdgpu_ras_log_on_err_counter(remote_adev);
+ amdgpu_ras_log_on_err_counter(remote_adev, type);
}
}
@@ -2420,6 +2604,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
+ reset_context.src = AMDGPU_RESET_SRC_RAS;
/* Perform full reset in fatal error mode */
if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
@@ -2439,18 +2624,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- /* For any RAS error that needs a full reset to
- * recover, set the fatal error status
- */
- if (hive) {
- list_for_each_entry(remote_adev,
- &hive->device_list,
- gmc.xgmi.head)
- amdgpu_ras_set_fed(remote_adev,
- true);
- } else {
- amdgpu_ras_set_fed(adev, true);
- }
psp_fatal_error_recovery_quirk(&adev->psp);
}
}
@@ -2516,9 +2689,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
goto out;
}
- amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
- bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
- AMDGPU_GPU_PAGE_SIZE);
+ amdgpu_ras_reserve_page(adev, bps[i].retired_page);
memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
data->count++;
@@ -2674,10 +2845,236 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
}
}
+int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint16_t pasid,
+ pasid_notify pasid_fn, void *data, uint32_t reset)
+{
+ int ret = 0;
+ struct ras_poison_msg poison_msg;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ memset(&poison_msg, 0, sizeof(poison_msg));
+ poison_msg.block = block;
+ poison_msg.pasid = pasid;
+ poison_msg.reset = reset;
+ poison_msg.pasid_fn = pasid_fn;
+ poison_msg.data = data;
+
+ ret = kfifo_put(&con->poison_fifo, poison_msg);
+ if (!ret) {
+ dev_err(adev->dev, "Poison message fifo is full!\n");
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
+ struct ras_poison_msg *poison_msg)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ return kfifo_get(&con->poison_fifo, poison_msg);
+}
+
+static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
+{
+ mutex_init(&ecc_log->lock);
+
+ INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
+ ecc_log->de_queried_count = 0;
+ ecc_log->prev_de_queried_count = 0;
+}
+
+static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ struct ras_ecc_err *ecc_err;
+
+ mutex_lock(&ecc_log->lock);
+ radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) {
+ ecc_err = radix_tree_deref_slot(slot);
+ kfree(ecc_err->err_pages.pfn);
+ kfree(ecc_err);
+ radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot);
+ }
+ mutex_unlock(&ecc_log->lock);
+
+ mutex_destroy(&ecc_log->lock);
+ ecc_log->de_queried_count = 0;
+ ecc_log->prev_de_queried_count = 0;
+}
+
+static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
+ uint32_t delayed_ms)
+{
+ int ret;
+
+ mutex_lock(&con->umc_ecc_log.lock);
+ ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
+ UMC_ECC_NEW_DETECTED_TAG);
+ mutex_unlock(&con->umc_ecc_log.lock);
+
+ if (ret)
+ schedule_delayed_work(&con->page_retirement_dwork,
+ msecs_to_jiffies(delayed_ms));
+
+ return ret ? true : false;
+}
+
+static void amdgpu_ras_do_page_retirement(struct work_struct *work)
+{
+ struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+ page_retirement_dwork.work);
+ struct amdgpu_device *adev = con->adev;
+ struct ras_err_data err_data;
+ unsigned long err_cnt;
+
+ /* If gpu reset is ongoing, delay retiring the bad pages */
+ if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
+ amdgpu_ras_schedule_retirement_dwork(con,
+ AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3);
+ return;
+ }
+
+ amdgpu_ras_error_data_init(&err_data);
+
+ amdgpu_umc_handle_bad_pages(adev, &err_data);
+ err_cnt = err_data.err_addr_cnt;
+
+ amdgpu_ras_error_data_fini(&err_data);
+
+ if (err_cnt && amdgpu_ras_is_rma(adev))
+ amdgpu_ras_reset_gpu(adev);
+
+ amdgpu_ras_schedule_retirement_dwork(con,
+ AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
+}
+
+static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+ uint32_t poison_creation_count)
+{
+ int ret = 0;
+ struct ras_ecc_log_info *ecc_log;
+ struct ras_query_if info;
+ uint32_t timeout = 0;
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ uint64_t de_queried_count;
+ uint32_t new_detect_count, total_detect_count;
+ uint32_t need_query_count = poison_creation_count;
+ bool query_data_timeout = false;
+ enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
+
+ memset(&info, 0, sizeof(info));
+ info.head.block = AMDGPU_RAS_BLOCK__UMC;
+
+ ecc_log = &ras->umc_ecc_log;
+ total_detect_count = 0;
+ do {
+ ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
+ if (ret)
+ return ret;
+
+ de_queried_count = ecc_log->de_queried_count;
+ if (de_queried_count > ecc_log->prev_de_queried_count) {
+ new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
+ ecc_log->prev_de_queried_count = de_queried_count;
+ timeout = 0;
+ } else {
+ new_detect_count = 0;
+ }
+
+ if (new_detect_count) {
+ total_detect_count += new_detect_count;
+ } else {
+ if (!timeout && need_query_count)
+ timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
+
+ if (timeout) {
+ if (!--timeout) {
+ query_data_timeout = true;
+ break;
+ }
+ msleep(1);
+ }
+ }
+ } while (total_detect_count < need_query_count);
+
+ if (query_data_timeout) {
+ dev_warn(adev->dev, "Can't find deferred error! count: %u\n",
+ (need_query_count - total_detect_count));
+ return -ENOENT;
+ }
+
+ if (total_detect_count)
+ schedule_delayed_work(&ras->page_retirement_dwork, 0);
+
+ return 0;
+}
+
+static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_poison_msg msg;
+ int ret;
+
+ do {
+ ret = kfifo_get(&con->poison_fifo, &msg);
+ } while (ret);
+}
+
+static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
+ uint32_t msg_count, uint32_t *gpu_reset)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ uint32_t reset_flags = 0, reset = 0;
+ struct ras_poison_msg msg;
+ int ret, i;
+
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
+ for (i = 0; i < msg_count; i++) {
+ ret = amdgpu_ras_get_poison_req(adev, &msg);
+ if (!ret)
+ continue;
+
+ if (msg.pasid_fn)
+ msg.pasid_fn(adev, msg.pasid, msg.data);
+
+ reset_flags |= msg.reset;
+ }
+
+ /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
+ if (reset_flags && !amdgpu_ras_is_rma(adev)) {
+ if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
+ reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+ else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET)
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ else
+ reset = reset_flags;
+
+ flush_delayed_work(&con->page_retirement_dwork);
+
+ con->gpu_reset_flags |= reset;
+ amdgpu_ras_reset_gpu(adev);
+
+ *gpu_reset = reset;
+
+ /* Wait for gpu recovery to complete */
+ flush_work(&con->recovery_work);
+ }
+
+ return 0;
+}
+
static int amdgpu_ras_page_retirement_thread(void *param)
{
struct amdgpu_device *adev = (struct amdgpu_device *)param;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ uint32_t poison_creation_count, msg_count;
+ uint32_t gpu_reset;
+ int ret;
while (!kthread_should_stop()) {
@@ -2688,13 +3085,62 @@ static int amdgpu_ras_page_retirement_thread(void *param)
if (kthread_should_stop())
break;
- dev_info(adev->dev, "Start processing page retirement. request:%d\n",
- atomic_read(&con->page_retirement_req_cnt));
+ gpu_reset = 0;
+
+ do {
+ poison_creation_count = atomic_read(&con->poison_creation_count);
+ ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count);
+ if (ret == -EIO)
+ break;
+
+ if (poison_creation_count) {
+ atomic_sub(poison_creation_count, &con->poison_creation_count);
+ atomic_sub(poison_creation_count, &con->page_retirement_req_cnt);
+ }
+ } while (atomic_read(&con->poison_creation_count));
+
+ if (ret != -EIO) {
+ msg_count = kfifo_len(&con->poison_fifo);
+ if (msg_count) {
+ ret = amdgpu_ras_poison_consumption_handler(adev,
+ msg_count, &gpu_reset);
+ if ((ret != -EIO) &&
+ (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET))
+ atomic_sub(msg_count, &con->page_retirement_req_cnt);
+ }
+ }
+
+ if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
+ /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */
+ /* Clear poison creation request */
+ atomic_set(&con->poison_creation_count, 0);
+
+ /* Clear poison fifo */
+ amdgpu_ras_clear_poison_fifo(adev);
+
+ /* Clear all poison requests */
+ atomic_set(&con->page_retirement_req_cnt, 0);
- atomic_dec(&con->page_retirement_req_cnt);
+ if (ret == -EIO) {
+ /* Wait for mode-1 reset to complete */
+ down_read(&adev->reset_domain->sem);
+ up_read(&adev->reset_domain->sem);
+ }
+
+ /* Wake up work to save bad pages to eeprom */
+ schedule_delayed_work(&con->page_retirement_dwork, 0);
+ } else if (gpu_reset) {
+ /* gpu just completed mode-2 reset or other reset */
+ /* Clear poison consumption messages cached in fifo */
+ msg_count = kfifo_len(&con->poison_fifo);
+ if (msg_count) {
+ amdgpu_ras_clear_poison_fifo(adev);
+ atomic_sub(msg_count, &con->page_retirement_req_cnt);
+ }
- amdgpu_umc_bad_page_polling_timeout(adev,
- false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
+ /* Wake up work to save bad pages to eeprom */
+ schedule_delayed_work(&con->page_retirement_dwork, 0);
+ }
}
return 0;
@@ -2705,7 +3151,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
u32 max_eeprom_records_count = 0;
- bool exc_err_limit = false;
int ret;
if (!con || amdgpu_sriov_vf(adev))
@@ -2742,12 +3187,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
*/
if (adev->gmc.xgmi.pending_reset)
return 0;
- ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
+ ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
/*
- * This calling fails when exc_err_limit is true or
+ * This calling fails when is_rma is true or
* ret != 0.
*/
- if (exc_err_limit || ret)
+ if (amdgpu_ras_is_rma(adev) || ret)
goto free;
if (con->eeprom_control.ras_num_recs) {
@@ -2763,9 +3208,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
}
}
+ mutex_init(&con->page_rsv_lock);
+ INIT_KFIFO(con->poison_fifo);
mutex_init(&con->page_retirement_lock);
init_waitqueue_head(&con->page_retirement_wq);
atomic_set(&con->page_retirement_req_cnt, 0);
+ atomic_set(&con->poison_creation_count, 0);
con->page_retirement_thread =
kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");
if (IS_ERR(con->page_retirement_thread)) {
@@ -2773,6 +3221,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
}
+ INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
+ amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
#ifdef CONFIG_X86_MCE_AMD
if ((adev->asic_type == CHIP_ALDEBARAN) &&
(adev->gmc.xgmi.connected_to_cpu))
@@ -2791,7 +3241,7 @@ out:
* Except error threshold exceeding case, other failure cases in this
* function would not fail amdgpu driver init.
*/
- if (!exc_err_limit)
+ if (!amdgpu_ras_is_rma(adev))
ret = 0;
else
ret = -EINVAL;
@@ -2803,18 +3253,33 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data = con->eh_data;
+ int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES;
+ bool ret;
/* recovery_init failed to init it, fini is useless */
if (!data)
return 0;
+ /* Save all cached bad pages to eeprom */
+ do {
+ flush_delayed_work(&con->page_retirement_dwork);
+ ret = amdgpu_ras_schedule_retirement_dwork(con, 0);
+ } while (ret && max_flush_timeout--);
+
if (con->page_retirement_thread)
kthread_stop(con->page_retirement_thread);
atomic_set(&con->page_retirement_req_cnt, 0);
+ atomic_set(&con->poison_creation_count, 0);
+
+ mutex_destroy(&con->page_rsv_lock);
cancel_work_sync(&con->recovery_work);
+ cancel_delayed_work_sync(&con->page_retirement_dwork);
+
+ amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
+
mutex_lock(&con->recovery_lock);
con->eh_data = NULL;
kfree(data->bps);
@@ -2831,6 +3296,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 14):
return true;
default:
return false;
@@ -2842,6 +3308,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 0):
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
+ case IP_VERSION(13, 0, 14):
return true;
default:
return false;
@@ -3001,6 +3468,11 @@ init_ras_enabled_flag:
/* aca is disabled by default */
adev->aca.is_enabled = false;
+
+ /* bad page feature is not applicable to specific app platform */
+ if (adev->gmc.is_app_apu &&
+ amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0))
+ amdgpu_bad_page_threshold = 0;
}
static void amdgpu_ras_counte_dw(struct work_struct *work)
@@ -3036,6 +3508,60 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
AMDGPU_RAS_ERROR__PARITY;
}
+static void ras_event_mgr_init(struct ras_event_manager *mgr)
+{
+ struct ras_event_state *event_state;
+ int i;
+
+ memset(mgr, 0, sizeof(*mgr));
+ atomic64_set(&mgr->seqno, 0);
+
+ for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) {
+ event_state = &mgr->event_state[i];
+ event_state->last_seqno = RAS_EVENT_INVALID_ID;
+ atomic64_set(&event_state->count, 0);
+ }
+}
+
+static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ struct amdgpu_hive_info *hive;
+
+ if (!ras)
+ return;
+
+ hive = amdgpu_get_xgmi_hive(adev);
+ ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
+
+ /* init event manager with node 0 on xgmi system */
+ if (!amdgpu_in_reset(adev)) {
+ if (!hive || adev->gmc.xgmi.node_id == 0)
+ ras_event_mgr_init(ras->event_mgr);
+ }
+
+ if (hive)
+ amdgpu_put_xgmi_hive(hive);
+}
+
+static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!con || (adev->flags & AMD_IS_APU))
+ return;
+
+ switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
+ case IP_VERSION(13, 0, 2):
+ case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 14):
+ con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+ break;
+ default:
+ break;
+ }
+}
+
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3141,11 +3667,22 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
+ amdgpu_ras_init_reserved_vram_size(adev);
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
}
+ if (amdgpu_ras_aca_is_supported(adev)) {
+ if (amdgpu_aca_is_enabled(adev))
+ r = amdgpu_aca_init(adev);
+ else
+ r = amdgpu_mca_init(adev);
+ if (r)
+ goto release_con;
+ }
+
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
"hardware ability[%x] ras_mask[%x]\n",
adev->ras_hw_enabled, adev->ras_enabled);
@@ -3352,23 +3889,30 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
struct amdgpu_ras_block_object *obj;
int r;
- /* Guest side doesn't need init ras feature */
- if (amdgpu_sriov_vf(adev))
- return 0;
+ amdgpu_ras_event_mgr_init(adev);
- if (amdgpu_aca_is_enabled(adev)) {
- if (amdgpu_in_reset(adev))
- r = amdgpu_aca_reset(adev);
- else
- r = amdgpu_aca_init(adev);
- if (r)
- return r;
+ if (amdgpu_ras_aca_is_supported(adev)) {
+ if (amdgpu_in_reset(adev)) {
+ if (amdgpu_aca_is_enabled(adev))
+ r = amdgpu_aca_reset(adev);
+ else
+ r = amdgpu_mca_reset(adev);
+ if (r)
+ return r;
+ }
- amdgpu_ras_set_aca_debug_mode(adev, false);
- } else {
- amdgpu_ras_set_mca_debug_mode(adev, false);
+ if (!amdgpu_sriov_vf(adev)) {
+ if (amdgpu_aca_is_enabled(adev))
+ amdgpu_ras_set_aca_debug_mode(adev, false);
+ else
+ amdgpu_ras_set_mca_debug_mode(adev, false);
+ }
}
+ /* Guest side doesn't need init ras feature */
+ if (amdgpu_sriov_vf(adev))
+ return 0;
+
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
obj = node->ras_obj;
if (!obj) {
@@ -3436,8 +3980,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
amdgpu_ras_fs_fini(adev);
amdgpu_ras_interrupt_remove_all(adev);
- if (amdgpu_aca_is_enabled(adev))
- amdgpu_aca_fini(adev);
+ if (amdgpu_ras_aca_is_supported(adev)) {
+ if (amdgpu_aca_is_enabled(adev))
+ amdgpu_aca_fini(adev);
+ else
+ amdgpu_mca_fini(adev);
+ }
WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared");
@@ -3472,14 +4020,90 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
atomic_set(&ras->fed, !!status);
}
+static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *ras;
+
+ ras = amdgpu_ras_get_context(adev);
+ if (!ras)
+ return NULL;
+
+ return ras->event_mgr;
+}
+
+int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,
+ const void *caller)
+{
+ struct ras_event_manager *event_mgr;
+ struct ras_event_state *event_state;
+ int ret = 0;
+
+ if (type >= RAS_EVENT_TYPE_COUNT) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ event_mgr = __get_ras_event_mgr(adev);
+ if (!event_mgr) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ event_state = &event_mgr->event_state[type];
+ event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno);
+ atomic64_inc(&event_state->count);
+
+out:
+ if (ret && caller)
+ dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n",
+ (int)type, caller, ret);
+
+ return ret;
+}
+
+u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)
+{
+ struct ras_event_manager *event_mgr;
+ u64 id;
+
+ if (type >= RAS_EVENT_TYPE_COUNT)
+ return RAS_EVENT_INVALID_ID;
+
+ switch (type) {
+ case RAS_EVENT_TYPE_FATAL:
+ case RAS_EVENT_TYPE_POISON_CREATION:
+ case RAS_EVENT_TYPE_POISON_CONSUMPTION:
+ event_mgr = __get_ras_event_mgr(adev);
+ if (!event_mgr)
+ return RAS_EVENT_INVALID_ID;
+
+ id = event_mgr->event_state[type].last_seqno;
+ break;
+ case RAS_EVENT_TYPE_INVALID:
+ default:
+ id = RAS_EVENT_INVALID_ID;
+ break;
+ }
+
+ return id;
+}
+
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ enum ras_event_type type = RAS_EVENT_TYPE_FATAL;
+ u64 event_id;
+
+ if (amdgpu_ras_mark_ras_event(adev, type))
+ return;
+
+ event_id = amdgpu_ras_acquire_event_id(adev, type);
- dev_info(adev->dev, "uncorrectable hardware error"
- "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
+ RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
+ "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
+ amdgpu_ras_set_fed(adev, true);
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev);
}
@@ -3664,6 +4288,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ /* mode1 is the only selection for RMA status */
+ if (amdgpu_ras_is_rma(adev)) {
+ ras->gpu_reset_flags = 0;
+ ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+ }
+
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
return 0;
@@ -3983,8 +4613,6 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
if (!err_node)
return NULL;
- INIT_LIST_HEAD(&err_node->err_info.err_addr_list);
-
memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
err_data->err_list_count++;
@@ -3994,32 +4622,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
return &err_node->err_info;
}
-void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr)
-{
- struct ras_err_addr *mca_err_addr;
-
- mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
- if (!mca_err_addr)
- return;
-
- INIT_LIST_HEAD(&mca_err_addr->node);
-
- mca_err_addr->err_status = err_addr->err_status;
- mca_err_addr->err_ipid = err_addr->err_ipid;
- mca_err_addr->err_addr = err_addr->err_addr;
-
- list_add_tail(&mca_err_addr->node, &err_info->err_addr_list);
-}
-
-void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr)
-{
- list_del(&mca_err_addr->node);
- kfree(mca_err_addr);
-}
-
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info,
- struct ras_err_addr *err_addr, u64 count)
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ u64 count)
{
struct ras_err_info *err_info;
@@ -4033,9 +4638,6 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
if (!err_info)
return -EINVAL;
- if (err_addr && err_addr->err_status)
- amdgpu_ras_add_mca_err_addr(err_info, err_addr);
-
err_info->ue_count += count;
err_data->ue_count += count;
@@ -4043,8 +4645,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
}
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info,
- struct ras_err_addr *err_addr, u64 count)
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ u64 count)
{
struct ras_err_info *err_info;
@@ -4065,8 +4667,8 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
}
int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info,
- struct ras_err_addr *err_addr, u64 count)
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ u64 count)
{
struct ras_err_info *err_info;
@@ -4080,9 +4682,6 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
if (!err_info)
return -EINVAL;
- if (err_addr && err_addr->err_status)
- amdgpu_ras_add_mca_err_addr(err_info, err_addr);
-
err_info->de_count += count;
err_data->de_count += count;
@@ -4092,64 +4691,84 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
#define mmMP0_SMN_C2PMSG_92 0x1609C
#define mmMP0_SMN_C2PMSG_126 0x160BE
static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
- u32 instance, u32 boot_error)
+ u32 instance)
{
u32 socket_id, aid_id, hbm_id;
- u32 reg_data;
+ u32 fw_status;
+ u32 boot_error;
u64 reg_addr;
- socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
- aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
- hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
-
/* The pattern for smn addressing in other SOC could be different from
* the one for aqua_vanjaram. We should revisit the code if the pattern
* is changed. In such case, replace the aqua_vanjaram implementation
* with more common helper */
reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
aqua_vanjaram_encode_ext_smn_addressing(instance);
+ fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
- reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
- dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n",
- socket_id, aid_id, reg_data);
+ reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+ aqua_vanjaram_encode_ext_smn_addressing(instance);
+ boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+
+ socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+ aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+ hbm_id = ((1 == AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error)) ? 0 : 1);
if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n",
- socket_id, aid_id, hbm_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n",
+ socket_id, aid_id, hbm_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n",
- socket_id, aid_id, hbm_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n",
+ socket_id, aid_id, hbm_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n",
- socket_id, aid_id, hbm_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n",
+ socket_id, aid_id, hbm_id, fw_status);
+
+ if (AMDGPU_RAS_GPU_ERR_DATA_ABORT(boot_error))
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n",
+ socket_id, aid_id, fw_status);
+
+ if (AMDGPU_RAS_GPU_ERR_UNKNOWN(boot_error))
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, unknown boot time errors\n",
+ socket_id, aid_id, fw_status);
}
-static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
- u32 instance, u32 *boot_error)
+static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
+ u32 instance)
{
- u32 reg_addr;
+ u64 reg_addr;
u32 reg_data;
int retry_loop;
@@ -4158,40 +4777,65 @@ static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
- if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) {
- *boot_error = AMDGPU_RAS_BOOT_SUCEESS;
- return 0;
- }
- msleep(1);
- }
-
- /* The pattern for smn addressing in other SOC could be different from
- * the one for aqua_vanjaram. We should revisit the code if the pattern
- * is changed. In such case, replace the aqua_vanjaram implementation
- * with more common helper */
- reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
- aqua_vanjaram_encode_ext_smn_addressing(instance);
-
- for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
- reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
- if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
- *boot_error = reg_data;
- return 0;
- }
- msleep(1);
+ if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS)
+ return false;
+ else
+ msleep(1);
}
- *boot_error = reg_data;
- return -ETIME;
+ return true;
}
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
{
- u32 boot_error = 0;
u32 i;
for (i = 0; i < num_instances; i++) {
- if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error))
- amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);
+ if (amdgpu_ras_boot_error_detected(adev, i))
+ amdgpu_ras_boot_time_error_reporting(adev, i);
}
}
+
+int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
+ uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
+ int ret = 0;
+
+ mutex_lock(&con->page_rsv_lock);
+ ret = amdgpu_vram_mgr_query_page_status(mgr, start);
+ if (ret == -ENOENT)
+ ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE);
+ mutex_unlock(&con->page_rsv_lock);
+
+ return ret;
+}
+
+void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (RAS_EVENT_ID_IS_VALID(event_id))
+ dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf);
+ else
+ dev_printk(KERN_INFO, adev->dev, "%pV", &vaf);
+
+ va_end(args);
+}
+
+bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!con)
+ return false;
+
+ return con->is_rma;
+}