diff options
Diffstat (limited to 'drivers/ras')
-rw-r--r-- | drivers/ras/amd/atl/internal.h | 3 | ||||
-rw-r--r-- | drivers/ras/amd/atl/umc.c | 19 | ||||
-rw-r--r-- | drivers/ras/amd/fmpm.c | 9 |
3 files changed, 28 insertions, 3 deletions
diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h index f9be26d25348..d096b58cd0ae 100644 --- a/drivers/ras/amd/atl/internal.h +++ b/drivers/ras/amd/atl/internal.h @@ -362,4 +362,7 @@ static inline void atl_debug_on_bad_intlv_mode(struct addr_ctx *ctx) atl_debug(ctx, "Unrecognized interleave mode: %u", ctx->map.intlv_mode); } +#define MI300_UMC_MCA_COL GENMASK(5, 1) +#define MI300_UMC_MCA_ROW13 BIT(23) + #endif /* __AMD_ATL_INTERNAL_H__ */ diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index dc8aa12f63c8..6e072b7667e9 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -229,7 +229,6 @@ int get_umc_info_mi300(void) * Additionally, the PC and Bank bits may be hashed. This must be accounted for before * reconstructing the normalized address. */ -#define MI300_UMC_MCA_COL GENMASK(5, 1) #define MI300_UMC_MCA_BANK GENMASK(9, 6) #define MI300_UMC_MCA_ROW GENMASK(24, 10) #define MI300_UMC_MCA_PC BIT(25) @@ -320,7 +319,7 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats. */ #define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL)) -static void retire_row_mi300(struct atl_err *a_err) +static void _retire_row_mi300(struct atl_err *a_err) { unsigned long addr; struct page *p; @@ -351,6 +350,22 @@ static void retire_row_mi300(struct atl_err *a_err) } } +/* + * In addition to the column bits, the row[13] bit should also be included when + * calculating addresses affected by a physical row. + * + * Instead of running through another loop over a single bit, just run through + * the column bits twice and flip the row[13] bit in-between. + * + * See MI300_UMC_MCA_ROW for the row bits in MCA_ADDR_UMC value. + */ +static void retire_row_mi300(struct atl_err *a_err) +{ + _retire_row_mi300(a_err); + a_err->addr ^= MI300_UMC_MCA_ROW13; + _retire_row_mi300(a_err); +} + void amd_retire_dram_row(struct atl_err *a_err) { if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c index 90de737fbc90..8877c6ff64c4 100644 --- a/drivers/ras/amd/fmpm.c +++ b/drivers/ras/amd/fmpm.c @@ -250,6 +250,13 @@ static bool rec_has_valid_entries(struct fru_rec *rec) return true; } +/* + * Row retirement is done on MI300 systems, and some bits are 'don't + * care' for comparing addresses with unique physical rows. This + * includes all column bits and the row[13] bit. + */ +#define MASK_ADDR(addr) ((addr) & ~(MI300_UMC_MCA_ROW13 | MI300_UMC_MCA_COL)) + static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new) { /* @@ -258,7 +265,7 @@ static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_ * * Also, order the checks from most->least likely to fail to shortcut the code. */ - if (old->addr != new->addr) + if (MASK_ADDR(old->addr) != MASK_ADDR(new->addr)) return false; if (old->hw_id != new->hw_id) |