summaryrefslogtreecommitdiffstats
path: root/drivers/accel
diff options
context:
space:
mode:
authorOfir Bitton <obitton@habana.ai>2023-07-25 21:11:56 +0300
committerOded Gabbay <ogabbay@kernel.org>2023-10-09 12:37:21 +0300
commitd261b0ab131e2511b70f7bc4a3737d3b90ca6e87 (patch)
treee860450dfb5b564ccab5f7bd32a0b5feba708390 /drivers/accel
parent10d260f655c1af1b5a5e7d0cea001e3d0461aeaa (diff)
downloadlinux-d261b0ab131e2511b70f7bc4a3737d3b90ca6e87.tar.gz
linux-d261b0ab131e2511b70f7bc4a3737d3b90ca6e87.tar.bz2
linux-d261b0ab131e2511b70f7bc4a3737d3b90ca6e87.zip
accel/habanalabs/gaudi2: include block id in ECC error reporting
During ECC event handling, Memory wrapper id was mistakenly printed as block id. Fix the print and in addition fetch the actual block-id from firmware. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/accel')
-rw-r--r--drivers/accel/habanalabs/gaudi2/gaudi2.c23
-rw-r--r--drivers/accel/habanalabs/include/common/cpucp_if.h3
2 files changed, 20 insertions, 6 deletions
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index d60389b6700f..dca19be42d5f 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -7834,16 +7834,29 @@ static void gaudi2_print_event(struct hl_device *hdev, u16 event_type,
static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
struct hl_eq_ecc_data *ecc_data)
{
- u64 ecc_address = 0, ecc_syndrom = 0;
+ u64 ecc_address = 0, ecc_syndrome = 0;
u8 memory_wrapper_idx = 0;
+ bool has_block_id = false;
+ u16 block_id;
+
+ if (!hl_is_fw_sw_ver_below(hdev, 1, 12))
+ has_block_id = true;
ecc_address = le64_to_cpu(ecc_data->ecc_address);
- ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom);
+ ecc_syndrome = le64_to_cpu(ecc_data->ecc_syndrom);
memory_wrapper_idx = ecc_data->memory_wrapper_idx;
- gaudi2_print_event(hdev, event_type, !ecc_data->is_critical,
- "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u. critical %u.",
- ecc_address, ecc_syndrom, memory_wrapper_idx, ecc_data->is_critical);
+ if (has_block_id) {
+ block_id = le16_to_cpu(ecc_data->block_id);
+ gaudi2_print_event(hdev, event_type, !ecc_data->is_critical,
+ "ECC error detected. address: %#llx. Syndrome: %#llx. wrapper id %u. block id %#x. critical %u.",
+ ecc_address, ecc_syndrome, memory_wrapper_idx, block_id,
+ ecc_data->is_critical);
+ } else {
+ gaudi2_print_event(hdev, event_type, !ecc_data->is_critical,
+ "ECC error detected. address: %#llx. Syndrome: %#llx. wrapper id %u. critical %u.",
+ ecc_address, ecc_syndrome, memory_wrapper_idx, ecc_data->is_critical);
+ }
return !!ecc_data->is_critical;
}
diff --git a/drivers/accel/habanalabs/include/common/cpucp_if.h b/drivers/accel/habanalabs/include/common/cpucp_if.h
index 33807b839c37..ef7d32224066 100644
--- a/drivers/accel/habanalabs/include/common/cpucp_if.h
+++ b/drivers/accel/habanalabs/include/common/cpucp_if.h
@@ -69,7 +69,8 @@ struct hl_eq_ecc_data {
__le64 ecc_syndrom;
__u8 memory_wrapper_idx;
__u8 is_critical;
- __u8 pad[6];
+ __le16 block_id;
+ __u8 pad[4];
};
enum hl_sm_sei_cause {