diff options
3 files changed, 199 insertions, 1 deletions
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index eb91519aacd7..b1ee6feb0caf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -229,6 +229,9 @@ enum hclge_opcode_type { HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513, HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514, HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515, + HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580, + HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581, + HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584, HCLGE_IGU_EGU_TNL_INT_EN = 0x1803, HCLGE_IGU_COMMON_INT_EN = 0x1806, HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index 81ad9b415556..77deea0beeba 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -337,6 +337,30 @@ static const struct hclge_hw_error hclge_ssu_port_based_pf_int[] = { { /* sentinel */ } }; +static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = { + { .int_msk = 0, .msg = "rocee qmm ovf: sgid invalid err" }, + { .int_msk = 0x4, .msg = "rocee qmm ovf: sgid ovf err" }, + { .int_msk = 0x8, .msg = "rocee qmm ovf: smac invalid err" }, + { .int_msk = 0xC, .msg = "rocee qmm ovf: smac ovf err" }, + { .int_msk = 0x10, .msg = "rocee qmm ovf: cqc invalid err" }, + { .int_msk = 0x11, .msg = "rocee qmm ovf: cqc ovf err" }, + { .int_msk = 0x12, .msg = "rocee qmm ovf: cqc hopnum err" }, + { .int_msk = 0x13, .msg = "rocee qmm ovf: cqc ba0 err" }, + { .int_msk = 0x14, .msg = "rocee qmm ovf: srqc invalid err" }, + { .int_msk = 0x15, .msg = "rocee qmm ovf: srqc ovf err" }, + { .int_msk = 0x16, .msg = "rocee qmm ovf: srqc hopnum err" }, + { .int_msk = 0x17, .msg = "rocee qmm ovf: srqc ba0 err" }, + { .int_msk = 0x18, .msg = "rocee qmm ovf: mpt invalid err" }, + { .int_msk = 0x19, .msg = "rocee qmm ovf: mpt ovf err" }, + { .int_msk = 0x1A, .msg = "rocee qmm ovf: mpt hopnum err" }, + { .int_msk = 0x1B, .msg = "rocee qmm ovf: mpt ba0 err" }, + { .int_msk = 0x1C, .msg = "rocee qmm ovf: qpc invalid err" }, + { .int_msk = 0x1D, .msg = "rocee qmm ovf: qpc ovf err" }, + { .int_msk = 0x1E, .msg = "rocee qmm ovf: qpc hopnum err" }, + { .int_msk = 0x1F, .msg = "rocee qmm ovf: qpc ba0 err" }, + { /* sentinel */ } +}; + static void hclge_log_error(struct device *dev, char *reg, const struct hclge_hw_error *err, u32 err_sts) @@ -1023,6 +1047,148 @@ static int hclge_handle_all_ras_errors(struct hclge_dev *hdev) return ret; } +static int hclge_log_rocee_ovf_error(struct hclge_dev *hdev) +{ + struct device *dev = &hdev->pdev->dev; + struct hclge_desc desc[2]; + int ret; + + /* read overflow error status */ + ret = hclge_cmd_query_error(hdev, &desc[0], + HCLGE_ROCEE_PF_RAS_INT_CMD, + 0, 0, 0); + if (ret) { + dev_err(dev, "failed(%d) to query ROCEE OVF error sts\n", ret); + return ret; + } + + /* log overflow error */ + if (le32_to_cpu(desc[0].data[0]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) { + const struct hclge_hw_error *err; + u32 err_sts; + + err = &hclge_rocee_qmm_ovf_err_int[0]; + err_sts = HCLGE_ROCEE_OVF_ERR_TYPE_MASK & + le32_to_cpu(desc[0].data[0]); + while (err->msg) { + if (err->int_msk == err_sts) { + dev_warn(dev, "%s [error status=0x%x] found\n", + err->msg, + le32_to_cpu(desc[0].data[0])); + break; + } + err++; + } + } + + if (le32_to_cpu(desc[0].data[1]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) { + dev_warn(dev, "ROCEE TSP OVF [error status=0x%x] found\n", + le32_to_cpu(desc[0].data[1])); + } + + if (le32_to_cpu(desc[0].data[2]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) { + dev_warn(dev, "ROCEE SCC OVF [error status=0x%x] found\n", + le32_to_cpu(desc[0].data[2])); + } + + return 0; +} + +static int hclge_log_and_clear_rocee_ras_error(struct hclge_dev *hdev) +{ + enum hnae3_reset_type reset_type = HNAE3_FUNC_RESET; + struct hnae3_ae_dev *ae_dev = hdev->ae_dev; + struct device *dev = &hdev->pdev->dev; + struct hclge_desc desc[2]; + unsigned int status; + int ret; + + /* read RAS error interrupt status */ + ret = hclge_cmd_query_error(hdev, &desc[0], + HCLGE_QUERY_CLEAR_ROCEE_RAS_INT, + 0, 0, 0); + if (ret) { + dev_err(dev, "failed(%d) to query ROCEE RAS INT SRC\n", ret); + /* reset everything for now */ + HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET); + return ret; + } + + status = le32_to_cpu(desc[0].data[0]); + + if (status & HCLGE_ROCEE_RERR_INT_MASK) + dev_warn(dev, "ROCEE RAS AXI rresp error\n"); + + if (status & HCLGE_ROCEE_BERR_INT_MASK) + dev_warn(dev, "ROCEE RAS AXI bresp error\n"); + + if (status & HCLGE_ROCEE_ECC_INT_MASK) { + dev_warn(dev, "ROCEE RAS 2bit ECC error\n"); + reset_type = HNAE3_GLOBAL_RESET; + } + + if (status & HCLGE_ROCEE_OVF_INT_MASK) { + ret = hclge_log_rocee_ovf_error(hdev); + if (ret) { + dev_err(dev, "failed(%d) to process ovf error\n", ret); + /* reset everything for now */ + HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET); + return ret; + } + } + + /* clear error status */ + hclge_cmd_reuse_desc(&desc[0], false); + ret = hclge_cmd_send(&hdev->hw, &desc[0], 1); + if (ret) { + dev_err(dev, "failed(%d) to clear ROCEE RAS error\n", ret); + /* reset everything for now */ + reset_type = HNAE3_GLOBAL_RESET; + } + + HCLGE_SET_DEFAULT_RESET_REQUEST(reset_type); + + return ret; +} + +static int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en) +{ + struct device *dev = &hdev->pdev->dev; + struct hclge_desc desc; + int ret; + + if (hdev->pdev->revision < 0x21 || !hnae3_dev_roce_supported(hdev)) + return 0; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_CONFIG_ROCEE_RAS_INT_EN, false); + if (en) { + /* enable ROCEE hw error interrupts */ + desc.data[0] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN); + desc.data[1] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN); + + hclge_log_and_clear_rocee_ras_error(hdev); + } + desc.data[2] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN_MASK); + desc.data[3] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN_MASK); + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) + dev_err(dev, "failed(%d) to config ROCEE RAS interrupt\n", ret); + + return ret; +} + +static int hclge_handle_rocee_ras_error(struct hnae3_ae_dev *ae_dev) +{ + struct hclge_dev *hdev = ae_dev->priv; + + if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) || + hdev->pdev->revision < 0x21) + return HNAE3_NONE_RESET; + + return hclge_log_and_clear_rocee_ras_error(hdev); +} + static const struct hclge_hw_blk hw_blk[] = { { .msk = BIT(0), .name = "IGU_EGU", @@ -1058,6 +1224,7 @@ static const struct hclge_hw_blk hw_blk[] = { int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state) { const struct hclge_hw_blk *module = hw_blk; + struct device *dev = &hdev->pdev->dev; int ret = 0; while (module->name) { @@ -1069,6 +1236,10 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state) module++; } + ret = hclge_config_rocee_ras_interrupt(hdev, state); + if (ret) + dev_err(dev, "fail(%d) to configure ROCEE err int\n", ret); + return ret; } @@ -1086,9 +1257,21 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev) "HNS Non-Fatal RAS error(status=0x%x) identified\n", status); hclge_handle_all_ras_errors(hdev); - return PCI_ERS_RESULT_NEED_RESET; + } else { + if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) || + hdev->pdev->revision < 0x21) + return PCI_ERS_RESULT_RECOVERED; + } + + if (status & HCLGE_RAS_REG_ROCEE_ERR_MASK) { + dev_warn(dev, "ROCEE uncorrected RAS error identified\n"); + hclge_handle_rocee_ras_error(ae_dev); } + if (status & HCLGE_RAS_REG_NFE_MASK || + status & HCLGE_RAS_REG_ROCEE_ERR_MASK) + return PCI_ERS_RESULT_NEED_RESET; + return PCI_ERS_RESULT_RECOVERED; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h index d2077f2795fa..51a7d4eb066a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h @@ -8,6 +8,7 @@ #define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00 #define HCLGE_RAS_REG_NFE_MASK 0xFF00 +#define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000 #define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG 0x20800 #define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00 @@ -83,6 +84,17 @@ #define HCLGE_QCN_ECC_INT_MASK GENMASK(21, 0) #define HCLGE_NCSI_ECC_INT_MASK GENMASK(1, 0) +#define HCLGE_ROCEE_RAS_NFE_INT_EN 0xF +#define HCLGE_ROCEE_RAS_CE_INT_EN 0x1 +#define HCLGE_ROCEE_RAS_NFE_INT_EN_MASK 0xF +#define HCLGE_ROCEE_RAS_CE_INT_EN_MASK 0x1 +#define HCLGE_ROCEE_RERR_INT_MASK BIT(0) +#define HCLGE_ROCEE_BERR_INT_MASK BIT(1) +#define HCLGE_ROCEE_ECC_INT_MASK BIT(2) +#define HCLGE_ROCEE_OVF_INT_MASK BIT(3) +#define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000 +#define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F + enum hclge_err_int_type { HCLGE_ERR_INT_MSIX = 0, HCLGE_ERR_INT_RAS_CE = 1, |