From b6c7256688264f5096a2cfaaaa56d01ea686b367 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Fri, 9 Jun 2023 04:01:48 -0700 Subject: RDMA/bnxt_re: Add firmware stall check detection Every completion will update last_seen value in the unit of jiffies. last_seen field will be used to know if firmware is alive and is useful to detect firmware stall. Non blocking interface __wait_for_resp will have logic to detect firmware stall. After every 10 second interval if __wait_for_resp has not received completion for a given command it will check for firmware stall condition. If current jiffies is greater than last_seen jiffies + RCFW_FW_STALL_TIMEOUT_SEC * HZ, it is a firmware stall. Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/1686308514-11996-12-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 44 ++++++++++++++++++++++++------ drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 5 ++-- 2 files changed, 39 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 4e5f66e5c714..349fbed003d3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -112,11 +112,13 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) return bnxt_qplib_map_rc(opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; - /* Non zero means command completed */ wait_event_timeout(cmdq->waitq, !test_bit(cbit, cmdq->cmdq_bitmap), - msecs_to_jiffies(10000)); + msecs_to_jiffies(RCFW_FW_STALL_TIMEOUT_SEC + * 1000)); if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; @@ -126,6 +128,11 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; + /* Firmware stall is detected */ + if (time_after(jiffies, cmdq->last_seen + + (RCFW_FW_STALL_TIMEOUT_SEC * HZ))) + return -ENODEV; + } while (true); }; @@ -154,6 +161,8 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode) do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) return bnxt_qplib_map_rc(opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; udelay(1); @@ -184,9 +193,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, hwq = &cmdq->hwq; pdev = rcfw->pdev; - if (test_bit(FIRMWARE_TIMED_OUT, &cmdq->flags)) - return -ETIMEDOUT; - /* Cmdq are in 16-byte units, each request can consume 1 or more * cmdqe */ @@ -285,14 +291,21 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, do { if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) return bnxt_qplib_map_rc(opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; usleep_range(1000, 1001); bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); if (!test_bit(cbit, cmdq->cmdq_bitmap)) return 0; - if (jiffies_to_msecs(jiffies - issue_time) > 10000) - return -ETIMEDOUT; + if (jiffies_to_msecs(jiffies - issue_time) > + (RCFW_FW_STALL_TIMEOUT_SEC * 1000)) { + /* Firmware stall is detected */ + if (time_after(jiffies, cmdq->last_seen + + (RCFW_FW_STALL_TIMEOUT_SEC * HZ))) + return -ENODEV; + } } while (true); }; @@ -308,6 +321,8 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, /* Prevent posting if f/w is not in a state to process */ if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) return -ENXIO; + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { @@ -375,7 +390,6 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, /* timed out */ dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n", cookie, opcode, RCFW_CMD_WAIT_TIME_MS); - set_bit(FIRMWARE_TIMED_OUT, &rcfw->cmdq.flags); return rc; } @@ -383,6 +397,8 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, spin_lock_irqsave(&rcfw->cmdq.hwq.lock, flags); crsqe = &rcfw->crsqe_tbl[cbit]; crsqe->is_waiter_alive = false; + if (rc == -ENODEV) + set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags); spin_unlock_irqrestore(&rcfw->cmdq.hwq.lock, flags); return -ETIMEDOUT; } @@ -533,6 +549,17 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, cookie &= RCFW_MAX_COOKIE_VALUE; cbit = cookie % rcfw->cmdq_depth; crsqe = &rcfw->crsqe_tbl[cbit]; + + if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED, + &rcfw->cmdq.flags), + "QPLIB: Unreponsive rcfw channel detected.!!")) { + dev_info(&pdev->dev, + "rcfw timedout: cookie = %#x, free_slots = %d", + cookie, crsqe->free_slots); + spin_unlock_irqrestore(&hwq->lock, flags); + return rc; + } + if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap)) dev_warn(&pdev->dev, "CMD bit %d was not requested\n", cbit); @@ -582,6 +609,7 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) * reading any further. */ dma_rmb(); + rcfw->cmdq.last_seen = jiffies; type = creqe->type & CREQ_BASE_TYPE_MASK; switch (type) { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 6ed81c180821..54576f12c8da 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -51,6 +51,7 @@ #define RCFW_DBR_PCI_BAR_REGION 2 #define RCFW_DBR_BASE_PAGE_SHIFT 12 +#define RCFW_FW_STALL_TIMEOUT_SEC 40 /* Cmdq contains a fix number of a 16-Byte slots */ struct bnxt_qplib_cmdqe { @@ -128,7 +129,6 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) #define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT - 1) #define RCFW_CMD_IS_BLOCKING 0x8000 -#define RCFW_BLOCKED_CMD_WAIT_COUNT 20000000UL /* 20 sec */ /* Crsq buf is 1024-Byte */ struct bnxt_qplib_crsbe { @@ -170,7 +170,7 @@ struct bnxt_qplib_qp_node { #define FIRMWARE_INITIALIZED_FLAG (0) #define FIRMWARE_FIRST_FLAG (31) -#define FIRMWARE_TIMED_OUT (3) +#define FIRMWARE_STALL_DETECTED (3) #define ERR_DEVICE_DETACHED (4) struct bnxt_qplib_cmdq_mbox { @@ -185,6 +185,7 @@ struct bnxt_qplib_cmdq_ctx { wait_queue_head_t waitq; unsigned long flags; unsigned long *cmdq_bitmap; + unsigned long last_seen; u32 seq_num; }; -- cgit v1.2.3