From 08c7f09356e45d093d1867c7a3c6ac6526e2f98b Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Sun, 7 May 2023 11:29:29 -0700 Subject: RDMA/bnxt_re: Fix the page_size used during the MR creation Driver populates the list of pages used for Memory region wrongly when page size is more than system page size. This is causing a failure when some of the applications that creates MR with page size as 2M. Since HW can support multiple page sizes, pass the correct page size while creating the MR. Also, driver need not adjust the number of pages when HW Queues are created with user memory. It should work with the number of dma blocks returned by ib_umem_num_dma_blocks. Fix this calculation also. Fixes: 0c4dcd602817 ("RDMA/bnxt_re: Refactor hardware queue memory allocation") Fixes: f6919d56388c ("RDMA/bnxt_re: Code refactor while populating user MRs") Link: https://lore.kernel.org/r/1683484169-9539-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Kalesh AP Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_res.c | 12 ++---------- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 7 +++---- 2 files changed, 5 insertions(+), 14 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 126d4f26f75a..81b0c5e879f9 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -215,17 +215,9 @@ int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq, return -EINVAL; hwq_attr->sginfo->npages = npages; } else { - unsigned long sginfo_num_pages = ib_umem_num_dma_blocks( - hwq_attr->sginfo->umem, hwq_attr->sginfo->pgsize); - + npages = ib_umem_num_dma_blocks(hwq_attr->sginfo->umem, + hwq_attr->sginfo->pgsize); hwq->is_user = true; - npages = sginfo_num_pages; - npages = (npages * PAGE_SIZE) / - BIT_ULL(hwq_attr->sginfo->pgshft); - if ((sginfo_num_pages * PAGE_SIZE) % - BIT_ULL(hwq_attr->sginfo->pgshft)) - if (!npages) - npages++; } if (npages == MAX_PBL_LVL_0_PGS && !hwq_attr->sginfo->nopte) { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 1714a1e23113..b967a17a44be 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -617,16 +617,15 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, /* Free the hwq if it already exist, must be a rereg */ if (mr->hwq.max_elements) bnxt_qplib_free_hwq(res, &mr->hwq); - /* Use system PAGE_SIZE */ hwq_attr.res = res; hwq_attr.depth = pages; - hwq_attr.stride = buf_pg_size; + hwq_attr.stride = sizeof(dma_addr_t); hwq_attr.type = HWQ_TYPE_MR; hwq_attr.sginfo = &sginfo; hwq_attr.sginfo->umem = umem; hwq_attr.sginfo->npages = pages; - hwq_attr.sginfo->pgsize = PAGE_SIZE; - hwq_attr.sginfo->pgshft = PAGE_SHIFT; + hwq_attr.sginfo->pgsize = buf_pg_size; + hwq_attr.sginfo->pgshft = ilog2(buf_pg_size); rc = bnxt_qplib_alloc_init_hwq(&mr->hwq, &hwq_attr); if (rc) { dev_err(&res->pdev->dev, -- cgit v1.2.3 From 17eabd6a044b57b2c1b5459eb9d11af3ea909e63 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Mon, 15 May 2023 15:10:57 -0500 Subject: RDMA/rxe: Fix double unlock in rxe_qp.c A recent patch can cause a double spin_unlock_bh() in rxe_qp_to_attr() at line 715 in rxe_qp.c. Move the 2nd unlock into the if statement. Fixes: f605f26ea196 ("RDMA/rxe: Protect QP state with qp->state_lock") Link: https://lore.kernel.org/r/20230515201056.1591140-1-rpearsonhpe@gmail.com Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/27773078-40ce-414f-8b97-781954da9f25@kili.mountain Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index c5451a4488ca..245dd36638c7 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -712,8 +712,9 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) if (qp->attr.sq_draining) { spin_unlock_bh(&qp->state_lock); cond_resched(); + } else { + spin_unlock_bh(&qp->state_lock); } - spin_unlock_bh(&qp->state_lock); return 0; } -- cgit v1.2.3 From b5f3fe27c54b1fe77d4dd834ccd48a6d694f872e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 May 2023 11:50:56 +0800 Subject: RDMA/rxe: Convert spin_{lock_bh,unlock_bh} to spin_{lock_irqsave,unlock_irqrestore} We need to call spin_lock_irqsave()/spin_unlock_irqrestore() for state_lock in rxe, otherwsie the callchain: ib_post_send_mad -> spin_lock_irqsave -> ib_post_send -> rxe_post_send -> spin_lock_bh -> spin_unlock_bh -> spin_unlock_irqrestore Causes below traces during run block nvmeof-mp/001 test due to mismatched spinlock nesting: WARNING: CPU: 0 PID: 94794 at kernel/softirq.c:376 __local_bh_enable_ip+0xc2/0x140 [ ... ] CPU: 0 PID: 94794 Comm: kworker/u4:1 Tainted: G E 6.4.0-rc1 #9 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014 Workqueue: rdma_cm cma_work_handler [rdma_cm] RIP: 0010:__local_bh_enable_ip+0xc2/0x140 Code: 48 85 c0 74 72 5b 41 5c 5d 31 c0 89 c2 89 c1 89 c6 89 c7 41 89 c0 e9 bd 0e 11 01 65 8b 05 f2 65 72 48 85 c0 0f 85 76 ff ff ff <0f> 0b e9 6f ff ff ff e8 d2 39 1c 00 eb 80 4c 89 e7 e8 68 ad 0a 00 RSP: 0018:ffffb7cf818539f0 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000201 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000201 RDI: ffffffffc0f25f79 RBP: ffffb7cf81853a00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffc0f25f79 R13: ffff8db1f0fa6000 R14: ffff8db2c63ff000 R15: 00000000000000e8 FS: 0000000000000000(0000) GS:ffff8db33bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559758db0f20 CR3: 0000000105124000 CR4: 00000000003506f0 Call Trace: _raw_spin_unlock_bh+0x31/0x40 rxe_post_send+0x59/0x8b0 [rdma_rxe] ib_send_mad+0x26b/0x470 [ib_core] ib_post_send_mad+0x150/0xb40 [ib_core] ? cm_form_tid+0x5b/0x90 [ib_cm] ib_send_cm_req+0x7c8/0xb70 [ib_cm] rdma_connect_locked+0x433/0x940 [rdma_cm] nvme_rdma_cm_handler+0x5d7/0x9c0 [nvme_rdma] cma_cm_event_handler+0x4f/0x170 [rdma_cm] cma_work_handler+0x6a/0xe0 [rdma_cm] process_one_work+0x2a9/0x580 worker_thread+0x52/0x3f0 ? __pfx_worker_thread+0x10/0x10 kthread+0x109/0x140 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 0 PID: 94794 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x37/0x60 [ ... ] CPU: 0 PID: 94794 Comm: kworker/u4:1 Tainted: G W E 6.4.0-rc1 #9 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.15.0-0-g2dd4b9b-rebuilt.opensuse.org 04/01/2014 Workqueue: rdma_cm cma_work_handler [rdma_cm] RIP: 0010:warn_bogus_irq_restore+0x37/0x60 Code: fb 01 77 36 83 e3 01 74 0e 48 8b 5d f8 c9 31 f6 89 f7 e9 ac ea 01 00 48 c7 c7 e0 52 33 b9 c6 05 bb 1c 69 01 01 e8 39 24 f0 fe <0f> 0b 48 8b 5d f8 c9 31 f6 89 f7 e9 89 ea 01 00 0f b6 f3 48 c7 c7 RSP: 0018:ffffb7cf81853a58 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffb7cf81853a60 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8db2cfb1a9e8 R13: ffff8db2cfb1a9d8 R14: ffff8db2c63ff000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8db33bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559758db0f20 CR3: 0000000105124000 CR4: 00000000003506f0 Call Trace: _raw_spin_unlock_irqrestore+0x91/0xa0 ib_send_mad+0x1e3/0x470 [ib_core] ib_post_send_mad+0x150/0xb40 [ib_core] ? cm_form_tid+0x5b/0x90 [ib_cm] ib_send_cm_req+0x7c8/0xb70 [ib_cm] rdma_connect_locked+0x433/0x940 [rdma_cm] nvme_rdma_cm_handler+0x5d7/0x9c0 [nvme_rdma] cma_cm_event_handler+0x4f/0x170 [rdma_cm] cma_work_handler+0x6a/0xe0 [rdma_cm] process_one_work+0x2a9/0x580 worker_thread+0x52/0x3f0 ? __pfx_worker_thread+0x10/0x10 kthread+0x109/0x140 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 Fixes: f605f26ea196 ("RDMA/rxe: Protect QP state with qp->state_lock") Link: https://lore.kernel.org/r/20230510035056.881196-1-guoqing.jiang@linux.dev Signed-off-by: Guoqing Jiang Reviewed-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 26 +++++++++++++++---------- drivers/infiniband/sw/rxe/rxe_net.c | 7 ++++--- drivers/infiniband/sw/rxe/rxe_qp.c | 36 ++++++++++++++++++++++------------- drivers/infiniband/sw/rxe/rxe_recv.c | 9 +++++---- drivers/infiniband/sw/rxe/rxe_req.c | 30 ++++++++++++++++------------- drivers/infiniband/sw/rxe/rxe_resp.c | 14 ++++++++------ drivers/infiniband/sw/rxe/rxe_verbs.c | 25 ++++++++++++------------ 7 files changed, 86 insertions(+), 61 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index db18ace74d2b..f46c5a5fd0ae 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -115,15 +115,16 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) void retransmit_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, retrans_timer); + unsigned long flags; rxe_dbg_qp(qp, "retransmit timer fired\n"); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { qp->comp.timeout = 1; rxe_sched_task(&qp->comp.task); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) @@ -481,11 +482,13 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) static void comp_check_sq_drain_done(struct rxe_qp *qp) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(qp_state(qp) == IB_QPS_SQD)) { if (qp->attr.sq_draining && qp->comp.psn == qp->req.psn) { qp->attr.sq_draining = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->ibqp.event_handler) { struct ib_event ev; @@ -499,7 +502,7 @@ static void comp_check_sq_drain_done(struct rxe_qp *qp) return; } } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static inline enum comp_state complete_ack(struct rxe_qp *qp, @@ -625,13 +628,15 @@ static void free_pkt(struct rxe_pkt_info *pkt) */ static void reset_retry_timer(struct rxe_qp *qp) { + unsigned long flags; + if (qp_type(qp) == IB_QPT_RC && qp->qp_timeout_jiffies) { - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) >= IB_QPS_RTS && psn_compare(qp->req.psn, qp->comp.psn) > 0) mod_timer(&qp->retrans_timer, jiffies + qp->qp_timeout_jiffies); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } } @@ -643,18 +648,19 @@ int rxe_completer(struct rxe_qp *qp) struct rxe_pkt_info *pkt = NULL; enum comp_state state; int ret; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); drain_resp_pkts(qp); flush_send_queue(qp, notify); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->comp.timeout) { qp->comp.timeout_retry = 1; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 2bc7361152ea..a38fab19bed1 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -412,15 +412,16 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, int err; int is_request = pkt->mask & RXE_REQ_MASK; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || (!is_request && (qp_state(qp) < IB_QPS_RTR))) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_icrc_generate(skb, pkt); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 245dd36638c7..61a2eb77d999 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -300,6 +300,7 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, struct rxe_cq *rcq = to_rcq(init->recv_cq); struct rxe_cq *scq = to_rcq(init->send_cq); struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; + unsigned long flags; rxe_get(pd); rxe_get(rcq); @@ -325,10 +326,10 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, if (err) goto err2; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_RESET; qp->valid = 1; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return 0; @@ -492,24 +493,28 @@ static void rxe_qp_reset(struct rxe_qp *qp) /* move the qp to the error state */ void rxe_qp_error(struct rxe_qp *qp) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ rxe_sched_task(&qp->resp.task); rxe_sched_task(&qp->comp.task); rxe_sched_task(&qp->req.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); qp->attr.sq_draining = 1; rxe_sched_task(&qp->comp.task); rxe_sched_task(&qp->req.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } /* caller should hold qp->state_lock */ @@ -555,14 +560,16 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, qp->attr.cur_qp_state = attr->qp_state; if (mask & IB_QP_STATE) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); err = __qp_chk_state(qp, attr, mask); if (!err) { qp->attr.qp_state = attr->qp_state; rxe_dbg_qp(qp, "state -> %s\n", qps2str[attr->qp_state]); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (err) return err; @@ -688,6 +695,8 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, /* called by the query qp verb */ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { + unsigned long flags; + *attr = qp->attr; attr->rq_psn = qp->resp.psn; @@ -708,12 +717,12 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) /* Applications that get this state typically spin on it. * Yield the processor */ - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp->attr.sq_draining) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); cond_resched(); } else { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } return 0; @@ -737,10 +746,11 @@ int rxe_qp_chk_destroy(struct rxe_qp *qp) static void rxe_qp_do_cleanup(struct work_struct *work) { struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); qp->valid = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; if (qp_type(qp) == IB_QPT_RC) { diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 2f953cc74256..5861e4244049 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -14,6 +14,7 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct rxe_qp *qp) { unsigned int pkt_type; + unsigned long flags; if (unlikely(!qp->valid)) return -EINVAL; @@ -38,19 +39,19 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return -EINVAL; } - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (pkt->mask & RXE_REQ_MASK) { if (unlikely(qp_state(qp) < IB_QPS_RTR)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return -EINVAL; } } else { if (unlikely(qp_state(qp) < IB_QPS_RTS)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return -EINVAL; } } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 65134a9aefe7..5fe7cbae3031 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -99,17 +99,18 @@ static void req_retry(struct rxe_qp *qp) void rnr_nak_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); + unsigned long flags; rxe_dbg_qp(qp, "nak timer fired\n"); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { /* request a send queue retry */ qp->req.need_retry = 1; qp->req.wait_for_rnr_timer = 0; rxe_sched_task(&qp->req.task); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static void req_check_sq_drain_done(struct rxe_qp *qp) @@ -118,8 +119,9 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) unsigned int index; unsigned int cons; struct rxe_send_wqe *wqe; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_SQD) { q = qp->sq.queue; index = qp->req.wqe_index; @@ -140,7 +142,7 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) break; qp->attr.sq_draining = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->ibqp.event_handler) { struct ib_event ev; @@ -154,7 +156,7 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) return; } while (0); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp) @@ -173,6 +175,7 @@ static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp) static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) { struct rxe_send_wqe *wqe; + unsigned long flags; req_check_sq_drain_done(qp); @@ -180,13 +183,13 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) if (wqe == NULL) return NULL; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely((qp_state(qp) == IB_QPS_SQD) && (wqe->state != wqe_state_processing))) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return NULL; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); return wqe; @@ -676,16 +679,17 @@ int rxe_requester(struct rxe_qp *qp) struct rxe_queue *q = qp->sq.queue; struct rxe_ah *ah; struct rxe_av *av; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(!qp->valid)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } if (unlikely(qp_state(qp) == IB_QPS_ERR)) { wqe = __req_next_wqe(qp); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (wqe) goto err; else @@ -700,10 +704,10 @@ int rxe_requester(struct rxe_qp *qp) qp->req.wait_psn = 0; qp->req.need_retry = 0; qp->req.wait_for_rnr_timer = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); /* we come here if the retransmit timer has fired * or if the rnr timer has fired. If the retransmit diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 68f6cd188d8e..1da044f6b7d4 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1047,6 +1047,7 @@ static enum resp_states do_complete(struct rxe_qp *qp, struct ib_uverbs_wc *uwc = &cqe.uibwc; struct rxe_recv_wqe *wqe = qp->resp.wqe; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + unsigned long flags; if (!wqe) goto finish; @@ -1137,12 +1138,12 @@ static enum resp_states do_complete(struct rxe_qp *qp, return RESPST_ERR_CQ_OVERFLOW; finish: - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(qp_state(qp) == IB_QPS_ERR)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return RESPST_CHK_RESOURCE; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(!pkt)) return RESPST_DONE; @@ -1468,18 +1469,19 @@ int rxe_responder(struct rxe_qp *qp) enum resp_states state; struct rxe_pkt_info *pkt = NULL; int ret; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); drain_req_pkts(qp); flush_recv_queue(qp, notify); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index dea605b7f683..4d8f6b8051ff 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -904,10 +904,10 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, if (!err) rxe_sched_task(&qp->req.task); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->comp.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return err; } @@ -917,22 +917,23 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, { struct rxe_qp *qp = to_rqp(ibqp); int err; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed"); return -EINVAL; } if (unlikely(qp_state(qp) < IB_QPS_RTS)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_err_qp(qp, "qp not ready to send"); return -EINVAL; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->is_user) { /* Utilize process context to do protocol processing */ @@ -1008,22 +1009,22 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, struct rxe_rq *rq = &qp->rq; unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed"); return -EINVAL; } /* see C10-97.2.1 */ if (unlikely((qp_state(qp) < IB_QPS_INIT))) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_dbg_qp(qp, "qp not ready to post recv"); return -EINVAL; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(qp->srq)) { *bad_wr = wr; @@ -1044,10 +1045,10 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_unlock_irqrestore(&rq->producer_lock, flags); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->resp.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return err; } -- cgit v1.2.3 From 866422cdddcdf59d8c68e9472d49ba1be29b5fcf Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Thu, 11 May 2023 11:51:03 +0000 Subject: RDMA/efa: Fix unsupported page sizes in device Device uses 4KB size blocks for user pages indirect list while the driver creates those blocks with the size of PAGE_SIZE of the kernel. On kernels with PAGE_SIZE different than 4KB (ARM RHEL), this leads to a failure on register MR with indirect list because of the miss communication between driver and device. Fixes: 40909f664d27 ("RDMA/efa: Add EFA verbs implementation") Link: https://lore.kernel.org/r/20230511115103.13876-1-ynachum@amazon.com Reviewed-by: Firas Jahjah Reviewed-by: Michael Margolin Signed-off-by: Yonatan Nachum Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 8eca6c14d0cf..2a195c4b0f17 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1403,7 +1403,7 @@ static int pbl_continuous_initialize(struct efa_dev *dev, */ static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl) { - u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE); + u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, EFA_CHUNK_PAYLOAD_SIZE); struct scatterlist *sgl; int sg_dma_cnt, err; -- cgit v1.2.3 From 58caa2a51ad4fd21763696cc6c4defc9fc1b4b4f Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 May 2023 17:22:43 +0800 Subject: RDMA/hns: Fix timeout attr in query qp for HIP08 On HIP08, the queried timeout attr is different from the timeout attr configured by the user. It is found by rdma-core testcase test_rdmacm_async_traffic: ====================================================================== FAIL: test_rdmacm_async_traffic (tests.test_rdmacm.CMTestCase) ---------------------------------------------------------------------- Traceback (most recent call last): File "./tests/test_rdmacm.py", line 33, in test_rdmacm_async_traffic self.two_nodes_rdmacm_traffic(CMAsyncConnection, self.rdmacm_traffic, File "./tests/base.py", line 382, in two_nodes_rdmacm_traffic raise(res) AssertionError Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC") Link: https://lore.kernel.org/r/20230512092245.344442-2-huangjunxian6@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 17 ++++++++++++++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 ++ 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 84f1167de1d9..3a1c90406ed9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5012,7 +5012,6 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, static bool check_qp_timeout_cfg_range(struct hns_roce_dev *hr_dev, u8 *timeout) { #define QP_ACK_TIMEOUT_MAX_HIP08 20 -#define QP_ACK_TIMEOUT_OFFSET 10 #define QP_ACK_TIMEOUT_MAX 31 if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { @@ -5021,7 +5020,7 @@ static bool check_qp_timeout_cfg_range(struct hns_roce_dev *hr_dev, u8 *timeout) "local ACK timeout shall be 0 to 20.\n"); return false; } - *timeout += QP_ACK_TIMEOUT_OFFSET; + *timeout += HNS_ROCE_V2_QP_ACK_TIMEOUT_OFS_HIP08; } else if (hr_dev->pci_dev->revision > PCI_REVISION_ID_HIP08) { if (*timeout > QP_ACK_TIMEOUT_MAX) { ibdev_warn(&hr_dev->ib_dev, @@ -5307,6 +5306,18 @@ out: return ret; } +static u8 get_qp_timeout_attr(struct hns_roce_dev *hr_dev, + struct hns_roce_v2_qp_context *context) +{ + u8 timeout; + + timeout = (u8)hr_reg_read(context, QPC_AT); + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + timeout -= HNS_ROCE_V2_QP_ACK_TIMEOUT_OFS_HIP08; + + return timeout; +} + static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) @@ -5384,7 +5395,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_attr->max_dest_rd_atomic = 1 << hr_reg_read(&context, QPC_RR_MAX); qp_attr->min_rnr_timer = (u8)hr_reg_read(&context, QPC_MIN_RNR_TIME); - qp_attr->timeout = (u8)hr_reg_read(&context, QPC_AT); + qp_attr->timeout = get_qp_timeout_attr(hr_dev, &context); qp_attr->retry_cnt = hr_reg_read(&context, QPC_RETRY_NUM_INIT); qp_attr->rnr_retry = hr_reg_read(&context, QPC_RNR_NUM_INIT); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 1b44d2434ab4..7033eae2407c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -44,6 +44,8 @@ #define HNS_ROCE_V2_MAX_XRCD_NUM 0x1000000 #define HNS_ROCE_V2_RSV_XRCD_NUM 0 +#define HNS_ROCE_V2_QP_ACK_TIMEOUT_OFS_HIP08 10 + #define HNS_ROCE_V3_SCCC_SZ 64 #define HNS_ROCE_V3_GMV_ENTRY_SZ 32 -- cgit v1.2.3 From 7f3969b14f356dd65fa95b3528eb05c32e68bc06 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Fri, 12 May 2023 17:22:44 +0800 Subject: RDMA/hns: Fix base address table allocation For hns, the specification of an entry like resource (E.g. WQE/CQE/EQE) depends on BT page size, buf page size and hopnum. For user mode, the buf page size depends on UMEM. Therefore, the actual specification is controlled by BT page size and hopnum. The current BT page size and hopnum are obtained from firmware. This makes the driver inflexible and introduces unnecessary constraints. Resource allocation failures occur in many scenarios. This patch will calculate whether the BT page size set by firmware is sufficient before allocating BT, and increase the BT page size if it is insufficient. Fixes: 1133401412a9 ("RDMA/hns: Optimize base address table config flow for qp buffer") Link: https://lore.kernel.org/r/20230512092245.344442-3-huangjunxian6@hisilicon.com Signed-off-by: Chengchang Tang Signed-off-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_mr.c | 43 +++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 37a5cf62f88b..14376490ac22 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -33,6 +33,7 @@ #include #include +#include #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" @@ -909,6 +910,44 @@ static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev, return page_cnt; } +static u64 cal_pages_per_l1ba(unsigned int ba_per_bt, unsigned int hopnum) +{ + return int_pow(ba_per_bt, hopnum - 1); +} + +static unsigned int cal_best_bt_pg_sz(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr, + unsigned int pg_shift) +{ + unsigned long cap = hr_dev->caps.page_size_cap; + struct hns_roce_buf_region *re; + unsigned int pgs_per_l1ba; + unsigned int ba_per_bt; + unsigned int ba_num; + int i; + + for_each_set_bit_from(pg_shift, &cap, sizeof(cap) * BITS_PER_BYTE) { + if (!(BIT(pg_shift) & cap)) + continue; + + ba_per_bt = BIT(pg_shift) / BA_BYTE_LEN; + ba_num = 0; + for (i = 0; i < mtr->hem_cfg.region_count; i++) { + re = &mtr->hem_cfg.region[i]; + if (re->hopnum == 0) + continue; + + pgs_per_l1ba = cal_pages_per_l1ba(ba_per_bt, re->hopnum); + ba_num += DIV_ROUND_UP(re->count, pgs_per_l1ba); + } + + if (ba_num <= ba_per_bt) + return pg_shift; + } + + return 0; +} + static int mtr_alloc_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, unsigned int ba_page_shift) { @@ -917,6 +956,10 @@ static int mtr_alloc_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, hns_roce_hem_list_init(&mtr->hem_list); if (!cfg->is_direct) { + ba_page_shift = cal_best_bt_pg_sz(hr_dev, mtr, ba_page_shift); + if (!ba_page_shift) + return -ERANGE; + ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, cfg->region, cfg->region_count, ba_page_shift); -- cgit v1.2.3 From 56518a603fd2bf74762d176ac980572db84a3e14 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Fri, 12 May 2023 17:22:45 +0800 Subject: RDMA/hns: Modify the value of long message loopback slice Long message loopback slice is used for achieving traffic balance between QPs. It prevents the problem that QPs with large traffic occupying the hardware pipeline for a long time and QPs with small traffic cannot be scheduled. Currently, its maximum value is set to 16K, which means only after a QP sends 16K will the second QP be scheduled. This value is too large, which will lead to unbalanced traffic scheduling, and thus it needs to be modified. The setting range of the long message loopback slice is modified to be from 1024 (the lower limit supported by hardware) to mtu. Actual testing shows that this value can significantly reduce error in hardware traffic scheduling. This solution is compatible with both HIP08 and HIP09. The modified lp_pktn_ini has a maximum value of 2 (when mtu is 256), so the range checking code for lp_pktn_ini is no longer necessary and needs to be deleted. Fixes: 0e60778efb07 ("RDMA/hns: Modify the value of MAX_LP_MSG_LEN to meet hardware compatibility") Link: https://lore.kernel.org/r/20230512092245.344442-4-huangjunxian6@hisilicon.com Signed-off-by: Yangyang Li Signed-off-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 3a1c90406ed9..d4c6b9bc0a4e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4583,11 +4583,9 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, mtu = ib_mtu_enum_to_int(ib_mtu); if (WARN_ON(mtu <= 0)) return -EINVAL; -#define MAX_LP_MSG_LEN 16384 - /* MTU * (2 ^ LP_PKTN_INI) shouldn't be bigger than 16KB */ - lp_pktn_ini = ilog2(MAX_LP_MSG_LEN / mtu); - if (WARN_ON(lp_pktn_ini >= 0xF)) - return -EINVAL; +#define MIN_LP_MSG_LEN 1024 + /* mtu * (2 ^ lp_pktn_ini) should be in the range of 1024 to mtu */ + lp_pktn_ini = ilog2(max(mtu, MIN_LP_MSG_LEN) / mtu); if (attr_mask & IB_QP_PATH_MTU) { hr_reg_write(context, QPC_MTU, ib_mtu); -- cgit v1.2.3 From 349e3c0cf239cc01d58a1e6c749e171de014cd6a Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 01:10:59 -0700 Subject: RDMA/bnxt_re: Fix a possible memory leak Inside bnxt_qplib_create_cq(), when the check for NULL DPI fails, driver returns directly without freeing the memory allocated inside bnxt_qplib_alloc_init_hwq() routine. Fixed this by moving the check for NULL DPI before invoking bnxt_qplib_alloc_init_hwq(). Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Link: https://lore.kernel.org/r/1684397461-23082-2-git-send-email-selvin.xavier@broadcom.com Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index f139d4cd1712..8974f6235cfa 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2056,6 +2056,12 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) u32 pg_sz_lvl; int rc; + if (!cq->dpi) { + dev_err(&rcfw->pdev->dev, + "FP: CREATE_CQ failed due to NULL DPI\n"); + return -EINVAL; + } + hwq_attr.res = res; hwq_attr.depth = cq->max_wqe; hwq_attr.stride = sizeof(struct cq_base); @@ -2069,11 +2075,6 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) CMDQ_BASE_OPCODE_CREATE_CQ, sizeof(req)); - if (!cq->dpi) { - dev_err(&rcfw->pdev->dev, - "FP: CREATE_CQ failed due to NULL DPI\n"); - return -EINVAL; - } req.dpi = cpu_to_le32(cq->dpi->dpi); req.cq_handle = cpu_to_le64(cq->cq_handle); req.cq_size = cpu_to_le32(cq->hwq.max_elements); -- cgit v1.2.3 From 0fa0d520e2a878cb4c94c4dc84395905d3f14f54 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 01:11:00 -0700 Subject: RDMA/bnxt_re: Fix return value of bnxt_re_process_raw_qp_pkt_rx bnxt_re_process_raw_qp_pkt_rx() always return 0 and ignores the return value of bnxt_re_post_send_shadow_qp(). Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Link: https://lore.kernel.org/r/1684397461-23082-3-git-send-email-selvin.xavier@broadcom.com Reviewed-by: Hongguang Gao Reviewed-by: Ajit Khaparde Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index e86afecfbe46..b1c36412025f 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3341,9 +3341,7 @@ static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *gsi_qp, udwr.remote_qkey = gsi_sqp->qplib_qp.qkey; /* post data received in the send queue */ - rc = bnxt_re_post_send_shadow_qp(rdev, gsi_sqp, swr); - - return 0; + return bnxt_re_post_send_shadow_qp(rdev, gsi_sqp, swr); } static void bnxt_re_process_res_rawqp1_wc(struct ib_wc *wc, -- cgit v1.2.3 From dd5fb04857d7346c9ea4901ec3ebe5b90b0e8868 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 18 May 2023 01:11:01 -0700 Subject: RDMA/bnxt_re: Do not enable congestion control on VFs Congestion control needs to be enabled only on the PFs. FW fails the command if issued on VFs. Avoid sending the command on VFs. Fixes: f13bcef04ba0 ("RDMA/bnxt_re: Enable congestion control by default") Link: https://lore.kernel.org/r/1684397461-23082-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Kalesh AP Reviewed-by: Selvin Thyparampil Xavier Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b9e2f89337e8..e34eccd178d0 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1336,6 +1336,10 @@ static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) { struct bnxt_qplib_cc_param cc_param = {}; + /* Do not enable congestion control on VFs */ + if (rdev->is_virtfn) + return; + /* Currently enabling only for GenP5 adapters */ if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) return; -- cgit v1.2.3 From c8f304d75f6c6cc679a73f89591f9a915da38f09 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 22 May 2023 10:56:53 -0500 Subject: RDMA/irdma: Prevent QP use after free There is a window where the poll cq may use a QP that has been freed. This can happen if a CQE is polled before irdma_clean_cqes() can clear the CQE's related to the QP and the destroy QP races to free the QP memory. then the QP structures are used in irdma_poll_cq. Fix this by moving the clearing of CQE's before the reference is removed and the QP is destroyed. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20230522155654.1309-3-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index ab5cdf782785..68ce3cd40002 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -522,11 +522,6 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (!iwqp->user_mode) cancel_delayed_work_sync(&iwqp->dwork_flush); - irdma_qp_rem_ref(&iwqp->ibqp); - wait_for_completion(&iwqp->free_qp); - irdma_free_lsmm_rsrc(iwqp); - irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); - if (!iwqp->user_mode) { if (iwqp->iwscq) { irdma_clean_cqes(iwqp, iwqp->iwscq); @@ -534,6 +529,12 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) irdma_clean_cqes(iwqp, iwqp->iwrcq); } } + + irdma_qp_rem_ref(&iwqp->ibqp); + wait_for_completion(&iwqp->free_qp); + irdma_free_lsmm_rsrc(iwqp); + irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); + irdma_remove_push_mmap_entries(iwqp); irdma_free_qp_rsrc(iwqp); -- cgit v1.2.3 From 5842d1d9c1b0d17e0c29eae65ae1f245f83682dd Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 22 May 2023 10:56:54 -0500 Subject: RDMA/irdma: Fix Local Invalidate fencing If the local invalidate fence is indicated in the WR, only the read fence is currently being set in WQE. Fix this to set both the read and local fence in the WQE. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20230522155654.1309-4-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 68ce3cd40002..eaa12c124598 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3292,6 +3292,7 @@ static int irdma_post_send(struct ib_qp *ibqp, break; case IB_WR_LOCAL_INV: info.op_type = IRDMA_OP_TYPE_INV_STAG; + info.local_fence = info.read_fence; info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; err = irdma_uk_stag_local_invalidate(ukqp, &info, true); break; -- cgit v1.2.3 From 3bf3a7c6985c625f64e73baefdaa36f1c2045a29 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 25 Apr 2023 01:02:42 +0000 Subject: RDMA/rtrs: Fix the last iu->buf leak in err path The last iu->buf will leak if ib_dma_mapping_error() fails. Fixes: c0894b3ea69d ("RDMA/rtrs: core: lib functions shared between client and server modules") Link: https://lore.kernel.org/r/1682384563-2-3-git-send-email-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Acked-by: Guoqing Jiang Acked-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index 4bf9d868cc52..3696f367ff51 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -37,8 +37,10 @@ struct rtrs_iu *rtrs_iu_alloc(u32 iu_num, size_t size, gfp_t gfp_mask, goto err; iu->dma_addr = ib_dma_map_single(dma_dev, iu->buf, size, dir); - if (ib_dma_mapping_error(dma_dev, iu->dma_addr)) + if (ib_dma_mapping_error(dma_dev, iu->dma_addr)) { + kfree(iu->buf); goto err; + } iu->cqe.done = done; iu->size = size; -- cgit v1.2.3 From 9c29c8c7df0688f358d2df5ddd16c97c2f7292b4 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 25 Apr 2023 01:02:43 +0000 Subject: RDMA/rtrs: Fix rxe_dealloc_pd warning In current design: 1. PD and clt_path->s.dev are shared among connections. 2. every con[n]'s cleanup phase will call destroy_con_cq_qp() 3. clt_path->s.dev will be always decreased in destroy_con_cq_qp(), and when clt_path->s.dev become zero, it will destroy PD. 4. when con[1] failed to create, con[1] will not take clt_path->s.dev, but it try to decreased clt_path->s.dev So, in case create_cm(con[0]) succeeds but create_cm(con[1]) fails, destroy_con_cq_qp(con[1]) will be called first which will destroy the PD while this PD is still taken by con[0]. Here, we refactor the error path of create_cm() and init_conns(), so that we do the cleanup in the order they are created. The warning occurs when destroying RXE PD whose reference count is not zero. rnbd_client L597: Mapping device /dev/nvme0n1 on session client, (access_mode: rw, nr_poll_queues: 0) ------------[ cut here ]------------ WARNING: CPU: 0 PID: 26407 at drivers/infiniband/sw/rxe/rxe_pool.c:256 __rxe_cleanup+0x13a/0x170 [rdma_rxe] Modules linked in: rpcrdma rdma_ucm ib_iser rnbd_client libiscsi rtrs_client scsi_transport_iscsi rtrs_core rdma_cm iw_cm ib_cm crc32_generic rdma_rxe udp_tunnel ib_uverbs ib_core kmem device_dax nd_pmem dax_pmem nd_vme crc32c_intel fuse nvme_core nfit libnvdimm dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 26407 Comm: rnbd-client.sh Kdump: loaded Not tainted 6.2.0-rc6-roce-flush+ #53 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__rxe_cleanup+0x13a/0x170 [rdma_rxe] Code: 45 84 e4 0f 84 5a ff ff ff 48 89 ef e8 5f 18 71 f9 84 c0 75 90 be c8 00 00 00 48 89 ef e8 be 89 1f fa 85 c0 0f 85 7b ff ff ff <0f> 0b 41 bc ea ff ff ff e9 71 ff ff ff e8 84 7f 1f fa e9 d0 fe ff RSP: 0018:ffffb09880b6f5f0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff99401f15d6a8 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffbac8234b RDI: 00000000ffffffff RBP: ffff99401f15d6d0 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000002d82 R11: 0000000000000000 R12: 0000000000000001 R13: ffff994101eff208 R14: ffffb09880b6f6a0 R15: 00000000fffffe00 FS: 00007fe113904740(0000) GS:ffff99413bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff6cde656c8 CR3: 000000001f108004 CR4: 00000000001706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rxe_dealloc_pd+0x16/0x20 [rdma_rxe] ib_dealloc_pd_user+0x4b/0x80 [ib_core] rtrs_ib_dev_put+0x79/0xd0 [rtrs_core] destroy_con_cq_qp+0x8a/0xa0 [rtrs_client] init_path+0x1e7/0x9a0 [rtrs_client] ? __pfx_autoremove_wake_function+0x10/0x10 ? lock_is_held_type+0xd7/0x130 ? rcu_read_lock_sched_held+0x43/0x80 ? pcpu_alloc+0x3dd/0x7d0 ? rtrs_clt_init_stats+0x18/0x40 [rtrs_client] rtrs_clt_open+0x24f/0x5a0 [rtrs_client] ? __pfx_rnbd_clt_link_ev+0x10/0x10 [rnbd_client] rnbd_clt_map_device+0x6a5/0xe10 [rnbd_client] Fixes: 6a98d71daea1 ("RDMA/rtrs: client: main functionality") Link: https://lore.kernel.org/r/1682384563-2-4-git-send-email-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Acked-by: Jack Wang Tested-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 55 ++++++++++++++-------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index edb2e3a25880..cfb50bfe53c3 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -2040,6 +2040,7 @@ static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id, return 0; } +/* The caller should do the cleanup in case of error */ static int create_cm(struct rtrs_clt_con *con) { struct rtrs_path *s = con->c.path; @@ -2062,14 +2063,14 @@ static int create_cm(struct rtrs_clt_con *con) err = rdma_set_reuseaddr(cm_id, 1); if (err != 0) { rtrs_err(s, "Set address reuse failed, err: %d\n", err); - goto destroy_cm; + return err; } err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr, (struct sockaddr *)&clt_path->s.dst_addr, RTRS_CONNECT_TIMEOUT_MS); if (err) { rtrs_err(s, "Failed to resolve address, err: %d\n", err); - goto destroy_cm; + return err; } /* * Combine connection status and session events. This is needed @@ -2084,29 +2085,15 @@ static int create_cm(struct rtrs_clt_con *con) if (err == 0) err = -ETIMEDOUT; /* Timedout or interrupted */ - goto errr; - } - if (con->cm_err < 0) { - err = con->cm_err; - goto errr; + return err; } - if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTING) { + if (con->cm_err < 0) + return con->cm_err; + if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTING) /* Device removal */ - err = -ECONNABORTED; - goto errr; - } + return -ECONNABORTED; return 0; - -errr: - stop_cm(con); - mutex_lock(&con->con_mutex); - destroy_con_cq_qp(con); - mutex_unlock(&con->con_mutex); -destroy_cm: - destroy_cm(con); - - return err; } static void rtrs_clt_path_up(struct rtrs_clt_path *clt_path) @@ -2334,7 +2321,7 @@ static void rtrs_clt_close_work(struct work_struct *work) static int init_conns(struct rtrs_clt_path *clt_path) { unsigned int cid; - int err; + int err, i; /* * On every new session connections increase reconnect counter @@ -2350,10 +2337,8 @@ static int init_conns(struct rtrs_clt_path *clt_path) goto destroy; err = create_cm(to_clt_con(clt_path->s.con[cid])); - if (err) { - destroy_con(to_clt_con(clt_path->s.con[cid])); + if (err) goto destroy; - } } err = alloc_path_reqs(clt_path); if (err) @@ -2364,15 +2349,21 @@ static int init_conns(struct rtrs_clt_path *clt_path) return 0; destroy: - while (cid--) { - struct rtrs_clt_con *con = to_clt_con(clt_path->s.con[cid]); + /* Make sure we do the cleanup in the order they are created */ + for (i = 0; i <= cid; i++) { + struct rtrs_clt_con *con; - stop_cm(con); + if (!clt_path->s.con[i]) + break; - mutex_lock(&con->con_mutex); - destroy_con_cq_qp(con); - mutex_unlock(&con->con_mutex); - destroy_cm(con); + con = to_clt_con(clt_path->s.con[i]); + if (con->c.cm_id) { + stop_cm(con); + mutex_lock(&con->con_mutex); + destroy_con_cq_qp(con); + mutex_unlock(&con->con_mutex); + destroy_cm(con); + } destroy_con(con); } /* -- cgit v1.2.3 From 9a3763e87379c97a78b7c6c6f40720b1e877174f Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 17 May 2023 12:22:42 -0500 Subject: RDMA/rxe: Fix packet length checks In rxe_net.c a received packet, from udp or loopback, is passed to rxe_rcv() in rxe_recv.c as a udp packet. I.e. skb->data is pointing at the udp header. But rxe_rcv() makes length checks to verify the packet is long enough to hold the roce headers as if it were a roce packet. I.e. skb->data pointing at the bth header. A runt packet would appear to have 8 more bytes than it actually does which may lead to incorrect behavior. This patch calls skb_pull() to adjust the skb to point at the bth header before calling rxe_rcv() which fixes this error. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20230517172242.1806340-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index a38fab19bed1..cd59666158b1 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -159,6 +159,9 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) pkt->mask = RXE_GRH_MASK; pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); + /* remove udp header */ + skb_pull(skb, sizeof(struct udphdr)); + rxe_rcv(skb); return 0; @@ -401,6 +404,9 @@ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) return -EIO; } + /* remove udp header */ + skb_pull(skb, sizeof(struct udphdr)); + rxe_rcv(skb); return 0; -- cgit v1.2.3 From b00683422fd79dd07c9b75efdce1660e5e19150e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 17 May 2023 16:15:10 -0500 Subject: RDMA/rxe: Fix ref count error in check_rkey() There is a reference count error in error path code and a potential race in check_rkey() in rxe_resp.c. When looking up the rkey for a memory window the reference to the mw from rxe_lookup_mw() is dropped before a reference is taken on the mr referenced by the mw. If the mr is destroyed immediately after the call to rxe_put(mw) the mr pointer is unprotected and may end up pointing at freed memory. The rxe_get(mr) call should take place before the rxe_put(mw) call. All errors in check_rkey() call rxe_put(mw) if mw is not NULL but it was already called after the above. The mw pointer should be set to NULL after the rxe_put(mw) call to prevent this from happening. Fixes: cdd0b85675ae ("RDMA/rxe: Implement memory access through MWs") Link: https://lore.kernel.org/r/20230517211509.1819998-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 1da044f6b7d4..ee68306555b9 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -489,8 +489,9 @@ static enum resp_states check_rkey(struct rxe_qp *qp, if (mw->access & IB_ZERO_BASED) qp->resp.offset = mw->addr; - rxe_put(mw); rxe_get(mr); + rxe_put(mw); + mw = NULL; } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { -- cgit v1.2.3 From 18e7e3e4217083a682e2c7282011c70c8a1ba070 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Mon, 29 May 2023 11:35:26 -0400 Subject: RDMA/bnxt_re: Fix reporting active_{speed,width} attributes After commit 6d758147c7b8 ("RDMA/bnxt_re: Use auxiliary driver interface") the active_{speed, width} attributes are reported incorrectly, This is happening because ib_get_eth_speed() is called only once from bnxt_re_ib_init() - Fix this issue by calling ib_get_eth_speed() from bnxt_re_query_port(). Fixes: 6d758147c7b8 ("RDMA/bnxt_re: Use auxiliary driver interface") Link: https://lore.kernel.org/r/20230529153525.87254-1-kheib@redhat.com Signed-off-by: Kamal Heib Acked-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 -- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 7 ++++--- drivers/infiniband/hw/bnxt_re/main.c | 2 -- 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 5a2baf49ecaa..2c95e6f3d47a 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -135,8 +135,6 @@ struct bnxt_re_dev { struct delayed_work worker; u8 cur_prio_map; - u16 active_speed; - u8 active_width; /* FP Notification Queue (CQ & SRQ) */ struct tasklet_struct nq_task; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index b1c36412025f..952811c40c54 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -199,6 +199,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + int rc; memset(port_attr, 0, sizeof(*port_attr)); @@ -228,10 +229,10 @@ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, port_attr->sm_sl = 0; port_attr->subnet_timeout = 0; port_attr->init_type_reply = 0; - port_attr->active_speed = rdev->active_speed; - port_attr->active_width = rdev->active_width; + rc = ib_get_eth_speed(&rdev->ibdev, port_num, &port_attr->active_speed, + &port_attr->active_width); - return 0; + return rc; } int bnxt_re_get_port_immutable(struct ib_device *ibdev, u32 port_num, diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index e34eccd178d0..3073398a2183 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1077,8 +1077,6 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) return rc; } dev_info(rdev_to_dev(rdev), "Device registered with IB successfully"); - ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, - &rdev->active_width); set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); event = netif_running(rdev->netdev) && netif_carrier_ok(rdev->netdev) ? -- cgit v1.2.3 From 2a62b6210ce876c596086ab8fd4c8a0c3d10611a Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 2 Jun 2023 11:54:08 +0800 Subject: RDMA/rxe: Fix the use-before-initialization error of resp_pkts In the following: Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106 assign_lock_key kernel/locking/lockdep.c:982 [inline] register_lock_class+0xdb6/0x1120 kernel/locking/lockdep.c:1295 __lock_acquire+0x10a/0x5df0 kernel/locking/lockdep.c:4951 lock_acquire kernel/locking/lockdep.c:5691 [inline] lock_acquire+0x1b1/0x520 kernel/locking/lockdep.c:5656 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x3d/0x60 kernel/locking/spinlock.c:162 skb_dequeue+0x20/0x180 net/core/skbuff.c:3639 drain_resp_pkts drivers/infiniband/sw/rxe/rxe_comp.c:555 [inline] rxe_completer+0x250d/0x3cc0 drivers/infiniband/sw/rxe/rxe_comp.c:652 rxe_qp_do_cleanup+0x1be/0x820 drivers/infiniband/sw/rxe/rxe_qp.c:761 execute_in_process_context+0x3b/0x150 kernel/workqueue.c:3473 __rxe_cleanup+0x21e/0x370 drivers/infiniband/sw/rxe/rxe_pool.c:233 rxe_create_qp+0x3f6/0x5f0 drivers/infiniband/sw/rxe/rxe_verbs.c:583 This is a use-before-initialization problem. It happens because rxe_qp_do_cleanup is called during error unwind before the struct has been fully initialized. Move the initialization of the skb earlier. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20230602035408.741534-1-yanjun.zhu@intel.com Reported-by: syzbot+eba589d8f49c73d356da@syzkaller.appspotmail.com Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 61a2eb77d999..a0f206431cf8 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -176,6 +176,9 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, spin_lock_init(&qp->rq.producer_lock); spin_lock_init(&qp->rq.consumer_lock); + skb_queue_head_init(&qp->req_pkts); + skb_queue_head_init(&qp->resp_pkts); + atomic_set(&qp->ssn, 0); atomic_set(&qp->skb_out, 0); } @@ -234,8 +237,6 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, qp->req.opcode = -1; qp->comp.opcode = -1; - skb_queue_head_init(&qp->req_pkts); - rxe_init_task(&qp->req.task, qp, rxe_requester); rxe_init_task(&qp->comp.task, qp, rxe_completer); @@ -279,8 +280,6 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, } } - skb_queue_head_init(&qp->resp_pkts); - rxe_init_task(&qp->resp.task, qp, rxe_responder); qp->resp.opcode = OPCODE_NONE; -- cgit v1.2.3 From ee4d269eccfea6c17b18281bef482700d898e86f Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 5 Jun 2023 13:33:17 +0300 Subject: RDMA/mlx5: Initiate dropless RQ for RAW Ethernet functions Delay drop data is initiated for PFs that have the capability of rq_delay_drop and are in roce profile. However, PFs with RAW ethernet profile do not initiate delay drop data on function load, causing kernel panic if delay drop struct members are accessed later on in case a dropless RQ is created. Thus, stage the delay drop initialization as part of RAW ethernet PF loading process. Fixes: b5ca15ad7e61 ("IB/mlx5: Add proper representors support") Signed-off-by: Maher Sanalla Reviewed-by: Maor Gottlieb Link: https://lore.kernel.org/r/2e9d386785043d48c38711826eb910315c1de141.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5d45de223c43..f0b394ed7452 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4275,6 +4275,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), + STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, + mlx5_ib_stage_delay_drop_init, + mlx5_ib_stage_delay_drop_cleanup), STAGE_CREATE(MLX5_IB_STAGE_RESTRACK, mlx5_ib_restrack_init, NULL), -- cgit v1.2.3 From e1f4a52ac171dd863fe89055e749ef5e0a0bc5ce Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 5 Jun 2023 13:33:18 +0300 Subject: RDMA/mlx5: Create an indirect flow table for steering anchor A misbehaved user can create a steering anchor that points to a kernel flow table and then destroy the anchor without freeing the associated STC. This creates a problem as the kernel can't destroy the flow table since there is still a reference to it. As a result, this can exhaust all available flow table resources, preventing other users from using the RDMA device. To prevent this problem, a solution is implemented where a special flow table with two steering rules is created when a user creates a steering anchor for the first time. The rules include one that drops all traffic and another that points to the kernel flow table. If the steering anchor is destroyed, only the rule pointing to the kernel's flow table is removed. Any traffic reaching the special flow table after that is dropped. Since the special flow table is not destroyed when the steering anchor is destroyed, any issues are prevented from occurring. The remaining resources are only destroyed when the RDMA device is destroyed, which happens after all DEVX objects are freed, including the STCs, thus mitigating the issue. Fixes: 0c6ab0ca9a66 ("RDMA/mlx5: Expose steering anchor to userspace") Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Link: https://lore.kernel.org/r/b4a88a871d651fa4e8f98d552553c1cfe9ba2cd6.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/fs.c | 276 ++++++++++++++++++++++++++++++++++- drivers/infiniband/hw/mlx5/fs.h | 16 ++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 11 ++ 3 files changed, 296 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 3008632a6c20..1e419e080b53 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -695,8 +695,6 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; - if (mlx5_ib_shared_ft_allowed(&dev->ib_dev)) - ft_attr.uid = MLX5_SHARED_RESOURCE_UID; ft_attr.prio = priority; ft_attr.max_fte = num_entries; ft_attr.flags = flags; @@ -2025,6 +2023,237 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject, return 0; } +static int steering_anchor_create_ft(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + enum mlx5_flow_namespace_type ns_type) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + + if (ft_prio->anchor.ft) + return 0; + + ns = mlx5_get_flow_namespace(dev->mdev, ns_type); + if (!ns) + return -EOPNOTSUPP; + + ft_attr.flags = MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.uid = MLX5_SHARED_RESOURCE_UID; + ft_attr.prio = 0; + ft_attr.max_fte = 2; + ft_attr.level = 1; + + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) + return PTR_ERR(ft); + + ft_prio->anchor.ft = ft; + + return 0; +} + +static void steering_anchor_destroy_ft(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.ft) { + mlx5_destroy_flow_table(ft_prio->anchor.ft); + ft_prio->anchor.ft = NULL; + } +} + +static int +steering_anchor_create_fg_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + void *flow_group_in; + int err = 0; + + if (ft_prio->anchor.fg_drop) + return 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + + fg = mlx5_create_flow_group(ft_prio->anchor.ft, flow_group_in); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + goto out; + } + + ft_prio->anchor.fg_drop = fg; + +out: + kvfree(flow_group_in); + + return err; +} + +static void +steering_anchor_destroy_fg_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.fg_drop) { + mlx5_destroy_flow_group(ft_prio->anchor.fg_drop); + ft_prio->anchor.fg_drop = NULL; + } +} + +static int +steering_anchor_create_fg_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + void *flow_group_in; + int err = 0; + + if (ft_prio->anchor.fg_goto_table) + return 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + fg = mlx5_create_flow_group(ft_prio->anchor.ft, flow_group_in); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + goto out; + } + ft_prio->anchor.fg_goto_table = fg; + +out: + kvfree(flow_group_in); + + return err; +} + +static void +steering_anchor_destroy_fg_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.fg_goto_table) { + mlx5_destroy_flow_group(ft_prio->anchor.fg_goto_table); + ft_prio->anchor.fg_goto_table = NULL; + } +} + +static int +steering_anchor_create_rule_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *handle; + + if (ft_prio->anchor.rule_drop) + return 0; + + flow_act.fg = ft_prio->anchor.fg_drop; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + handle = mlx5_add_flow_rules(ft_prio->anchor.ft, NULL, &flow_act, + NULL, 0); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ft_prio->anchor.rule_drop = handle; + + return 0; +} + +static void steering_anchor_destroy_rule_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.rule_drop) { + mlx5_del_flow_rules(ft_prio->anchor.rule_drop); + ft_prio->anchor.rule_drop = NULL; + } +} + +static int +steering_anchor_create_rule_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *handle; + + if (ft_prio->anchor.rule_goto_table) + return 0; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + flow_act.fg = ft_prio->anchor.fg_goto_table; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = ft_prio->flow_table; + + handle = mlx5_add_flow_rules(ft_prio->anchor.ft, NULL, &flow_act, + &dest, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ft_prio->anchor.rule_goto_table = handle; + + return 0; +} + +static void +steering_anchor_destroy_rule_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.rule_goto_table) { + mlx5_del_flow_rules(ft_prio->anchor.rule_goto_table); + ft_prio->anchor.rule_goto_table = NULL; + } +} + +static int steering_anchor_create_res(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + enum mlx5_flow_namespace_type ns_type) +{ + int err; + + err = steering_anchor_create_ft(dev, ft_prio, ns_type); + if (err) + return err; + + err = steering_anchor_create_fg_drop(ft_prio); + if (err) + goto destroy_ft; + + err = steering_anchor_create_fg_goto_table(ft_prio); + if (err) + goto destroy_fg_drop; + + err = steering_anchor_create_rule_drop(ft_prio); + if (err) + goto destroy_fg_goto_table; + + err = steering_anchor_create_rule_goto_table(ft_prio); + if (err) + goto destroy_rule_drop; + + return 0; + +destroy_rule_drop: + steering_anchor_destroy_rule_drop(ft_prio); +destroy_fg_goto_table: + steering_anchor_destroy_fg_goto_table(ft_prio); +destroy_fg_drop: + steering_anchor_destroy_fg_drop(ft_prio); +destroy_ft: + steering_anchor_destroy_ft(ft_prio); + + return err; +} + +static void mlx5_steering_anchor_destroy_res(struct mlx5_ib_flow_prio *ft_prio) +{ + steering_anchor_destroy_rule_goto_table(ft_prio); + steering_anchor_destroy_rule_drop(ft_prio); + steering_anchor_destroy_fg_goto_table(ft_prio); + steering_anchor_destroy_fg_drop(ft_prio); + steering_anchor_destroy_ft(ft_prio); +} + static int steering_anchor_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) @@ -2035,6 +2264,9 @@ static int steering_anchor_cleanup(struct ib_uobject *uobject, return -EBUSY; mutex_lock(&obj->dev->flow_db->lock); + if (!--obj->ft_prio->anchor.rule_goto_table_ref) + steering_anchor_destroy_rule_goto_table(obj->ft_prio); + put_flow_table(obj->dev, obj->ft_prio, true); mutex_unlock(&obj->dev->flow_db->lock); @@ -2042,6 +2274,24 @@ static int steering_anchor_cleanup(struct ib_uobject *uobject, return 0; } +static void fs_cleanup_anchor(struct mlx5_ib_flow_prio *prio, + int count) +{ + while (count--) + mlx5_steering_anchor_destroy_res(&prio[count]); +} + +void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) +{ + fs_cleanup_anchor(dev->flow_db->prios, MLX5_IB_NUM_FLOW_FT); + fs_cleanup_anchor(dev->flow_db->egress_prios, MLX5_IB_NUM_FLOW_FT); + fs_cleanup_anchor(dev->flow_db->sniffer, MLX5_IB_NUM_SNIFFER_FTS); + fs_cleanup_anchor(dev->flow_db->egress, MLX5_IB_NUM_EGRESS_FTS); + fs_cleanup_anchor(dev->flow_db->fdb, MLX5_IB_NUM_FDB_FTS); + fs_cleanup_anchor(dev->flow_db->rdma_rx, MLX5_IB_NUM_FLOW_FT); + fs_cleanup_anchor(dev->flow_db->rdma_tx, MLX5_IB_NUM_FLOW_FT); +} + static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, struct mlx5_ib_flow_matcher *obj) { @@ -2182,21 +2432,31 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( return -ENOMEM; mutex_lock(&dev->flow_db->lock); + ft_prio = _get_flow_table(dev, priority, ns_type, 0); if (IS_ERR(ft_prio)) { - mutex_unlock(&dev->flow_db->lock); err = PTR_ERR(ft_prio); goto free_obj; } ft_prio->refcount++; - ft_id = mlx5_flow_table_id(ft_prio->flow_table); - mutex_unlock(&dev->flow_db->lock); + + if (!ft_prio->anchor.rule_goto_table_ref) { + err = steering_anchor_create_res(dev, ft_prio, ns_type); + if (err) + goto put_flow_table; + } + + ft_prio->anchor.rule_goto_table_ref++; + + ft_id = mlx5_flow_table_id(ft_prio->anchor.ft); err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, &ft_id, sizeof(ft_id)); if (err) - goto put_flow_table; + goto destroy_res; + + mutex_unlock(&dev->flow_db->lock); uobj->object = obj; obj->dev = dev; @@ -2205,8 +2465,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( return 0; +destroy_res: + --ft_prio->anchor.rule_goto_table_ref; + mlx5_steering_anchor_destroy_res(ft_prio); put_flow_table: - mutex_lock(&dev->flow_db->lock); put_flow_table(dev, ft_prio, true); mutex_unlock(&dev->flow_db->lock); free_obj: diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h index ad320adaf321..b9734904f5f0 100644 --- a/drivers/infiniband/hw/mlx5/fs.h +++ b/drivers/infiniband/hw/mlx5/fs.h @@ -10,6 +10,7 @@ #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_fs_init(struct mlx5_ib_dev *dev); +void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev); #else static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) { @@ -21,9 +22,24 @@ static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) mutex_init(&dev->flow_db->lock); return 0; } + +inline void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) {} #endif + static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) { + /* When a steering anchor is created, a special flow table is also + * created for the user to reference. Since the user can reference it, + * the kernel cannot trust that when the user destroys the steering + * anchor, they no longer reference the flow table. + * + * To address this issue, when a user destroys a steering anchor, only + * the flow steering rule in the table is destroyed, but the table + * itself is kept to deal with the above scenario. The remaining + * resources are only removed when the RDMA device is destroyed, which + * is a safe assumption that all references are gone. + */ + mlx5_ib_fs_cleanup_anchor(dev); kfree(dev->flow_db); } #endif /* _MLX5_IB_FS_H */ diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index efa4dc6e7dee..91fc0cdf377d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -237,8 +237,19 @@ enum { #define MLX5_IB_NUM_SNIFFER_FTS 2 #define MLX5_IB_NUM_EGRESS_FTS 1 #define MLX5_IB_NUM_FDB_FTS MLX5_BY_PASS_NUM_REGULAR_PRIOS + +struct mlx5_ib_anchor { + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg_goto_table; + struct mlx5_flow_group *fg_drop; + struct mlx5_flow_handle *rule_goto_table; + struct mlx5_flow_handle *rule_drop; + unsigned int rule_goto_table_ref; +}; + struct mlx5_ib_flow_prio { struct mlx5_flow_table *flow_table; + struct mlx5_ib_anchor anchor; unsigned int refcount; }; -- cgit v1.2.3 From c2ea687e5e0e29802714a981a1ccdc17c2c4b2be Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 13:33:19 +0300 Subject: RDMA/mlx5: Fix Q-counters per vport allocation Previously Q-counters data was being allocated over the PF for all of the available vports, however that isn't necessary. Since each VF or SF has a Q-counter allocated for itself. So we only need to allocate two counters data structures, one for the device counters, and one for all the other vports to expose the representors, since they only need to read from it in order to determine mainly counters numbers and names, so they can all share. This in turn also solves a bug we previously had where we couldn't switch the device to switchdev mode when there were more than 128 SF/VFs configured, since that is the maximum amount of Q-counters available for a single port Fixes: d22467a71ebe ("RDMA/mlx5: Expand switchdev Q-counters to expose representor statistics") Signed-off-by: Patrisious Haddad Reviewed-by: Mark Zhang Link: https://lore.kernel.org/r/f54671df16e2227a069b229b33b62cd9ee24c475.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 1c06920505d2..3d7ef81a50b8 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -209,7 +209,8 @@ static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev, !vport_qcounters_supported(dev)) || !port_num) return &dev->port[0].cnts; - return &dev->port[port_num - 1].cnts; + return is_mdev_switchdev_mode(dev->mdev) ? + &dev->port[1].cnts : &dev->port[port_num - 1].cnts; } /** @@ -262,7 +263,7 @@ static struct rdma_hw_stats * mlx5_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - const struct mlx5_ib_counters *cnts = &dev->port[port_num - 1].cnts; + const struct mlx5_ib_counters *cnts = get_counters(dev, port_num); return do_alloc_stats(cnts); } @@ -725,11 +726,11 @@ err: static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; - int num_cnt_ports; + int num_cnt_ports = dev->num_ports; int i, j; - num_cnt_ports = (!is_mdev_switchdev_mode(dev->mdev) || - vport_qcounters_supported(dev)) ? dev->num_ports : 1; + if (is_mdev_switchdev_mode(dev->mdev)) + num_cnt_ports = min(2, num_cnt_ports); MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); @@ -761,15 +762,22 @@ static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) { u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; - int num_cnt_ports; + int num_cnt_ports = dev->num_ports; int err = 0; int i; bool is_shared; MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0; - num_cnt_ports = (!is_mdev_switchdev_mode(dev->mdev) || - vport_qcounters_supported(dev)) ? dev->num_ports : 1; + + /* + * In switchdev we need to allocate two ports, one that is used for + * the device Q_counters and it is essentially the real Q_counters of + * this device, while the other is used as a helper for PF to be able to + * query all other vports. + */ + if (is_mdev_switchdev_mode(dev->mdev)) + num_cnt_ports = min(2, num_cnt_ports); for (i = 0; i < num_cnt_ports; i++) { err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts, i); -- cgit v1.2.3 From e80ef139488fb4f481c877a81f6ba54f1f0ee416 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 13:33:20 +0300 Subject: RDMA/mlx5: Remove vport Q-counters dependency on normal Q-counters Previously the Q-counters initialization assumed that the vport Q-counters structures and the normal Q-counters structures are identical in size, and hence when a Q-counter was added to normal Q-counters structure but not to the vport Q-counters struct it would lead to that counter name being NULL in switchdev mode, which could cause the kernel crash below. Currently break the dependency between those two structure and always use the appropriate struct size, in order to remove the assumption that both structure sizes are equal. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 20c64a067 P4D 20c64a067 PUD 20152b067 PMD 0 Oops: 0000 [#1] SMP CPU: 19 PID: 11717 Comm: devlink Tainted: G OE 6.2.0_mlnx #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:strlen+0x0/0x20 Code: 66 2e 0f 1f 84 00 00 00 00 00 48 01 fe eb 0f 0f b6 07 38 d0 74 10 48 83 c7 01 84 c0 74 05 48 39 f7 75 ec 31 c0 c3 48 89 f8 c3 <80> 3f 00 48 89 f8 74 10 48 83 c7 01 80 3f 00 75 f7 48 29 c7 48 89 RSP: 0018:ffffc9000318b618 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000002c00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff888211918110 R09: ffff888211918000 R10: 000000000000001e R11: ffff888211918000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: ffff8881038ec250 FS: 00007fa53342fe80(0000) GS:ffff88885fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000002042b2003 CR4: 0000000000770ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: kernfs_name_hash+0x12/0x80 kernfs_find_ns+0x35/0xb0 kernfs_remove_by_name_ns+0x46/0xc0 remove_files.isra.1+0x30/0x70 internal_create_group+0x253/0x380 internal_create_groups.part.4+0x3e/0xa0 setup_port+0x27a/0x8c0 [ib_core] ib_setup_port_attrs+0x9d/0x300 [ib_core] ib_register_device+0x48e/0x550 [ib_core] __mlx5_ib_add+0x2b/0x80 [mlx5_ib] mlx5_ib_vport_rep_load+0x141/0x360 [mlx5_ib] mlx5_esw_offloads_rep_load+0x48/0xa0 [mlx5_core] esw_offloads_enable+0x41e/0xd10 [mlx5_core] mlx5_eswitch_enable_locked+0x1e3/0x340 [mlx5_core] ? __cond_resched+0x15/0x30 mlx5_devlink_eswitch_mode_set+0x204/0x3c0 [mlx5_core] devlink_nl_cmd_eswitch_set_doit+0x8d/0x100 genl_family_rcv_msg_doit.isra.19+0xea/0x110 genl_rcv_msg+0x19b/0x290 ? devlink_nl_cmd_region_read_dumpit+0x760/0x760 ? devlink_nl_cmd_port_param_get_doit+0x30/0x30 ? devlink_put+0x50/0x50 ? genl_get_cmd_both+0x60/0x60 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1be/0x2a0 netlink_sendmsg+0x361/0x4d0 sock_sendmsg+0x30/0x40 __sys_sendto+0x11a/0x150 ? handle_mm_fault+0x101/0x2b0 ? do_user_addr_fault+0x21d/0x720 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7fa533611cba Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 76 c3 0f 1f 44 00 00 55 48 83 ec 30 44 89 4c RSP: 002b:00007ffdb6a898a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000daab00 RCX: 00007fa533611cba RDX: 0000000000000038 RSI: 0000000000daab00 RDI: 0000000000000003 RBP: 0000000000daa910 R08: 00007fa533822000 R09: 000000000000000c R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_core(OE) mlxdevm(OE) ib_uverbs(OE) ib_core(OE) mlx_compat(OE) mlxfw(OE) memtrack(OE) pci_hyperv_intf nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_filter iptable_nat dns_resolver nf_nat br_netfilter nfs bridge stp llc lockd grace fscache netfs rfkill overlay iTCO_wdt iTCO_vendor_support kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel i2c_i801 sunrpc lpc_ich sha512_ssse3 pcspkr i2c_smbus mfd_core drm sch_fq_codel i2c_core ip_tables fuse crc32c_intel serio_raw virtio_net net_failover failover [last unloaded: mlxfw] CR2: 0000000000000000 ---[ end trace 0000000000000000 ]--- RIP: 0010:strlen+0x0/0x20 Code: 66 2e 0f 1f 84 00 00 00 00 00 48 01 fe eb 0f 0f b6 07 38 d0 74 10 48 83 c7 01 84 c0 74 05 48 39 f7 75 ec 31 c0 c3 48 89 f8 c3 <80> 3f 00 48 89 f8 74 10 48 83 c7 01 80 3f 00 75 f7 48 29 c7 48 89 RSP: 0018:ffffc9000318b618 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000002c00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff888211918110 R09: ffff888211918000 R10: 000000000000001e R11: ffff888211918000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: ffff8881038ec250 FS: 00007fa53342fe80(0000) GS:ffff88885fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000002042b2003 CR4: 0000000000770ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Kernel panic - not syncing: Fatal exception Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception ]--- Fixes: d22467a71ebe ("RDMA/mlx5: Expand switchdev Q-counters to expose representor statistics") Signed-off-by: Patrisious Haddad Reviewed-by: Mark Zhang Link: https://lore.kernel.org/r/016777b7f16eb6bb178999ff59097d0c0f91f68a.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 58 ++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 18 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 3d7ef81a50b8..f40d9c61e30b 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -576,43 +576,53 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, bool is_vport = is_mdev_switchdev_mode(dev->mdev) && port_num != MLX5_VPORT_PF; const struct mlx5_ib_counter *names; - int j = 0, i; + int j = 0, i, size; names = is_vport ? vport_basic_q_cnts : basic_q_cnts; - for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) { + size = is_vport ? ARRAY_SIZE(vport_basic_q_cnts) : + ARRAY_SIZE(basic_q_cnts); + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = basic_q_cnts[i].offset; + offsets[j] = names[i].offset; } names = is_vport ? vport_out_of_seq_q_cnts : out_of_seq_q_cnts; + size = is_vport ? ARRAY_SIZE(vport_out_of_seq_q_cnts) : + ARRAY_SIZE(out_of_seq_q_cnts); if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) { - for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = out_of_seq_q_cnts[i].offset; + offsets[j] = names[i].offset; } } names = is_vport ? vport_retrans_q_cnts : retrans_q_cnts; + size = is_vport ? ARRAY_SIZE(vport_retrans_q_cnts) : + ARRAY_SIZE(retrans_q_cnts); if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { - for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = retrans_q_cnts[i].offset; + offsets[j] = names[i].offset; } } names = is_vport ? vport_extended_err_cnts : extended_err_cnts; + size = is_vport ? ARRAY_SIZE(vport_extended_err_cnts) : + ARRAY_SIZE(extended_err_cnts); if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) { - for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = extended_err_cnts[i].offset; + offsets[j] = names[i].offset; } } names = is_vport ? vport_roce_accl_cnts : roce_accl_cnts; + size = is_vport ? ARRAY_SIZE(vport_roce_accl_cnts) : + ARRAY_SIZE(roce_accl_cnts); if (MLX5_CAP_GEN(dev->mdev, roce_accl)) { - for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = roce_accl_cnts[i].offset; + offsets[j] = names[i].offset; } } @@ -662,25 +672,37 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, struct mlx5_ib_counters *cnts, u32 port_num) { - u32 num_counters, num_op_counters = 0; + bool is_vport = is_mdev_switchdev_mode(dev->mdev) && + port_num != MLX5_VPORT_PF; + u32 num_counters, num_op_counters = 0, size; - num_counters = ARRAY_SIZE(basic_q_cnts); + size = is_vport ? ARRAY_SIZE(vport_basic_q_cnts) : + ARRAY_SIZE(basic_q_cnts); + num_counters = size; + size = is_vport ? ARRAY_SIZE(vport_out_of_seq_q_cnts) : + ARRAY_SIZE(out_of_seq_q_cnts); if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) - num_counters += ARRAY_SIZE(out_of_seq_q_cnts); + num_counters += size; + size = is_vport ? ARRAY_SIZE(vport_retrans_q_cnts) : + ARRAY_SIZE(retrans_q_cnts); if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) - num_counters += ARRAY_SIZE(retrans_q_cnts); + num_counters += size; + size = is_vport ? ARRAY_SIZE(vport_extended_err_cnts) : + ARRAY_SIZE(extended_err_cnts); if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) - num_counters += ARRAY_SIZE(extended_err_cnts); + num_counters += size; + size = is_vport ? ARRAY_SIZE(vport_roce_accl_cnts) : + ARRAY_SIZE(roce_accl_cnts); if (MLX5_CAP_GEN(dev->mdev, roce_accl)) - num_counters += ARRAY_SIZE(roce_accl_cnts); + num_counters += size; cnts->num_q_counters = num_counters; - if (is_mdev_switchdev_mode(dev->mdev) && port_num != MLX5_VPORT_PF) + if (is_vport) goto skip_non_qcounters; if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { -- cgit v1.2.3 From 2de43f5b5137e03ef64a2c3f064a01992830c67f Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 13:33:21 +0300 Subject: RDMA/mlx5: Fix Q-counters query in LAG mode Previously we used the core device associated to the IB device in order to do the Q-counters query to the FW, but in LAG mode it is possible that the core device isn't the one that created this VF. Hence instead of using the core device to query the Q-counters we use the ESW core device which is guaranteed to be that of the VF. Fixes: d22467a71ebe ("RDMA/mlx5: Expand switchdev Q-counters to expose representor statistics") Signed-off-by: Patrisious Haddad Reviewed-by: Mark Zhang Link: https://lore.kernel.org/r/778d7d7a24892348d0bdef17d2e5f9e044717e86.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index f40d9c61e30b..93257fa5aae8 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -330,6 +330,7 @@ static int mlx5_ib_query_q_counters_vport(struct mlx5_ib_dev *dev, { u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {}; + struct mlx5_core_dev *mdev; __be32 val; int ret, i; @@ -337,12 +338,16 @@ static int mlx5_ib_query_q_counters_vport(struct mlx5_ib_dev *dev, dev->port[port_num].rep->vport == MLX5_VPORT_UPLINK) return 0; + mdev = mlx5_eswitch_get_core_dev(dev->port[port_num].rep->esw); + if (!mdev) + return -EOPNOTSUPP; + MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); MLX5_SET(query_q_counter_in, in, other_vport, 1); MLX5_SET(query_q_counter_in, in, vport_number, dev->port[port_num].rep->vport); MLX5_SET(query_q_counter_in, in, aggregate, 1); - ret = mlx5_cmd_exec_inout(dev->mdev, query_q_counter, in, out); + ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out); if (ret) return ret; -- cgit v1.2.3 From 58030c76cce473b6cfd630bbecb97215def0dff8 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 5 Jun 2023 13:33:23 +0300 Subject: RDMA/cma: Always set static rate to 0 for RoCE Set static rate to 0 as it should be discovered by path query and has no meaning for RoCE. This also avoid of using the rtnl lock and ethtool API, which is a bottleneck when try to setup many rdma-cm connections at the same time, especially with multiple processes. Fixes: 3c86aa70bf67 ("RDMA/cm: Add RDMA CM support for IBoE devices") Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/f72a4f8b667b803aee9fa794069f61afb5839ce4.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 93a1c48d0c32..6b3f4384e46a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3295,7 +3295,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; - route->path_rec->rate = iboe_get_rate(ndev); + route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate @@ -4964,7 +4964,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, if (!ndev) return -ENODEV; - ib.rec.rate = iboe_get_rate(ndev); + ib.rec.rate = IB_RATE_PORT_CURRENT; ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); -- cgit v1.2.3 From 0cadb4db79e1d9eea66711c4031e435c2191907e Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Mon, 5 Jun 2023 13:33:24 +0300 Subject: RDMA/uverbs: Restrict usage of privileged QKEYs According to the IB specification rel-1.6, section 3.5.3: "QKEYs with the most significant bit set are considered controlled QKEYs, and a HCA does not allow a consumer to arbitrarily specify a controlled QKEY." Thus, block non-privileged users from setting such a QKEY. Cc: stable@vger.kernel.org Fixes: bc38a6abdd5a ("[PATCH] IB uverbs: core implementation") Signed-off-by: Edward Srouji Link: https://lore.kernel.org/r/c00c809ddafaaf87d6f6cb827978670989a511b3.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4796f6a8828c..e836c9c477f6 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1850,8 +1850,13 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, attr->path_mtu = cmd->base.path_mtu; if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE) attr->path_mig_state = cmd->base.path_mig_state; - if (cmd->base.attr_mask & IB_QP_QKEY) + if (cmd->base.attr_mask & IB_QP_QKEY) { + if (cmd->base.qkey & IB_QP_SET_QKEY && !capable(CAP_NET_RAW)) { + ret = -EPERM; + goto release_qp; + } attr->qkey = cmd->base.qkey; + } if (cmd->base.attr_mask & IB_QP_RQ_PSN) attr->rq_psn = cmd->base.rq_psn; if (cmd->base.attr_mask & IB_QP_SQ_PSN) -- cgit v1.2.3 From 62fab312fa1683e812e605db20d4f22de3e3fb2f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 5 Jun 2023 13:33:25 +0300 Subject: IB/uverbs: Fix to consider event queue closing also upon non-blocking mode Fix ib_uverbs_event_read() to consider event queue closing also upon non-blocking mode. Once the queue is closed (e.g. hot-plug flow) all the existing events are cleaned-up as part of ib_uverbs_free_event_queue(). An application that uses the non-blocking FD mode should get -EIO in that case to let it knows that the device was removed already. Otherwise, it can loose the indication that the device was removed and won't recover. As part of that, refactor the code to have a single flow with regards to 'is_closed' for both blocking and non-blocking modes. Fixes: 14e23bd6d221 ("RDMA/core: Fix locking in ib_uverbs_event_read") Reviewed-by: Maor Gottlieb Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/97b00116a1e1e13f8dc4ec38a5ea81cf8c030210.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_main.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index fbace69672ca..7c9c79c13941 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -222,8 +222,12 @@ static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, spin_lock_irq(&ev_queue->lock); while (list_empty(&ev_queue->event_list)) { - spin_unlock_irq(&ev_queue->lock); + if (ev_queue->is_closed) { + spin_unlock_irq(&ev_queue->lock); + return -EIO; + } + spin_unlock_irq(&ev_queue->lock); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; @@ -233,12 +237,6 @@ static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, return -ERESTARTSYS; spin_lock_irq(&ev_queue->lock); - - /* If device was disassociated and no event exists set an error */ - if (list_empty(&ev_queue->event_list) && ev_queue->is_closed) { - spin_unlock_irq(&ev_queue->lock); - return -EIO; - } } event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list); -- cgit v1.2.3 From 617f5db1a626f18d5cbb7c7faf7bf8f9ea12be78 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 5 Jun 2023 13:33:26 +0300 Subject: RDMA/mlx5: Fix affinity assignment The cited commit aimed to ensure that Virtual Functions (VFs) assign a queue affinity to a Queue Pair (QP) to distribute traffic when the LAG master creates a hardware LAG. If the affinity was set while the hardware was not in LAG, the firmware would ignore the affinity value. However, this commit unintentionally assigned an affinity to QPs on the LAG master's VPORT even if the RDMA device was not marked as LAG-enabled. In most cases, this was not an issue because when the hardware entered hardware LAG configuration, the RDMA device of the LAG master would be destroyed and a new one would be created, marked as LAG-enabled. The problem arises when a user configures Equal-Cost Multipath (ECMP). In ECMP mode, traffic can be directed to different physical ports based on the queue affinity, which is intended for use by VPORTS other than the E-Switch manager. ECMP mode is supported only if both E-Switch managers are in switchdev mode and the appropriate route is configured via IP. In this configuration, the RDMA device is not destroyed, and we retain the RDMA device that is not marked as LAG-enabled. To ensure correct behavior, Send Queues (SQs) opened by the E-Switch manager through verbs should be assigned strict affinity. This means they will only be able to communicate through the native physical port associated with the E-Switch manager. This will prevent the firmware from assigning affinity and will not allow the SQs to be remapped in case of failover. Fixes: 802dcc7fc5ec ("RDMA/mlx5: Support TX port affinity for VF drivers in LAG mode") Reviewed-by: Maor Gottlieb Signed-off-by: Mark Bloch Link: https://lore.kernel.org/r/425b05f4da840bc684b0f7e8ebf61aeb5cef09b0.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +++ drivers/infiniband/hw/mlx5/qp.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 91fc0cdf377d..2dfa6f49a6f4 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1598,6 +1598,9 @@ static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) MLX5_CAP_PORT_SELECTION(dev->mdev, port_select_flow_table_bypass)) return 0; + if (mlx5_lag_is_lacp_owner(dev->mdev) && !dev->lag_active) + return 0; + return dev->lag_active || (MLX5_CAP_GEN(dev->mdev, num_lag_ports) > 1 && MLX5_CAP_GEN(dev->mdev, lag_tx_port_affinity)); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 70ca8ffa9256..78b96bfb4e6a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1237,6 +1237,9 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid); MLX5_SET(tisc, tisc, transport_domain, tdn); + if (!mlx5_ib_lag_should_assign_affinity(dev) && + mlx5_lag_is_lacp_owner(dev->mdev)) + MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1); if (qp->flags & IB_QP_CREATE_SOURCE_QPN) MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); -- cgit v1.2.3 From 691b0480933f0ce88a81ed1d1a0aff340ff6293a Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Tue, 6 Jun 2023 03:25:29 -0700 Subject: IB/isert: Fix dead lock in ib_isert - When a iSER session is released, ib_isert module is taking a mutex lock and releasing all pending connections. As part of this, ib_isert is destroying rdma cm_id. To destroy cm_id, rdma_cm module is sending CM events to CMA handler of ib_isert. This handler is taking same mutex lock. Hence it leads to deadlock between ib_isert & rdma_cm modules. - For fix, created local list of pending connections and release the connection outside of mutex lock. Calltrace: --------- [ 1229.791410] INFO: task kworker/10:1:642 blocked for more than 120 seconds. [ 1229.791416] Tainted: G OE --------- - - 4.18.0-372.9.1.el8.x86_64 #1 [ 1229.791418] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1229.791419] task:kworker/10:1 state:D stack: 0 pid: 642 ppid: 2 flags:0x80004000 [ 1229.791424] Workqueue: ib_cm cm_work_handler [ib_cm] [ 1229.791436] Call Trace: [ 1229.791438] __schedule+0x2d1/0x830 [ 1229.791445] ? select_idle_sibling+0x23/0x6f0 [ 1229.791449] schedule+0x35/0xa0 [ 1229.791451] schedule_preempt_disabled+0xa/0x10 [ 1229.791453] __mutex_lock.isra.7+0x310/0x420 [ 1229.791456] ? select_task_rq_fair+0x351/0x990 [ 1229.791459] isert_cma_handler+0x224/0x330 [ib_isert] [ 1229.791463] ? ttwu_queue_wakelist+0x159/0x170 [ 1229.791466] cma_cm_event_handler+0x25/0xd0 [rdma_cm] [ 1229.791474] cma_ib_handler+0xa7/0x2e0 [rdma_cm] [ 1229.791478] cm_process_work+0x22/0xf0 [ib_cm] [ 1229.791483] cm_work_handler+0xf4/0xf30 [ib_cm] [ 1229.791487] ? move_linked_works+0x6e/0xa0 [ 1229.791490] process_one_work+0x1a7/0x360 [ 1229.791491] ? create_worker+0x1a0/0x1a0 [ 1229.791493] worker_thread+0x30/0x390 [ 1229.791494] ? create_worker+0x1a0/0x1a0 [ 1229.791495] kthread+0x10a/0x120 [ 1229.791497] ? set_kthread_struct+0x40/0x40 [ 1229.791499] ret_from_fork+0x1f/0x40 [ 1229.791739] INFO: task targetcli:28666 blocked for more than 120 seconds. [ 1229.791740] Tainted: G OE --------- - - 4.18.0-372.9.1.el8.x86_64 #1 [ 1229.791741] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1229.791742] task:targetcli state:D stack: 0 pid:28666 ppid: 5510 flags:0x00004080 [ 1229.791743] Call Trace: [ 1229.791744] __schedule+0x2d1/0x830 [ 1229.791746] schedule+0x35/0xa0 [ 1229.791748] schedule_preempt_disabled+0xa/0x10 [ 1229.791749] __mutex_lock.isra.7+0x310/0x420 [ 1229.791751] rdma_destroy_id+0x15/0x20 [rdma_cm] [ 1229.791755] isert_connect_release+0x115/0x130 [ib_isert] [ 1229.791757] isert_free_np+0x87/0x140 [ib_isert] [ 1229.791761] iscsit_del_np+0x74/0x120 [iscsi_target_mod] [ 1229.791776] lio_target_np_driver_store+0xe9/0x140 [iscsi_target_mod] [ 1229.791784] configfs_write_file+0xb2/0x110 [ 1229.791788] vfs_write+0xa5/0x1a0 [ 1229.791792] ksys_write+0x4f/0xb0 [ 1229.791794] do_syscall_64+0x5b/0x1a0 [ 1229.791798] entry_SYSCALL_64_after_hwframe+0x65/0xca Fixes: bd3792205aae ("iser-target: Fix pending connections handling in target stack shutdown sequnce") Reviewed-by: Sagi Grimberg Signed-off-by: Selvin Xavier Signed-off-by: Saravanan Vajravel Link: https://lore.kernel.org/r/20230606102531.162967-2-saravanan.vajravel@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/isert/ib_isert.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index f290cd49698e..b4809d237250 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2431,6 +2431,7 @@ isert_free_np(struct iscsi_np *np) { struct isert_np *isert_np = np->np_context; struct isert_conn *isert_conn, *n; + LIST_HEAD(drop_conn_list); if (isert_np->cm_id) rdma_destroy_id(isert_np->cm_id); @@ -2450,7 +2451,7 @@ isert_free_np(struct iscsi_np *np) node) { isert_info("cleaning isert_conn %p state (%d)\n", isert_conn, isert_conn->state); - isert_connect_release(isert_conn); + list_move_tail(&isert_conn->node, &drop_conn_list); } } @@ -2461,11 +2462,16 @@ isert_free_np(struct iscsi_np *np) node) { isert_info("cleaning isert_conn %p state (%d)\n", isert_conn, isert_conn->state); - isert_connect_release(isert_conn); + list_move_tail(&isert_conn->node, &drop_conn_list); } } mutex_unlock(&isert_np->mutex); + list_for_each_entry_safe(isert_conn, n, &drop_conn_list, node) { + list_del_init(&isert_conn->node); + isert_connect_release(isert_conn); + } + np->np_context = NULL; kfree(isert_np); } -- cgit v1.2.3 From 7651e2d6c5b359a28c2d4c904fec6608d1021ca8 Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Tue, 6 Jun 2023 03:25:30 -0700 Subject: IB/isert: Fix possible list corruption in CMA handler When ib_isert module receives connection error event, it is releasing the isert session and removes corresponding list node but it doesn't take appropriate mutex lock to remove the list node. This can lead to linked list corruption Fixes: bd3792205aae ("iser-target: Fix pending connections handling in target stack shutdown sequnce") Signed-off-by: Selvin Xavier Signed-off-by: Saravanan Vajravel Link: https://lore.kernel.org/r/20230606102531.162967-3-saravanan.vajravel@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/isert/ib_isert.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index b4809d237250..00a7303c8cc6 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -657,9 +657,13 @@ static int isert_connect_error(struct rdma_cm_id *cma_id) { struct isert_conn *isert_conn = cma_id->qp->qp_context; + struct isert_np *isert_np = cma_id->context; ib_drain_qp(isert_conn->qp); + + mutex_lock(&isert_np->mutex); list_del_init(&isert_conn->node); + mutex_unlock(&isert_np->mutex); isert_conn->cm_id = NULL; isert_put_conn(isert_conn); -- cgit v1.2.3 From 699826f4e30ab76a62c238c86fbef7e826639c8d Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Tue, 6 Jun 2023 03:25:31 -0700 Subject: IB/isert: Fix incorrect release of isert connection The ib_isert module is releasing the isert connection both in isert_wait_conn() handler as well as isert_free_conn() handler. In isert_wait_conn() handler, it is expected to wait for iSCSI session logout operation to complete. It should free the isert connection only in isert_free_conn() handler. When a bunch of iSER target is cleared, this issue can lead to use-after-free memory issue as isert conn is twice released Fixes: b02efbfc9a05 ("iser-target: Fix implicit termination of connections") Reviewed-by: Sagi Grimberg Signed-off-by: Saravanan Vajravel Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/20230606102531.162967-4-saravanan.vajravel@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/isert/ib_isert.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 00a7303c8cc6..92e1e7587af8 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2570,8 +2570,6 @@ static void isert_wait_conn(struct iscsit_conn *conn) isert_put_unsol_pending_cmds(conn); isert_wait4cmds(conn); isert_wait4logout(isert_conn); - - queue_work(isert_release_wq, &isert_conn->release_work); } static void isert_free_conn(struct iscsit_conn *conn) -- cgit v1.2.3 From 0c7e314a6352664e12ec465f576cf039e95f8369 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Mon, 12 Jun 2023 10:50:33 -0500 Subject: RDMA/rxe: Fix rxe_cq_post A recent patch replaced a tasklet execution of cq->comp_handler by a direct call. While this made sense it let changes to cq->notify state be unprotected and assumed that the cq completion machinery and the ulp done callbacks were reentrant. The result is that in some cases completion events can be lost. This patch moves the cq->comp_handler call inside of the spinlock in rxe_cq_post which solves both issues. This is compatible with the matching code in the request notify verb. Fixes: 78b26a335310 ("RDMA/rxe: Remove tasklet call from rxe_cq.c") Link: https://lore.kernel.org/r/20230612155032.17036-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_cq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 20ff0c0c4605..6ca2a05b6a2a 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -113,8 +113,6 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); - spin_unlock_irqrestore(&cq->cq_lock, flags); - if ((cq->notify == IB_CQ_NEXT_COMP) || (cq->notify == IB_CQ_SOLICITED && solicited)) { cq->notify = 0; @@ -122,6 +120,8 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } + spin_unlock_irqrestore(&cq->cq_lock, flags); + return 0; } -- cgit v1.2.3