From 3bf3a7c6985c625f64e73baefdaa36f1c2045a29 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 25 Apr 2023 01:02:42 +0000 Subject: RDMA/rtrs: Fix the last iu->buf leak in err path The last iu->buf will leak if ib_dma_mapping_error() fails. Fixes: c0894b3ea69d ("RDMA/rtrs: core: lib functions shared between client and server modules") Link: https://lore.kernel.org/r/1682384563-2-3-git-send-email-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Acked-by: Guoqing Jiang Acked-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index 4bf9d868cc52..3696f367ff51 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -37,8 +37,10 @@ struct rtrs_iu *rtrs_iu_alloc(u32 iu_num, size_t size, gfp_t gfp_mask, goto err; iu->dma_addr = ib_dma_map_single(dma_dev, iu->buf, size, dir); - if (ib_dma_mapping_error(dma_dev, iu->dma_addr)) + if (ib_dma_mapping_error(dma_dev, iu->dma_addr)) { + kfree(iu->buf); goto err; + } iu->cqe.done = done; iu->size = size; -- cgit v1.2.3 From 9c29c8c7df0688f358d2df5ddd16c97c2f7292b4 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 25 Apr 2023 01:02:43 +0000 Subject: RDMA/rtrs: Fix rxe_dealloc_pd warning In current design: 1. PD and clt_path->s.dev are shared among connections. 2. every con[n]'s cleanup phase will call destroy_con_cq_qp() 3. clt_path->s.dev will be always decreased in destroy_con_cq_qp(), and when clt_path->s.dev become zero, it will destroy PD. 4. when con[1] failed to create, con[1] will not take clt_path->s.dev, but it try to decreased clt_path->s.dev So, in case create_cm(con[0]) succeeds but create_cm(con[1]) fails, destroy_con_cq_qp(con[1]) will be called first which will destroy the PD while this PD is still taken by con[0]. Here, we refactor the error path of create_cm() and init_conns(), so that we do the cleanup in the order they are created. The warning occurs when destroying RXE PD whose reference count is not zero. rnbd_client L597: Mapping device /dev/nvme0n1 on session client, (access_mode: rw, nr_poll_queues: 0) ------------[ cut here ]------------ WARNING: CPU: 0 PID: 26407 at drivers/infiniband/sw/rxe/rxe_pool.c:256 __rxe_cleanup+0x13a/0x170 [rdma_rxe] Modules linked in: rpcrdma rdma_ucm ib_iser rnbd_client libiscsi rtrs_client scsi_transport_iscsi rtrs_core rdma_cm iw_cm ib_cm crc32_generic rdma_rxe udp_tunnel ib_uverbs ib_core kmem device_dax nd_pmem dax_pmem nd_vme crc32c_intel fuse nvme_core nfit libnvdimm dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 26407 Comm: rnbd-client.sh Kdump: loaded Not tainted 6.2.0-rc6-roce-flush+ #53 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__rxe_cleanup+0x13a/0x170 [rdma_rxe] Code: 45 84 e4 0f 84 5a ff ff ff 48 89 ef e8 5f 18 71 f9 84 c0 75 90 be c8 00 00 00 48 89 ef e8 be 89 1f fa 85 c0 0f 85 7b ff ff ff <0f> 0b 41 bc ea ff ff ff e9 71 ff ff ff e8 84 7f 1f fa e9 d0 fe ff RSP: 0018:ffffb09880b6f5f0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff99401f15d6a8 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffbac8234b RDI: 00000000ffffffff RBP: ffff99401f15d6d0 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000002d82 R11: 0000000000000000 R12: 0000000000000001 R13: ffff994101eff208 R14: ffffb09880b6f6a0 R15: 00000000fffffe00 FS: 00007fe113904740(0000) GS:ffff99413bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff6cde656c8 CR3: 000000001f108004 CR4: 00000000001706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rxe_dealloc_pd+0x16/0x20 [rdma_rxe] ib_dealloc_pd_user+0x4b/0x80 [ib_core] rtrs_ib_dev_put+0x79/0xd0 [rtrs_core] destroy_con_cq_qp+0x8a/0xa0 [rtrs_client] init_path+0x1e7/0x9a0 [rtrs_client] ? __pfx_autoremove_wake_function+0x10/0x10 ? lock_is_held_type+0xd7/0x130 ? rcu_read_lock_sched_held+0x43/0x80 ? pcpu_alloc+0x3dd/0x7d0 ? rtrs_clt_init_stats+0x18/0x40 [rtrs_client] rtrs_clt_open+0x24f/0x5a0 [rtrs_client] ? __pfx_rnbd_clt_link_ev+0x10/0x10 [rnbd_client] rnbd_clt_map_device+0x6a5/0xe10 [rnbd_client] Fixes: 6a98d71daea1 ("RDMA/rtrs: client: main functionality") Link: https://lore.kernel.org/r/1682384563-2-4-git-send-email-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Acked-by: Jack Wang Tested-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 55 ++++++++++++++-------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index edb2e3a25880..cfb50bfe53c3 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -2040,6 +2040,7 @@ static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id, return 0; } +/* The caller should do the cleanup in case of error */ static int create_cm(struct rtrs_clt_con *con) { struct rtrs_path *s = con->c.path; @@ -2062,14 +2063,14 @@ static int create_cm(struct rtrs_clt_con *con) err = rdma_set_reuseaddr(cm_id, 1); if (err != 0) { rtrs_err(s, "Set address reuse failed, err: %d\n", err); - goto destroy_cm; + return err; } err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr, (struct sockaddr *)&clt_path->s.dst_addr, RTRS_CONNECT_TIMEOUT_MS); if (err) { rtrs_err(s, "Failed to resolve address, err: %d\n", err); - goto destroy_cm; + return err; } /* * Combine connection status and session events. This is needed @@ -2084,29 +2085,15 @@ static int create_cm(struct rtrs_clt_con *con) if (err == 0) err = -ETIMEDOUT; /* Timedout or interrupted */ - goto errr; - } - if (con->cm_err < 0) { - err = con->cm_err; - goto errr; + return err; } - if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTING) { + if (con->cm_err < 0) + return con->cm_err; + if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTING) /* Device removal */ - err = -ECONNABORTED; - goto errr; - } + return -ECONNABORTED; return 0; - -errr: - stop_cm(con); - mutex_lock(&con->con_mutex); - destroy_con_cq_qp(con); - mutex_unlock(&con->con_mutex); -destroy_cm: - destroy_cm(con); - - return err; } static void rtrs_clt_path_up(struct rtrs_clt_path *clt_path) @@ -2334,7 +2321,7 @@ static void rtrs_clt_close_work(struct work_struct *work) static int init_conns(struct rtrs_clt_path *clt_path) { unsigned int cid; - int err; + int err, i; /* * On every new session connections increase reconnect counter @@ -2350,10 +2337,8 @@ static int init_conns(struct rtrs_clt_path *clt_path) goto destroy; err = create_cm(to_clt_con(clt_path->s.con[cid])); - if (err) { - destroy_con(to_clt_con(clt_path->s.con[cid])); + if (err) goto destroy; - } } err = alloc_path_reqs(clt_path); if (err) @@ -2364,15 +2349,21 @@ static int init_conns(struct rtrs_clt_path *clt_path) return 0; destroy: - while (cid--) { - struct rtrs_clt_con *con = to_clt_con(clt_path->s.con[cid]); + /* Make sure we do the cleanup in the order they are created */ + for (i = 0; i <= cid; i++) { + struct rtrs_clt_con *con; - stop_cm(con); + if (!clt_path->s.con[i]) + break; - mutex_lock(&con->con_mutex); - destroy_con_cq_qp(con); - mutex_unlock(&con->con_mutex); - destroy_cm(con); + con = to_clt_con(clt_path->s.con[i]); + if (con->c.cm_id) { + stop_cm(con); + mutex_lock(&con->con_mutex); + destroy_con_cq_qp(con); + mutex_unlock(&con->con_mutex); + destroy_cm(con); + } destroy_con(con); } /* -- cgit v1.2.3 From 9a3763e87379c97a78b7c6c6f40720b1e877174f Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 17 May 2023 12:22:42 -0500 Subject: RDMA/rxe: Fix packet length checks In rxe_net.c a received packet, from udp or loopback, is passed to rxe_rcv() in rxe_recv.c as a udp packet. I.e. skb->data is pointing at the udp header. But rxe_rcv() makes length checks to verify the packet is long enough to hold the roce headers as if it were a roce packet. I.e. skb->data pointing at the bth header. A runt packet would appear to have 8 more bytes than it actually does which may lead to incorrect behavior. This patch calls skb_pull() to adjust the skb to point at the bth header before calling rxe_rcv() which fixes this error. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20230517172242.1806340-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index a38fab19bed1..cd59666158b1 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -159,6 +159,9 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) pkt->mask = RXE_GRH_MASK; pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); + /* remove udp header */ + skb_pull(skb, sizeof(struct udphdr)); + rxe_rcv(skb); return 0; @@ -401,6 +404,9 @@ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) return -EIO; } + /* remove udp header */ + skb_pull(skb, sizeof(struct udphdr)); + rxe_rcv(skb); return 0; -- cgit v1.2.3 From b00683422fd79dd07c9b75efdce1660e5e19150e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 17 May 2023 16:15:10 -0500 Subject: RDMA/rxe: Fix ref count error in check_rkey() There is a reference count error in error path code and a potential race in check_rkey() in rxe_resp.c. When looking up the rkey for a memory window the reference to the mw from rxe_lookup_mw() is dropped before a reference is taken on the mr referenced by the mw. If the mr is destroyed immediately after the call to rxe_put(mw) the mr pointer is unprotected and may end up pointing at freed memory. The rxe_get(mr) call should take place before the rxe_put(mw) call. All errors in check_rkey() call rxe_put(mw) if mw is not NULL but it was already called after the above. The mw pointer should be set to NULL after the rxe_put(mw) call to prevent this from happening. Fixes: cdd0b85675ae ("RDMA/rxe: Implement memory access through MWs") Link: https://lore.kernel.org/r/20230517211509.1819998-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 1da044f6b7d4..ee68306555b9 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -489,8 +489,9 @@ static enum resp_states check_rkey(struct rxe_qp *qp, if (mw->access & IB_ZERO_BASED) qp->resp.offset = mw->addr; - rxe_put(mw); rxe_get(mr); + rxe_put(mw); + mw = NULL; } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { -- cgit v1.2.3 From 18e7e3e4217083a682e2c7282011c70c8a1ba070 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Mon, 29 May 2023 11:35:26 -0400 Subject: RDMA/bnxt_re: Fix reporting active_{speed,width} attributes After commit 6d758147c7b8 ("RDMA/bnxt_re: Use auxiliary driver interface") the active_{speed, width} attributes are reported incorrectly, This is happening because ib_get_eth_speed() is called only once from bnxt_re_ib_init() - Fix this issue by calling ib_get_eth_speed() from bnxt_re_query_port(). Fixes: 6d758147c7b8 ("RDMA/bnxt_re: Use auxiliary driver interface") Link: https://lore.kernel.org/r/20230529153525.87254-1-kheib@redhat.com Signed-off-by: Kamal Heib Acked-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 -- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 7 ++++--- drivers/infiniband/hw/bnxt_re/main.c | 2 -- 3 files changed, 4 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 5a2baf49ecaa..2c95e6f3d47a 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -135,8 +135,6 @@ struct bnxt_re_dev { struct delayed_work worker; u8 cur_prio_map; - u16 active_speed; - u8 active_width; /* FP Notification Queue (CQ & SRQ) */ struct tasklet_struct nq_task; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index b1c36412025f..952811c40c54 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -199,6 +199,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + int rc; memset(port_attr, 0, sizeof(*port_attr)); @@ -228,10 +229,10 @@ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, port_attr->sm_sl = 0; port_attr->subnet_timeout = 0; port_attr->init_type_reply = 0; - port_attr->active_speed = rdev->active_speed; - port_attr->active_width = rdev->active_width; + rc = ib_get_eth_speed(&rdev->ibdev, port_num, &port_attr->active_speed, + &port_attr->active_width); - return 0; + return rc; } int bnxt_re_get_port_immutable(struct ib_device *ibdev, u32 port_num, diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index e34eccd178d0..3073398a2183 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1077,8 +1077,6 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) return rc; } dev_info(rdev_to_dev(rdev), "Device registered with IB successfully"); - ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, - &rdev->active_width); set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); event = netif_running(rdev->netdev) && netif_carrier_ok(rdev->netdev) ? -- cgit v1.2.3 From 2a62b6210ce876c596086ab8fd4c8a0c3d10611a Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 2 Jun 2023 11:54:08 +0800 Subject: RDMA/rxe: Fix the use-before-initialization error of resp_pkts In the following: Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106 assign_lock_key kernel/locking/lockdep.c:982 [inline] register_lock_class+0xdb6/0x1120 kernel/locking/lockdep.c:1295 __lock_acquire+0x10a/0x5df0 kernel/locking/lockdep.c:4951 lock_acquire kernel/locking/lockdep.c:5691 [inline] lock_acquire+0x1b1/0x520 kernel/locking/lockdep.c:5656 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x3d/0x60 kernel/locking/spinlock.c:162 skb_dequeue+0x20/0x180 net/core/skbuff.c:3639 drain_resp_pkts drivers/infiniband/sw/rxe/rxe_comp.c:555 [inline] rxe_completer+0x250d/0x3cc0 drivers/infiniband/sw/rxe/rxe_comp.c:652 rxe_qp_do_cleanup+0x1be/0x820 drivers/infiniband/sw/rxe/rxe_qp.c:761 execute_in_process_context+0x3b/0x150 kernel/workqueue.c:3473 __rxe_cleanup+0x21e/0x370 drivers/infiniband/sw/rxe/rxe_pool.c:233 rxe_create_qp+0x3f6/0x5f0 drivers/infiniband/sw/rxe/rxe_verbs.c:583 This is a use-before-initialization problem. It happens because rxe_qp_do_cleanup is called during error unwind before the struct has been fully initialized. Move the initialization of the skb earlier. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20230602035408.741534-1-yanjun.zhu@intel.com Reported-by: syzbot+eba589d8f49c73d356da@syzkaller.appspotmail.com Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 61a2eb77d999..a0f206431cf8 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -176,6 +176,9 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, spin_lock_init(&qp->rq.producer_lock); spin_lock_init(&qp->rq.consumer_lock); + skb_queue_head_init(&qp->req_pkts); + skb_queue_head_init(&qp->resp_pkts); + atomic_set(&qp->ssn, 0); atomic_set(&qp->skb_out, 0); } @@ -234,8 +237,6 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, qp->req.opcode = -1; qp->comp.opcode = -1; - skb_queue_head_init(&qp->req_pkts); - rxe_init_task(&qp->req.task, qp, rxe_requester); rxe_init_task(&qp->comp.task, qp, rxe_completer); @@ -279,8 +280,6 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, } } - skb_queue_head_init(&qp->resp_pkts); - rxe_init_task(&qp->resp.task, qp, rxe_responder); qp->resp.opcode = OPCODE_NONE; -- cgit v1.2.3 From ee4d269eccfea6c17b18281bef482700d898e86f Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 5 Jun 2023 13:33:17 +0300 Subject: RDMA/mlx5: Initiate dropless RQ for RAW Ethernet functions Delay drop data is initiated for PFs that have the capability of rq_delay_drop and are in roce profile. However, PFs with RAW ethernet profile do not initiate delay drop data on function load, causing kernel panic if delay drop struct members are accessed later on in case a dropless RQ is created. Thus, stage the delay drop initialization as part of RAW ethernet PF loading process. Fixes: b5ca15ad7e61 ("IB/mlx5: Add proper representors support") Signed-off-by: Maher Sanalla Reviewed-by: Maor Gottlieb Link: https://lore.kernel.org/r/2e9d386785043d48c38711826eb910315c1de141.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5d45de223c43..f0b394ed7452 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4275,6 +4275,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), + STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, + mlx5_ib_stage_delay_drop_init, + mlx5_ib_stage_delay_drop_cleanup), STAGE_CREATE(MLX5_IB_STAGE_RESTRACK, mlx5_ib_restrack_init, NULL), -- cgit v1.2.3 From e1f4a52ac171dd863fe89055e749ef5e0a0bc5ce Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 5 Jun 2023 13:33:18 +0300 Subject: RDMA/mlx5: Create an indirect flow table for steering anchor A misbehaved user can create a steering anchor that points to a kernel flow table and then destroy the anchor without freeing the associated STC. This creates a problem as the kernel can't destroy the flow table since there is still a reference to it. As a result, this can exhaust all available flow table resources, preventing other users from using the RDMA device. To prevent this problem, a solution is implemented where a special flow table with two steering rules is created when a user creates a steering anchor for the first time. The rules include one that drops all traffic and another that points to the kernel flow table. If the steering anchor is destroyed, only the rule pointing to the kernel's flow table is removed. Any traffic reaching the special flow table after that is dropped. Since the special flow table is not destroyed when the steering anchor is destroyed, any issues are prevented from occurring. The remaining resources are only destroyed when the RDMA device is destroyed, which happens after all DEVX objects are freed, including the STCs, thus mitigating the issue. Fixes: 0c6ab0ca9a66 ("RDMA/mlx5: Expose steering anchor to userspace") Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Link: https://lore.kernel.org/r/b4a88a871d651fa4e8f98d552553c1cfe9ba2cd6.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/fs.c | 276 ++++++++++++++++++++++++++++++++++- drivers/infiniband/hw/mlx5/fs.h | 16 ++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 11 ++ 3 files changed, 296 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 3008632a6c20..1e419e080b53 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -695,8 +695,6 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; - if (mlx5_ib_shared_ft_allowed(&dev->ib_dev)) - ft_attr.uid = MLX5_SHARED_RESOURCE_UID; ft_attr.prio = priority; ft_attr.max_fte = num_entries; ft_attr.flags = flags; @@ -2025,6 +2023,237 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject, return 0; } +static int steering_anchor_create_ft(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + enum mlx5_flow_namespace_type ns_type) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + + if (ft_prio->anchor.ft) + return 0; + + ns = mlx5_get_flow_namespace(dev->mdev, ns_type); + if (!ns) + return -EOPNOTSUPP; + + ft_attr.flags = MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.uid = MLX5_SHARED_RESOURCE_UID; + ft_attr.prio = 0; + ft_attr.max_fte = 2; + ft_attr.level = 1; + + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) + return PTR_ERR(ft); + + ft_prio->anchor.ft = ft; + + return 0; +} + +static void steering_anchor_destroy_ft(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.ft) { + mlx5_destroy_flow_table(ft_prio->anchor.ft); + ft_prio->anchor.ft = NULL; + } +} + +static int +steering_anchor_create_fg_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + void *flow_group_in; + int err = 0; + + if (ft_prio->anchor.fg_drop) + return 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + + fg = mlx5_create_flow_group(ft_prio->anchor.ft, flow_group_in); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + goto out; + } + + ft_prio->anchor.fg_drop = fg; + +out: + kvfree(flow_group_in); + + return err; +} + +static void +steering_anchor_destroy_fg_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.fg_drop) { + mlx5_destroy_flow_group(ft_prio->anchor.fg_drop); + ft_prio->anchor.fg_drop = NULL; + } +} + +static int +steering_anchor_create_fg_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + void *flow_group_in; + int err = 0; + + if (ft_prio->anchor.fg_goto_table) + return 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + fg = mlx5_create_flow_group(ft_prio->anchor.ft, flow_group_in); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + goto out; + } + ft_prio->anchor.fg_goto_table = fg; + +out: + kvfree(flow_group_in); + + return err; +} + +static void +steering_anchor_destroy_fg_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.fg_goto_table) { + mlx5_destroy_flow_group(ft_prio->anchor.fg_goto_table); + ft_prio->anchor.fg_goto_table = NULL; + } +} + +static int +steering_anchor_create_rule_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *handle; + + if (ft_prio->anchor.rule_drop) + return 0; + + flow_act.fg = ft_prio->anchor.fg_drop; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + handle = mlx5_add_flow_rules(ft_prio->anchor.ft, NULL, &flow_act, + NULL, 0); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ft_prio->anchor.rule_drop = handle; + + return 0; +} + +static void steering_anchor_destroy_rule_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.rule_drop) { + mlx5_del_flow_rules(ft_prio->anchor.rule_drop); + ft_prio->anchor.rule_drop = NULL; + } +} + +static int +steering_anchor_create_rule_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *handle; + + if (ft_prio->anchor.rule_goto_table) + return 0; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + flow_act.fg = ft_prio->anchor.fg_goto_table; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = ft_prio->flow_table; + + handle = mlx5_add_flow_rules(ft_prio->anchor.ft, NULL, &flow_act, + &dest, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ft_prio->anchor.rule_goto_table = handle; + + return 0; +} + +static void +steering_anchor_destroy_rule_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.rule_goto_table) { + mlx5_del_flow_rules(ft_prio->anchor.rule_goto_table); + ft_prio->anchor.rule_goto_table = NULL; + } +} + +static int steering_anchor_create_res(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + enum mlx5_flow_namespace_type ns_type) +{ + int err; + + err = steering_anchor_create_ft(dev, ft_prio, ns_type); + if (err) + return err; + + err = steering_anchor_create_fg_drop(ft_prio); + if (err) + goto destroy_ft; + + err = steering_anchor_create_fg_goto_table(ft_prio); + if (err) + goto destroy_fg_drop; + + err = steering_anchor_create_rule_drop(ft_prio); + if (err) + goto destroy_fg_goto_table; + + err = steering_anchor_create_rule_goto_table(ft_prio); + if (err) + goto destroy_rule_drop; + + return 0; + +destroy_rule_drop: + steering_anchor_destroy_rule_drop(ft_prio); +destroy_fg_goto_table: + steering_anchor_destroy_fg_goto_table(ft_prio); +destroy_fg_drop: + steering_anchor_destroy_fg_drop(ft_prio); +destroy_ft: + steering_anchor_destroy_ft(ft_prio); + + return err; +} + +static void mlx5_steering_anchor_destroy_res(struct mlx5_ib_flow_prio *ft_prio) +{ + steering_anchor_destroy_rule_goto_table(ft_prio); + steering_anchor_destroy_rule_drop(ft_prio); + steering_anchor_destroy_fg_goto_table(ft_prio); + steering_anchor_destroy_fg_drop(ft_prio); + steering_anchor_destroy_ft(ft_prio); +} + static int steering_anchor_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) @@ -2035,6 +2264,9 @@ static int steering_anchor_cleanup(struct ib_uobject *uobject, return -EBUSY; mutex_lock(&obj->dev->flow_db->lock); + if (!--obj->ft_prio->anchor.rule_goto_table_ref) + steering_anchor_destroy_rule_goto_table(obj->ft_prio); + put_flow_table(obj->dev, obj->ft_prio, true); mutex_unlock(&obj->dev->flow_db->lock); @@ -2042,6 +2274,24 @@ static int steering_anchor_cleanup(struct ib_uobject *uobject, return 0; } +static void fs_cleanup_anchor(struct mlx5_ib_flow_prio *prio, + int count) +{ + while (count--) + mlx5_steering_anchor_destroy_res(&prio[count]); +} + +void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) +{ + fs_cleanup_anchor(dev->flow_db->prios, MLX5_IB_NUM_FLOW_FT); + fs_cleanup_anchor(dev->flow_db->egress_prios, MLX5_IB_NUM_FLOW_FT); + fs_cleanup_anchor(dev->flow_db->sniffer, MLX5_IB_NUM_SNIFFER_FTS); + fs_cleanup_anchor(dev->flow_db->egress, MLX5_IB_NUM_EGRESS_FTS); + fs_cleanup_anchor(dev->flow_db->fdb, MLX5_IB_NUM_FDB_FTS); + fs_cleanup_anchor(dev->flow_db->rdma_rx, MLX5_IB_NUM_FLOW_FT); + fs_cleanup_anchor(dev->flow_db->rdma_tx, MLX5_IB_NUM_FLOW_FT); +} + static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, struct mlx5_ib_flow_matcher *obj) { @@ -2182,21 +2432,31 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( return -ENOMEM; mutex_lock(&dev->flow_db->lock); + ft_prio = _get_flow_table(dev, priority, ns_type, 0); if (IS_ERR(ft_prio)) { - mutex_unlock(&dev->flow_db->lock); err = PTR_ERR(ft_prio); goto free_obj; } ft_prio->refcount++; - ft_id = mlx5_flow_table_id(ft_prio->flow_table); - mutex_unlock(&dev->flow_db->lock); + + if (!ft_prio->anchor.rule_goto_table_ref) { + err = steering_anchor_create_res(dev, ft_prio, ns_type); + if (err) + goto put_flow_table; + } + + ft_prio->anchor.rule_goto_table_ref++; + + ft_id = mlx5_flow_table_id(ft_prio->anchor.ft); err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, &ft_id, sizeof(ft_id)); if (err) - goto put_flow_table; + goto destroy_res; + + mutex_unlock(&dev->flow_db->lock); uobj->object = obj; obj->dev = dev; @@ -2205,8 +2465,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( return 0; +destroy_res: + --ft_prio->anchor.rule_goto_table_ref; + mlx5_steering_anchor_destroy_res(ft_prio); put_flow_table: - mutex_lock(&dev->flow_db->lock); put_flow_table(dev, ft_prio, true); mutex_unlock(&dev->flow_db->lock); free_obj: diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h index ad320adaf321..b9734904f5f0 100644 --- a/drivers/infiniband/hw/mlx5/fs.h +++ b/drivers/infiniband/hw/mlx5/fs.h @@ -10,6 +10,7 @@ #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_fs_init(struct mlx5_ib_dev *dev); +void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev); #else static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) { @@ -21,9 +22,24 @@ static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) mutex_init(&dev->flow_db->lock); return 0; } + +inline void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) {} #endif + static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) { + /* When a steering anchor is created, a special flow table is also + * created for the user to reference. Since the user can reference it, + * the kernel cannot trust that when the user destroys the steering + * anchor, they no longer reference the flow table. + * + * To address this issue, when a user destroys a steering anchor, only + * the flow steering rule in the table is destroyed, but the table + * itself is kept to deal with the above scenario. The remaining + * resources are only removed when the RDMA device is destroyed, which + * is a safe assumption that all references are gone. + */ + mlx5_ib_fs_cleanup_anchor(dev); kfree(dev->flow_db); } #endif /* _MLX5_IB_FS_H */ diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index efa4dc6e7dee..91fc0cdf377d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -237,8 +237,19 @@ enum { #define MLX5_IB_NUM_SNIFFER_FTS 2 #define MLX5_IB_NUM_EGRESS_FTS 1 #define MLX5_IB_NUM_FDB_FTS MLX5_BY_PASS_NUM_REGULAR_PRIOS + +struct mlx5_ib_anchor { + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg_goto_table; + struct mlx5_flow_group *fg_drop; + struct mlx5_flow_handle *rule_goto_table; + struct mlx5_flow_handle *rule_drop; + unsigned int rule_goto_table_ref; +}; + struct mlx5_ib_flow_prio { struct mlx5_flow_table *flow_table; + struct mlx5_ib_anchor anchor; unsigned int refcount; }; -- cgit v1.2.3 From c2ea687e5e0e29802714a981a1ccdc17c2c4b2be Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 13:33:19 +0300 Subject: RDMA/mlx5: Fix Q-counters per vport allocation Previously Q-counters data was being allocated over the PF for all of the available vports, however that isn't necessary. Since each VF or SF has a Q-counter allocated for itself. So we only need to allocate two counters data structures, one for the device counters, and one for all the other vports to expose the representors, since they only need to read from it in order to determine mainly counters numbers and names, so they can all share. This in turn also solves a bug we previously had where we couldn't switch the device to switchdev mode when there were more than 128 SF/VFs configured, since that is the maximum amount of Q-counters available for a single port Fixes: d22467a71ebe ("RDMA/mlx5: Expand switchdev Q-counters to expose representor statistics") Signed-off-by: Patrisious Haddad Reviewed-by: Mark Zhang Link: https://lore.kernel.org/r/f54671df16e2227a069b229b33b62cd9ee24c475.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 1c06920505d2..3d7ef81a50b8 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -209,7 +209,8 @@ static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev, !vport_qcounters_supported(dev)) || !port_num) return &dev->port[0].cnts; - return &dev->port[port_num - 1].cnts; + return is_mdev_switchdev_mode(dev->mdev) ? + &dev->port[1].cnts : &dev->port[port_num - 1].cnts; } /** @@ -262,7 +263,7 @@ static struct rdma_hw_stats * mlx5_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - const struct mlx5_ib_counters *cnts = &dev->port[port_num - 1].cnts; + const struct mlx5_ib_counters *cnts = get_counters(dev, port_num); return do_alloc_stats(cnts); } @@ -725,11 +726,11 @@ err: static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; - int num_cnt_ports; + int num_cnt_ports = dev->num_ports; int i, j; - num_cnt_ports = (!is_mdev_switchdev_mode(dev->mdev) || - vport_qcounters_supported(dev)) ? dev->num_ports : 1; + if (is_mdev_switchdev_mode(dev->mdev)) + num_cnt_ports = min(2, num_cnt_ports); MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); @@ -761,15 +762,22 @@ static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) { u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; - int num_cnt_ports; + int num_cnt_ports = dev->num_ports; int err = 0; int i; bool is_shared; MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0; - num_cnt_ports = (!is_mdev_switchdev_mode(dev->mdev) || - vport_qcounters_supported(dev)) ? dev->num_ports : 1; + + /* + * In switchdev we need to allocate two ports, one that is used for + * the device Q_counters and it is essentially the real Q_counters of + * this device, while the other is used as a helper for PF to be able to + * query all other vports. + */ + if (is_mdev_switchdev_mode(dev->mdev)) + num_cnt_ports = min(2, num_cnt_ports); for (i = 0; i < num_cnt_ports; i++) { err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts, i); -- cgit v1.2.3 From e80ef139488fb4f481c877a81f6ba54f1f0ee416 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 13:33:20 +0300 Subject: RDMA/mlx5: Remove vport Q-counters dependency on normal Q-counters Previously the Q-counters initialization assumed that the vport Q-counters structures and the normal Q-counters structures are identical in size, and hence when a Q-counter was added to normal Q-counters structure but not to the vport Q-counters struct it would lead to that counter name being NULL in switchdev mode, which could cause the kernel crash below. Currently break the dependency between those two structure and always use the appropriate struct size, in order to remove the assumption that both structure sizes are equal. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 20c64a067 P4D 20c64a067 PUD 20152b067 PMD 0 Oops: 0000 [#1] SMP CPU: 19 PID: 11717 Comm: devlink Tainted: G OE 6.2.0_mlnx #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:strlen+0x0/0x20 Code: 66 2e 0f 1f 84 00 00 00 00 00 48 01 fe eb 0f 0f b6 07 38 d0 74 10 48 83 c7 01 84 c0 74 05 48 39 f7 75 ec 31 c0 c3 48 89 f8 c3 <80> 3f 00 48 89 f8 74 10 48 83 c7 01 80 3f 00 75 f7 48 29 c7 48 89 RSP: 0018:ffffc9000318b618 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000002c00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff888211918110 R09: ffff888211918000 R10: 000000000000001e R11: ffff888211918000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: ffff8881038ec250 FS: 00007fa53342fe80(0000) GS:ffff88885fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000002042b2003 CR4: 0000000000770ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: kernfs_name_hash+0x12/0x80 kernfs_find_ns+0x35/0xb0 kernfs_remove_by_name_ns+0x46/0xc0 remove_files.isra.1+0x30/0x70 internal_create_group+0x253/0x380 internal_create_groups.part.4+0x3e/0xa0 setup_port+0x27a/0x8c0 [ib_core] ib_setup_port_attrs+0x9d/0x300 [ib_core] ib_register_device+0x48e/0x550 [ib_core] __mlx5_ib_add+0x2b/0x80 [mlx5_ib] mlx5_ib_vport_rep_load+0x141/0x360 [mlx5_ib] mlx5_esw_offloads_rep_load+0x48/0xa0 [mlx5_core] esw_offloads_enable+0x41e/0xd10 [mlx5_core] mlx5_eswitch_enable_locked+0x1e3/0x340 [mlx5_core] ? __cond_resched+0x15/0x30 mlx5_devlink_eswitch_mode_set+0x204/0x3c0 [mlx5_core] devlink_nl_cmd_eswitch_set_doit+0x8d/0x100 genl_family_rcv_msg_doit.isra.19+0xea/0x110 genl_rcv_msg+0x19b/0x290 ? devlink_nl_cmd_region_read_dumpit+0x760/0x760 ? devlink_nl_cmd_port_param_get_doit+0x30/0x30 ? devlink_put+0x50/0x50 ? genl_get_cmd_both+0x60/0x60 netlink_rcv_skb+0x54/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1be/0x2a0 netlink_sendmsg+0x361/0x4d0 sock_sendmsg+0x30/0x40 __sys_sendto+0x11a/0x150 ? handle_mm_fault+0x101/0x2b0 ? do_user_addr_fault+0x21d/0x720 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7fa533611cba Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 76 c3 0f 1f 44 00 00 55 48 83 ec 30 44 89 4c RSP: 002b:00007ffdb6a898a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000000daab00 RCX: 00007fa533611cba RDX: 0000000000000038 RSI: 0000000000daab00 RDI: 0000000000000003 RBP: 0000000000daa910 R08: 00007fa533822000 R09: 000000000000000c R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_core(OE) mlxdevm(OE) ib_uverbs(OE) ib_core(OE) mlx_compat(OE) mlxfw(OE) memtrack(OE) pci_hyperv_intf nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_filter iptable_nat dns_resolver nf_nat br_netfilter nfs bridge stp llc lockd grace fscache netfs rfkill overlay iTCO_wdt iTCO_vendor_support kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel i2c_i801 sunrpc lpc_ich sha512_ssse3 pcspkr i2c_smbus mfd_core drm sch_fq_codel i2c_core ip_tables fuse crc32c_intel serio_raw virtio_net net_failover failover [last unloaded: mlxfw] CR2: 0000000000000000 ---[ end trace 0000000000000000 ]--- RIP: 0010:strlen+0x0/0x20 Code: 66 2e 0f 1f 84 00 00 00 00 00 48 01 fe eb 0f 0f b6 07 38 d0 74 10 48 83 c7 01 84 c0 74 05 48 39 f7 75 ec 31 c0 c3 48 89 f8 c3 <80> 3f 00 48 89 f8 74 10 48 83 c7 01 80 3f 00 75 f7 48 29 c7 48 89 RSP: 0018:ffffc9000318b618 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000002c00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff888211918110 R09: ffff888211918000 R10: 000000000000001e R11: ffff888211918000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: ffff8881038ec250 FS: 00007fa53342fe80(0000) GS:ffff88885fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000002042b2003 CR4: 0000000000770ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Kernel panic - not syncing: Fatal exception Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception ]--- Fixes: d22467a71ebe ("RDMA/mlx5: Expand switchdev Q-counters to expose representor statistics") Signed-off-by: Patrisious Haddad Reviewed-by: Mark Zhang Link: https://lore.kernel.org/r/016777b7f16eb6bb178999ff59097d0c0f91f68a.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 58 ++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 18 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 3d7ef81a50b8..f40d9c61e30b 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -576,43 +576,53 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, bool is_vport = is_mdev_switchdev_mode(dev->mdev) && port_num != MLX5_VPORT_PF; const struct mlx5_ib_counter *names; - int j = 0, i; + int j = 0, i, size; names = is_vport ? vport_basic_q_cnts : basic_q_cnts; - for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) { + size = is_vport ? ARRAY_SIZE(vport_basic_q_cnts) : + ARRAY_SIZE(basic_q_cnts); + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = basic_q_cnts[i].offset; + offsets[j] = names[i].offset; } names = is_vport ? vport_out_of_seq_q_cnts : out_of_seq_q_cnts; + size = is_vport ? ARRAY_SIZE(vport_out_of_seq_q_cnts) : + ARRAY_SIZE(out_of_seq_q_cnts); if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) { - for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = out_of_seq_q_cnts[i].offset; + offsets[j] = names[i].offset; } } names = is_vport ? vport_retrans_q_cnts : retrans_q_cnts; + size = is_vport ? ARRAY_SIZE(vport_retrans_q_cnts) : + ARRAY_SIZE(retrans_q_cnts); if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { - for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = retrans_q_cnts[i].offset; + offsets[j] = names[i].offset; } } names = is_vport ? vport_extended_err_cnts : extended_err_cnts; + size = is_vport ? ARRAY_SIZE(vport_extended_err_cnts) : + ARRAY_SIZE(extended_err_cnts); if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) { - for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = extended_err_cnts[i].offset; + offsets[j] = names[i].offset; } } names = is_vport ? vport_roce_accl_cnts : roce_accl_cnts; + size = is_vport ? ARRAY_SIZE(vport_roce_accl_cnts) : + ARRAY_SIZE(roce_accl_cnts); if (MLX5_CAP_GEN(dev->mdev, roce_accl)) { - for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = roce_accl_cnts[i].offset; + offsets[j] = names[i].offset; } } @@ -662,25 +672,37 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, struct mlx5_ib_counters *cnts, u32 port_num) { - u32 num_counters, num_op_counters = 0; + bool is_vport = is_mdev_switchdev_mode(dev->mdev) && + port_num != MLX5_VPORT_PF; + u32 num_counters, num_op_counters = 0, size; - num_counters = ARRAY_SIZE(basic_q_cnts); + size = is_vport ? ARRAY_SIZE(vport_basic_q_cnts) : + ARRAY_SIZE(basic_q_cnts); + num_counters = size; + size = is_vport ? ARRAY_SIZE(vport_out_of_seq_q_cnts) : + ARRAY_SIZE(out_of_seq_q_cnts); if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) - num_counters += ARRAY_SIZE(out_of_seq_q_cnts); + num_counters += size; + size = is_vport ? ARRAY_SIZE(vport_retrans_q_cnts) : + ARRAY_SIZE(retrans_q_cnts); if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) - num_counters += ARRAY_SIZE(retrans_q_cnts); + num_counters += size; + size = is_vport ? ARRAY_SIZE(vport_extended_err_cnts) : + ARRAY_SIZE(extended_err_cnts); if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) - num_counters += ARRAY_SIZE(extended_err_cnts); + num_counters += size; + size = is_vport ? ARRAY_SIZE(vport_roce_accl_cnts) : + ARRAY_SIZE(roce_accl_cnts); if (MLX5_CAP_GEN(dev->mdev, roce_accl)) - num_counters += ARRAY_SIZE(roce_accl_cnts); + num_counters += size; cnts->num_q_counters = num_counters; - if (is_mdev_switchdev_mode(dev->mdev) && port_num != MLX5_VPORT_PF) + if (is_vport) goto skip_non_qcounters; if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { -- cgit v1.2.3 From 2de43f5b5137e03ef64a2c3f064a01992830c67f Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 5 Jun 2023 13:33:21 +0300 Subject: RDMA/mlx5: Fix Q-counters query in LAG mode Previously we used the core device associated to the IB device in order to do the Q-counters query to the FW, but in LAG mode it is possible that the core device isn't the one that created this VF. Hence instead of using the core device to query the Q-counters we use the ESW core device which is guaranteed to be that of the VF. Fixes: d22467a71ebe ("RDMA/mlx5: Expand switchdev Q-counters to expose representor statistics") Signed-off-by: Patrisious Haddad Reviewed-by: Mark Zhang Link: https://lore.kernel.org/r/778d7d7a24892348d0bdef17d2e5f9e044717e86.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index f40d9c61e30b..93257fa5aae8 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -330,6 +330,7 @@ static int mlx5_ib_query_q_counters_vport(struct mlx5_ib_dev *dev, { u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {}; + struct mlx5_core_dev *mdev; __be32 val; int ret, i; @@ -337,12 +338,16 @@ static int mlx5_ib_query_q_counters_vport(struct mlx5_ib_dev *dev, dev->port[port_num].rep->vport == MLX5_VPORT_UPLINK) return 0; + mdev = mlx5_eswitch_get_core_dev(dev->port[port_num].rep->esw); + if (!mdev) + return -EOPNOTSUPP; + MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); MLX5_SET(query_q_counter_in, in, other_vport, 1); MLX5_SET(query_q_counter_in, in, vport_number, dev->port[port_num].rep->vport); MLX5_SET(query_q_counter_in, in, aggregate, 1); - ret = mlx5_cmd_exec_inout(dev->mdev, query_q_counter, in, out); + ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out); if (ret) return ret; -- cgit v1.2.3 From 58030c76cce473b6cfd630bbecb97215def0dff8 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 5 Jun 2023 13:33:23 +0300 Subject: RDMA/cma: Always set static rate to 0 for RoCE Set static rate to 0 as it should be discovered by path query and has no meaning for RoCE. This also avoid of using the rtnl lock and ethtool API, which is a bottleneck when try to setup many rdma-cm connections at the same time, especially with multiple processes. Fixes: 3c86aa70bf67 ("RDMA/cm: Add RDMA CM support for IBoE devices") Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/f72a4f8b667b803aee9fa794069f61afb5839ce4.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 93a1c48d0c32..6b3f4384e46a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3295,7 +3295,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; - route->path_rec->rate = iboe_get_rate(ndev); + route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate @@ -4964,7 +4964,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, if (!ndev) return -ENODEV; - ib.rec.rate = iboe_get_rate(ndev); + ib.rec.rate = IB_RATE_PORT_CURRENT; ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); -- cgit v1.2.3 From 0cadb4db79e1d9eea66711c4031e435c2191907e Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Mon, 5 Jun 2023 13:33:24 +0300 Subject: RDMA/uverbs: Restrict usage of privileged QKEYs According to the IB specification rel-1.6, section 3.5.3: "QKEYs with the most significant bit set are considered controlled QKEYs, and a HCA does not allow a consumer to arbitrarily specify a controlled QKEY." Thus, block non-privileged users from setting such a QKEY. Cc: stable@vger.kernel.org Fixes: bc38a6abdd5a ("[PATCH] IB uverbs: core implementation") Signed-off-by: Edward Srouji Link: https://lore.kernel.org/r/c00c809ddafaaf87d6f6cb827978670989a511b3.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4796f6a8828c..e836c9c477f6 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1850,8 +1850,13 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, attr->path_mtu = cmd->base.path_mtu; if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE) attr->path_mig_state = cmd->base.path_mig_state; - if (cmd->base.attr_mask & IB_QP_QKEY) + if (cmd->base.attr_mask & IB_QP_QKEY) { + if (cmd->base.qkey & IB_QP_SET_QKEY && !capable(CAP_NET_RAW)) { + ret = -EPERM; + goto release_qp; + } attr->qkey = cmd->base.qkey; + } if (cmd->base.attr_mask & IB_QP_RQ_PSN) attr->rq_psn = cmd->base.rq_psn; if (cmd->base.attr_mask & IB_QP_SQ_PSN) -- cgit v1.2.3 From 62fab312fa1683e812e605db20d4f22de3e3fb2f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 5 Jun 2023 13:33:25 +0300 Subject: IB/uverbs: Fix to consider event queue closing also upon non-blocking mode Fix ib_uverbs_event_read() to consider event queue closing also upon non-blocking mode. Once the queue is closed (e.g. hot-plug flow) all the existing events are cleaned-up as part of ib_uverbs_free_event_queue(). An application that uses the non-blocking FD mode should get -EIO in that case to let it knows that the device was removed already. Otherwise, it can loose the indication that the device was removed and won't recover. As part of that, refactor the code to have a single flow with regards to 'is_closed' for both blocking and non-blocking modes. Fixes: 14e23bd6d221 ("RDMA/core: Fix locking in ib_uverbs_event_read") Reviewed-by: Maor Gottlieb Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/97b00116a1e1e13f8dc4ec38a5ea81cf8c030210.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_main.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index fbace69672ca..7c9c79c13941 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -222,8 +222,12 @@ static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, spin_lock_irq(&ev_queue->lock); while (list_empty(&ev_queue->event_list)) { - spin_unlock_irq(&ev_queue->lock); + if (ev_queue->is_closed) { + spin_unlock_irq(&ev_queue->lock); + return -EIO; + } + spin_unlock_irq(&ev_queue->lock); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; @@ -233,12 +237,6 @@ static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, return -ERESTARTSYS; spin_lock_irq(&ev_queue->lock); - - /* If device was disassociated and no event exists set an error */ - if (list_empty(&ev_queue->event_list) && ev_queue->is_closed) { - spin_unlock_irq(&ev_queue->lock); - return -EIO; - } } event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list); -- cgit v1.2.3 From 617f5db1a626f18d5cbb7c7faf7bf8f9ea12be78 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 5 Jun 2023 13:33:26 +0300 Subject: RDMA/mlx5: Fix affinity assignment The cited commit aimed to ensure that Virtual Functions (VFs) assign a queue affinity to a Queue Pair (QP) to distribute traffic when the LAG master creates a hardware LAG. If the affinity was set while the hardware was not in LAG, the firmware would ignore the affinity value. However, this commit unintentionally assigned an affinity to QPs on the LAG master's VPORT even if the RDMA device was not marked as LAG-enabled. In most cases, this was not an issue because when the hardware entered hardware LAG configuration, the RDMA device of the LAG master would be destroyed and a new one would be created, marked as LAG-enabled. The problem arises when a user configures Equal-Cost Multipath (ECMP). In ECMP mode, traffic can be directed to different physical ports based on the queue affinity, which is intended for use by VPORTS other than the E-Switch manager. ECMP mode is supported only if both E-Switch managers are in switchdev mode and the appropriate route is configured via IP. In this configuration, the RDMA device is not destroyed, and we retain the RDMA device that is not marked as LAG-enabled. To ensure correct behavior, Send Queues (SQs) opened by the E-Switch manager through verbs should be assigned strict affinity. This means they will only be able to communicate through the native physical port associated with the E-Switch manager. This will prevent the firmware from assigning affinity and will not allow the SQs to be remapped in case of failover. Fixes: 802dcc7fc5ec ("RDMA/mlx5: Support TX port affinity for VF drivers in LAG mode") Reviewed-by: Maor Gottlieb Signed-off-by: Mark Bloch Link: https://lore.kernel.org/r/425b05f4da840bc684b0f7e8ebf61aeb5cef09b0.1685960567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +++ drivers/infiniband/hw/mlx5/qp.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 91fc0cdf377d..2dfa6f49a6f4 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1598,6 +1598,9 @@ static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) MLX5_CAP_PORT_SELECTION(dev->mdev, port_select_flow_table_bypass)) return 0; + if (mlx5_lag_is_lacp_owner(dev->mdev) && !dev->lag_active) + return 0; + return dev->lag_active || (MLX5_CAP_GEN(dev->mdev, num_lag_ports) > 1 && MLX5_CAP_GEN(dev->mdev, lag_tx_port_affinity)); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 70ca8ffa9256..78b96bfb4e6a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1237,6 +1237,9 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid); MLX5_SET(tisc, tisc, transport_domain, tdn); + if (!mlx5_ib_lag_should_assign_affinity(dev) && + mlx5_lag_is_lacp_owner(dev->mdev)) + MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1); if (qp->flags & IB_QP_CREATE_SOURCE_QPN) MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); -- cgit v1.2.3 From 691b0480933f0ce88a81ed1d1a0aff340ff6293a Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Tue, 6 Jun 2023 03:25:29 -0700 Subject: IB/isert: Fix dead lock in ib_isert - When a iSER session is released, ib_isert module is taking a mutex lock and releasing all pending connections. As part of this, ib_isert is destroying rdma cm_id. To destroy cm_id, rdma_cm module is sending CM events to CMA handler of ib_isert. This handler is taking same mutex lock. Hence it leads to deadlock between ib_isert & rdma_cm modules. - For fix, created local list of pending connections and release the connection outside of mutex lock. Calltrace: --------- [ 1229.791410] INFO: task kworker/10:1:642 blocked for more than 120 seconds. [ 1229.791416] Tainted: G OE --------- - - 4.18.0-372.9.1.el8.x86_64 #1 [ 1229.791418] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1229.791419] task:kworker/10:1 state:D stack: 0 pid: 642 ppid: 2 flags:0x80004000 [ 1229.791424] Workqueue: ib_cm cm_work_handler [ib_cm] [ 1229.791436] Call Trace: [ 1229.791438] __schedule+0x2d1/0x830 [ 1229.791445] ? select_idle_sibling+0x23/0x6f0 [ 1229.791449] schedule+0x35/0xa0 [ 1229.791451] schedule_preempt_disabled+0xa/0x10 [ 1229.791453] __mutex_lock.isra.7+0x310/0x420 [ 1229.791456] ? select_task_rq_fair+0x351/0x990 [ 1229.791459] isert_cma_handler+0x224/0x330 [ib_isert] [ 1229.791463] ? ttwu_queue_wakelist+0x159/0x170 [ 1229.791466] cma_cm_event_handler+0x25/0xd0 [rdma_cm] [ 1229.791474] cma_ib_handler+0xa7/0x2e0 [rdma_cm] [ 1229.791478] cm_process_work+0x22/0xf0 [ib_cm] [ 1229.791483] cm_work_handler+0xf4/0xf30 [ib_cm] [ 1229.791487] ? move_linked_works+0x6e/0xa0 [ 1229.791490] process_one_work+0x1a7/0x360 [ 1229.791491] ? create_worker+0x1a0/0x1a0 [ 1229.791493] worker_thread+0x30/0x390 [ 1229.791494] ? create_worker+0x1a0/0x1a0 [ 1229.791495] kthread+0x10a/0x120 [ 1229.791497] ? set_kthread_struct+0x40/0x40 [ 1229.791499] ret_from_fork+0x1f/0x40 [ 1229.791739] INFO: task targetcli:28666 blocked for more than 120 seconds. [ 1229.791740] Tainted: G OE --------- - - 4.18.0-372.9.1.el8.x86_64 #1 [ 1229.791741] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1229.791742] task:targetcli state:D stack: 0 pid:28666 ppid: 5510 flags:0x00004080 [ 1229.791743] Call Trace: [ 1229.791744] __schedule+0x2d1/0x830 [ 1229.791746] schedule+0x35/0xa0 [ 1229.791748] schedule_preempt_disabled+0xa/0x10 [ 1229.791749] __mutex_lock.isra.7+0x310/0x420 [ 1229.791751] rdma_destroy_id+0x15/0x20 [rdma_cm] [ 1229.791755] isert_connect_release+0x115/0x130 [ib_isert] [ 1229.791757] isert_free_np+0x87/0x140 [ib_isert] [ 1229.791761] iscsit_del_np+0x74/0x120 [iscsi_target_mod] [ 1229.791776] lio_target_np_driver_store+0xe9/0x140 [iscsi_target_mod] [ 1229.791784] configfs_write_file+0xb2/0x110 [ 1229.791788] vfs_write+0xa5/0x1a0 [ 1229.791792] ksys_write+0x4f/0xb0 [ 1229.791794] do_syscall_64+0x5b/0x1a0 [ 1229.791798] entry_SYSCALL_64_after_hwframe+0x65/0xca Fixes: bd3792205aae ("iser-target: Fix pending connections handling in target stack shutdown sequnce") Reviewed-by: Sagi Grimberg Signed-off-by: Selvin Xavier Signed-off-by: Saravanan Vajravel Link: https://lore.kernel.org/r/20230606102531.162967-2-saravanan.vajravel@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/isert/ib_isert.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index f290cd49698e..b4809d237250 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2431,6 +2431,7 @@ isert_free_np(struct iscsi_np *np) { struct isert_np *isert_np = np->np_context; struct isert_conn *isert_conn, *n; + LIST_HEAD(drop_conn_list); if (isert_np->cm_id) rdma_destroy_id(isert_np->cm_id); @@ -2450,7 +2451,7 @@ isert_free_np(struct iscsi_np *np) node) { isert_info("cleaning isert_conn %p state (%d)\n", isert_conn, isert_conn->state); - isert_connect_release(isert_conn); + list_move_tail(&isert_conn->node, &drop_conn_list); } } @@ -2461,11 +2462,16 @@ isert_free_np(struct iscsi_np *np) node) { isert_info("cleaning isert_conn %p state (%d)\n", isert_conn, isert_conn->state); - isert_connect_release(isert_conn); + list_move_tail(&isert_conn->node, &drop_conn_list); } } mutex_unlock(&isert_np->mutex); + list_for_each_entry_safe(isert_conn, n, &drop_conn_list, node) { + list_del_init(&isert_conn->node); + isert_connect_release(isert_conn); + } + np->np_context = NULL; kfree(isert_np); } -- cgit v1.2.3 From 7651e2d6c5b359a28c2d4c904fec6608d1021ca8 Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Tue, 6 Jun 2023 03:25:30 -0700 Subject: IB/isert: Fix possible list corruption in CMA handler When ib_isert module receives connection error event, it is releasing the isert session and removes corresponding list node but it doesn't take appropriate mutex lock to remove the list node. This can lead to linked list corruption Fixes: bd3792205aae ("iser-target: Fix pending connections handling in target stack shutdown sequnce") Signed-off-by: Selvin Xavier Signed-off-by: Saravanan Vajravel Link: https://lore.kernel.org/r/20230606102531.162967-3-saravanan.vajravel@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/isert/ib_isert.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index b4809d237250..00a7303c8cc6 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -657,9 +657,13 @@ static int isert_connect_error(struct rdma_cm_id *cma_id) { struct isert_conn *isert_conn = cma_id->qp->qp_context; + struct isert_np *isert_np = cma_id->context; ib_drain_qp(isert_conn->qp); + + mutex_lock(&isert_np->mutex); list_del_init(&isert_conn->node); + mutex_unlock(&isert_np->mutex); isert_conn->cm_id = NULL; isert_put_conn(isert_conn); -- cgit v1.2.3 From 699826f4e30ab76a62c238c86fbef7e826639c8d Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Tue, 6 Jun 2023 03:25:31 -0700 Subject: IB/isert: Fix incorrect release of isert connection The ib_isert module is releasing the isert connection both in isert_wait_conn() handler as well as isert_free_conn() handler. In isert_wait_conn() handler, it is expected to wait for iSCSI session logout operation to complete. It should free the isert connection only in isert_free_conn() handler. When a bunch of iSER target is cleared, this issue can lead to use-after-free memory issue as isert conn is twice released Fixes: b02efbfc9a05 ("iser-target: Fix implicit termination of connections") Reviewed-by: Sagi Grimberg Signed-off-by: Saravanan Vajravel Signed-off-by: Selvin Xavier Link: https://lore.kernel.org/r/20230606102531.162967-4-saravanan.vajravel@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/isert/ib_isert.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 00a7303c8cc6..92e1e7587af8 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2570,8 +2570,6 @@ static void isert_wait_conn(struct iscsit_conn *conn) isert_put_unsol_pending_cmds(conn); isert_wait4cmds(conn); isert_wait4logout(isert_conn); - - queue_work(isert_release_wq, &isert_conn->release_work); } static void isert_free_conn(struct iscsit_conn *conn) -- cgit v1.2.3 From 0c7e314a6352664e12ec465f576cf039e95f8369 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Mon, 12 Jun 2023 10:50:33 -0500 Subject: RDMA/rxe: Fix rxe_cq_post A recent patch replaced a tasklet execution of cq->comp_handler by a direct call. While this made sense it let changes to cq->notify state be unprotected and assumed that the cq completion machinery and the ulp done callbacks were reentrant. The result is that in some cases completion events can be lost. This patch moves the cq->comp_handler call inside of the spinlock in rxe_cq_post which solves both issues. This is compatible with the matching code in the request notify verb. Fixes: 78b26a335310 ("RDMA/rxe: Remove tasklet call from rxe_cq.c") Link: https://lore.kernel.org/r/20230612155032.17036-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_cq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 20ff0c0c4605..6ca2a05b6a2a 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -113,8 +113,6 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); - spin_unlock_irqrestore(&cq->cq_lock, flags); - if ((cq->notify == IB_CQ_NEXT_COMP) || (cq->notify == IB_CQ_SOLICITED && solicited)) { cq->notify = 0; @@ -122,6 +120,8 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } + spin_unlock_irqrestore(&cq->cq_lock, flags); + return 0; } -- cgit v1.2.3