From e8d67fa5696e2fcaf956dae36d11e6eff5246101 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 30 May 2019 00:51:26 +0300 Subject: net: dsa: sja1105: Don't store frame type in skb->cb Due to a confusion I thought that eth_type_trans() was called by the network stack whereas it can actually be called by network drivers to figure out the skb protocol and next packet_type handlers. In light of the above, it is not safe to store the frame type from the DSA tagger's .filter callback (first entry point on RX path), since GRO is yet to be invoked on the received traffic. Hence it is very likely that the skb->cb will actually get overwritten between eth_type_trans() and the actual DSA packet_type handler. Of course, what this patch fixes is the actual overwriting of the SJA1105_SKB_CB(skb)->type field from the GRO layer, which made all frames be seen as SJA1105_FRAME_TYPE_NORMAL (0). Fixes: 227d07a07ef1 ("net: dsa: sja1105: Add support for traffic through standalone ports") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/tag_sja1105.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 969402c7dbf1..d43737e6c3fb 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -28,14 +28,10 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb) */ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) { - if (sja1105_is_link_local(skb)) { - SJA1105_SKB_CB(skb)->type = SJA1105_FRAME_TYPE_LINK_LOCAL; + if (sja1105_is_link_local(skb)) return true; - } - if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) { - SJA1105_SKB_CB(skb)->type = SJA1105_FRAME_TYPE_NORMAL; + if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) return true; - } return false; } @@ -84,7 +80,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, skb->offload_fwd_mark = 1; - if (SJA1105_SKB_CB(skb)->type == SJA1105_FRAME_TYPE_LINK_LOCAL) { + if (sja1105_is_link_local(skb)) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of * the incl_srcpt options. -- cgit v1.2.3 From afa0925c6fcc6a8f610e996ca09bc3215048033c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 31 May 2019 12:37:23 -0400 Subject: packet: unconditionally free po->rollover Rollover used to use a complex RCU mechanism for assignment, which had a race condition. The below patch fixed the bug and greatly simplified the logic. The feature depends on fanout, but the state is private to the socket. Fanout_release returns f only when the last member leaves and the fanout struct is to be freed. Destroy rollover unconditionally, regardless of fanout state. Fixes: 57f015f5eccf2 ("packet: fix crash in fanout_demux_rollover()") Reported-by: syzbot Diagnosed-by: Dmitry Vyukov Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fbc775fbf712..d4889bf7248e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3014,8 +3014,8 @@ static int packet_release(struct socket *sock) synchronize_net(); + kfree(po->rollover); if (f) { - kfree(po->rollover); fanout_release_data(f); kfree(f); } -- cgit v1.2.3 From 27393f8c6efc03b8e0b64134721b0d337fca0a80 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 4 Jun 2019 12:00:11 -0700 Subject: Revert "net/tls: avoid NULL-deref on resync during device removal" This reverts commit 38030d7cb77963ba84cdbe034806e2b81245339f. Unfortunately the RX resync may get called from soft IRQ, so we can't take the rwsem to protect from the device disappearing. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_device.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index b95c408fd771..49b3a2ff8ef3 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -553,8 +553,8 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) { struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev = tls_ctx->netdev; struct tls_offload_context_rx *rx_ctx; - struct net_device *netdev; u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -568,15 +568,10 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) is_req_pending = resync_req; if (unlikely(is_req_pending) && req_seq == seq && - atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) { - seq += TLS_HEADER_SIZE - 1; - down_read(&device_offload_lock); - netdev = tls_ctx->netdev; - if (netdev) - netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, - rcd_sn); - up_read(&device_offload_lock); - } + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, + seq + TLS_HEADER_SIZE - 1, + rcd_sn); } static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) -- cgit v1.2.3 From e52972c11d6b1262964db96d65934196db621685 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 4 Jun 2019 12:00:12 -0700 Subject: net/tls: replace the sleeping lock around RX resync with a bit lock Commit 38030d7cb779 ("net/tls: avoid NULL-deref on resync during device removal") tried to fix a potential NULL-dereference by taking the context rwsem. Unfortunately the RX resync may get called from soft IRQ, so we can't use the rwsem to protect from the device disappearing. Because we are guaranteed there can be only one resync at a time (it's called from strparser) use a bit to indicate resync is busy and make device removal wait for the bit to get cleared. Note that there is a leftover "flags" field in struct tls_context already. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_device.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 49b3a2ff8ef3..1f9cf57d9754 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -550,10 +550,22 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) } } +static void tls_device_resync_rx(struct tls_context *tls_ctx, + struct sock *sk, u32 seq, u64 rcd_sn) +{ + struct net_device *netdev; + + if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags))) + return; + netdev = READ_ONCE(tls_ctx->netdev); + if (netdev) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn); + clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); +} + void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct net_device *netdev = tls_ctx->netdev; struct tls_offload_context_rx *rx_ctx; u32 is_req_pending; s64 resync_req; @@ -568,10 +580,10 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) is_req_pending = resync_req; if (unlikely(is_req_pending) && req_seq == seq && - atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) - netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, - seq + TLS_HEADER_SIZE - 1, - rcd_sn); + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) { + seq += TLS_HEADER_SIZE - 1; + tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); + } } static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) @@ -972,7 +984,10 @@ static int tls_device_down(struct net_device *netdev) if (ctx->rx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); - ctx->netdev = NULL; + WRITE_ONCE(ctx->netdev, NULL); + smp_mb__before_atomic(); /* pairs with test_and_set_bit() */ + while (test_bit(TLS_RX_SYNC_RUNNING, &ctx->flags)) + usleep_range(10, 200); dev_put(netdev); list_del_init(&ctx->list); -- cgit v1.2.3 From 82ba25c6de200d7a9e9c970c998cdd6dfa8637ae Mon Sep 17 00:00:00 2001 From: Tim Beale Date: Tue, 4 Jun 2019 13:56:23 +1200 Subject: udp: only choose unbound UDP socket for multicast when not in a VRF By default, packets received in another VRF should not be passed to an unbound socket in the default VRF. This patch updates the IPv4 UDP multicast logic to match the unicast VRF logic (in compute_score()), as well as the IPv6 mcast logic (in __udp_v6_is_mcast_sock()). The particular case I noticed was DHCP discover packets going to the 255.255.255.255 address, which are handled by __udp4_lib_mcast_deliver(). The previous code meant that running multiple different DHCP server or relay agent instances across VRFs did not work correctly - any server/relay agent in the default VRF received DHCP discover packets for all other VRFs. Fixes: 6da5b0f027a8 ("net: ensure unbound datagram socket to be chosen when not in a VRF") Signed-off-by: Tim Beale Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/udp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8fb250ed53d4..efe9283a8a95 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -538,8 +538,7 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport != rmt_port && inet->inet_dport) || (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || ipv6_only_sock(sk) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return false; if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif)) return false; -- cgit v1.2.3 From fdf71426e7c548d6e4f5e51e0238732d60e05f5f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 4 Jun 2019 11:44:06 +0200 Subject: net: fix indirect calls helpers for ptype list hooks. As Eric noted, the current wrapper for ptype func hook inside __netif_receive_skb_list_ptype() has no chance of avoiding the indirect call: we enter such code path only for protocols other than ipv4 and ipv6. Instead we can wrap the list_func invocation. v1 -> v2: - use the correct fix tag Fixes: f5737cbadb7d ("net: use indirect calls helpers for ptype hook") Suggested-by: Eric Dumazet Signed-off-by: Paolo Abeni Acked-by: Edward Cree Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 66f7508825bd..1c4593ec4409 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5025,12 +5025,12 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head, if (list_empty(head)) return; if (pt_prev->list_func != NULL) - pt_prev->list_func(head, pt_prev, orig_dev); + INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, + ip_list_rcv, head, pt_prev, orig_dev); else list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); - INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, - skb->dev, pt_prev, orig_dev); + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } } -- cgit v1.2.3 From 0a90478b93a46bdcd56ba33c37566a993e455d54 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 2 Jun 2019 19:10:24 +0800 Subject: ipv4: not do cache for local delivery if bc_forwarding is enabled With the topo: h1 ---| rp1 | | route rp3 |--- h3 (192.168.200.1) h2 ---| rp2 | If rp1 bc_forwarding is set while rp2 bc_forwarding is not, after doing "ping 192.168.200.255" on h1, then ping 192.168.200.255 on h2, and the packets can still be forwared. This issue was caused by the input route cache. It should only do the cache for either bc forwarding or local delivery. Otherwise, local delivery can use the route cache for bc forwarding of other interfaces. This patch is to fix it by not doing cache for local delivery if all.bc_forwarding is enabled. Note that we don't fix it by checking route cache local flag after rt_cache_valid() in "local_input:" and "ip_mkroute_input", as the common route code shouldn't be touched for bc_forwarding. Fixes: 5cbf777cfdf6 ("route: add support for directed broadcast forwarding") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/route.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 11ddc276776e..91bf75b8ac1c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1985,7 +1985,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 itag = 0; struct rtable *rth; struct flowi4 fl4; - bool do_cache; + bool do_cache = true; /* IP on this device is disabled. */ @@ -2062,6 +2062,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type == RTN_BROADCAST) { if (IN_DEV_BFORWARD(in_dev)) goto make_route; + /* not do cache if bc_forwarding is enabled */ + if (IPV4_DEVCONF_ALL(net, BC_FORWARDING)) + do_cache = false; goto brd_input; } @@ -2099,18 +2102,15 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: - do_cache = false; - if (res->fi) { - if (!itag) { - struct fib_nh_common *nhc = FIB_RES_NHC(*res); + do_cache &= res->fi && !itag; + if (do_cache) { + struct fib_nh_common *nhc = FIB_RES_NHC(*res); - rth = rcu_dereference(nhc->nhc_rth_input); - if (rt_cache_valid(rth)) { - skb_dst_set_noref(skb, &rth->dst); - err = 0; - goto out; - } - do_cache = true; + rth = rcu_dereference(nhc->nhc_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + err = 0; + goto out; } } -- cgit v1.2.3 From b50e058746ba29f517e27299447831ab3d93f896 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 3 Jun 2019 08:48:19 -0400 Subject: net: rds: fix memory leak when unload rds_rdma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When KASAN is enabled, after several rds connections are created, then "rmmod rds_rdma" is run. The following will appear. " BUG rds_ib_incoming (Not tainted): Objects remaining in rds_ib_incoming on __kmem_cache_shutdown() Call Trace: dump_stack+0x71/0xab slab_err+0xad/0xd0 __kmem_cache_shutdown+0x17d/0x370 shutdown_cache+0x17/0x130 kmem_cache_destroy+0x1df/0x210 rds_ib_recv_exit+0x11/0x20 [rds_rdma] rds_ib_exit+0x7a/0x90 [rds_rdma] __x64_sys_delete_module+0x224/0x2c0 ? __ia32_sys_delete_module+0x2c0/0x2c0 do_syscall_64+0x73/0x190 entry_SYSCALL_64_after_hwframe+0x44/0xa9 " This is rds connection memory leak. The root cause is: When "rmmod rds_rdma" is run, rds_ib_remove_one will call rds_ib_dev_shutdown to drop the rds connections. rds_ib_dev_shutdown will call rds_conn_drop to drop rds connections as below. " rds_conn_path_drop(&conn->c_path[0], false); " In the above, destroy is set to false. void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); rcu_read_lock(); if (!destroy && rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } queue_work(rds_wq, &cp->cp_down_w); rcu_read_unlock(); } In the above function, destroy is set to false. rds_destroy_pending is called. This does not move rds connections to ib_nodev_conns. So destroy is set to true to move rds connections to ib_nodev_conns. In rds_ib_unregister_client, flush_workqueue is called to make rds_wq finsh shutdown rds connections. The function rds_ib_destroy_nodev_conns is called to shutdown rds connections finally. Then rds_ib_recv_exit is called to destroy slab. void rds_ib_recv_exit(void) { kmem_cache_destroy(rds_ib_incoming_slab); kmem_cache_destroy(rds_ib_frag_slab); } The above slab memory leak will not occur again. >From tests, 256 rds connections [root@ca-dev14 ~]# time rmmod rds_rdma real 0m16.522s user 0m0.000s sys 0m8.152s 512 rds connections [root@ca-dev14 ~]# time rmmod rds_rdma real 0m32.054s user 0m0.000s sys 0m15.568s To rmmod rds_rdma with 256 rds connections, about 16 seconds are needed. And with 512 rds connections, about 32 seconds are needed. >From ftrace, when one rds connection is destroyed, " 19) | rds_conn_destroy [rds]() { 19) 7.782 us | rds_conn_path_drop [rds](); 15) | rds_shutdown_worker [rds]() { 15) | rds_conn_shutdown [rds]() { 15) 1.651 us | rds_send_path_reset [rds](); 15) 7.195 us | } 15) + 11.434 us | } 19) 2.285 us | rds_cong_remove_conn [rds](); 19) * 24062.76 us | } " So if many rds connections will be destroyed, this function rds_ib_destroy_nodev_conns uses most of time. Suggested-by: Håkon Bugge Signed-off-by: Zhu Yanjun Signed-off-by: David S. Miller --- net/rds/ib.c | 2 +- net/rds/ib_recv.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 2da9b75bad16..b8d581b779b2 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -87,7 +87,7 @@ static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) spin_lock_irqsave(&rds_ibdev->spinlock, flags); list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) - rds_conn_drop(ic->conn); + rds_conn_path_drop(&ic->conn->c_path[0], true); spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 8946c89d7392..3cae88cbdaa0 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -168,6 +168,7 @@ void rds_ib_recv_free_caches(struct rds_ib_connection *ic) list_del(&inc->ii_cache_entry); WARN_ON(!list_empty(&inc->ii_frags)); kmem_cache_free(rds_ib_incoming_slab, inc); + atomic_dec(&rds_ib_allocation); } rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); @@ -1057,6 +1058,8 @@ out: void rds_ib_recv_exit(void) { + WARN_ON(atomic_read(&rds_ib_allocation)); + kmem_cache_destroy(rds_ib_incoming_slab); kmem_cache_destroy(rds_ib_frag_slab); } -- cgit v1.2.3 From 0a8dd9f67cd0da7dc284f48b032ce00db1a68791 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Jun 2019 16:32:59 -0400 Subject: Fix memory leak in sctp_process_init syzbot found the following leak in sctp_process_init BUG: memory leak unreferenced object 0xffff88810ef68400 (size 1024): comm "syz-executor273", pid 7046, jiffies 4294945598 (age 28.770s) hex dump (first 32 bytes): 1d de 28 8d de 0b 1b e3 b5 c2 f9 68 fd 1a 97 25 ..(........h...% 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000a02cebbd>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<00000000a02cebbd>] slab_post_alloc_hook mm/slab.h:439 [inline] [<00000000a02cebbd>] slab_alloc mm/slab.c:3326 [inline] [<00000000a02cebbd>] __do_kmalloc mm/slab.c:3658 [inline] [<00000000a02cebbd>] __kmalloc_track_caller+0x15d/0x2c0 mm/slab.c:3675 [<000000009e6245e6>] kmemdup+0x27/0x60 mm/util.c:119 [<00000000dfdc5d2d>] kmemdup include/linux/string.h:432 [inline] [<00000000dfdc5d2d>] sctp_process_init+0xa7e/0xc20 net/sctp/sm_make_chunk.c:2437 [<00000000b58b62f8>] sctp_cmd_process_init net/sctp/sm_sideeffect.c:682 [inline] [<00000000b58b62f8>] sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1384 [inline] [<00000000b58b62f8>] sctp_side_effects net/sctp/sm_sideeffect.c:1194 [inline] [<00000000b58b62f8>] sctp_do_sm+0xbdc/0x1d60 net/sctp/sm_sideeffect.c:1165 [<0000000044e11f96>] sctp_assoc_bh_rcv+0x13c/0x200 net/sctp/associola.c:1074 [<00000000ec43804d>] sctp_inq_push+0x7f/0xb0 net/sctp/inqueue.c:95 [<00000000726aa954>] sctp_backlog_rcv+0x5e/0x2a0 net/sctp/input.c:354 [<00000000d9e249a8>] sk_backlog_rcv include/net/sock.h:950 [inline] [<00000000d9e249a8>] __release_sock+0xab/0x110 net/core/sock.c:2418 [<00000000acae44fa>] release_sock+0x37/0xd0 net/core/sock.c:2934 [<00000000963cc9ae>] sctp_sendmsg+0x2c0/0x990 net/sctp/socket.c:2122 [<00000000a7fc7565>] inet_sendmsg+0x64/0x120 net/ipv4/af_inet.c:802 [<00000000b732cbd3>] sock_sendmsg_nosec net/socket.c:652 [inline] [<00000000b732cbd3>] sock_sendmsg+0x54/0x70 net/socket.c:671 [<00000000274c57ab>] ___sys_sendmsg+0x393/0x3c0 net/socket.c:2292 [<000000008252aedb>] __sys_sendmsg+0x80/0xf0 net/socket.c:2330 [<00000000f7bf23d1>] __do_sys_sendmsg net/socket.c:2339 [inline] [<00000000f7bf23d1>] __se_sys_sendmsg net/socket.c:2337 [inline] [<00000000f7bf23d1>] __x64_sys_sendmsg+0x23/0x30 net/socket.c:2337 [<00000000a8b4131f>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:3 The problem was that the peer.cookie value points to an skb allocated area on the first pass through this function, at which point it is overwritten with a heap allocated value, but in certain cases, where a COOKIE_ECHO chunk is included in the packet, a second pass through sctp_process_init is made, where the cookie value is re-allocated, leaking the first allocation. Fix is to always allocate the cookie value, and free it when we are done using it. Signed-off-by: Neil Horman Reported-by: syzbot+f7e9153b037eac9b1df8@syzkaller.appspotmail.com CC: Marcelo Ricardo Leitner CC: "David S. Miller" CC: netdev@vger.kernel.org Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 13 +++---------- net/sctp/sm_sideeffect.c | 5 +++++ 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 92331e1195c1..f17908f5c4f3 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2312,7 +2312,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, union sctp_addr addr; struct sctp_af *af; int src_match = 0; - char *cookie; /* We must include the address that the INIT packet came from. * This is the only address that matters for an INIT packet. @@ -2416,14 +2415,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, /* Peer Rwnd : Current calculated value of the peer's rwnd. */ asoc->peer.rwnd = asoc->peer.i.a_rwnd; - /* Copy cookie in case we need to resend COOKIE-ECHO. */ - cookie = asoc->peer.cookie; - if (cookie) { - asoc->peer.cookie = kmemdup(cookie, asoc->peer.cookie_len, gfp); - if (!asoc->peer.cookie) - goto clean_up; - } - /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily * high (for example, implementations MAY use the size of the receiver * advertised window). @@ -2592,7 +2583,9 @@ do_addr_param: case SCTP_PARAM_STATE_COOKIE: asoc->peer.cookie_len = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); - asoc->peer.cookie = param.cookie->body; + asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp); + if (!asoc->peer.cookie) + retval = 0; break; case SCTP_PARAM_HEARTBEAT_INFO: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 9b50da548db2..a554d6d15d1b 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -883,6 +883,11 @@ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds, asoc->rto_initial; } + if (sctp_state(asoc, ESTABLISHED)) { + kfree(asoc->peer.cookie); + asoc->peer.cookie = NULL; + } + if (sctp_state(asoc, ESTABLISHED) || sctp_state(asoc, CLOSED) || sctp_state(asoc, SHUTDOWN_RECEIVED)) { -- cgit v1.2.3 From 0ee4e76937d69128a6a66861ba393ebdc2ffc8a2 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 3 Jun 2019 16:57:13 -0400 Subject: ethtool: fix potential userspace buffer overflow ethtool_get_regs() allocates a buffer of size ops->get_regs_len(), and pass it to the kernel driver via ops->get_regs() for filling. There is no restriction about what the kernel drivers can or cannot do with the open ethtool_regs structure. They usually set regs->version and ignore regs->len or set it to the same size as ops->get_regs_len(). But if userspace allocates a smaller buffer for the registers dump, we would cause a userspace buffer overflow in the final copy_to_user() call, which uses the regs.len value potentially reset by the driver. To fix this, make this case obvious and store regs.len before calling ops->get_regs(), to only copy as much data as requested by userspace, up to the value returned by ops->get_regs_len(). While at it, remove the redundant check for non-null regbuf. Signed-off-by: Vivien Didelot Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/core/ethtool.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 43e9add58340..1a0196fbb49c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1359,13 +1359,16 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (!regbuf) return -ENOMEM; + if (regs.len < reglen) + reglen = regs.len; + ops->get_regs(dev, ®s, regbuf); ret = -EFAULT; if (copy_to_user(useraddr, ®s, sizeof(regs))) goto out; useraddr += offsetof(struct ethtool_regs, data); - if (regbuf && copy_to_user(useraddr, regbuf, regs.len)) + if (copy_to_user(useraddr, regbuf, reglen)) goto out; ret = 0; -- cgit v1.2.3 From 4970b42d5c362bf873982db7d93245c5281e58f4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 5 Jun 2019 12:27:14 +0800 Subject: Revert "fib_rules: return 0 directly if an exactly same rule exists when NLM_F_EXCL not supplied" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit e9919a24d3022f72bcadc407e73a6ef17093a849. Nathan reported the new behaviour breaks Android, as Android just add new rules and delete old ones. If we return 0 without adding dup rules, Android will remove the new added rules and causing system to soft-reboot. Fixes: e9919a24d302 ("fib_rules: return 0 directly if an exactly same rule exists when NLM_F_EXCL not supplied") Reported-by: Nathan Chancellor Reported-by: Yaro Slav Reported-by: Maciej Żenczykowski Signed-off-by: Hangbin Liu Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: David S. Miller --- net/core/fib_rules.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 43f0115cce9c..18f8dd8329ed 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -757,9 +757,9 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) goto errout; - if (rule_exists(ops, frh, tb, rule)) { - if (nlh->nlmsg_flags & NLM_F_EXCL) - err = -EEXIST; + if ((nlh->nlmsg_flags & NLM_F_EXCL) && + rule_exists(ops, frh, tb, rule)) { + err = -EEXIST; goto errout_free; } -- cgit v1.2.3 From 59e3e4b52663a9d97efbce7307f62e4bc5c9ce91 Mon Sep 17 00:00:00 2001 From: Olivier Matz Date: Thu, 6 Jun 2019 09:15:18 +0200 Subject: ipv6: use READ_ONCE() for inet->hdrincl as in ipv4 As it was done in commit 8f659a03a0ba ("net: ipv4: fix for a race condition in raw_sendmsg") and commit 20b50d79974e ("net: ipv4: emulate READ_ONCE() on ->hdrincl bit-field in raw_sendmsg()") for ipv4, copy the value of inet->hdrincl in a local variable, to avoid introducing a race condition in the next commit. Signed-off-by: Olivier Matz Signed-off-by: David S. Miller --- net/ipv6/raw.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 96a3559f2a09..af2f9a833c4e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -783,6 +783,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; + int hdrincl; u16 proto; int err; @@ -796,6 +797,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + /* hdrincl should be READ_ONCE(inet->hdrincl) + * but READ_ONCE() doesn't work with bit fields. + * Doing this indirectly yields the same result. + */ + hdrincl = inet->hdrincl; + hdrincl = READ_ONCE(hdrincl); + /* * Get and verify the address. */ @@ -908,7 +916,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_oif = np->ucast_oif; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - if (inet->hdrincl) + if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; if (ipc6.tclass < 0) @@ -931,7 +939,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_confirm; back_from_confirm: - if (inet->hdrincl) + if (hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags, &ipc6.sockc); else { -- cgit v1.2.3 From b9aa52c4cb457e7416cc0c95f475e72ef4a61336 Mon Sep 17 00:00:00 2001 From: Olivier Matz Date: Thu, 6 Jun 2019 09:15:19 +0200 Subject: ipv6: fix EFAULT on sendto with icmpv6 and hdrincl The following code returns EFAULT (Bad address): s = socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6); setsockopt(s, SOL_IPV6, IPV6_HDRINCL, 1); sendto(ipv6_icmp6_packet, addr); /* returns -1, errno = EFAULT */ The IPv4 equivalent code works. A workaround is to use IPPROTO_RAW instead of IPPROTO_ICMPV6. The failure happens because 2 bytes are eaten from the msghdr by rawv6_probe_proto_opt() starting from commit 19e3c66b52ca ("ipv6 equivalent of "ipv4: Avoid reading user iov twice after raw_probe_proto_opt""), but at that time it was not a problem because IPV6_HDRINCL was not yet introduced. Only eat these 2 bytes if hdrincl == 0. Fixes: 715f504b1189 ("ipv6: add IPV6_HDRINCL option for raw sockets") Signed-off-by: Olivier Matz Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/raw.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index af2f9a833c4e..1bb88b4b677b 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -895,11 +895,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = ipv6_fixup_options(&opt_space, opt); fl6.flowi6_proto = proto; - rfv.msg = msg; - rfv.hlen = 0; - err = rawv6_probe_proto_opt(&rfv, &fl6); - if (err) - goto out; + + if (!hdrincl) { + rfv.msg = msg; + rfv.hlen = 0; + err = rawv6_probe_proto_opt(&rfv, &fl6); + if (err) + goto out; + } if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; -- cgit v1.2.3 From 85cb928787eab6a2f4ca9d2a798b6f3bed53ced1 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 6 Jun 2019 04:00:03 -0400 Subject: net: rds: fix memory leak in rds_ib_flush_mr_pool When the following tests last for several hours, the problem will occur. Server: rds-stress -r 1.1.1.16 -D 1M Client: rds-stress -r 1.1.1.14 -s 1.1.1.16 -D 1M -T 30 The following will occur. " Starting up.... tsks tx/s rx/s tx+rx K/s mbi K/s mbo K/s tx us/c rtt us cpu % 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 " >From vmcore, we can find that clean_list is NULL. >From the source code, rds_mr_flushd calls rds_ib_mr_pool_flush_worker. Then rds_ib_mr_pool_flush_worker calls " rds_ib_flush_mr_pool(pool, 0, NULL); " Then in function " int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **ibmr_ret) " ibmr_ret is NULL. In the source code, " ... list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); if (ibmr_ret) *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); /* more than one entry in llist nodes */ if (clean_nodes->next) llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list); ... " When ibmr_ret is NULL, llist_entry is not executed. clean_nodes->next instead of clean_nodes is added in clean_list. So clean_nodes is discarded. It can not be used again. The workqueue is executed periodically. So more and more clean_nodes are discarded. Finally the clean_list is NULL. Then this problem will occur. Fixes: 1bc144b62524 ("net, rds, Replace xlist in net/rds/xlist.h with llist") Signed-off-by: Zhu Yanjun Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index d664e9ade74d..0b347f46b2f4 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -428,12 +428,14 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, wait_clean_list_grace(); list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); - if (ibmr_ret) + if (ibmr_ret) { *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); - + clean_nodes = clean_nodes->next; + } /* more than one entry in llist nodes */ - if (clean_nodes->next) - llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list); + if (clean_nodes) + llist_add_batch(clean_nodes, clean_tail, + &pool->clean_list); } -- cgit v1.2.3 From 720f1de4021f09898b8c8443f3b3e995991b6e3a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 6 Jun 2019 15:45:03 +0200 Subject: pktgen: do not sleep with the thread lock held. Currently, the process issuing a "start" command on the pktgen procfs interface, acquires the pktgen thread lock and never release it, until all pktgen threads are completed. The above can blocks indefinitely any other pktgen command and any (even unrelated) netdevice removal - as the pktgen netdev notifier acquires the same lock. The issue is demonstrated by the following script, reported by Matteo: ip -b - <<'EOF' link add type dummy link add type veth link set dummy0 up EOF modprobe pktgen echo reset >/proc/net/pktgen/pgctrl { echo rem_device_all echo add_device dummy0 } >/proc/net/pktgen/kpktgend_0 echo count 0 >/proc/net/pktgen/dummy0 echo start >/proc/net/pktgen/pgctrl & sleep 1 rmmod veth Fix the above releasing the thread lock around the sleep call. Additionally we must prevent racing with forcefull rmmod - as the thread lock no more protects from them. Instead, acquire a self-reference before waiting for any thread. As a side effect, running rmmod pktgen while some thread is running now fails with "module in use" error, before this patch such command hanged indefinitely. Note: the issue predates the commit reported in the fixes tag, but this fix can't be applied before the mentioned commit. v1 -> v2: - no need to check for thread existence after flipping the lock, pktgen threads are freed only at net exit time - Fixes: 6146e6a43b35 ("[PKTGEN]: Removes thread_{un,}lock() macros.") Reported-and-tested-by: Matteo Croce Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/pktgen.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 319ad5490fb3..a33d03c05ed6 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3066,7 +3066,13 @@ static int pktgen_wait_thread_run(struct pktgen_thread *t) { while (thread_is_running(t)) { + /* note: 't' will still be around even after the unlock/lock + * cycle because pktgen_thread threads are only cleared at + * net exit + */ + mutex_unlock(&pktgen_thread_lock); msleep_interruptible(100); + mutex_lock(&pktgen_thread_lock); if (signal_pending(current)) goto signal; @@ -3081,6 +3087,10 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn) struct pktgen_thread *t; int sig = 1; + /* prevent from racing with rmmod */ + if (!try_module_get(THIS_MODULE)) + return sig; + mutex_lock(&pktgen_thread_lock); list_for_each_entry(t, &pn->pktgen_threads, th_list) { @@ -3094,6 +3104,7 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn) t->control |= (T_STOP); mutex_unlock(&pktgen_thread_lock); + module_put(THIS_MODULE); return sig; } -- cgit v1.2.3