From ef8d8ccdc216f797e66cb4a1372f5c4c285ce1e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Aug 2019 21:26:22 -0700 Subject: tcp: make sure EPOLLOUT wont be missed As Jason Baron explained in commit 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure"), it is crucial we properly set SOCK_NOSPACE when needed. However, Jason patch had a bug, because the 'nonblocking' status as far as sk_stream_wait_memory() is concerned is governed by MSG_DONTWAIT flag passed at sendmsg() time : long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); So it is very possible that tcp sendmsg() calls sk_stream_wait_memory(), and that sk_stream_wait_memory() returns -EAGAIN with SOCK_NOSPACE cleared, if sk->sk_sndtimeo has been set to a small (but not zero) value. This patch removes the 'noblock' variable since we must always set SOCK_NOSPACE if -EAGAIN is returned. It also renames the do_nonblock label since we might reach this code path even if we were in blocking mode. Fixes: 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure") Signed-off-by: Eric Dumazet Cc: Jason Baron Reported-by: Vladimir Rutsky Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Acked-by: Jason Baron Signed-off-by: David S. Miller --- net/core/stream.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/stream.c b/net/core/stream.c index e94bb02a5629..4f1d4aa5fb38 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -120,7 +120,6 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) int err = 0; long vm_wait = 0; long current_timeo = *timeo_p; - bool noblock = (*timeo_p ? false : true); DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) @@ -133,11 +132,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - if (!*timeo_p) { - if (noblock) - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - goto do_nonblock; - } + if (!*timeo_p) + goto do_eagain; if (signal_pending(current)) goto do_interrupted; sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -169,7 +165,13 @@ out: do_error: err = -EPIPE; goto out; -do_nonblock: +do_eagain: + /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can + * be generated later. + * When TCP receives ACK packets that make room, tcp_check_space() + * only calls tcp_new_space() if SOCK_NOSPACE is set. + */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; goto out; do_interrupted: -- cgit v1.2.3 From db38de39684dda2bf307f41797db2831deba64e9 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 21 Aug 2019 14:17:20 +0200 Subject: flow_dissector: Fix potential use-after-free on BPF_PROG_DETACH Call to bpf_prog_put(), with help of call_rcu(), queues an RCU-callback to free the program once a grace period has elapsed. The callback can run together with new RCU readers that started after the last grace period. New RCU readers can potentially see the "old" to-be-freed or already-freed pointer to the program object before the RCU update-side NULLs it. Reorder the operations so that the RCU update-side resets the protected pointer before the end of the grace period after which the program will be freed. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Reported-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Acked-by: Petar Penkov Signed-off-by: Daniel Borkmann --- net/core/flow_dissector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3e6fedb57bc1..2470b4b404e6 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -142,8 +142,8 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) mutex_unlock(&flow_dissector_mutex); return -ENOENT; } - bpf_prog_put(attached); RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + bpf_prog_put(attached); mutex_unlock(&flow_dissector_mutex); return 0; } -- cgit v1.2.3 From 2c238177bd7f4b14bdf7447cc1cd9bb791f147e6 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 20 Aug 2019 17:50:25 +0200 Subject: bpf: allow narrow loads of some sk_reuseport_md fields with offset > 0 test_select_reuseport fails on s390 due to verifier rejecting test_select_reuseport_kern.o with the following message: ; data_check.eth_protocol = reuse_md->eth_protocol; 18: (69) r1 = *(u16 *)(r6 +22) invalid bpf_context access off=22 size=2 This is because on big-endian machines casts from __u32 to __u16 are generated by referencing the respective variable as __u16 with an offset of 2 (as opposed to 0 on little-endian machines). The verifier already has all the infrastructure in place to allow such accesses, it's just that they are not explicitly enabled for eth_protocol field. Enable them for eth_protocol field by using bpf_ctx_range instead of offsetof. Ditto for ip_protocol, bind_inany and len, since they already allow narrowing, and the same problem can arise when working with them. Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann --- net/core/filter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 7878f918b8c0..4c6a252d4212 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8757,13 +8757,13 @@ sk_reuseport_is_valid_access(int off, int size, return size == size_default; /* Fields that allow narrowing */ - case offsetof(struct sk_reuseport_md, eth_protocol): + case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < FIELD_SIZEOF(struct sk_buff, protocol)) return false; /* fall through */ - case offsetof(struct sk_reuseport_md, ip_protocol): - case offsetof(struct sk_reuseport_md, bind_inany): - case offsetof(struct sk_reuseport_md, len): + case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): + case bpf_ctx_range(struct sk_reuseport_md, bind_inany): + case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); -- cgit v1.2.3 From b45ce32135d1c82a5bf12aa56957c3fd27956057 Mon Sep 17 00:00:00 2001 From: zhanglin Date: Fri, 23 Aug 2019 09:14:11 +0800 Subject: sock: fix potential memory leak in proto_register() If protocols registered exceeded PROTO_INUSE_NR, prot will be added to proto_list, but no available bit left for prot in proto_inuse_idx. Changes since v2: * Propagate the error code properly Signed-off-by: zhanglin Signed-off-by: David S. Miller --- net/core/sock.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 6d08553f885c..545fac19a711 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3287,16 +3287,17 @@ static __init int net_inuse_init(void) core_initcall(net_inuse_init); -static void assign_proto_idx(struct proto *prot) +static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { pr_err("PROTO_INUSE_NR exhausted\n"); - return; + return -ENOSPC; } set_bit(prot->inuse_idx, proto_inuse_idx); + return 0; } static void release_proto_idx(struct proto *prot) @@ -3305,8 +3306,9 @@ static void release_proto_idx(struct proto *prot) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else -static inline void assign_proto_idx(struct proto *prot) +static inline int assign_proto_idx(struct proto *prot) { + return 0; } static inline void release_proto_idx(struct proto *prot) @@ -3355,6 +3357,8 @@ static int req_prot_init(const struct proto *prot) int proto_register(struct proto *prot, int alloc_slab) { + int ret = -ENOBUFS; + if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, @@ -3391,20 +3395,27 @@ int proto_register(struct proto *prot, int alloc_slab) } mutex_lock(&proto_list_mutex); + ret = assign_proto_idx(prot); + if (ret) { + mutex_unlock(&proto_list_mutex); + goto out_free_timewait_sock_slab_name; + } list_add(&prot->node, &proto_list); - assign_proto_idx(prot); mutex_unlock(&proto_list_mutex); - return 0; + return ret; out_free_timewait_sock_slab_name: - kfree(prot->twsk_prot->twsk_slab_name); + if (alloc_slab && prot->twsk_prot) + kfree(prot->twsk_prot->twsk_slab_name); out_free_request_sock_slab: - req_prot_cleanup(prot->rsk_prot); + if (alloc_slab) { + req_prot_cleanup(prot->rsk_prot); - kmem_cache_destroy(prot->slab); - prot->slab = NULL; + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } out: - return -ENOBUFS; + return ret; } EXPORT_SYMBOL(proto_register); -- cgit v1.2.3